summaryrefslogtreecommitdiff
path: root/drivers/infiniband/core
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
commit57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree5e910f0e82173f4ef4f51111366a3f1299037a7b /drivers/infiniband/core
Initial import
Diffstat (limited to 'drivers/infiniband/core')
-rw-r--r--drivers/infiniband/core/Makefile34
-rw-r--r--drivers/infiniband/core/addr.c558
-rw-r--r--drivers/infiniband/core/agent.c217
-rw-r--r--drivers/infiniband/core/agent.h51
-rw-r--r--drivers/infiniband/core/cache.c439
-rw-r--r--drivers/infiniband/core/cm.c3931
-rw-r--r--drivers/infiniband/core/cm_msgs.h836
-rw-r--r--drivers/infiniband/core/cma.c3720
-rw-r--r--drivers/infiniband/core/core_priv.h54
-rw-r--r--drivers/infiniband/core/device.c785
-rw-r--r--drivers/infiniband/core/fmr_pool.c544
-rw-r--r--drivers/infiniband/core/iwcm.c1069
-rw-r--r--drivers/infiniband/core/iwcm.h62
-rw-r--r--drivers/infiniband/core/iwpm_msg.c756
-rw-r--r--drivers/infiniband/core/iwpm_util.c749
-rw-r--r--drivers/infiniband/core/iwpm_util.h253
-rw-r--r--drivers/infiniband/core/mad.c3176
-rw-r--r--drivers/infiniband/core/mad_priv.h227
-rw-r--r--drivers/infiniband/core/mad_rmpp.c953
-rw-r--r--drivers/infiniband/core/mad_rmpp.h58
-rw-r--r--drivers/infiniband/core/multicast.c903
-rw-r--r--drivers/infiniband/core/netlink.c216
-rw-r--r--drivers/infiniband/core/packer.c203
-rw-r--r--drivers/infiniband/core/sa.h66
-rw-r--r--drivers/infiniband/core/sa_query.c1280
-rw-r--r--drivers/infiniband/core/smi.c253
-rw-r--r--drivers/infiniband/core/smi.h90
-rw-r--r--drivers/infiniband/core/sysfs.c922
-rw-r--r--drivers/infiniband/core/ucm.c1370
-rw-r--r--drivers/infiniband/core/ucma.c1635
-rw-r--r--drivers/infiniband/core/ud_header.c414
-rw-r--r--drivers/infiniband/core/umem.c365
-rw-r--r--drivers/infiniband/core/umem_odp.c669
-rw-r--r--drivers/infiniband/core/umem_rbtree.c94
-rw-r--r--drivers/infiniband/core/user_mad.c1390
-rw-r--r--drivers/infiniband/core/uverbs.h263
-rw-r--r--drivers/infiniband/core/uverbs_cmd.c3357
-rw-r--r--drivers/infiniband/core/uverbs_main.c1039
-rw-r--r--drivers/infiniband/core/uverbs_marshall.c148
-rw-r--r--drivers/infiniband/core/verbs.c1448
40 files changed, 34597 insertions, 0 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
new file mode 100644
index 000000000..acf736764
--- /dev/null
+++ b/drivers/infiniband/core/Makefile
@@ -0,0 +1,34 @@
+infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_cm.o
+user_access-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_ucm.o
+
+obj-$(CONFIG_INFINIBAND) += ib_core.o ib_mad.o ib_sa.o \
+ ib_cm.o iw_cm.o ib_addr.o \
+ $(infiniband-y)
+obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o
+obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
+ $(user_access-y)
+
+ib_core-y := packer.o ud_header.o verbs.o sysfs.o \
+ device.o fmr_pool.o cache.o netlink.o
+ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
+
+ib_mad-y := mad.o smi.o agent.o mad_rmpp.o
+
+ib_sa-y := sa_query.o multicast.o
+
+ib_cm-y := cm.o
+
+iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o
+
+rdma_cm-y := cma.o
+
+rdma_ucm-y := ucma.o
+
+ib_addr-y := addr.o
+
+ib_umad-y := user_mad.o
+
+ib_ucm-y := ucm.o
+
+ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
new file mode 100644
index 000000000..38339d220
--- /dev/null
+++ b/drivers/infiniband/core/addr.c
@@ -0,0 +1,558 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mutex.h>
+#include <linux/inetdevice.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/module.h>
+#include <net/arp.h>
+#include <net/neighbour.h>
+#include <net/route.h>
+#include <net/netevent.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("IB Address Translation");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct addr_req {
+ struct list_head list;
+ struct sockaddr_storage src_addr;
+ struct sockaddr_storage dst_addr;
+ struct rdma_dev_addr *addr;
+ struct rdma_addr_client *client;
+ void *context;
+ void (*callback)(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context);
+ unsigned long timeout;
+ int status;
+};
+
+static void process_req(struct work_struct *work);
+
+static DEFINE_MUTEX(lock);
+static LIST_HEAD(req_list);
+static DECLARE_DELAYED_WORK(work, process_req);
+static struct workqueue_struct *addr_wq;
+
+int rdma_addr_size(struct sockaddr *addr)
+{
+ switch (addr->sa_family) {
+ case AF_INET:
+ return sizeof(struct sockaddr_in);
+ case AF_INET6:
+ return sizeof(struct sockaddr_in6);
+ case AF_IB:
+ return sizeof(struct sockaddr_ib);
+ default:
+ return 0;
+ }
+}
+EXPORT_SYMBOL(rdma_addr_size);
+
+static struct rdma_addr_client self;
+
+void rdma_addr_register_client(struct rdma_addr_client *client)
+{
+ atomic_set(&client->refcount, 1);
+ init_completion(&client->comp);
+}
+EXPORT_SYMBOL(rdma_addr_register_client);
+
+static inline void put_client(struct rdma_addr_client *client)
+{
+ if (atomic_dec_and_test(&client->refcount))
+ complete(&client->comp);
+}
+
+void rdma_addr_unregister_client(struct rdma_addr_client *client)
+{
+ put_client(client);
+ wait_for_completion(&client->comp);
+}
+EXPORT_SYMBOL(rdma_addr_unregister_client);
+
+int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
+ const unsigned char *dst_dev_addr)
+{
+ dev_addr->dev_type = dev->type;
+ memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
+ memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
+ if (dst_dev_addr)
+ memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
+ dev_addr->bound_dev_if = dev->ifindex;
+ return 0;
+}
+EXPORT_SYMBOL(rdma_copy_addr);
+
+int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
+ u16 *vlan_id)
+{
+ struct net_device *dev;
+ int ret = -EADDRNOTAVAIL;
+
+ if (dev_addr->bound_dev_if) {
+ dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+ if (!dev)
+ return -ENODEV;
+ ret = rdma_copy_addr(dev_addr, dev, NULL);
+ dev_put(dev);
+ return ret;
+ }
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ dev = ip_dev_find(&init_net,
+ ((struct sockaddr_in *) addr)->sin_addr.s_addr);
+
+ if (!dev)
+ return ret;
+
+ ret = rdma_copy_addr(dev_addr, dev, NULL);
+ if (vlan_id)
+ *vlan_id = rdma_vlan_dev_vlan_id(dev);
+ dev_put(dev);
+ break;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev) {
+ if (ipv6_chk_addr(&init_net,
+ &((struct sockaddr_in6 *) addr)->sin6_addr,
+ dev, 1)) {
+ ret = rdma_copy_addr(dev_addr, dev, NULL);
+ if (vlan_id)
+ *vlan_id = rdma_vlan_dev_vlan_id(dev);
+ break;
+ }
+ }
+ rcu_read_unlock();
+ break;
+#endif
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rdma_translate_ip);
+
+static void set_timeout(unsigned long time)
+{
+ unsigned long delay;
+
+ delay = time - jiffies;
+ if ((long)delay < 0)
+ delay = 0;
+
+ mod_delayed_work(addr_wq, &work, delay);
+}
+
+static void queue_req(struct addr_req *req)
+{
+ struct addr_req *temp_req;
+
+ mutex_lock(&lock);
+ list_for_each_entry_reverse(temp_req, &req_list, list) {
+ if (time_after_eq(req->timeout, temp_req->timeout))
+ break;
+ }
+
+ list_add(&req->list, &temp_req->list);
+
+ if (req_list.next == &req->list)
+ set_timeout(req->timeout);
+ mutex_unlock(&lock);
+}
+
+static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, void *daddr)
+{
+ struct neighbour *n;
+ int ret;
+
+ n = dst_neigh_lookup(dst, daddr);
+
+ rcu_read_lock();
+ if (!n || !(n->nud_state & NUD_VALID)) {
+ if (n)
+ neigh_event_send(n, NULL);
+ ret = -ENODATA;
+ } else {
+ ret = rdma_copy_addr(dev_addr, dst->dev, n->ha);
+ }
+ rcu_read_unlock();
+
+ if (n)
+ neigh_release(n);
+
+ return ret;
+}
+
+static int addr4_resolve(struct sockaddr_in *src_in,
+ struct sockaddr_in *dst_in,
+ struct rdma_dev_addr *addr)
+{
+ __be32 src_ip = src_in->sin_addr.s_addr;
+ __be32 dst_ip = dst_in->sin_addr.s_addr;
+ struct rtable *rt;
+ struct flowi4 fl4;
+ int ret;
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = dst_ip;
+ fl4.saddr = src_ip;
+ fl4.flowi4_oif = addr->bound_dev_if;
+ rt = ip_route_output_key(&init_net, &fl4);
+ if (IS_ERR(rt)) {
+ ret = PTR_ERR(rt);
+ goto out;
+ }
+ src_in->sin_family = AF_INET;
+ src_in->sin_addr.s_addr = fl4.saddr;
+
+ if (rt->dst.dev->flags & IFF_LOOPBACK) {
+ ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL);
+ if (!ret)
+ memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
+ goto put;
+ }
+
+ /* If the device does ARP internally, return 'done' */
+ if (rt->dst.dev->flags & IFF_NOARP) {
+ ret = rdma_copy_addr(addr, rt->dst.dev, NULL);
+ goto put;
+ }
+
+ ret = dst_fetch_ha(&rt->dst, addr, &fl4.daddr);
+put:
+ ip_rt_put(rt);
+out:
+ return ret;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+ struct sockaddr_in6 *dst_in,
+ struct rdma_dev_addr *addr)
+{
+ struct flowi6 fl6;
+ struct dst_entry *dst;
+ int ret;
+
+ memset(&fl6, 0, sizeof fl6);
+ fl6.daddr = dst_in->sin6_addr;
+ fl6.saddr = src_in->sin6_addr;
+ fl6.flowi6_oif = addr->bound_dev_if;
+
+ dst = ip6_route_output(&init_net, NULL, &fl6);
+ if ((ret = dst->error))
+ goto put;
+
+ if (ipv6_addr_any(&fl6.saddr)) {
+ ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev,
+ &fl6.daddr, 0, &fl6.saddr);
+ if (ret)
+ goto put;
+
+ src_in->sin6_family = AF_INET6;
+ src_in->sin6_addr = fl6.saddr;
+ }
+
+ if (dst->dev->flags & IFF_LOOPBACK) {
+ ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL);
+ if (!ret)
+ memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
+ goto put;
+ }
+
+ /* If the device does ARP internally, return 'done' */
+ if (dst->dev->flags & IFF_NOARP) {
+ ret = rdma_copy_addr(addr, dst->dev, NULL);
+ goto put;
+ }
+
+ ret = dst_fetch_ha(dst, addr, &fl6.daddr);
+put:
+ dst_release(dst);
+ return ret;
+}
+#else
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+ struct sockaddr_in6 *dst_in,
+ struct rdma_dev_addr *addr)
+{
+ return -EADDRNOTAVAIL;
+}
+#endif
+
+static int addr_resolve(struct sockaddr *src_in,
+ struct sockaddr *dst_in,
+ struct rdma_dev_addr *addr)
+{
+ if (src_in->sa_family == AF_INET) {
+ return addr4_resolve((struct sockaddr_in *) src_in,
+ (struct sockaddr_in *) dst_in, addr);
+ } else
+ return addr6_resolve((struct sockaddr_in6 *) src_in,
+ (struct sockaddr_in6 *) dst_in, addr);
+}
+
+static void process_req(struct work_struct *work)
+{
+ struct addr_req *req, *temp_req;
+ struct sockaddr *src_in, *dst_in;
+ struct list_head done_list;
+
+ INIT_LIST_HEAD(&done_list);
+
+ mutex_lock(&lock);
+ list_for_each_entry_safe(req, temp_req, &req_list, list) {
+ if (req->status == -ENODATA) {
+ src_in = (struct sockaddr *) &req->src_addr;
+ dst_in = (struct sockaddr *) &req->dst_addr;
+ req->status = addr_resolve(src_in, dst_in, req->addr);
+ if (req->status && time_after_eq(jiffies, req->timeout))
+ req->status = -ETIMEDOUT;
+ else if (req->status == -ENODATA)
+ continue;
+ }
+ list_move_tail(&req->list, &done_list);
+ }
+
+ if (!list_empty(&req_list)) {
+ req = list_entry(req_list.next, struct addr_req, list);
+ set_timeout(req->timeout);
+ }
+ mutex_unlock(&lock);
+
+ list_for_each_entry_safe(req, temp_req, &done_list, list) {
+ list_del(&req->list);
+ req->callback(req->status, (struct sockaddr *) &req->src_addr,
+ req->addr, req->context);
+ put_client(req->client);
+ kfree(req);
+ }
+}
+
+int rdma_resolve_ip(struct rdma_addr_client *client,
+ struct sockaddr *src_addr, struct sockaddr *dst_addr,
+ struct rdma_dev_addr *addr, int timeout_ms,
+ void (*callback)(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context),
+ void *context)
+{
+ struct sockaddr *src_in, *dst_in;
+ struct addr_req *req;
+ int ret = 0;
+
+ req = kzalloc(sizeof *req, GFP_KERNEL);
+ if (!req)
+ return -ENOMEM;
+
+ src_in = (struct sockaddr *) &req->src_addr;
+ dst_in = (struct sockaddr *) &req->dst_addr;
+
+ if (src_addr) {
+ if (src_addr->sa_family != dst_addr->sa_family) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ memcpy(src_in, src_addr, rdma_addr_size(src_addr));
+ } else {
+ src_in->sa_family = dst_addr->sa_family;
+ }
+
+ memcpy(dst_in, dst_addr, rdma_addr_size(dst_addr));
+ req->addr = addr;
+ req->callback = callback;
+ req->context = context;
+ req->client = client;
+ atomic_inc(&client->refcount);
+
+ req->status = addr_resolve(src_in, dst_in, addr);
+ switch (req->status) {
+ case 0:
+ req->timeout = jiffies;
+ queue_req(req);
+ break;
+ case -ENODATA:
+ req->timeout = msecs_to_jiffies(timeout_ms) + jiffies;
+ queue_req(req);
+ break;
+ default:
+ ret = req->status;
+ atomic_dec(&client->refcount);
+ goto err;
+ }
+ return ret;
+err:
+ kfree(req);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_ip);
+
+void rdma_addr_cancel(struct rdma_dev_addr *addr)
+{
+ struct addr_req *req, *temp_req;
+
+ mutex_lock(&lock);
+ list_for_each_entry_safe(req, temp_req, &req_list, list) {
+ if (req->addr == addr) {
+ req->status = -ECANCELED;
+ req->timeout = jiffies;
+ list_move(&req->list, &req_list);
+ set_timeout(req->timeout);
+ break;
+ }
+ }
+ mutex_unlock(&lock);
+}
+EXPORT_SYMBOL(rdma_addr_cancel);
+
+struct resolve_cb_context {
+ struct rdma_dev_addr *addr;
+ struct completion comp;
+};
+
+static void resolve_cb(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *addr, void *context)
+{
+ memcpy(((struct resolve_cb_context *)context)->addr, addr, sizeof(struct
+ rdma_dev_addr));
+ complete(&((struct resolve_cb_context *)context)->comp);
+}
+
+int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac,
+ u16 *vlan_id)
+{
+ int ret = 0;
+ struct rdma_dev_addr dev_addr;
+ struct resolve_cb_context ctx;
+ struct net_device *dev;
+
+ union {
+ struct sockaddr _sockaddr;
+ struct sockaddr_in _sockaddr_in;
+ struct sockaddr_in6 _sockaddr_in6;
+ } sgid_addr, dgid_addr;
+
+
+ rdma_gid2ip(&sgid_addr._sockaddr, sgid);
+ rdma_gid2ip(&dgid_addr._sockaddr, dgid);
+
+ memset(&dev_addr, 0, sizeof(dev_addr));
+
+ ctx.addr = &dev_addr;
+ init_completion(&ctx.comp);
+ ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr,
+ &dev_addr, 1000, resolve_cb, &ctx);
+ if (ret)
+ return ret;
+
+ wait_for_completion(&ctx.comp);
+
+ memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN);
+ dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if);
+ if (!dev)
+ return -ENODEV;
+ if (vlan_id)
+ *vlan_id = rdma_vlan_dev_vlan_id(dev);
+ dev_put(dev);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh);
+
+int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id)
+{
+ int ret = 0;
+ struct rdma_dev_addr dev_addr;
+ union {
+ struct sockaddr _sockaddr;
+ struct sockaddr_in _sockaddr_in;
+ struct sockaddr_in6 _sockaddr_in6;
+ } gid_addr;
+
+ rdma_gid2ip(&gid_addr._sockaddr, sgid);
+
+ memset(&dev_addr, 0, sizeof(dev_addr));
+ ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id);
+ if (ret)
+ return ret;
+
+ memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid);
+
+static int netevent_callback(struct notifier_block *self, unsigned long event,
+ void *ctx)
+{
+ if (event == NETEVENT_NEIGH_UPDATE) {
+ struct neighbour *neigh = ctx;
+
+ if (neigh->nud_state & NUD_VALID) {
+ set_timeout(jiffies);
+ }
+ }
+ return 0;
+}
+
+static struct notifier_block nb = {
+ .notifier_call = netevent_callback
+};
+
+static int __init addr_init(void)
+{
+ addr_wq = create_singlethread_workqueue("ib_addr");
+ if (!addr_wq)
+ return -ENOMEM;
+
+ register_netevent_notifier(&nb);
+ rdma_addr_register_client(&self);
+ return 0;
+}
+
+static void __exit addr_cleanup(void)
+{
+ rdma_addr_unregister_client(&self);
+ unregister_netevent_notifier(&nb);
+ destroy_workqueue(addr_wq);
+}
+
+module_init(addr_init);
+module_exit(addr_cleanup);
diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
new file mode 100644
index 000000000..f6d29614c
--- /dev/null
+++ b/drivers/infiniband/core/agent.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "agent.h"
+#include "smi.h"
+#include "mad_priv.h"
+
+#define SPFX "ib_agent: "
+
+struct ib_agent_port_private {
+ struct list_head port_list;
+ struct ib_mad_agent *agent[2];
+};
+
+static DEFINE_SPINLOCK(ib_agent_port_list_lock);
+static LIST_HEAD(ib_agent_port_list);
+
+static struct ib_agent_port_private *
+__ib_get_agent_port(struct ib_device *device, int port_num)
+{
+ struct ib_agent_port_private *entry;
+
+ list_for_each_entry(entry, &ib_agent_port_list, port_list) {
+ if (entry->agent[1]->device == device &&
+ entry->agent[1]->port_num == port_num)
+ return entry;
+ }
+ return NULL;
+}
+
+static struct ib_agent_port_private *
+ib_get_agent_port(struct ib_device *device, int port_num)
+{
+ struct ib_agent_port_private *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+ entry = __ib_get_agent_port(device, port_num);
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+ return entry;
+}
+
+void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
+ struct ib_wc *wc, struct ib_device *device,
+ int port_num, int qpn)
+{
+ struct ib_agent_port_private *port_priv;
+ struct ib_mad_agent *agent;
+ struct ib_mad_send_buf *send_buf;
+ struct ib_ah *ah;
+ struct ib_mad_send_wr_private *mad_send_wr;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH)
+ port_priv = ib_get_agent_port(device, 0);
+ else
+ port_priv = ib_get_agent_port(device, port_num);
+
+ if (!port_priv) {
+ dev_err(&device->dev, "Unable to find port agent\n");
+ return;
+ }
+
+ agent = port_priv->agent[qpn];
+ ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num);
+ if (IS_ERR(ah)) {
+ dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n",
+ PTR_ERR(ah));
+ return;
+ }
+
+ send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0,
+ IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+ GFP_KERNEL);
+ if (IS_ERR(send_buf)) {
+ dev_err(&device->dev, "ib_create_send_mad error\n");
+ goto err1;
+ }
+
+ memcpy(send_buf->mad, mad, sizeof *mad);
+ send_buf->ah = ah;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH) {
+ mad_send_wr = container_of(send_buf,
+ struct ib_mad_send_wr_private,
+ send_buf);
+ mad_send_wr->send_wr.wr.ud.port_num = port_num;
+ }
+
+ if (ib_post_send_mad(send_buf, NULL)) {
+ dev_err(&device->dev, "ib_post_send_mad error\n");
+ goto err2;
+ }
+ return;
+err2:
+ ib_free_send_mad(send_buf);
+err1:
+ ib_destroy_ah(ah);
+}
+
+static void agent_send_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ ib_destroy_ah(mad_send_wc->send_buf->ah);
+ ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int ib_agent_port_open(struct ib_device *device, int port_num)
+{
+ struct ib_agent_port_private *port_priv;
+ unsigned long flags;
+ int ret;
+
+ /* Create new device info */
+ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+ if (!port_priv) {
+ dev_err(&device->dev, "No memory for ib_agent_port_private\n");
+ ret = -ENOMEM;
+ goto error1;
+ }
+
+ if (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) {
+ /* Obtain send only MAD agent for SMI QP */
+ port_priv->agent[0] = ib_register_mad_agent(device, port_num,
+ IB_QPT_SMI, NULL, 0,
+ &agent_send_handler,
+ NULL, NULL, 0);
+ if (IS_ERR(port_priv->agent[0])) {
+ ret = PTR_ERR(port_priv->agent[0]);
+ goto error2;
+ }
+ }
+
+ /* Obtain send only MAD agent for GSI QP */
+ port_priv->agent[1] = ib_register_mad_agent(device, port_num,
+ IB_QPT_GSI, NULL, 0,
+ &agent_send_handler,
+ NULL, NULL, 0);
+ if (IS_ERR(port_priv->agent[1])) {
+ ret = PTR_ERR(port_priv->agent[1]);
+ goto error3;
+ }
+
+ spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+ list_add_tail(&port_priv->port_list, &ib_agent_port_list);
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+ return 0;
+
+error3:
+ if (port_priv->agent[0])
+ ib_unregister_mad_agent(port_priv->agent[0]);
+error2:
+ kfree(port_priv);
+error1:
+ return ret;
+}
+
+int ib_agent_port_close(struct ib_device *device, int port_num)
+{
+ struct ib_agent_port_private *port_priv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+ port_priv = __ib_get_agent_port(device, port_num);
+ if (port_priv == NULL) {
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+ dev_err(&device->dev, "Port %d not found\n", port_num);
+ return -ENODEV;
+ }
+ list_del(&port_priv->port_list);
+ spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+ ib_unregister_mad_agent(port_priv->agent[1]);
+ if (port_priv->agent[0])
+ ib_unregister_mad_agent(port_priv->agent[0]);
+
+ kfree(port_priv);
+ return 0;
+}
diff --git a/drivers/infiniband/core/agent.h b/drivers/infiniband/core/agent.h
new file mode 100644
index 000000000..666928700
--- /dev/null
+++ b/drivers/infiniband/core/agent.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __AGENT_H_
+#define __AGENT_H_
+
+#include <linux/err.h>
+#include <rdma/ib_mad.h>
+
+extern int ib_agent_port_open(struct ib_device *device, int port_num);
+
+extern int ib_agent_port_close(struct ib_device *device, int port_num);
+
+extern void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
+ struct ib_wc *wc, struct ib_device *device,
+ int port_num, int qpn);
+
+#endif /* __AGENT_H_ */
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
new file mode 100644
index 000000000..80f6cf244
--- /dev/null
+++ b/drivers/infiniband/core/cache.c
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include <rdma/ib_cache.h>
+
+#include "core_priv.h"
+
+struct ib_pkey_cache {
+ int table_len;
+ u16 table[0];
+};
+
+struct ib_gid_cache {
+ int table_len;
+ union ib_gid table[0];
+};
+
+struct ib_update_work {
+ struct work_struct work;
+ struct ib_device *device;
+ u8 port_num;
+};
+
+static inline int start_port(struct ib_device *device)
+{
+ return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+static inline int end_port(struct ib_device *device)
+{
+ return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+ 0 : device->phys_port_cnt;
+}
+
+int ib_get_cached_gid(struct ib_device *device,
+ u8 port_num,
+ int index,
+ union ib_gid *gid)
+{
+ struct ib_gid_cache *cache;
+ unsigned long flags;
+ int ret = 0;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ cache = device->cache.gid_cache[port_num - start_port(device)];
+
+ if (index < 0 || index >= cache->table_len)
+ ret = -EINVAL;
+ else
+ *gid = cache->table[index];
+
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_gid);
+
+int ib_find_cached_gid(struct ib_device *device,
+ union ib_gid *gid,
+ u8 *port_num,
+ u16 *index)
+{
+ struct ib_gid_cache *cache;
+ unsigned long flags;
+ int p, i;
+ int ret = -ENOENT;
+
+ *port_num = -1;
+ if (index)
+ *index = -1;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+ cache = device->cache.gid_cache[p];
+ for (i = 0; i < cache->table_len; ++i) {
+ if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
+ *port_num = p + start_port(device);
+ if (index)
+ *index = i;
+ ret = 0;
+ goto found;
+ }
+ }
+ }
+found:
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_gid);
+
+int ib_get_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ int index,
+ u16 *pkey)
+{
+ struct ib_pkey_cache *cache;
+ unsigned long flags;
+ int ret = 0;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+ if (index < 0 || index >= cache->table_len)
+ ret = -EINVAL;
+ else
+ *pkey = cache->table[index];
+
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_pkey);
+
+int ib_find_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ u16 pkey,
+ u16 *index)
+{
+ struct ib_pkey_cache *cache;
+ unsigned long flags;
+ int i;
+ int ret = -ENOENT;
+ int partial_ix = -1;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+ *index = -1;
+
+ for (i = 0; i < cache->table_len; ++i)
+ if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
+ if (cache->table[i] & 0x8000) {
+ *index = i;
+ ret = 0;
+ break;
+ } else
+ partial_ix = i;
+ }
+
+ if (ret && partial_ix >= 0) {
+ *index = partial_ix;
+ ret = 0;
+ }
+
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_pkey);
+
+int ib_find_exact_cached_pkey(struct ib_device *device,
+ u8 port_num,
+ u16 pkey,
+ u16 *index)
+{
+ struct ib_pkey_cache *cache;
+ unsigned long flags;
+ int i;
+ int ret = -ENOENT;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+
+ cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+ *index = -1;
+
+ for (i = 0; i < cache->table_len; ++i)
+ if (cache->table[i] == pkey) {
+ *index = i;
+ ret = 0;
+ break;
+ }
+
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_find_exact_cached_pkey);
+
+int ib_get_cached_lmc(struct ib_device *device,
+ u8 port_num,
+ u8 *lmc)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ read_lock_irqsave(&device->cache.lock, flags);
+ *lmc = device->cache.lmc_cache[port_num - start_port(device)];
+ read_unlock_irqrestore(&device->cache.lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_lmc);
+
+static void ib_cache_update(struct ib_device *device,
+ u8 port)
+{
+ struct ib_port_attr *tprops = NULL;
+ struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache;
+ struct ib_gid_cache *gid_cache = NULL, *old_gid_cache;
+ int i;
+ int ret;
+
+ tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+ if (!tprops)
+ return;
+
+ ret = ib_query_port(device, port, tprops);
+ if (ret) {
+ printk(KERN_WARNING "ib_query_port failed (%d) for %s\n",
+ ret, device->name);
+ goto err;
+ }
+
+ pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len *
+ sizeof *pkey_cache->table, GFP_KERNEL);
+ if (!pkey_cache)
+ goto err;
+
+ pkey_cache->table_len = tprops->pkey_tbl_len;
+
+ gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len *
+ sizeof *gid_cache->table, GFP_KERNEL);
+ if (!gid_cache)
+ goto err;
+
+ gid_cache->table_len = tprops->gid_tbl_len;
+
+ for (i = 0; i < pkey_cache->table_len; ++i) {
+ ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
+ if (ret) {
+ printk(KERN_WARNING "ib_query_pkey failed (%d) for %s (index %d)\n",
+ ret, device->name, i);
+ goto err;
+ }
+ }
+
+ for (i = 0; i < gid_cache->table_len; ++i) {
+ ret = ib_query_gid(device, port, i, gid_cache->table + i);
+ if (ret) {
+ printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
+ ret, device->name, i);
+ goto err;
+ }
+ }
+
+ write_lock_irq(&device->cache.lock);
+
+ old_pkey_cache = device->cache.pkey_cache[port - start_port(device)];
+ old_gid_cache = device->cache.gid_cache [port - start_port(device)];
+
+ device->cache.pkey_cache[port - start_port(device)] = pkey_cache;
+ device->cache.gid_cache [port - start_port(device)] = gid_cache;
+
+ device->cache.lmc_cache[port - start_port(device)] = tprops->lmc;
+
+ write_unlock_irq(&device->cache.lock);
+
+ kfree(old_pkey_cache);
+ kfree(old_gid_cache);
+ kfree(tprops);
+ return;
+
+err:
+ kfree(pkey_cache);
+ kfree(gid_cache);
+ kfree(tprops);
+}
+
+static void ib_cache_task(struct work_struct *_work)
+{
+ struct ib_update_work *work =
+ container_of(_work, struct ib_update_work, work);
+
+ ib_cache_update(work->device, work->port_num);
+ kfree(work);
+}
+
+static void ib_cache_event(struct ib_event_handler *handler,
+ struct ib_event *event)
+{
+ struct ib_update_work *work;
+
+ if (event->event == IB_EVENT_PORT_ERR ||
+ event->event == IB_EVENT_PORT_ACTIVE ||
+ event->event == IB_EVENT_LID_CHANGE ||
+ event->event == IB_EVENT_PKEY_CHANGE ||
+ event->event == IB_EVENT_SM_CHANGE ||
+ event->event == IB_EVENT_CLIENT_REREGISTER ||
+ event->event == IB_EVENT_GID_CHANGE) {
+ work = kmalloc(sizeof *work, GFP_ATOMIC);
+ if (work) {
+ INIT_WORK(&work->work, ib_cache_task);
+ work->device = event->device;
+ work->port_num = event->element.port_num;
+ queue_work(ib_wq, &work->work);
+ }
+ }
+}
+
+static void ib_cache_setup_one(struct ib_device *device)
+{
+ int p;
+
+ rwlock_init(&device->cache.lock);
+
+ device->cache.pkey_cache =
+ kmalloc(sizeof *device->cache.pkey_cache *
+ (end_port(device) - start_port(device) + 1), GFP_KERNEL);
+ device->cache.gid_cache =
+ kmalloc(sizeof *device->cache.gid_cache *
+ (end_port(device) - start_port(device) + 1), GFP_KERNEL);
+
+ device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *
+ (end_port(device) -
+ start_port(device) + 1),
+ GFP_KERNEL);
+
+ if (!device->cache.pkey_cache || !device->cache.gid_cache ||
+ !device->cache.lmc_cache) {
+ printk(KERN_WARNING "Couldn't allocate cache "
+ "for %s\n", device->name);
+ goto err;
+ }
+
+ for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+ device->cache.pkey_cache[p] = NULL;
+ device->cache.gid_cache [p] = NULL;
+ ib_cache_update(device, p + start_port(device));
+ }
+
+ INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
+ device, ib_cache_event);
+ if (ib_register_event_handler(&device->cache.event_handler))
+ goto err_cache;
+
+ return;
+
+err_cache:
+ for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+ kfree(device->cache.pkey_cache[p]);
+ kfree(device->cache.gid_cache[p]);
+ }
+
+err:
+ kfree(device->cache.pkey_cache);
+ kfree(device->cache.gid_cache);
+ kfree(device->cache.lmc_cache);
+}
+
+static void ib_cache_cleanup_one(struct ib_device *device)
+{
+ int p;
+
+ ib_unregister_event_handler(&device->cache.event_handler);
+ flush_workqueue(ib_wq);
+
+ for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+ kfree(device->cache.pkey_cache[p]);
+ kfree(device->cache.gid_cache[p]);
+ }
+
+ kfree(device->cache.pkey_cache);
+ kfree(device->cache.gid_cache);
+ kfree(device->cache.lmc_cache);
+}
+
+static struct ib_client cache_client = {
+ .name = "cache",
+ .add = ib_cache_setup_one,
+ .remove = ib_cache_cleanup_one
+};
+
+int __init ib_cache_setup(void)
+{
+ return ib_register_client(&cache_client);
+}
+
+void __exit ib_cache_cleanup(void)
+{
+ ib_unregister_client(&cache_client);
+}
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
new file mode 100644
index 000000000..0271608a5
--- /dev/null
+++ b/drivers/infiniband/core/cm.c
@@ -0,0 +1,3931 @@
+/*
+ * Copyright (c) 2004-2007 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/random.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/workqueue.h>
+#include <linux/kdev_t.h>
+#include <linux/etherdevice.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include "cm_msgs.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static void cm_add_one(struct ib_device *device);
+static void cm_remove_one(struct ib_device *device);
+
+static struct ib_client cm_client = {
+ .name = "cm",
+ .add = cm_add_one,
+ .remove = cm_remove_one
+};
+
+static struct ib_cm {
+ spinlock_t lock;
+ struct list_head device_list;
+ rwlock_t device_lock;
+ struct rb_root listen_service_table;
+ u64 listen_service_id;
+ /* struct rb_root peer_service_table; todo: fix peer to peer */
+ struct rb_root remote_qp_table;
+ struct rb_root remote_id_table;
+ struct rb_root remote_sidr_table;
+ struct idr local_id_table;
+ __be32 random_id_operand;
+ struct list_head timewait_list;
+ struct workqueue_struct *wq;
+} cm;
+
+/* Counter indexes ordered by attribute ID */
+enum {
+ CM_REQ_COUNTER,
+ CM_MRA_COUNTER,
+ CM_REJ_COUNTER,
+ CM_REP_COUNTER,
+ CM_RTU_COUNTER,
+ CM_DREQ_COUNTER,
+ CM_DREP_COUNTER,
+ CM_SIDR_REQ_COUNTER,
+ CM_SIDR_REP_COUNTER,
+ CM_LAP_COUNTER,
+ CM_APR_COUNTER,
+ CM_ATTR_COUNT,
+ CM_ATTR_ID_OFFSET = 0x0010,
+};
+
+enum {
+ CM_XMIT,
+ CM_XMIT_RETRIES,
+ CM_RECV,
+ CM_RECV_DUPLICATES,
+ CM_COUNTER_GROUPS
+};
+
+static char const counter_group_names[CM_COUNTER_GROUPS]
+ [sizeof("cm_rx_duplicates")] = {
+ "cm_tx_msgs", "cm_tx_retries",
+ "cm_rx_msgs", "cm_rx_duplicates"
+};
+
+struct cm_counter_group {
+ struct kobject obj;
+ atomic_long_t counter[CM_ATTR_COUNT];
+};
+
+struct cm_counter_attribute {
+ struct attribute attr;
+ int index;
+};
+
+#define CM_COUNTER_ATTR(_name, _index) \
+struct cm_counter_attribute cm_##_name##_counter_attr = { \
+ .attr = { .name = __stringify(_name), .mode = 0444 }, \
+ .index = _index \
+}
+
+static CM_COUNTER_ATTR(req, CM_REQ_COUNTER);
+static CM_COUNTER_ATTR(mra, CM_MRA_COUNTER);
+static CM_COUNTER_ATTR(rej, CM_REJ_COUNTER);
+static CM_COUNTER_ATTR(rep, CM_REP_COUNTER);
+static CM_COUNTER_ATTR(rtu, CM_RTU_COUNTER);
+static CM_COUNTER_ATTR(dreq, CM_DREQ_COUNTER);
+static CM_COUNTER_ATTR(drep, CM_DREP_COUNTER);
+static CM_COUNTER_ATTR(sidr_req, CM_SIDR_REQ_COUNTER);
+static CM_COUNTER_ATTR(sidr_rep, CM_SIDR_REP_COUNTER);
+static CM_COUNTER_ATTR(lap, CM_LAP_COUNTER);
+static CM_COUNTER_ATTR(apr, CM_APR_COUNTER);
+
+static struct attribute *cm_counter_default_attrs[] = {
+ &cm_req_counter_attr.attr,
+ &cm_mra_counter_attr.attr,
+ &cm_rej_counter_attr.attr,
+ &cm_rep_counter_attr.attr,
+ &cm_rtu_counter_attr.attr,
+ &cm_dreq_counter_attr.attr,
+ &cm_drep_counter_attr.attr,
+ &cm_sidr_req_counter_attr.attr,
+ &cm_sidr_rep_counter_attr.attr,
+ &cm_lap_counter_attr.attr,
+ &cm_apr_counter_attr.attr,
+ NULL
+};
+
+struct cm_port {
+ struct cm_device *cm_dev;
+ struct ib_mad_agent *mad_agent;
+ struct kobject port_obj;
+ u8 port_num;
+ struct cm_counter_group counter_group[CM_COUNTER_GROUPS];
+};
+
+struct cm_device {
+ struct list_head list;
+ struct ib_device *ib_device;
+ struct device *device;
+ u8 ack_delay;
+ struct cm_port *port[0];
+};
+
+struct cm_av {
+ struct cm_port *port;
+ union ib_gid dgid;
+ struct ib_ah_attr ah_attr;
+ u16 pkey_index;
+ u8 timeout;
+ u8 valid;
+ u8 smac[ETH_ALEN];
+};
+
+struct cm_work {
+ struct delayed_work work;
+ struct list_head list;
+ struct cm_port *port;
+ struct ib_mad_recv_wc *mad_recv_wc; /* Received MADs */
+ __be32 local_id; /* Established / timewait */
+ __be32 remote_id;
+ struct ib_cm_event cm_event;
+ struct ib_sa_path_rec path[0];
+};
+
+struct cm_timewait_info {
+ struct cm_work work; /* Must be first. */
+ struct list_head list;
+ struct rb_node remote_qp_node;
+ struct rb_node remote_id_node;
+ __be64 remote_ca_guid;
+ __be32 remote_qpn;
+ u8 inserted_remote_qp;
+ u8 inserted_remote_id;
+};
+
+struct cm_id_private {
+ struct ib_cm_id id;
+
+ struct rb_node service_node;
+ struct rb_node sidr_id_node;
+ spinlock_t lock; /* Do not acquire inside cm.lock */
+ struct completion comp;
+ atomic_t refcount;
+
+ struct ib_mad_send_buf *msg;
+ struct cm_timewait_info *timewait_info;
+ /* todo: use alternate port on send failure */
+ struct cm_av av;
+ struct cm_av alt_av;
+ struct ib_cm_compare_data *compare_data;
+
+ void *private_data;
+ __be64 tid;
+ __be32 local_qpn;
+ __be32 remote_qpn;
+ enum ib_qp_type qp_type;
+ __be32 sq_psn;
+ __be32 rq_psn;
+ int timeout_ms;
+ enum ib_mtu path_mtu;
+ __be16 pkey;
+ u8 private_data_len;
+ u8 max_cm_retries;
+ u8 peer_to_peer;
+ u8 responder_resources;
+ u8 initiator_depth;
+ u8 retry_count;
+ u8 rnr_retry_count;
+ u8 service_timeout;
+ u8 target_ack_delay;
+
+ struct list_head work_list;
+ atomic_t work_count;
+};
+
+static void cm_work_handler(struct work_struct *work);
+
+static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
+{
+ if (atomic_dec_and_test(&cm_id_priv->refcount))
+ complete(&cm_id_priv->comp);
+}
+
+static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
+ struct ib_mad_send_buf **msg)
+{
+ struct ib_mad_agent *mad_agent;
+ struct ib_mad_send_buf *m;
+ struct ib_ah *ah;
+
+ mad_agent = cm_id_priv->av.port->mad_agent;
+ ah = ib_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr);
+ if (IS_ERR(ah))
+ return PTR_ERR(ah);
+
+ m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn,
+ cm_id_priv->av.pkey_index,
+ 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+ GFP_ATOMIC);
+ if (IS_ERR(m)) {
+ ib_destroy_ah(ah);
+ return PTR_ERR(m);
+ }
+
+ /* Timeout set by caller if response is expected. */
+ m->ah = ah;
+ m->retries = cm_id_priv->max_cm_retries;
+
+ atomic_inc(&cm_id_priv->refcount);
+ m->context[0] = cm_id_priv;
+ *msg = m;
+ return 0;
+}
+
+static int cm_alloc_response_msg(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ struct ib_mad_send_buf **msg)
+{
+ struct ib_mad_send_buf *m;
+ struct ib_ah *ah;
+
+ ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc,
+ mad_recv_wc->recv_buf.grh, port->port_num);
+ if (IS_ERR(ah))
+ return PTR_ERR(ah);
+
+ m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
+ 0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+ GFP_ATOMIC);
+ if (IS_ERR(m)) {
+ ib_destroy_ah(ah);
+ return PTR_ERR(m);
+ }
+ m->ah = ah;
+ *msg = m;
+ return 0;
+}
+
+static void cm_free_msg(struct ib_mad_send_buf *msg)
+{
+ ib_destroy_ah(msg->ah);
+ if (msg->context[0])
+ cm_deref_id(msg->context[0]);
+ ib_free_send_mad(msg);
+}
+
+static void * cm_copy_private_data(const void *private_data,
+ u8 private_data_len)
+{
+ void *data;
+
+ if (!private_data || !private_data_len)
+ return NULL;
+
+ data = kmemdup(private_data, private_data_len, GFP_KERNEL);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ return data;
+}
+
+static void cm_set_private_data(struct cm_id_private *cm_id_priv,
+ void *private_data, u8 private_data_len)
+{
+ if (cm_id_priv->private_data && cm_id_priv->private_data_len)
+ kfree(cm_id_priv->private_data);
+
+ cm_id_priv->private_data = private_data;
+ cm_id_priv->private_data_len = private_data_len;
+}
+
+static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
+ struct ib_grh *grh, struct cm_av *av)
+{
+ av->port = port;
+ av->pkey_index = wc->pkey_index;
+ ib_init_ah_from_wc(port->cm_dev->ib_device, port->port_num, wc,
+ grh, &av->ah_attr);
+}
+
+static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
+{
+ struct cm_device *cm_dev;
+ struct cm_port *port = NULL;
+ unsigned long flags;
+ int ret;
+ u8 p;
+
+ read_lock_irqsave(&cm.device_lock, flags);
+ list_for_each_entry(cm_dev, &cm.device_list, list) {
+ if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid,
+ &p, NULL)) {
+ port = cm_dev->port[p-1];
+ break;
+ }
+ }
+ read_unlock_irqrestore(&cm.device_lock, flags);
+
+ if (!port)
+ return -EINVAL;
+
+ ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num,
+ be16_to_cpu(path->pkey), &av->pkey_index);
+ if (ret)
+ return ret;
+
+ av->port = port;
+ ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path,
+ &av->ah_attr);
+ av->timeout = path->packet_life_time + 1;
+ memcpy(av->smac, path->smac, sizeof(av->smac));
+
+ av->valid = 1;
+ return 0;
+}
+
+static int cm_alloc_id(struct cm_id_private *cm_id_priv)
+{
+ unsigned long flags;
+ int id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock_irqsave(&cm.lock, flags);
+
+ id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT);
+
+ spin_unlock_irqrestore(&cm.lock, flags);
+ idr_preload_end();
+
+ cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
+ return id < 0 ? id : 0;
+}
+
+static void cm_free_id(__be32 local_id)
+{
+ spin_lock_irq(&cm.lock);
+ idr_remove(&cm.local_id_table,
+ (__force int) (local_id ^ cm.random_id_operand));
+ spin_unlock_irq(&cm.lock);
+}
+
+static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id)
+{
+ struct cm_id_private *cm_id_priv;
+
+ cm_id_priv = idr_find(&cm.local_id_table,
+ (__force int) (local_id ^ cm.random_id_operand));
+ if (cm_id_priv) {
+ if (cm_id_priv->id.remote_id == remote_id)
+ atomic_inc(&cm_id_priv->refcount);
+ else
+ cm_id_priv = NULL;
+ }
+
+ return cm_id_priv;
+}
+
+static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)
+{
+ struct cm_id_private *cm_id_priv;
+
+ spin_lock_irq(&cm.lock);
+ cm_id_priv = cm_get_id(local_id, remote_id);
+ spin_unlock_irq(&cm.lock);
+
+ return cm_id_priv;
+}
+
+static void cm_mask_copy(u32 *dst, const u32 *src, const u32 *mask)
+{
+ int i;
+
+ for (i = 0; i < IB_CM_COMPARE_SIZE; i++)
+ dst[i] = src[i] & mask[i];
+}
+
+static int cm_compare_data(struct ib_cm_compare_data *src_data,
+ struct ib_cm_compare_data *dst_data)
+{
+ u32 src[IB_CM_COMPARE_SIZE];
+ u32 dst[IB_CM_COMPARE_SIZE];
+
+ if (!src_data || !dst_data)
+ return 0;
+
+ cm_mask_copy(src, src_data->data, dst_data->mask);
+ cm_mask_copy(dst, dst_data->data, src_data->mask);
+ return memcmp(src, dst, sizeof(src));
+}
+
+static int cm_compare_private_data(u32 *private_data,
+ struct ib_cm_compare_data *dst_data)
+{
+ u32 src[IB_CM_COMPARE_SIZE];
+
+ if (!dst_data)
+ return 0;
+
+ cm_mask_copy(src, private_data, dst_data->mask);
+ return memcmp(src, dst_data->data, sizeof(src));
+}
+
+/*
+ * Trivial helpers to strip endian annotation and compare; the
+ * endianness doesn't actually matter since we just need a stable
+ * order for the RB tree.
+ */
+static int be32_lt(__be32 a, __be32 b)
+{
+ return (__force u32) a < (__force u32) b;
+}
+
+static int be32_gt(__be32 a, __be32 b)
+{
+ return (__force u32) a > (__force u32) b;
+}
+
+static int be64_lt(__be64 a, __be64 b)
+{
+ return (__force u64) a < (__force u64) b;
+}
+
+static int be64_gt(__be64 a, __be64 b)
+{
+ return (__force u64) a > (__force u64) b;
+}
+
+static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
+{
+ struct rb_node **link = &cm.listen_service_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct cm_id_private *cur_cm_id_priv;
+ __be64 service_id = cm_id_priv->id.service_id;
+ __be64 service_mask = cm_id_priv->id.service_mask;
+ int data_cmp;
+
+ while (*link) {
+ parent = *link;
+ cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+ service_node);
+ data_cmp = cm_compare_data(cm_id_priv->compare_data,
+ cur_cm_id_priv->compare_data);
+ if ((cur_cm_id_priv->id.service_mask & service_id) ==
+ (service_mask & cur_cm_id_priv->id.service_id) &&
+ (cm_id_priv->id.device == cur_cm_id_priv->id.device) &&
+ !data_cmp)
+ return cur_cm_id_priv;
+
+ if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
+ link = &(*link)->rb_left;
+ else if (cm_id_priv->id.device > cur_cm_id_priv->id.device)
+ link = &(*link)->rb_right;
+ else if (be64_lt(service_id, cur_cm_id_priv->id.service_id))
+ link = &(*link)->rb_left;
+ else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
+ link = &(*link)->rb_right;
+ else if (data_cmp < 0)
+ link = &(*link)->rb_left;
+ else
+ link = &(*link)->rb_right;
+ }
+ rb_link_node(&cm_id_priv->service_node, parent, link);
+ rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table);
+ return NULL;
+}
+
+static struct cm_id_private * cm_find_listen(struct ib_device *device,
+ __be64 service_id,
+ u32 *private_data)
+{
+ struct rb_node *node = cm.listen_service_table.rb_node;
+ struct cm_id_private *cm_id_priv;
+ int data_cmp;
+
+ while (node) {
+ cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
+ data_cmp = cm_compare_private_data(private_data,
+ cm_id_priv->compare_data);
+ if ((cm_id_priv->id.service_mask & service_id) ==
+ cm_id_priv->id.service_id &&
+ (cm_id_priv->id.device == device) && !data_cmp)
+ return cm_id_priv;
+
+ if (device < cm_id_priv->id.device)
+ node = node->rb_left;
+ else if (device > cm_id_priv->id.device)
+ node = node->rb_right;
+ else if (be64_lt(service_id, cm_id_priv->id.service_id))
+ node = node->rb_left;
+ else if (be64_gt(service_id, cm_id_priv->id.service_id))
+ node = node->rb_right;
+ else if (data_cmp < 0)
+ node = node->rb_left;
+ else
+ node = node->rb_right;
+ }
+ return NULL;
+}
+
+static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info
+ *timewait_info)
+{
+ struct rb_node **link = &cm.remote_id_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct cm_timewait_info *cur_timewait_info;
+ __be64 remote_ca_guid = timewait_info->remote_ca_guid;
+ __be32 remote_id = timewait_info->work.remote_id;
+
+ while (*link) {
+ parent = *link;
+ cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+ remote_id_node);
+ if (be32_lt(remote_id, cur_timewait_info->work.remote_id))
+ link = &(*link)->rb_left;
+ else if (be32_gt(remote_id, cur_timewait_info->work.remote_id))
+ link = &(*link)->rb_right;
+ else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+ link = &(*link)->rb_left;
+ else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+ link = &(*link)->rb_right;
+ else
+ return cur_timewait_info;
+ }
+ timewait_info->inserted_remote_id = 1;
+ rb_link_node(&timewait_info->remote_id_node, parent, link);
+ rb_insert_color(&timewait_info->remote_id_node, &cm.remote_id_table);
+ return NULL;
+}
+
+static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid,
+ __be32 remote_id)
+{
+ struct rb_node *node = cm.remote_id_table.rb_node;
+ struct cm_timewait_info *timewait_info;
+
+ while (node) {
+ timewait_info = rb_entry(node, struct cm_timewait_info,
+ remote_id_node);
+ if (be32_lt(remote_id, timewait_info->work.remote_id))
+ node = node->rb_left;
+ else if (be32_gt(remote_id, timewait_info->work.remote_id))
+ node = node->rb_right;
+ else if (be64_lt(remote_ca_guid, timewait_info->remote_ca_guid))
+ node = node->rb_left;
+ else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid))
+ node = node->rb_right;
+ else
+ return timewait_info;
+ }
+ return NULL;
+}
+
+static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info
+ *timewait_info)
+{
+ struct rb_node **link = &cm.remote_qp_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct cm_timewait_info *cur_timewait_info;
+ __be64 remote_ca_guid = timewait_info->remote_ca_guid;
+ __be32 remote_qpn = timewait_info->remote_qpn;
+
+ while (*link) {
+ parent = *link;
+ cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+ remote_qp_node);
+ if (be32_lt(remote_qpn, cur_timewait_info->remote_qpn))
+ link = &(*link)->rb_left;
+ else if (be32_gt(remote_qpn, cur_timewait_info->remote_qpn))
+ link = &(*link)->rb_right;
+ else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+ link = &(*link)->rb_left;
+ else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+ link = &(*link)->rb_right;
+ else
+ return cur_timewait_info;
+ }
+ timewait_info->inserted_remote_qp = 1;
+ rb_link_node(&timewait_info->remote_qp_node, parent, link);
+ rb_insert_color(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+ return NULL;
+}
+
+static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private
+ *cm_id_priv)
+{
+ struct rb_node **link = &cm.remote_sidr_table.rb_node;
+ struct rb_node *parent = NULL;
+ struct cm_id_private *cur_cm_id_priv;
+ union ib_gid *port_gid = &cm_id_priv->av.dgid;
+ __be32 remote_id = cm_id_priv->id.remote_id;
+
+ while (*link) {
+ parent = *link;
+ cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+ sidr_id_node);
+ if (be32_lt(remote_id, cur_cm_id_priv->id.remote_id))
+ link = &(*link)->rb_left;
+ else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id))
+ link = &(*link)->rb_right;
+ else {
+ int cmp;
+ cmp = memcmp(port_gid, &cur_cm_id_priv->av.dgid,
+ sizeof *port_gid);
+ if (cmp < 0)
+ link = &(*link)->rb_left;
+ else if (cmp > 0)
+ link = &(*link)->rb_right;
+ else
+ return cur_cm_id_priv;
+ }
+ }
+ rb_link_node(&cm_id_priv->sidr_id_node, parent, link);
+ rb_insert_color(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+ return NULL;
+}
+
+static void cm_reject_sidr_req(struct cm_id_private *cm_id_priv,
+ enum ib_cm_sidr_status status)
+{
+ struct ib_cm_sidr_rep_param param;
+
+ memset(&param, 0, sizeof param);
+ param.status = status;
+ ib_send_cm_sidr_rep(&cm_id_priv->id, &param);
+}
+
+struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
+ ib_cm_handler cm_handler,
+ void *context)
+{
+ struct cm_id_private *cm_id_priv;
+ int ret;
+
+ cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL);
+ if (!cm_id_priv)
+ return ERR_PTR(-ENOMEM);
+
+ cm_id_priv->id.state = IB_CM_IDLE;
+ cm_id_priv->id.device = device;
+ cm_id_priv->id.cm_handler = cm_handler;
+ cm_id_priv->id.context = context;
+ cm_id_priv->id.remote_cm_qpn = 1;
+ ret = cm_alloc_id(cm_id_priv);
+ if (ret)
+ goto error;
+
+ spin_lock_init(&cm_id_priv->lock);
+ init_completion(&cm_id_priv->comp);
+ INIT_LIST_HEAD(&cm_id_priv->work_list);
+ atomic_set(&cm_id_priv->work_count, -1);
+ atomic_set(&cm_id_priv->refcount, 1);
+ return &cm_id_priv->id;
+
+error:
+ kfree(cm_id_priv);
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_cm_id);
+
+static struct cm_work * cm_dequeue_work(struct cm_id_private *cm_id_priv)
+{
+ struct cm_work *work;
+
+ if (list_empty(&cm_id_priv->work_list))
+ return NULL;
+
+ work = list_entry(cm_id_priv->work_list.next, struct cm_work, list);
+ list_del(&work->list);
+ return work;
+}
+
+static void cm_free_work(struct cm_work *work)
+{
+ if (work->mad_recv_wc)
+ ib_free_recv_mad(work->mad_recv_wc);
+ kfree(work);
+}
+
+static inline int cm_convert_to_ms(int iba_time)
+{
+ /* approximate conversion to ms from 4.096us x 2^iba_time */
+ return 1 << max(iba_time - 8, 0);
+}
+
+/*
+ * calculate: 4.096x2^ack_timeout = 4.096x2^ack_delay + 2x4.096x2^life_time
+ * Because of how ack_timeout is stored, adding one doubles the timeout.
+ * To avoid large timeouts, select the max(ack_delay, life_time + 1), and
+ * increment it (round up) only if the other is within 50%.
+ */
+static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time)
+{
+ int ack_timeout = packet_life_time + 1;
+
+ if (ack_timeout >= ca_ack_delay)
+ ack_timeout += (ca_ack_delay >= (ack_timeout - 1));
+ else
+ ack_timeout = ca_ack_delay +
+ (ack_timeout >= (ca_ack_delay - 1));
+
+ return min(31, ack_timeout);
+}
+
+static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info)
+{
+ if (timewait_info->inserted_remote_id) {
+ rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table);
+ timewait_info->inserted_remote_id = 0;
+ }
+
+ if (timewait_info->inserted_remote_qp) {
+ rb_erase(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+ timewait_info->inserted_remote_qp = 0;
+ }
+}
+
+static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id)
+{
+ struct cm_timewait_info *timewait_info;
+
+ timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL);
+ if (!timewait_info)
+ return ERR_PTR(-ENOMEM);
+
+ timewait_info->work.local_id = local_id;
+ INIT_DELAYED_WORK(&timewait_info->work.work, cm_work_handler);
+ timewait_info->work.cm_event.event = IB_CM_TIMEWAIT_EXIT;
+ return timewait_info;
+}
+
+static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
+{
+ int wait_time;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cm.lock, flags);
+ cm_cleanup_timewait(cm_id_priv->timewait_info);
+ list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list);
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ /*
+ * The cm_id could be destroyed by the user before we exit timewait.
+ * To protect against this, we search for the cm_id after exiting
+ * timewait before notifying the user that we've exited timewait.
+ */
+ cm_id_priv->id.state = IB_CM_TIMEWAIT;
+ wait_time = cm_convert_to_ms(cm_id_priv->av.timeout);
+ queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work,
+ msecs_to_jiffies(wait_time));
+ cm_id_priv->timewait_info = NULL;
+}
+
+static void cm_reset_to_idle(struct cm_id_private *cm_id_priv)
+{
+ unsigned long flags;
+
+ cm_id_priv->id.state = IB_CM_IDLE;
+ if (cm_id_priv->timewait_info) {
+ spin_lock_irqsave(&cm.lock, flags);
+ cm_cleanup_timewait(cm_id_priv->timewait_info);
+ spin_unlock_irqrestore(&cm.lock, flags);
+ kfree(cm_id_priv->timewait_info);
+ cm_id_priv->timewait_info = NULL;
+ }
+}
+
+static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_work *work;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+retest:
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id->state) {
+ case IB_CM_LISTEN:
+ cm_id->state = IB_CM_IDLE;
+ spin_unlock_irq(&cm_id_priv->lock);
+ spin_lock_irq(&cm.lock);
+ rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
+ spin_unlock_irq(&cm.lock);
+ break;
+ case IB_CM_SIDR_REQ_SENT:
+ cm_id->state = IB_CM_IDLE;
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ spin_unlock_irq(&cm_id_priv->lock);
+ break;
+ case IB_CM_SIDR_REQ_RCVD:
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT);
+ break;
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ spin_unlock_irq(&cm_id_priv->lock);
+ ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT,
+ &cm_id_priv->id.device->node_guid,
+ sizeof cm_id_priv->id.device->node_guid,
+ NULL, 0);
+ break;
+ case IB_CM_REQ_RCVD:
+ if (err == -ENOMEM) {
+ /* Do not reject to allow future retries. */
+ cm_reset_to_idle(cm_id_priv);
+ spin_unlock_irq(&cm_id_priv->lock);
+ } else {
+ spin_unlock_irq(&cm_id_priv->lock);
+ ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+ NULL, 0, NULL, 0);
+ }
+ break;
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ /* Fall through */
+ case IB_CM_MRA_REQ_SENT:
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ spin_unlock_irq(&cm_id_priv->lock);
+ ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+ NULL, 0, NULL, 0);
+ break;
+ case IB_CM_ESTABLISHED:
+ spin_unlock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->qp_type == IB_QPT_XRC_TGT)
+ break;
+ ib_send_cm_dreq(cm_id, NULL, 0);
+ goto retest;
+ case IB_CM_DREQ_SENT:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ cm_enter_timewait(cm_id_priv);
+ spin_unlock_irq(&cm_id_priv->lock);
+ break;
+ case IB_CM_DREQ_RCVD:
+ spin_unlock_irq(&cm_id_priv->lock);
+ ib_send_cm_drep(cm_id, NULL, 0);
+ break;
+ default:
+ spin_unlock_irq(&cm_id_priv->lock);
+ break;
+ }
+
+ cm_free_id(cm_id->local_id);
+ cm_deref_id(cm_id_priv);
+ wait_for_completion(&cm_id_priv->comp);
+ while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
+ cm_free_work(work);
+ kfree(cm_id_priv->compare_data);
+ kfree(cm_id_priv->private_data);
+ kfree(cm_id_priv);
+}
+
+void ib_destroy_cm_id(struct ib_cm_id *cm_id)
+{
+ cm_destroy_id(cm_id, 0);
+}
+EXPORT_SYMBOL(ib_destroy_cm_id);
+
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
+ struct ib_cm_compare_data *compare_data)
+{
+ struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
+ unsigned long flags;
+ int ret = 0;
+
+ service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
+ service_id &= service_mask;
+ if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID &&
+ (service_id != IB_CM_ASSIGN_SERVICE_ID))
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ if (cm_id->state != IB_CM_IDLE)
+ return -EINVAL;
+
+ if (compare_data) {
+ cm_id_priv->compare_data = kzalloc(sizeof *compare_data,
+ GFP_KERNEL);
+ if (!cm_id_priv->compare_data)
+ return -ENOMEM;
+ cm_mask_copy(cm_id_priv->compare_data->data,
+ compare_data->data, compare_data->mask);
+ memcpy(cm_id_priv->compare_data->mask, compare_data->mask,
+ sizeof(compare_data->mask));
+ }
+
+ cm_id->state = IB_CM_LISTEN;
+
+ spin_lock_irqsave(&cm.lock, flags);
+ if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
+ cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
+ cm_id->service_mask = ~cpu_to_be64(0);
+ } else {
+ cm_id->service_id = service_id;
+ cm_id->service_mask = service_mask;
+ }
+ cur_cm_id_priv = cm_insert_listen(cm_id_priv);
+ spin_unlock_irqrestore(&cm.lock, flags);
+
+ if (cur_cm_id_priv) {
+ cm_id->state = IB_CM_IDLE;
+ kfree(cm_id_priv->compare_data);
+ cm_id_priv->compare_data = NULL;
+ ret = -EBUSY;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(ib_cm_listen);
+
+static __be64 cm_form_tid(struct cm_id_private *cm_id_priv,
+ enum cm_msg_sequence msg_seq)
+{
+ u64 hi_tid, low_tid;
+
+ hi_tid = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32;
+ low_tid = (u64) ((__force u32)cm_id_priv->id.local_id |
+ (msg_seq << 30));
+ return cpu_to_be64(hi_tid | low_tid);
+}
+
+static void cm_format_mad_hdr(struct ib_mad_hdr *hdr,
+ __be16 attr_id, __be64 tid)
+{
+ hdr->base_version = IB_MGMT_BASE_VERSION;
+ hdr->mgmt_class = IB_MGMT_CLASS_CM;
+ hdr->class_version = IB_CM_CLASS_VERSION;
+ hdr->method = IB_MGMT_METHOD_SEND;
+ hdr->attr_id = attr_id;
+ hdr->tid = tid;
+}
+
+static void cm_format_req(struct cm_req_msg *req_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_req_param *param)
+{
+ struct ib_sa_path_rec *pri_path = param->primary_path;
+ struct ib_sa_path_rec *alt_path = param->alternate_path;
+
+ cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID,
+ cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ));
+
+ req_msg->local_comm_id = cm_id_priv->id.local_id;
+ req_msg->service_id = param->service_id;
+ req_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
+ cm_req_set_local_qpn(req_msg, cpu_to_be32(param->qp_num));
+ cm_req_set_init_depth(req_msg, param->initiator_depth);
+ cm_req_set_remote_resp_timeout(req_msg,
+ param->remote_cm_response_timeout);
+ cm_req_set_qp_type(req_msg, param->qp_type);
+ cm_req_set_flow_ctrl(req_msg, param->flow_control);
+ cm_req_set_starting_psn(req_msg, cpu_to_be32(param->starting_psn));
+ cm_req_set_local_resp_timeout(req_msg,
+ param->local_cm_response_timeout);
+ req_msg->pkey = param->primary_path->pkey;
+ cm_req_set_path_mtu(req_msg, param->primary_path->mtu);
+ cm_req_set_max_cm_retries(req_msg, param->max_cm_retries);
+
+ if (param->qp_type != IB_QPT_XRC_INI) {
+ cm_req_set_resp_res(req_msg, param->responder_resources);
+ cm_req_set_retry_count(req_msg, param->retry_count);
+ cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count);
+ cm_req_set_srq(req_msg, param->srq);
+ }
+
+ if (pri_path->hop_limit <= 1) {
+ req_msg->primary_local_lid = pri_path->slid;
+ req_msg->primary_remote_lid = pri_path->dlid;
+ } else {
+ /* Work-around until there's a way to obtain remote LID info */
+ req_msg->primary_local_lid = IB_LID_PERMISSIVE;
+ req_msg->primary_remote_lid = IB_LID_PERMISSIVE;
+ }
+ req_msg->primary_local_gid = pri_path->sgid;
+ req_msg->primary_remote_gid = pri_path->dgid;
+ cm_req_set_primary_flow_label(req_msg, pri_path->flow_label);
+ cm_req_set_primary_packet_rate(req_msg, pri_path->rate);
+ req_msg->primary_traffic_class = pri_path->traffic_class;
+ req_msg->primary_hop_limit = pri_path->hop_limit;
+ cm_req_set_primary_sl(req_msg, pri_path->sl);
+ cm_req_set_primary_subnet_local(req_msg, (pri_path->hop_limit <= 1));
+ cm_req_set_primary_local_ack_timeout(req_msg,
+ cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+ pri_path->packet_life_time));
+
+ if (alt_path) {
+ if (alt_path->hop_limit <= 1) {
+ req_msg->alt_local_lid = alt_path->slid;
+ req_msg->alt_remote_lid = alt_path->dlid;
+ } else {
+ req_msg->alt_local_lid = IB_LID_PERMISSIVE;
+ req_msg->alt_remote_lid = IB_LID_PERMISSIVE;
+ }
+ req_msg->alt_local_gid = alt_path->sgid;
+ req_msg->alt_remote_gid = alt_path->dgid;
+ cm_req_set_alt_flow_label(req_msg,
+ alt_path->flow_label);
+ cm_req_set_alt_packet_rate(req_msg, alt_path->rate);
+ req_msg->alt_traffic_class = alt_path->traffic_class;
+ req_msg->alt_hop_limit = alt_path->hop_limit;
+ cm_req_set_alt_sl(req_msg, alt_path->sl);
+ cm_req_set_alt_subnet_local(req_msg, (alt_path->hop_limit <= 1));
+ cm_req_set_alt_local_ack_timeout(req_msg,
+ cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+ alt_path->packet_life_time));
+ }
+
+ if (param->private_data && param->private_data_len)
+ memcpy(req_msg->private_data, param->private_data,
+ param->private_data_len);
+}
+
+static int cm_validate_req_param(struct ib_cm_req_param *param)
+{
+ /* peer-to-peer not supported */
+ if (param->peer_to_peer)
+ return -EINVAL;
+
+ if (!param->primary_path)
+ return -EINVAL;
+
+ if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC &&
+ param->qp_type != IB_QPT_XRC_INI)
+ return -EINVAL;
+
+ if (param->private_data &&
+ param->private_data_len > IB_CM_REQ_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ if (param->alternate_path &&
+ (param->alternate_path->pkey != param->primary_path->pkey ||
+ param->alternate_path->mtu != param->primary_path->mtu))
+ return -EINVAL;
+
+ return 0;
+}
+
+int ib_send_cm_req(struct ib_cm_id *cm_id,
+ struct ib_cm_req_param *param)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_req_msg *req_msg;
+ unsigned long flags;
+ int ret;
+
+ ret = cm_validate_req_param(param);
+ if (ret)
+ return ret;
+
+ /* Verify that we're not in timewait. */
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_IDLE) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ ret = -EINVAL;
+ goto out;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+ id.local_id);
+ if (IS_ERR(cm_id_priv->timewait_info)) {
+ ret = PTR_ERR(cm_id_priv->timewait_info);
+ goto out;
+ }
+
+ ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av);
+ if (ret)
+ goto error1;
+ if (param->alternate_path) {
+ ret = cm_init_av_by_path(param->alternate_path,
+ &cm_id_priv->alt_av);
+ if (ret)
+ goto error1;
+ }
+ cm_id->service_id = param->service_id;
+ cm_id->service_mask = ~cpu_to_be64(0);
+ cm_id_priv->timeout_ms = cm_convert_to_ms(
+ param->primary_path->packet_life_time) * 2 +
+ cm_convert_to_ms(
+ param->remote_cm_response_timeout);
+ cm_id_priv->max_cm_retries = param->max_cm_retries;
+ cm_id_priv->initiator_depth = param->initiator_depth;
+ cm_id_priv->responder_resources = param->responder_resources;
+ cm_id_priv->retry_count = param->retry_count;
+ cm_id_priv->path_mtu = param->primary_path->mtu;
+ cm_id_priv->pkey = param->primary_path->pkey;
+ cm_id_priv->qp_type = param->qp_type;
+
+ ret = cm_alloc_msg(cm_id_priv, &cm_id_priv->msg);
+ if (ret)
+ goto error1;
+
+ req_msg = (struct cm_req_msg *) cm_id_priv->msg->mad;
+ cm_format_req(req_msg, cm_id_priv, param);
+ cm_id_priv->tid = req_msg->hdr.tid;
+ cm_id_priv->msg->timeout_ms = cm_id_priv->timeout_ms;
+ cm_id_priv->msg->context[1] = (void *) (unsigned long) IB_CM_REQ_SENT;
+
+ cm_id_priv->local_qpn = cm_req_get_local_qpn(req_msg);
+ cm_id_priv->rq_psn = cm_req_get_starting_psn(req_msg);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ ret = ib_post_send_mad(cm_id_priv->msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ goto error2;
+ }
+ BUG_ON(cm_id->state != IB_CM_IDLE);
+ cm_id->state = IB_CM_REQ_SENT;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return 0;
+
+error2: cm_free_msg(cm_id_priv->msg);
+error1: kfree(cm_id_priv->timewait_info);
+out: return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_req);
+
+static int cm_issue_rej(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ enum ib_cm_rej_reason reason,
+ enum cm_msg_response msg_rejected,
+ void *ari, u8 ari_length)
+{
+ struct ib_mad_send_buf *msg = NULL;
+ struct cm_rej_msg *rej_msg, *rcv_msg;
+ int ret;
+
+ ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+ if (ret)
+ return ret;
+
+ /* We just need common CM header information. Cast to any message. */
+ rcv_msg = (struct cm_rej_msg *) mad_recv_wc->recv_buf.mad;
+ rej_msg = (struct cm_rej_msg *) msg->mad;
+
+ cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, rcv_msg->hdr.tid);
+ rej_msg->remote_comm_id = rcv_msg->local_comm_id;
+ rej_msg->local_comm_id = rcv_msg->remote_comm_id;
+ cm_rej_set_msg_rejected(rej_msg, msg_rejected);
+ rej_msg->reason = cpu_to_be16(reason);
+
+ if (ari && ari_length) {
+ cm_rej_set_reject_info_len(rej_msg, ari_length);
+ memcpy(rej_msg->ari, ari, ari_length);
+ }
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ cm_free_msg(msg);
+
+ return ret;
+}
+
+static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid,
+ __be32 local_qpn, __be32 remote_qpn)
+{
+ return (be64_to_cpu(local_ca_guid) > be64_to_cpu(remote_ca_guid) ||
+ ((local_ca_guid == remote_ca_guid) &&
+ (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn))));
+}
+
+static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
+ struct ib_sa_path_rec *primary_path,
+ struct ib_sa_path_rec *alt_path)
+{
+ memset(primary_path, 0, sizeof *primary_path);
+ primary_path->dgid = req_msg->primary_local_gid;
+ primary_path->sgid = req_msg->primary_remote_gid;
+ primary_path->dlid = req_msg->primary_local_lid;
+ primary_path->slid = req_msg->primary_remote_lid;
+ primary_path->flow_label = cm_req_get_primary_flow_label(req_msg);
+ primary_path->hop_limit = req_msg->primary_hop_limit;
+ primary_path->traffic_class = req_msg->primary_traffic_class;
+ primary_path->reversible = 1;
+ primary_path->pkey = req_msg->pkey;
+ primary_path->sl = cm_req_get_primary_sl(req_msg);
+ primary_path->mtu_selector = IB_SA_EQ;
+ primary_path->mtu = cm_req_get_path_mtu(req_msg);
+ primary_path->rate_selector = IB_SA_EQ;
+ primary_path->rate = cm_req_get_primary_packet_rate(req_msg);
+ primary_path->packet_life_time_selector = IB_SA_EQ;
+ primary_path->packet_life_time =
+ cm_req_get_primary_local_ack_timeout(req_msg);
+ primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
+
+ if (req_msg->alt_local_lid) {
+ memset(alt_path, 0, sizeof *alt_path);
+ alt_path->dgid = req_msg->alt_local_gid;
+ alt_path->sgid = req_msg->alt_remote_gid;
+ alt_path->dlid = req_msg->alt_local_lid;
+ alt_path->slid = req_msg->alt_remote_lid;
+ alt_path->flow_label = cm_req_get_alt_flow_label(req_msg);
+ alt_path->hop_limit = req_msg->alt_hop_limit;
+ alt_path->traffic_class = req_msg->alt_traffic_class;
+ alt_path->reversible = 1;
+ alt_path->pkey = req_msg->pkey;
+ alt_path->sl = cm_req_get_alt_sl(req_msg);
+ alt_path->mtu_selector = IB_SA_EQ;
+ alt_path->mtu = cm_req_get_path_mtu(req_msg);
+ alt_path->rate_selector = IB_SA_EQ;
+ alt_path->rate = cm_req_get_alt_packet_rate(req_msg);
+ alt_path->packet_life_time_selector = IB_SA_EQ;
+ alt_path->packet_life_time =
+ cm_req_get_alt_local_ack_timeout(req_msg);
+ alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
+ }
+}
+
+static void cm_format_req_event(struct cm_work *work,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_id *listen_id)
+{
+ struct cm_req_msg *req_msg;
+ struct ib_cm_req_event_param *param;
+
+ req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.req_rcvd;
+ param->listen_id = listen_id;
+ param->port = cm_id_priv->av.port->port_num;
+ param->primary_path = &work->path[0];
+ if (req_msg->alt_local_lid)
+ param->alternate_path = &work->path[1];
+ else
+ param->alternate_path = NULL;
+ param->remote_ca_guid = req_msg->local_ca_guid;
+ param->remote_qkey = be32_to_cpu(req_msg->local_qkey);
+ param->remote_qpn = be32_to_cpu(cm_req_get_local_qpn(req_msg));
+ param->qp_type = cm_req_get_qp_type(req_msg);
+ param->starting_psn = be32_to_cpu(cm_req_get_starting_psn(req_msg));
+ param->responder_resources = cm_req_get_init_depth(req_msg);
+ param->initiator_depth = cm_req_get_resp_res(req_msg);
+ param->local_cm_response_timeout =
+ cm_req_get_remote_resp_timeout(req_msg);
+ param->flow_control = cm_req_get_flow_ctrl(req_msg);
+ param->remote_cm_response_timeout =
+ cm_req_get_local_resp_timeout(req_msg);
+ param->retry_count = cm_req_get_retry_count(req_msg);
+ param->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg);
+ param->srq = cm_req_get_srq(req_msg);
+ work->cm_event.private_data = &req_msg->private_data;
+}
+
+static void cm_process_work(struct cm_id_private *cm_id_priv,
+ struct cm_work *work)
+{
+ int ret;
+
+ /* We will typically only have the current event to report. */
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event);
+ cm_free_work(work);
+
+ while (!ret && !atomic_add_negative(-1, &cm_id_priv->work_count)) {
+ spin_lock_irq(&cm_id_priv->lock);
+ work = cm_dequeue_work(cm_id_priv);
+ spin_unlock_irq(&cm_id_priv->lock);
+ BUG_ON(!work);
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id,
+ &work->cm_event);
+ cm_free_work(work);
+ }
+ cm_deref_id(cm_id_priv);
+ if (ret)
+ cm_destroy_id(&cm_id_priv->id, ret);
+}
+
+static void cm_format_mra(struct cm_mra_msg *mra_msg,
+ struct cm_id_private *cm_id_priv,
+ enum cm_msg_response msg_mraed, u8 service_timeout,
+ const void *private_data, u8 private_data_len)
+{
+ cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid);
+ cm_mra_set_msg_mraed(mra_msg, msg_mraed);
+ mra_msg->local_comm_id = cm_id_priv->id.local_id;
+ mra_msg->remote_comm_id = cm_id_priv->id.remote_id;
+ cm_mra_set_service_timeout(mra_msg, service_timeout);
+
+ if (private_data && private_data_len)
+ memcpy(mra_msg->private_data, private_data, private_data_len);
+}
+
+static void cm_format_rej(struct cm_rej_msg *rej_msg,
+ struct cm_id_private *cm_id_priv,
+ enum ib_cm_rej_reason reason,
+ void *ari,
+ u8 ari_length,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid);
+ rej_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+ switch(cm_id_priv->id.state) {
+ case IB_CM_REQ_RCVD:
+ rej_msg->local_comm_id = 0;
+ cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ);
+ break;
+ case IB_CM_MRA_REQ_SENT:
+ rej_msg->local_comm_id = cm_id_priv->id.local_id;
+ cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ);
+ break;
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ rej_msg->local_comm_id = cm_id_priv->id.local_id;
+ cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REP);
+ break;
+ default:
+ rej_msg->local_comm_id = cm_id_priv->id.local_id;
+ cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_OTHER);
+ break;
+ }
+
+ rej_msg->reason = cpu_to_be16(reason);
+ if (ari && ari_length) {
+ cm_rej_set_reject_info_len(rej_msg, ari_length);
+ memcpy(rej_msg->ari, ari, ari_length);
+ }
+
+ if (private_data && private_data_len)
+ memcpy(rej_msg->private_data, private_data, private_data_len);
+}
+
+static void cm_dup_req_handler(struct cm_work *work,
+ struct cm_id_private *cm_id_priv)
+{
+ struct ib_mad_send_buf *msg = NULL;
+ int ret;
+
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_REQ_COUNTER]);
+
+ /* Quick state check to discard duplicate REQs. */
+ if (cm_id_priv->id.state == IB_CM_REQ_RCVD)
+ return;
+
+ ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+ if (ret)
+ return;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_MRA_REQ_SENT:
+ cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+ CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ break;
+ case IB_CM_TIMEWAIT:
+ cm_format_rej((struct cm_rej_msg *) msg->mad, cm_id_priv,
+ IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0);
+ break;
+ default:
+ goto unlock;
+ }
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ goto free;
+ return;
+
+unlock: spin_unlock_irq(&cm_id_priv->lock);
+free: cm_free_msg(msg);
+}
+
+static struct cm_id_private * cm_match_req(struct cm_work *work,
+ struct cm_id_private *cm_id_priv)
+{
+ struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv;
+ struct cm_timewait_info *timewait_info;
+ struct cm_req_msg *req_msg;
+
+ req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+ /* Check for possible duplicate REQ. */
+ spin_lock_irq(&cm.lock);
+ timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info);
+ if (timewait_info) {
+ cur_cm_id_priv = cm_get_id(timewait_info->work.local_id,
+ timewait_info->work.remote_id);
+ spin_unlock_irq(&cm.lock);
+ if (cur_cm_id_priv) {
+ cm_dup_req_handler(work, cur_cm_id_priv);
+ cm_deref_id(cur_cm_id_priv);
+ }
+ return NULL;
+ }
+
+ /* Check for stale connections. */
+ timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
+ if (timewait_info) {
+ cm_cleanup_timewait(cm_id_priv->timewait_info);
+ spin_unlock_irq(&cm.lock);
+ cm_issue_rej(work->port, work->mad_recv_wc,
+ IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ,
+ NULL, 0);
+ return NULL;
+ }
+
+ /* Find matching listen request. */
+ listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
+ req_msg->service_id,
+ req_msg->private_data);
+ if (!listen_cm_id_priv) {
+ cm_cleanup_timewait(cm_id_priv->timewait_info);
+ spin_unlock_irq(&cm.lock);
+ cm_issue_rej(work->port, work->mad_recv_wc,
+ IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ,
+ NULL, 0);
+ goto out;
+ }
+ atomic_inc(&listen_cm_id_priv->refcount);
+ atomic_inc(&cm_id_priv->refcount);
+ cm_id_priv->id.state = IB_CM_REQ_RCVD;
+ atomic_inc(&cm_id_priv->work_count);
+ spin_unlock_irq(&cm.lock);
+out:
+ return listen_cm_id_priv;
+}
+
+/*
+ * Work-around for inter-subnet connections. If the LIDs are permissive,
+ * we need to override the LID/SL data in the REQ with the LID information
+ * in the work completion.
+ */
+static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
+{
+ if (!cm_req_get_primary_subnet_local(req_msg)) {
+ if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) {
+ req_msg->primary_local_lid = cpu_to_be16(wc->slid);
+ cm_req_set_primary_sl(req_msg, wc->sl);
+ }
+
+ if (req_msg->primary_remote_lid == IB_LID_PERMISSIVE)
+ req_msg->primary_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+ }
+
+ if (!cm_req_get_alt_subnet_local(req_msg)) {
+ if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) {
+ req_msg->alt_local_lid = cpu_to_be16(wc->slid);
+ cm_req_set_alt_sl(req_msg, wc->sl);
+ }
+
+ if (req_msg->alt_remote_lid == IB_LID_PERMISSIVE)
+ req_msg->alt_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+ }
+}
+
+static int cm_req_handler(struct cm_work *work)
+{
+ struct ib_cm_id *cm_id;
+ struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
+ struct cm_req_msg *req_msg;
+ int ret;
+
+ req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+ cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
+ if (IS_ERR(cm_id))
+ return PTR_ERR(cm_id);
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ cm_id_priv->id.remote_id = req_msg->local_comm_id;
+ cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+ work->mad_recv_wc->recv_buf.grh,
+ &cm_id_priv->av);
+ cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+ id.local_id);
+ if (IS_ERR(cm_id_priv->timewait_info)) {
+ ret = PTR_ERR(cm_id_priv->timewait_info);
+ goto destroy;
+ }
+ cm_id_priv->timewait_info->work.remote_id = req_msg->local_comm_id;
+ cm_id_priv->timewait_info->remote_ca_guid = req_msg->local_ca_guid;
+ cm_id_priv->timewait_info->remote_qpn = cm_req_get_local_qpn(req_msg);
+
+ listen_cm_id_priv = cm_match_req(work, cm_id_priv);
+ if (!listen_cm_id_priv) {
+ ret = -EINVAL;
+ kfree(cm_id_priv->timewait_info);
+ goto destroy;
+ }
+
+ cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
+ cm_id_priv->id.context = listen_cm_id_priv->id.context;
+ cm_id_priv->id.service_id = req_msg->service_id;
+ cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+
+ cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
+ cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
+
+ memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN);
+ work->path[0].vlan_id = cm_id_priv->av.ah_attr.vlan_id;
+ ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
+ if (ret) {
+ ib_get_cached_gid(work->port->cm_dev->ib_device,
+ work->port->port_num, 0, &work->path[0].sgid);
+ ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
+ &work->path[0].sgid, sizeof work->path[0].sgid,
+ NULL, 0);
+ goto rejected;
+ }
+ if (req_msg->alt_local_lid) {
+ ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av);
+ if (ret) {
+ ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID,
+ &work->path[0].sgid,
+ sizeof work->path[0].sgid, NULL, 0);
+ goto rejected;
+ }
+ }
+ cm_id_priv->tid = req_msg->hdr.tid;
+ cm_id_priv->timeout_ms = cm_convert_to_ms(
+ cm_req_get_local_resp_timeout(req_msg));
+ cm_id_priv->max_cm_retries = cm_req_get_max_cm_retries(req_msg);
+ cm_id_priv->remote_qpn = cm_req_get_local_qpn(req_msg);
+ cm_id_priv->initiator_depth = cm_req_get_resp_res(req_msg);
+ cm_id_priv->responder_resources = cm_req_get_init_depth(req_msg);
+ cm_id_priv->path_mtu = cm_req_get_path_mtu(req_msg);
+ cm_id_priv->pkey = req_msg->pkey;
+ cm_id_priv->sq_psn = cm_req_get_starting_psn(req_msg);
+ cm_id_priv->retry_count = cm_req_get_retry_count(req_msg);
+ cm_id_priv->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg);
+ cm_id_priv->qp_type = cm_req_get_qp_type(req_msg);
+
+ cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id);
+ cm_process_work(cm_id_priv, work);
+ cm_deref_id(listen_cm_id_priv);
+ return 0;
+
+rejected:
+ atomic_dec(&cm_id_priv->refcount);
+ cm_deref_id(listen_cm_id_priv);
+destroy:
+ ib_destroy_cm_id(cm_id);
+ return ret;
+}
+
+static void cm_format_rep(struct cm_rep_msg *rep_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_rep_param *param)
+{
+ cm_format_mad_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid);
+ rep_msg->local_comm_id = cm_id_priv->id.local_id;
+ rep_msg->remote_comm_id = cm_id_priv->id.remote_id;
+ cm_rep_set_starting_psn(rep_msg, cpu_to_be32(param->starting_psn));
+ rep_msg->resp_resources = param->responder_resources;
+ cm_rep_set_target_ack_delay(rep_msg,
+ cm_id_priv->av.port->cm_dev->ack_delay);
+ cm_rep_set_failover(rep_msg, param->failover_accepted);
+ cm_rep_set_rnr_retry_count(rep_msg, param->rnr_retry_count);
+ rep_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
+
+ if (cm_id_priv->qp_type != IB_QPT_XRC_TGT) {
+ rep_msg->initiator_depth = param->initiator_depth;
+ cm_rep_set_flow_ctrl(rep_msg, param->flow_control);
+ cm_rep_set_srq(rep_msg, param->srq);
+ cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num));
+ } else {
+ cm_rep_set_srq(rep_msg, 1);
+ cm_rep_set_local_eecn(rep_msg, cpu_to_be32(param->qp_num));
+ }
+
+ if (param->private_data && param->private_data_len)
+ memcpy(rep_msg->private_data, param->private_data,
+ param->private_data_len);
+}
+
+int ib_send_cm_rep(struct ib_cm_id *cm_id,
+ struct ib_cm_rep_param *param)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ struct cm_rep_msg *rep_msg;
+ unsigned long flags;
+ int ret;
+
+ if (param->private_data &&
+ param->private_data_len > IB_CM_REP_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_REQ_RCVD &&
+ cm_id->state != IB_CM_MRA_REQ_SENT) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto out;
+
+ rep_msg = (struct cm_rep_msg *) msg->mad;
+ cm_format_rep(rep_msg, cm_id_priv, param);
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+ msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+
+ cm_id->state = IB_CM_REP_SENT;
+ cm_id_priv->msg = msg;
+ cm_id_priv->initiator_depth = param->initiator_depth;
+ cm_id_priv->responder_resources = param->responder_resources;
+ cm_id_priv->rq_psn = cm_rep_get_starting_psn(rep_msg);
+ cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF);
+
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rep);
+
+static void cm_format_rtu(struct cm_rtu_msg *rtu_msg,
+ struct cm_id_private *cm_id_priv,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&rtu_msg->hdr, CM_RTU_ATTR_ID, cm_id_priv->tid);
+ rtu_msg->local_comm_id = cm_id_priv->id.local_id;
+ rtu_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+ if (private_data && private_data_len)
+ memcpy(rtu_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_rtu(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ void *data;
+ int ret;
+
+ if (private_data && private_data_len > IB_CM_RTU_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ data = cm_copy_private_data(private_data, private_data_len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_REP_RCVD &&
+ cm_id->state != IB_CM_MRA_REP_SENT) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto error;
+
+ cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+ private_data, private_data_len);
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ kfree(data);
+ return ret;
+ }
+
+ cm_id->state = IB_CM_ESTABLISHED;
+ cm_set_private_data(cm_id_priv, data, private_data_len);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return 0;
+
+error: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ kfree(data);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rtu);
+
+static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type)
+{
+ struct cm_rep_msg *rep_msg;
+ struct ib_cm_rep_event_param *param;
+
+ rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.rep_rcvd;
+ param->remote_ca_guid = rep_msg->local_ca_guid;
+ param->remote_qkey = be32_to_cpu(rep_msg->local_qkey);
+ param->remote_qpn = be32_to_cpu(cm_rep_get_qpn(rep_msg, qp_type));
+ param->starting_psn = be32_to_cpu(cm_rep_get_starting_psn(rep_msg));
+ param->responder_resources = rep_msg->initiator_depth;
+ param->initiator_depth = rep_msg->resp_resources;
+ param->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg);
+ param->failover_accepted = cm_rep_get_failover(rep_msg);
+ param->flow_control = cm_rep_get_flow_ctrl(rep_msg);
+ param->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg);
+ param->srq = cm_rep_get_srq(rep_msg);
+ work->cm_event.private_data = &rep_msg->private_data;
+}
+
+static void cm_dup_rep_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_rep_msg *rep_msg;
+ struct ib_mad_send_buf *msg = NULL;
+ int ret;
+
+ rep_msg = (struct cm_rep_msg *) work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id,
+ rep_msg->local_comm_id);
+ if (!cm_id_priv)
+ return;
+
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_REP_COUNTER]);
+ ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+ if (ret)
+ goto deref;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state == IB_CM_ESTABLISHED)
+ cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT)
+ cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+ CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ else
+ goto unlock;
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ goto free;
+ goto deref;
+
+unlock: spin_unlock_irq(&cm_id_priv->lock);
+free: cm_free_msg(msg);
+deref: cm_deref_id(cm_id_priv);
+}
+
+static int cm_rep_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_rep_msg *rep_msg;
+ int ret;
+
+ rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0);
+ if (!cm_id_priv) {
+ cm_dup_rep_handler(work);
+ return -EINVAL;
+ }
+
+ cm_format_rep_event(work, cm_id_priv->qp_type);
+
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ break;
+ default:
+ spin_unlock_irq(&cm_id_priv->lock);
+ ret = -EINVAL;
+ goto error;
+ }
+
+ cm_id_priv->timewait_info->work.remote_id = rep_msg->local_comm_id;
+ cm_id_priv->timewait_info->remote_ca_guid = rep_msg->local_ca_guid;
+ cm_id_priv->timewait_info->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type);
+
+ spin_lock(&cm.lock);
+ /* Check for duplicate REP. */
+ if (cm_insert_remote_id(cm_id_priv->timewait_info)) {
+ spin_unlock(&cm.lock);
+ spin_unlock_irq(&cm_id_priv->lock);
+ ret = -EINVAL;
+ goto error;
+ }
+ /* Check for a stale connection. */
+ if (cm_insert_remote_qpn(cm_id_priv->timewait_info)) {
+ rb_erase(&cm_id_priv->timewait_info->remote_id_node,
+ &cm.remote_id_table);
+ cm_id_priv->timewait_info->inserted_remote_id = 0;
+ spin_unlock(&cm.lock);
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_issue_rej(work->port, work->mad_recv_wc,
+ IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP,
+ NULL, 0);
+ ret = -EINVAL;
+ goto error;
+ }
+ spin_unlock(&cm.lock);
+
+ cm_id_priv->id.state = IB_CM_REP_RCVD;
+ cm_id_priv->id.remote_id = rep_msg->local_comm_id;
+ cm_id_priv->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type);
+ cm_id_priv->initiator_depth = rep_msg->resp_resources;
+ cm_id_priv->responder_resources = rep_msg->initiator_depth;
+ cm_id_priv->sq_psn = cm_rep_get_starting_psn(rep_msg);
+ cm_id_priv->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg);
+ cm_id_priv->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg);
+ cm_id_priv->av.timeout =
+ cm_ack_timeout(cm_id_priv->target_ack_delay,
+ cm_id_priv->av.timeout - 1);
+ cm_id_priv->alt_av.timeout =
+ cm_ack_timeout(cm_id_priv->target_ack_delay,
+ cm_id_priv->alt_av.timeout - 1);
+
+ /* todo: handle peer_to_peer */
+
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+
+error:
+ cm_deref_id(cm_id_priv);
+ return ret;
+}
+
+static int cm_establish_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ int ret;
+
+ /* See comment in cm_establish about lookup. */
+ cm_id_priv = cm_acquire_id(work->local_id, work->remote_id);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_ESTABLISHED) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static int cm_rtu_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_rtu_msg *rtu_msg;
+ int ret;
+
+ rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(rtu_msg->remote_comm_id,
+ rtu_msg->local_comm_id);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ work->cm_event.private_data = &rtu_msg->private_data;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_REP_SENT &&
+ cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_RTU_COUNTER]);
+ goto out;
+ }
+ cm_id_priv->id.state = IB_CM_ESTABLISHED;
+
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_format_dreq(struct cm_dreq_msg *dreq_msg,
+ struct cm_id_private *cm_id_priv,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&dreq_msg->hdr, CM_DREQ_ATTR_ID,
+ cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_DREQ));
+ dreq_msg->local_comm_id = cm_id_priv->id.local_id;
+ dreq_msg->remote_comm_id = cm_id_priv->id.remote_id;
+ cm_dreq_set_remote_qpn(dreq_msg, cm_id_priv->remote_qpn);
+
+ if (private_data && private_data_len)
+ memcpy(dreq_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_dreq(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_ESTABLISHED) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (cm_id->lap_state == IB_CM_LAP_SENT ||
+ cm_id->lap_state == IB_CM_MRA_LAP_RCVD)
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret) {
+ cm_enter_timewait(cm_id_priv);
+ goto out;
+ }
+
+ cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv,
+ private_data, private_data_len);
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+ msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ cm_enter_timewait(cm_id_priv);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+
+ cm_id->state = IB_CM_DREQ_SENT;
+ cm_id_priv->msg = msg;
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_dreq);
+
+static void cm_format_drep(struct cm_drep_msg *drep_msg,
+ struct cm_id_private *cm_id_priv,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, cm_id_priv->tid);
+ drep_msg->local_comm_id = cm_id_priv->id.local_id;
+ drep_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+ if (private_data && private_data_len)
+ memcpy(drep_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_drep(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ void *data;
+ int ret;
+
+ if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ data = cm_copy_private_data(private_data, private_data_len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_DREQ_RCVD) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ kfree(data);
+ return -EINVAL;
+ }
+
+ cm_set_private_data(cm_id_priv, data, private_data_len);
+ cm_enter_timewait(cm_id_priv);
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto out;
+
+ cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+ private_data, private_data_len);
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_drep);
+
+static int cm_issue_drep(struct cm_port *port,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_send_buf *msg = NULL;
+ struct cm_dreq_msg *dreq_msg;
+ struct cm_drep_msg *drep_msg;
+ int ret;
+
+ ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+ if (ret)
+ return ret;
+
+ dreq_msg = (struct cm_dreq_msg *) mad_recv_wc->recv_buf.mad;
+ drep_msg = (struct cm_drep_msg *) msg->mad;
+
+ cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, dreq_msg->hdr.tid);
+ drep_msg->remote_comm_id = dreq_msg->local_comm_id;
+ drep_msg->local_comm_id = dreq_msg->remote_comm_id;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ cm_free_msg(msg);
+
+ return ret;
+}
+
+static int cm_dreq_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_dreq_msg *dreq_msg;
+ struct ib_mad_send_buf *msg = NULL;
+ int ret;
+
+ dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(dreq_msg->remote_comm_id,
+ dreq_msg->local_comm_id);
+ if (!cm_id_priv) {
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_DREQ_COUNTER]);
+ cm_issue_drep(work->port, work->mad_recv_wc);
+ return -EINVAL;
+ }
+
+ work->cm_event.private_data = &dreq_msg->private_data;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->local_qpn != cm_dreq_get_remote_qpn(dreq_msg))
+ goto unlock;
+
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REP_SENT:
+ case IB_CM_DREQ_SENT:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ break;
+ case IB_CM_ESTABLISHED:
+ if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT ||
+ cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ break;
+ case IB_CM_MRA_REP_RCVD:
+ break;
+ case IB_CM_TIMEWAIT:
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_DREQ_COUNTER]);
+ if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg))
+ goto unlock;
+
+ cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ib_post_send_mad(msg, NULL))
+ cm_free_msg(msg);
+ goto deref;
+ case IB_CM_DREQ_RCVD:
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_DREQ_COUNTER]);
+ goto unlock;
+ default:
+ goto unlock;
+ }
+ cm_id_priv->id.state = IB_CM_DREQ_RCVD;
+ cm_id_priv->tid = dreq_msg->hdr.tid;
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+
+unlock: spin_unlock_irq(&cm_id_priv->lock);
+deref: cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static int cm_drep_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_drep_msg *drep_msg;
+ int ret;
+
+ drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(drep_msg->remote_comm_id,
+ drep_msg->local_comm_id);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ work->cm_event.private_data = &drep_msg->private_data;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_DREQ_SENT &&
+ cm_id_priv->id.state != IB_CM_DREQ_RCVD) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+ cm_enter_timewait(cm_id_priv);
+
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+int ib_send_cm_rej(struct ib_cm_id *cm_id,
+ enum ib_cm_rej_reason reason,
+ void *ari,
+ u8 ari_length,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) ||
+ (ari && ari_length > IB_CM_REJ_ARI_LENGTH))
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id->state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (!ret)
+ cm_format_rej((struct cm_rej_msg *) msg->mad,
+ cm_id_priv, reason, ari, ari_length,
+ private_data, private_data_len);
+
+ cm_reset_to_idle(cm_id_priv);
+ break;
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (!ret)
+ cm_format_rej((struct cm_rej_msg *) msg->mad,
+ cm_id_priv, reason, ari, ari_length,
+ private_data, private_data_len);
+
+ cm_enter_timewait(cm_id_priv);
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (ret)
+ goto out;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ cm_free_msg(msg);
+
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rej);
+
+static void cm_format_rej_event(struct cm_work *work)
+{
+ struct cm_rej_msg *rej_msg;
+ struct ib_cm_rej_event_param *param;
+
+ rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.rej_rcvd;
+ param->ari = rej_msg->ari;
+ param->ari_length = cm_rej_get_reject_info_len(rej_msg);
+ param->reason = __be16_to_cpu(rej_msg->reason);
+ work->cm_event.private_data = &rej_msg->private_data;
+}
+
+static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg)
+{
+ struct cm_timewait_info *timewait_info;
+ struct cm_id_private *cm_id_priv;
+ __be32 remote_id;
+
+ remote_id = rej_msg->local_comm_id;
+
+ if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_TIMEOUT) {
+ spin_lock_irq(&cm.lock);
+ timewait_info = cm_find_remote_id( *((__be64 *) rej_msg->ari),
+ remote_id);
+ if (!timewait_info) {
+ spin_unlock_irq(&cm.lock);
+ return NULL;
+ }
+ cm_id_priv = idr_find(&cm.local_id_table, (__force int)
+ (timewait_info->work.local_id ^
+ cm.random_id_operand));
+ if (cm_id_priv) {
+ if (cm_id_priv->id.remote_id == remote_id)
+ atomic_inc(&cm_id_priv->refcount);
+ else
+ cm_id_priv = NULL;
+ }
+ spin_unlock_irq(&cm.lock);
+ } else if (cm_rej_get_msg_rejected(rej_msg) == CM_MSG_RESPONSE_REQ)
+ cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, 0);
+ else
+ cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, remote_id);
+
+ return cm_id_priv;
+}
+
+static int cm_rej_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_rej_msg *rej_msg;
+ int ret;
+
+ rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_rejected_id(rej_msg);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ cm_format_rej_event(work);
+
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ /* fall through */
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+ if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_STALE_CONN)
+ cm_enter_timewait(cm_id_priv);
+ else
+ cm_reset_to_idle(cm_id_priv);
+ break;
+ case IB_CM_DREQ_SENT:
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ /* fall through */
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ cm_enter_timewait(cm_id_priv);
+ break;
+ case IB_CM_ESTABLISHED:
+ if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT ||
+ cm_id_priv->id.lap_state == IB_CM_LAP_SENT) {
+ if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT)
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent,
+ cm_id_priv->msg);
+ cm_enter_timewait(cm_id_priv);
+ break;
+ }
+ /* fall through */
+ default:
+ spin_unlock_irq(&cm_id_priv->lock);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+int ib_send_cm_mra(struct ib_cm_id *cm_id,
+ u8 service_timeout,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ enum ib_cm_state cm_state;
+ enum ib_cm_lap_state lap_state;
+ enum cm_msg_response msg_response;
+ void *data;
+ unsigned long flags;
+ int ret;
+
+ if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ data = cm_copy_private_data(private_data, private_data_len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch(cm_id_priv->id.state) {
+ case IB_CM_REQ_RCVD:
+ cm_state = IB_CM_MRA_REQ_SENT;
+ lap_state = cm_id->lap_state;
+ msg_response = CM_MSG_RESPONSE_REQ;
+ break;
+ case IB_CM_REP_RCVD:
+ cm_state = IB_CM_MRA_REP_SENT;
+ lap_state = cm_id->lap_state;
+ msg_response = CM_MSG_RESPONSE_REP;
+ break;
+ case IB_CM_ESTABLISHED:
+ if (cm_id->lap_state == IB_CM_LAP_RCVD) {
+ cm_state = cm_id->state;
+ lap_state = IB_CM_MRA_LAP_SENT;
+ msg_response = CM_MSG_RESPONSE_OTHER;
+ break;
+ }
+ default:
+ ret = -EINVAL;
+ goto error1;
+ }
+
+ if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto error1;
+
+ cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+ msg_response, service_timeout,
+ private_data, private_data_len);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ goto error2;
+ }
+
+ cm_id->state = cm_state;
+ cm_id->lap_state = lap_state;
+ cm_id_priv->service_timeout = service_timeout;
+ cm_set_private_data(cm_id_priv, data, private_data_len);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return 0;
+
+error1: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ kfree(data);
+ return ret;
+
+error2: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ kfree(data);
+ cm_free_msg(msg);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_mra);
+
+static struct cm_id_private * cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
+{
+ switch (cm_mra_get_msg_mraed(mra_msg)) {
+ case CM_MSG_RESPONSE_REQ:
+ return cm_acquire_id(mra_msg->remote_comm_id, 0);
+ case CM_MSG_RESPONSE_REP:
+ case CM_MSG_RESPONSE_OTHER:
+ return cm_acquire_id(mra_msg->remote_comm_id,
+ mra_msg->local_comm_id);
+ default:
+ return NULL;
+ }
+}
+
+static int cm_mra_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_mra_msg *mra_msg;
+ int timeout, ret;
+
+ mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_mraed_id(mra_msg);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ work->cm_event.private_data = &mra_msg->private_data;
+ work->cm_event.param.mra_rcvd.service_timeout =
+ cm_mra_get_service_timeout(mra_msg);
+ timeout = cm_convert_to_ms(cm_mra_get_service_timeout(mra_msg)) +
+ cm_convert_to_ms(cm_id_priv->av.timeout);
+
+ spin_lock_irq(&cm_id_priv->lock);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_SENT:
+ if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REQ ||
+ ib_modify_mad(cm_id_priv->av.port->mad_agent,
+ cm_id_priv->msg, timeout))
+ goto out;
+ cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD;
+ break;
+ case IB_CM_REP_SENT:
+ if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REP ||
+ ib_modify_mad(cm_id_priv->av.port->mad_agent,
+ cm_id_priv->msg, timeout))
+ goto out;
+ cm_id_priv->id.state = IB_CM_MRA_REP_RCVD;
+ break;
+ case IB_CM_ESTABLISHED:
+ if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_OTHER ||
+ cm_id_priv->id.lap_state != IB_CM_LAP_SENT ||
+ ib_modify_mad(cm_id_priv->av.port->mad_agent,
+ cm_id_priv->msg, timeout)) {
+ if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+ atomic_long_inc(&work->port->
+ counter_group[CM_RECV_DUPLICATES].
+ counter[CM_MRA_COUNTER]);
+ goto out;
+ }
+ cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD;
+ break;
+ case IB_CM_MRA_REQ_RCVD:
+ case IB_CM_MRA_REP_RCVD:
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_MRA_COUNTER]);
+ /* fall through */
+ default:
+ goto out;
+ }
+
+ cm_id_priv->msg->context[1] = (void *) (unsigned long)
+ cm_id_priv->id.state;
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_format_lap(struct cm_lap_msg *lap_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_sa_path_rec *alternate_path,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID,
+ cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP));
+ lap_msg->local_comm_id = cm_id_priv->id.local_id;
+ lap_msg->remote_comm_id = cm_id_priv->id.remote_id;
+ cm_lap_set_remote_qpn(lap_msg, cm_id_priv->remote_qpn);
+ /* todo: need remote CM response timeout */
+ cm_lap_set_remote_resp_timeout(lap_msg, 0x1F);
+ lap_msg->alt_local_lid = alternate_path->slid;
+ lap_msg->alt_remote_lid = alternate_path->dlid;
+ lap_msg->alt_local_gid = alternate_path->sgid;
+ lap_msg->alt_remote_gid = alternate_path->dgid;
+ cm_lap_set_flow_label(lap_msg, alternate_path->flow_label);
+ cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class);
+ lap_msg->alt_hop_limit = alternate_path->hop_limit;
+ cm_lap_set_packet_rate(lap_msg, alternate_path->rate);
+ cm_lap_set_sl(lap_msg, alternate_path->sl);
+ cm_lap_set_subnet_local(lap_msg, 1); /* local only... */
+ cm_lap_set_local_ack_timeout(lap_msg,
+ cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+ alternate_path->packet_life_time));
+
+ if (private_data && private_data_len)
+ memcpy(lap_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_lap(struct ib_cm_id *cm_id,
+ struct ib_sa_path_rec *alternate_path,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if (private_data && private_data_len > IB_CM_LAP_PRIVATE_DATA_SIZE)
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_ESTABLISHED ||
+ (cm_id->lap_state != IB_CM_LAP_UNINIT &&
+ cm_id->lap_state != IB_CM_LAP_IDLE)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av);
+ if (ret)
+ goto out;
+ cm_id_priv->alt_av.timeout =
+ cm_ack_timeout(cm_id_priv->target_ack_delay,
+ cm_id_priv->alt_av.timeout - 1);
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto out;
+
+ cm_format_lap((struct cm_lap_msg *) msg->mad, cm_id_priv,
+ alternate_path, private_data, private_data_len);
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+ msg->context[1] = (void *) (unsigned long) IB_CM_ESTABLISHED;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+
+ cm_id->lap_state = IB_CM_LAP_SENT;
+ cm_id_priv->msg = msg;
+
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_lap);
+
+static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv,
+ struct ib_sa_path_rec *path,
+ struct cm_lap_msg *lap_msg)
+{
+ memset(path, 0, sizeof *path);
+ path->dgid = lap_msg->alt_local_gid;
+ path->sgid = lap_msg->alt_remote_gid;
+ path->dlid = lap_msg->alt_local_lid;
+ path->slid = lap_msg->alt_remote_lid;
+ path->flow_label = cm_lap_get_flow_label(lap_msg);
+ path->hop_limit = lap_msg->alt_hop_limit;
+ path->traffic_class = cm_lap_get_traffic_class(lap_msg);
+ path->reversible = 1;
+ path->pkey = cm_id_priv->pkey;
+ path->sl = cm_lap_get_sl(lap_msg);
+ path->mtu_selector = IB_SA_EQ;
+ path->mtu = cm_id_priv->path_mtu;
+ path->rate_selector = IB_SA_EQ;
+ path->rate = cm_lap_get_packet_rate(lap_msg);
+ path->packet_life_time_selector = IB_SA_EQ;
+ path->packet_life_time = cm_lap_get_local_ack_timeout(lap_msg);
+ path->packet_life_time -= (path->packet_life_time > 0);
+}
+
+static int cm_lap_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_lap_msg *lap_msg;
+ struct ib_cm_lap_event_param *param;
+ struct ib_mad_send_buf *msg = NULL;
+ int ret;
+
+ /* todo: verify LAP request and send reject APR if invalid. */
+ lap_msg = (struct cm_lap_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(lap_msg->remote_comm_id,
+ lap_msg->local_comm_id);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ param = &work->cm_event.param.lap_rcvd;
+ param->alternate_path = &work->path[0];
+ cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg);
+ work->cm_event.private_data = &lap_msg->private_data;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_ESTABLISHED)
+ goto unlock;
+
+ switch (cm_id_priv->id.lap_state) {
+ case IB_CM_LAP_UNINIT:
+ case IB_CM_LAP_IDLE:
+ break;
+ case IB_CM_MRA_LAP_SENT:
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_LAP_COUNTER]);
+ if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg))
+ goto unlock;
+
+ cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+ CM_MSG_RESPONSE_OTHER,
+ cm_id_priv->service_timeout,
+ cm_id_priv->private_data,
+ cm_id_priv->private_data_len);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ib_post_send_mad(msg, NULL))
+ cm_free_msg(msg);
+ goto deref;
+ case IB_CM_LAP_RCVD:
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_LAP_COUNTER]);
+ goto unlock;
+ default:
+ goto unlock;
+ }
+
+ cm_id_priv->id.lap_state = IB_CM_LAP_RCVD;
+ cm_id_priv->tid = lap_msg->hdr.tid;
+ cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+ work->mad_recv_wc->recv_buf.grh,
+ &cm_id_priv->av);
+ cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av);
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+
+unlock: spin_unlock_irq(&cm_id_priv->lock);
+deref: cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_format_apr(struct cm_apr_msg *apr_msg,
+ struct cm_id_private *cm_id_priv,
+ enum ib_cm_apr_status status,
+ void *info,
+ u8 info_length,
+ const void *private_data,
+ u8 private_data_len)
+{
+ cm_format_mad_hdr(&apr_msg->hdr, CM_APR_ATTR_ID, cm_id_priv->tid);
+ apr_msg->local_comm_id = cm_id_priv->id.local_id;
+ apr_msg->remote_comm_id = cm_id_priv->id.remote_id;
+ apr_msg->ap_status = (u8) status;
+
+ if (info && info_length) {
+ apr_msg->info_length = info_length;
+ memcpy(apr_msg->info, info, info_length);
+ }
+
+ if (private_data && private_data_len)
+ memcpy(apr_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_apr(struct ib_cm_id *cm_id,
+ enum ib_cm_apr_status status,
+ void *info,
+ u8 info_length,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if ((private_data && private_data_len > IB_CM_APR_PRIVATE_DATA_SIZE) ||
+ (info && info_length > IB_CM_APR_INFO_LENGTH))
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_ESTABLISHED ||
+ (cm_id->lap_state != IB_CM_LAP_RCVD &&
+ cm_id->lap_state != IB_CM_MRA_LAP_SENT)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto out;
+
+ cm_format_apr((struct cm_apr_msg *) msg->mad, cm_id_priv, status,
+ info, info_length, private_data, private_data_len);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+
+ cm_id->lap_state = IB_CM_LAP_IDLE;
+out: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_apr);
+
+static int cm_apr_handler(struct cm_work *work)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_apr_msg *apr_msg;
+ int ret;
+
+ apr_msg = (struct cm_apr_msg *)work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(apr_msg->remote_comm_id,
+ apr_msg->local_comm_id);
+ if (!cm_id_priv)
+ return -EINVAL; /* Unmatched reply. */
+
+ work->cm_event.param.apr_rcvd.ap_status = apr_msg->ap_status;
+ work->cm_event.param.apr_rcvd.apr_info = &apr_msg->info;
+ work->cm_event.param.apr_rcvd.info_len = apr_msg->info_length;
+ work->cm_event.private_data = &apr_msg->private_data;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_ESTABLISHED ||
+ (cm_id_priv->id.lap_state != IB_CM_LAP_SENT &&
+ cm_id_priv->id.lap_state != IB_CM_MRA_LAP_RCVD)) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+ cm_id_priv->id.lap_state = IB_CM_LAP_IDLE;
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ cm_id_priv->msg = NULL;
+
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static int cm_timewait_handler(struct cm_work *work)
+{
+ struct cm_timewait_info *timewait_info;
+ struct cm_id_private *cm_id_priv;
+ int ret;
+
+ timewait_info = (struct cm_timewait_info *)work;
+ spin_lock_irq(&cm.lock);
+ list_del(&timewait_info->list);
+ spin_unlock_irq(&cm.lock);
+
+ cm_id_priv = cm_acquire_id(timewait_info->work.local_id,
+ timewait_info->work.remote_id);
+ if (!cm_id_priv)
+ return -EINVAL;
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_TIMEWAIT ||
+ cm_id_priv->remote_qpn != timewait_info->remote_qpn) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+ cm_id_priv->id.state = IB_CM_IDLE;
+ ret = atomic_inc_and_test(&cm_id_priv->work_count);
+ if (!ret)
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ if (ret)
+ cm_process_work(cm_id_priv, work);
+ else
+ cm_deref_id(cm_id_priv);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_sidr_req_param *param)
+{
+ cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID,
+ cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_SIDR));
+ sidr_req_msg->request_id = cm_id_priv->id.local_id;
+ sidr_req_msg->pkey = param->path->pkey;
+ sidr_req_msg->service_id = param->service_id;
+
+ if (param->private_data && param->private_data_len)
+ memcpy(sidr_req_msg->private_data, param->private_data,
+ param->private_data_len);
+}
+
+int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
+ struct ib_cm_sidr_req_param *param)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if (!param->path || (param->private_data &&
+ param->private_data_len > IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE))
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ ret = cm_init_av_by_path(param->path, &cm_id_priv->av);
+ if (ret)
+ goto out;
+
+ cm_id->service_id = param->service_id;
+ cm_id->service_mask = ~cpu_to_be64(0);
+ cm_id_priv->timeout_ms = param->timeout_ms;
+ cm_id_priv->max_cm_retries = param->max_cm_retries;
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto out;
+
+ cm_format_sidr_req((struct cm_sidr_req_msg *) msg->mad, cm_id_priv,
+ param);
+ msg->timeout_ms = cm_id_priv->timeout_ms;
+ msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state == IB_CM_IDLE)
+ ret = ib_post_send_mad(msg, NULL);
+ else
+ ret = -EINVAL;
+
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ goto out;
+ }
+ cm_id->state = IB_CM_SIDR_REQ_SENT;
+ cm_id_priv->msg = msg;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+out:
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_req);
+
+static void cm_format_sidr_req_event(struct cm_work *work,
+ struct ib_cm_id *listen_id)
+{
+ struct cm_sidr_req_msg *sidr_req_msg;
+ struct ib_cm_sidr_req_event_param *param;
+
+ sidr_req_msg = (struct cm_sidr_req_msg *)
+ work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.sidr_req_rcvd;
+ param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
+ param->listen_id = listen_id;
+ param->port = work->port->port_num;
+ work->cm_event.private_data = &sidr_req_msg->private_data;
+}
+
+static int cm_sidr_req_handler(struct cm_work *work)
+{
+ struct ib_cm_id *cm_id;
+ struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
+ struct cm_sidr_req_msg *sidr_req_msg;
+ struct ib_wc *wc;
+
+ cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
+ if (IS_ERR(cm_id))
+ return PTR_ERR(cm_id);
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+ /* Record SGID/SLID and request ID for lookup. */
+ sidr_req_msg = (struct cm_sidr_req_msg *)
+ work->mad_recv_wc->recv_buf.mad;
+ wc = work->mad_recv_wc->wc;
+ cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid);
+ cm_id_priv->av.dgid.global.interface_id = 0;
+ cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+ work->mad_recv_wc->recv_buf.grh,
+ &cm_id_priv->av);
+ cm_id_priv->id.remote_id = sidr_req_msg->request_id;
+ cm_id_priv->tid = sidr_req_msg->hdr.tid;
+ atomic_inc(&cm_id_priv->work_count);
+
+ spin_lock_irq(&cm.lock);
+ cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv);
+ if (cur_cm_id_priv) {
+ spin_unlock_irq(&cm.lock);
+ atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+ counter[CM_SIDR_REQ_COUNTER]);
+ goto out; /* Duplicate message. */
+ }
+ cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
+ cur_cm_id_priv = cm_find_listen(cm_id->device,
+ sidr_req_msg->service_id,
+ sidr_req_msg->private_data);
+ if (!cur_cm_id_priv) {
+ spin_unlock_irq(&cm.lock);
+ cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
+ goto out; /* No match. */
+ }
+ atomic_inc(&cur_cm_id_priv->refcount);
+ atomic_inc(&cm_id_priv->refcount);
+ spin_unlock_irq(&cm.lock);
+
+ cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler;
+ cm_id_priv->id.context = cur_cm_id_priv->id.context;
+ cm_id_priv->id.service_id = sidr_req_msg->service_id;
+ cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+
+ cm_format_sidr_req_event(work, &cur_cm_id_priv->id);
+ cm_process_work(cm_id_priv, work);
+ cm_deref_id(cur_cm_id_priv);
+ return 0;
+out:
+ ib_destroy_cm_id(&cm_id_priv->id);
+ return -EINVAL;
+}
+
+static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg,
+ struct cm_id_private *cm_id_priv,
+ struct ib_cm_sidr_rep_param *param)
+{
+ cm_format_mad_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID,
+ cm_id_priv->tid);
+ sidr_rep_msg->request_id = cm_id_priv->id.remote_id;
+ sidr_rep_msg->status = param->status;
+ cm_sidr_rep_set_qpn(sidr_rep_msg, cpu_to_be32(param->qp_num));
+ sidr_rep_msg->service_id = cm_id_priv->id.service_id;
+ sidr_rep_msg->qkey = cpu_to_be32(param->qkey);
+
+ if (param->info && param->info_length)
+ memcpy(sidr_rep_msg->info, param->info, param->info_length);
+
+ if (param->private_data && param->private_data_len)
+ memcpy(sidr_rep_msg->private_data, param->private_data,
+ param->private_data_len);
+}
+
+int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
+ struct ib_cm_sidr_rep_param *param)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_mad_send_buf *msg;
+ unsigned long flags;
+ int ret;
+
+ if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) ||
+ (param->private_data &&
+ param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE))
+ return -EINVAL;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state != IB_CM_SIDR_REQ_RCVD) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ ret = cm_alloc_msg(cm_id_priv, &msg);
+ if (ret)
+ goto error;
+
+ cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv,
+ param);
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_free_msg(msg);
+ return ret;
+ }
+ cm_id->state = IB_CM_IDLE;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ spin_lock_irqsave(&cm.lock, flags);
+ rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+ spin_unlock_irqrestore(&cm.lock, flags);
+ return 0;
+
+error: spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_rep);
+
+static void cm_format_sidr_rep_event(struct cm_work *work)
+{
+ struct cm_sidr_rep_msg *sidr_rep_msg;
+ struct ib_cm_sidr_rep_event_param *param;
+
+ sidr_rep_msg = (struct cm_sidr_rep_msg *)
+ work->mad_recv_wc->recv_buf.mad;
+ param = &work->cm_event.param.sidr_rep_rcvd;
+ param->status = sidr_rep_msg->status;
+ param->qkey = be32_to_cpu(sidr_rep_msg->qkey);
+ param->qpn = be32_to_cpu(cm_sidr_rep_get_qpn(sidr_rep_msg));
+ param->info = &sidr_rep_msg->info;
+ param->info_len = sidr_rep_msg->info_length;
+ work->cm_event.private_data = &sidr_rep_msg->private_data;
+}
+
+static int cm_sidr_rep_handler(struct cm_work *work)
+{
+ struct cm_sidr_rep_msg *sidr_rep_msg;
+ struct cm_id_private *cm_id_priv;
+
+ sidr_rep_msg = (struct cm_sidr_rep_msg *)
+ work->mad_recv_wc->recv_buf.mad;
+ cm_id_priv = cm_acquire_id(sidr_rep_msg->request_id, 0);
+ if (!cm_id_priv)
+ return -EINVAL; /* Unmatched reply. */
+
+ spin_lock_irq(&cm_id_priv->lock);
+ if (cm_id_priv->id.state != IB_CM_SIDR_REQ_SENT) {
+ spin_unlock_irq(&cm_id_priv->lock);
+ goto out;
+ }
+ cm_id_priv->id.state = IB_CM_IDLE;
+ ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+ spin_unlock_irq(&cm_id_priv->lock);
+
+ cm_format_sidr_rep_event(work);
+ cm_process_work(cm_id_priv, work);
+ return 0;
+out:
+ cm_deref_id(cm_id_priv);
+ return -EINVAL;
+}
+
+static void cm_process_send_error(struct ib_mad_send_buf *msg,
+ enum ib_wc_status wc_status)
+{
+ struct cm_id_private *cm_id_priv;
+ struct ib_cm_event cm_event;
+ enum ib_cm_state state;
+ int ret;
+
+ memset(&cm_event, 0, sizeof cm_event);
+ cm_id_priv = msg->context[0];
+
+ /* Discard old sends or ones without a response. */
+ spin_lock_irq(&cm_id_priv->lock);
+ state = (enum ib_cm_state) (unsigned long) msg->context[1];
+ if (msg != cm_id_priv->msg || state != cm_id_priv->id.state)
+ goto discard;
+
+ switch (state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ cm_reset_to_idle(cm_id_priv);
+ cm_event.event = IB_CM_REQ_ERROR;
+ break;
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ cm_reset_to_idle(cm_id_priv);
+ cm_event.event = IB_CM_REP_ERROR;
+ break;
+ case IB_CM_DREQ_SENT:
+ cm_enter_timewait(cm_id_priv);
+ cm_event.event = IB_CM_DREQ_ERROR;
+ break;
+ case IB_CM_SIDR_REQ_SENT:
+ cm_id_priv->id.state = IB_CM_IDLE;
+ cm_event.event = IB_CM_SIDR_REQ_ERROR;
+ break;
+ default:
+ goto discard;
+ }
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_event.param.send_status = wc_status;
+
+ /* No other events can occur on the cm_id at this point. */
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event);
+ cm_free_msg(msg);
+ if (ret)
+ ib_destroy_cm_id(&cm_id_priv->id);
+ return;
+discard:
+ spin_unlock_irq(&cm_id_priv->lock);
+ cm_free_msg(msg);
+}
+
+static void cm_send_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_mad_send_buf *msg = mad_send_wc->send_buf;
+ struct cm_port *port;
+ u16 attr_index;
+
+ port = mad_agent->context;
+ attr_index = be16_to_cpu(((struct ib_mad_hdr *)
+ msg->mad)->attr_id) - CM_ATTR_ID_OFFSET;
+
+ /*
+ * If the send was in response to a received message (context[0] is not
+ * set to a cm_id), and is not a REJ, then it is a send that was
+ * manually retried.
+ */
+ if (!msg->context[0] && (attr_index != CM_REJ_COUNTER))
+ msg->retries = 1;
+
+ atomic_long_add(1 + msg->retries,
+ &port->counter_group[CM_XMIT].counter[attr_index]);
+ if (msg->retries)
+ atomic_long_add(msg->retries,
+ &port->counter_group[CM_XMIT_RETRIES].
+ counter[attr_index]);
+
+ switch (mad_send_wc->status) {
+ case IB_WC_SUCCESS:
+ case IB_WC_WR_FLUSH_ERR:
+ cm_free_msg(msg);
+ break;
+ default:
+ if (msg->context[0] && msg->context[1])
+ cm_process_send_error(msg, mad_send_wc->status);
+ else
+ cm_free_msg(msg);
+ break;
+ }
+}
+
+static void cm_work_handler(struct work_struct *_work)
+{
+ struct cm_work *work = container_of(_work, struct cm_work, work.work);
+ int ret;
+
+ switch (work->cm_event.event) {
+ case IB_CM_REQ_RECEIVED:
+ ret = cm_req_handler(work);
+ break;
+ case IB_CM_MRA_RECEIVED:
+ ret = cm_mra_handler(work);
+ break;
+ case IB_CM_REJ_RECEIVED:
+ ret = cm_rej_handler(work);
+ break;
+ case IB_CM_REP_RECEIVED:
+ ret = cm_rep_handler(work);
+ break;
+ case IB_CM_RTU_RECEIVED:
+ ret = cm_rtu_handler(work);
+ break;
+ case IB_CM_USER_ESTABLISHED:
+ ret = cm_establish_handler(work);
+ break;
+ case IB_CM_DREQ_RECEIVED:
+ ret = cm_dreq_handler(work);
+ break;
+ case IB_CM_DREP_RECEIVED:
+ ret = cm_drep_handler(work);
+ break;
+ case IB_CM_SIDR_REQ_RECEIVED:
+ ret = cm_sidr_req_handler(work);
+ break;
+ case IB_CM_SIDR_REP_RECEIVED:
+ ret = cm_sidr_rep_handler(work);
+ break;
+ case IB_CM_LAP_RECEIVED:
+ ret = cm_lap_handler(work);
+ break;
+ case IB_CM_APR_RECEIVED:
+ ret = cm_apr_handler(work);
+ break;
+ case IB_CM_TIMEWAIT_EXIT:
+ ret = cm_timewait_handler(work);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ if (ret)
+ cm_free_work(work);
+}
+
+static int cm_establish(struct ib_cm_id *cm_id)
+{
+ struct cm_id_private *cm_id_priv;
+ struct cm_work *work;
+ unsigned long flags;
+ int ret = 0;
+
+ work = kmalloc(sizeof *work, GFP_ATOMIC);
+ if (!work)
+ return -ENOMEM;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id->state)
+ {
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ cm_id->state = IB_CM_ESTABLISHED;
+ break;
+ case IB_CM_ESTABLISHED:
+ ret = -EISCONN;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ if (ret) {
+ kfree(work);
+ goto out;
+ }
+
+ /*
+ * The CM worker thread may try to destroy the cm_id before it
+ * can execute this work item. To prevent potential deadlock,
+ * we need to find the cm_id once we're in the context of the
+ * worker thread, rather than holding a reference on it.
+ */
+ INIT_DELAYED_WORK(&work->work, cm_work_handler);
+ work->local_id = cm_id->local_id;
+ work->remote_id = cm_id->remote_id;
+ work->mad_recv_wc = NULL;
+ work->cm_event.event = IB_CM_USER_ESTABLISHED;
+ queue_delayed_work(cm.wq, &work->work, 0);
+out:
+ return ret;
+}
+
+static int cm_migrate(struct ib_cm_id *cm_id)
+{
+ struct cm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret = 0;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id->state == IB_CM_ESTABLISHED &&
+ (cm_id->lap_state == IB_CM_LAP_UNINIT ||
+ cm_id->lap_state == IB_CM_LAP_IDLE)) {
+ cm_id->lap_state = IB_CM_LAP_IDLE;
+ cm_id_priv->av = cm_id_priv->alt_av;
+ } else
+ ret = -EINVAL;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ return ret;
+}
+
+int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event)
+{
+ int ret;
+
+ switch (event) {
+ case IB_EVENT_COMM_EST:
+ ret = cm_establish(cm_id);
+ break;
+ case IB_EVENT_PATH_MIG:
+ ret = cm_migrate(cm_id);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(ib_cm_notify);
+
+static void cm_recv_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct cm_port *port = mad_agent->context;
+ struct cm_work *work;
+ enum ib_cm_event_type event;
+ u16 attr_id;
+ int paths = 0;
+
+ switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) {
+ case CM_REQ_ATTR_ID:
+ paths = 1 + (((struct cm_req_msg *) mad_recv_wc->recv_buf.mad)->
+ alt_local_lid != 0);
+ event = IB_CM_REQ_RECEIVED;
+ break;
+ case CM_MRA_ATTR_ID:
+ event = IB_CM_MRA_RECEIVED;
+ break;
+ case CM_REJ_ATTR_ID:
+ event = IB_CM_REJ_RECEIVED;
+ break;
+ case CM_REP_ATTR_ID:
+ event = IB_CM_REP_RECEIVED;
+ break;
+ case CM_RTU_ATTR_ID:
+ event = IB_CM_RTU_RECEIVED;
+ break;
+ case CM_DREQ_ATTR_ID:
+ event = IB_CM_DREQ_RECEIVED;
+ break;
+ case CM_DREP_ATTR_ID:
+ event = IB_CM_DREP_RECEIVED;
+ break;
+ case CM_SIDR_REQ_ATTR_ID:
+ event = IB_CM_SIDR_REQ_RECEIVED;
+ break;
+ case CM_SIDR_REP_ATTR_ID:
+ event = IB_CM_SIDR_REP_RECEIVED;
+ break;
+ case CM_LAP_ATTR_ID:
+ paths = 1;
+ event = IB_CM_LAP_RECEIVED;
+ break;
+ case CM_APR_ATTR_ID:
+ event = IB_CM_APR_RECEIVED;
+ break;
+ default:
+ ib_free_recv_mad(mad_recv_wc);
+ return;
+ }
+
+ attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id);
+ atomic_long_inc(&port->counter_group[CM_RECV].
+ counter[attr_id - CM_ATTR_ID_OFFSET]);
+
+ work = kmalloc(sizeof *work + sizeof(struct ib_sa_path_rec) * paths,
+ GFP_KERNEL);
+ if (!work) {
+ ib_free_recv_mad(mad_recv_wc);
+ return;
+ }
+
+ INIT_DELAYED_WORK(&work->work, cm_work_handler);
+ work->cm_event.event = event;
+ work->mad_recv_wc = mad_recv_wc;
+ work->port = port;
+ queue_delayed_work(cm.wq, &work->work, 0);
+}
+
+static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_SENT:
+ case IB_CM_MRA_REQ_RCVD:
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ case IB_CM_ESTABLISHED:
+ *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX | IB_QP_PORT;
+ qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE;
+ if (cm_id_priv->responder_resources)
+ qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_ATOMIC;
+ qp_attr->pkey_index = cm_id_priv->av.pkey_index;
+ qp_attr->port_num = cm_id_priv->av.port->port_num;
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->id.state) {
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ case IB_CM_ESTABLISHED:
+ *qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN | IB_QP_RQ_PSN;
+ qp_attr->ah_attr = cm_id_priv->av.ah_attr;
+ if (!cm_id_priv->av.valid) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return -EINVAL;
+ }
+ if (cm_id_priv->av.ah_attr.vlan_id != 0xffff) {
+ qp_attr->vlan_id = cm_id_priv->av.ah_attr.vlan_id;
+ *qp_attr_mask |= IB_QP_VID;
+ }
+ if (!is_zero_ether_addr(cm_id_priv->av.smac)) {
+ memcpy(qp_attr->smac, cm_id_priv->av.smac,
+ sizeof(qp_attr->smac));
+ *qp_attr_mask |= IB_QP_SMAC;
+ }
+ if (cm_id_priv->alt_av.valid) {
+ if (cm_id_priv->alt_av.ah_attr.vlan_id != 0xffff) {
+ qp_attr->alt_vlan_id =
+ cm_id_priv->alt_av.ah_attr.vlan_id;
+ *qp_attr_mask |= IB_QP_ALT_VID;
+ }
+ if (!is_zero_ether_addr(cm_id_priv->alt_av.smac)) {
+ memcpy(qp_attr->alt_smac,
+ cm_id_priv->alt_av.smac,
+ sizeof(qp_attr->alt_smac));
+ *qp_attr_mask |= IB_QP_ALT_SMAC;
+ }
+ }
+ qp_attr->path_mtu = cm_id_priv->path_mtu;
+ qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn);
+ qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn);
+ if (cm_id_priv->qp_type == IB_QPT_RC ||
+ cm_id_priv->qp_type == IB_QPT_XRC_TGT) {
+ *qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER;
+ qp_attr->max_dest_rd_atomic =
+ cm_id_priv->responder_resources;
+ qp_attr->min_rnr_timer = 0;
+ }
+ if (cm_id_priv->alt_av.ah_attr.dlid) {
+ *qp_attr_mask |= IB_QP_ALT_PATH;
+ qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+ qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+ qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+ qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+ }
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->id.state) {
+ /* Allow transition to RTS before sending REP */
+ case IB_CM_REQ_RCVD:
+ case IB_CM_MRA_REQ_SENT:
+
+ case IB_CM_REP_RCVD:
+ case IB_CM_MRA_REP_SENT:
+ case IB_CM_REP_SENT:
+ case IB_CM_MRA_REP_RCVD:
+ case IB_CM_ESTABLISHED:
+ if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) {
+ *qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN;
+ qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn);
+ switch (cm_id_priv->qp_type) {
+ case IB_QPT_RC:
+ case IB_QPT_XRC_INI:
+ *qp_attr_mask |= IB_QP_RETRY_CNT | IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC;
+ qp_attr->retry_cnt = cm_id_priv->retry_count;
+ qp_attr->rnr_retry = cm_id_priv->rnr_retry_count;
+ qp_attr->max_rd_atomic = cm_id_priv->initiator_depth;
+ /* fall through */
+ case IB_QPT_XRC_TGT:
+ *qp_attr_mask |= IB_QP_TIMEOUT;
+ qp_attr->timeout = cm_id_priv->av.timeout;
+ break;
+ default:
+ break;
+ }
+ if (cm_id_priv->alt_av.ah_attr.dlid) {
+ *qp_attr_mask |= IB_QP_PATH_MIG_STATE;
+ qp_attr->path_mig_state = IB_MIG_REARM;
+ }
+ } else {
+ *qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE;
+ qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+ qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+ qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+ qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+ qp_attr->path_mig_state = IB_MIG_REARM;
+ }
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ struct cm_id_private *cm_id_priv;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+ switch (qp_attr->qp_state) {
+ case IB_QPS_INIT:
+ ret = cm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask);
+ break;
+ case IB_QPS_RTR:
+ ret = cm_init_qp_rtr_attr(cm_id_priv, qp_attr, qp_attr_mask);
+ break;
+ case IB_QPS_RTS:
+ ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(ib_cm_init_qp_attr);
+
+static void cm_get_ack_delay(struct cm_device *cm_dev)
+{
+ struct ib_device_attr attr;
+
+ if (ib_query_device(cm_dev->ib_device, &attr))
+ cm_dev->ack_delay = 0; /* acks will rely on packet life time */
+ else
+ cm_dev->ack_delay = attr.local_ca_ack_delay;
+}
+
+static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr,
+ char *buf)
+{
+ struct cm_counter_group *group;
+ struct cm_counter_attribute *cm_attr;
+
+ group = container_of(obj, struct cm_counter_group, obj);
+ cm_attr = container_of(attr, struct cm_counter_attribute, attr);
+
+ return sprintf(buf, "%ld\n",
+ atomic_long_read(&group->counter[cm_attr->index]));
+}
+
+static const struct sysfs_ops cm_counter_ops = {
+ .show = cm_show_counter
+};
+
+static struct kobj_type cm_counter_obj_type = {
+ .sysfs_ops = &cm_counter_ops,
+ .default_attrs = cm_counter_default_attrs
+};
+
+static void cm_release_port_obj(struct kobject *obj)
+{
+ struct cm_port *cm_port;
+
+ cm_port = container_of(obj, struct cm_port, port_obj);
+ kfree(cm_port);
+}
+
+static struct kobj_type cm_port_obj_type = {
+ .release = cm_release_port_obj
+};
+
+static char *cm_devnode(struct device *dev, umode_t *mode)
+{
+ if (mode)
+ *mode = 0666;
+ return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+struct class cm_class = {
+ .owner = THIS_MODULE,
+ .name = "infiniband_cm",
+ .devnode = cm_devnode,
+};
+EXPORT_SYMBOL(cm_class);
+
+static int cm_create_port_fs(struct cm_port *port)
+{
+ int i, ret;
+
+ ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type,
+ &port->cm_dev->device->kobj,
+ "%d", port->port_num);
+ if (ret) {
+ kfree(port);
+ return ret;
+ }
+
+ for (i = 0; i < CM_COUNTER_GROUPS; i++) {
+ ret = kobject_init_and_add(&port->counter_group[i].obj,
+ &cm_counter_obj_type,
+ &port->port_obj,
+ "%s", counter_group_names[i]);
+ if (ret)
+ goto error;
+ }
+
+ return 0;
+
+error:
+ while (i--)
+ kobject_put(&port->counter_group[i].obj);
+ kobject_put(&port->port_obj);
+ return ret;
+
+}
+
+static void cm_remove_port_fs(struct cm_port *port)
+{
+ int i;
+
+ for (i = 0; i < CM_COUNTER_GROUPS; i++)
+ kobject_put(&port->counter_group[i].obj);
+
+ kobject_put(&port->port_obj);
+}
+
+static void cm_add_one(struct ib_device *ib_device)
+{
+ struct cm_device *cm_dev;
+ struct cm_port *port;
+ struct ib_mad_reg_req reg_req = {
+ .mgmt_class = IB_MGMT_CLASS_CM,
+ .mgmt_class_version = IB_CM_CLASS_VERSION,
+ };
+ struct ib_port_modify port_modify = {
+ .set_port_cap_mask = IB_PORT_CM_SUP
+ };
+ unsigned long flags;
+ int ret;
+ u8 i;
+
+ if (rdma_node_get_transport(ib_device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) *
+ ib_device->phys_port_cnt, GFP_KERNEL);
+ if (!cm_dev)
+ return;
+
+ cm_dev->ib_device = ib_device;
+ cm_get_ack_delay(cm_dev);
+
+ cm_dev->device = device_create(&cm_class, &ib_device->dev,
+ MKDEV(0, 0), NULL,
+ "%s", ib_device->name);
+ if (IS_ERR(cm_dev->device)) {
+ kfree(cm_dev);
+ return;
+ }
+
+ set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
+ for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+ port = kzalloc(sizeof *port, GFP_KERNEL);
+ if (!port)
+ goto error1;
+
+ cm_dev->port[i-1] = port;
+ port->cm_dev = cm_dev;
+ port->port_num = i;
+
+ ret = cm_create_port_fs(port);
+ if (ret)
+ goto error1;
+
+ port->mad_agent = ib_register_mad_agent(ib_device, i,
+ IB_QPT_GSI,
+ &reg_req,
+ 0,
+ cm_send_handler,
+ cm_recv_handler,
+ port,
+ 0);
+ if (IS_ERR(port->mad_agent))
+ goto error2;
+
+ ret = ib_modify_port(ib_device, i, 0, &port_modify);
+ if (ret)
+ goto error3;
+ }
+ ib_set_client_data(ib_device, &cm_client, cm_dev);
+
+ write_lock_irqsave(&cm.device_lock, flags);
+ list_add_tail(&cm_dev->list, &cm.device_list);
+ write_unlock_irqrestore(&cm.device_lock, flags);
+ return;
+
+error3:
+ ib_unregister_mad_agent(port->mad_agent);
+error2:
+ cm_remove_port_fs(port);
+error1:
+ port_modify.set_port_cap_mask = 0;
+ port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
+ while (--i) {
+ port = cm_dev->port[i-1];
+ ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+ ib_unregister_mad_agent(port->mad_agent);
+ cm_remove_port_fs(port);
+ }
+ device_unregister(cm_dev->device);
+ kfree(cm_dev);
+}
+
+static void cm_remove_one(struct ib_device *ib_device)
+{
+ struct cm_device *cm_dev;
+ struct cm_port *port;
+ struct ib_port_modify port_modify = {
+ .clr_port_cap_mask = IB_PORT_CM_SUP
+ };
+ unsigned long flags;
+ int i;
+
+ cm_dev = ib_get_client_data(ib_device, &cm_client);
+ if (!cm_dev)
+ return;
+
+ write_lock_irqsave(&cm.device_lock, flags);
+ list_del(&cm_dev->list);
+ write_unlock_irqrestore(&cm.device_lock, flags);
+
+ for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+ port = cm_dev->port[i-1];
+ ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+ ib_unregister_mad_agent(port->mad_agent);
+ flush_workqueue(cm.wq);
+ cm_remove_port_fs(port);
+ }
+ device_unregister(cm_dev->device);
+ kfree(cm_dev);
+}
+
+static int __init ib_cm_init(void)
+{
+ int ret;
+
+ memset(&cm, 0, sizeof cm);
+ INIT_LIST_HEAD(&cm.device_list);
+ rwlock_init(&cm.device_lock);
+ spin_lock_init(&cm.lock);
+ cm.listen_service_table = RB_ROOT;
+ cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID);
+ cm.remote_id_table = RB_ROOT;
+ cm.remote_qp_table = RB_ROOT;
+ cm.remote_sidr_table = RB_ROOT;
+ idr_init(&cm.local_id_table);
+ get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
+ INIT_LIST_HEAD(&cm.timewait_list);
+
+ ret = class_register(&cm_class);
+ if (ret) {
+ ret = -ENOMEM;
+ goto error1;
+ }
+
+ cm.wq = create_workqueue("ib_cm");
+ if (!cm.wq) {
+ ret = -ENOMEM;
+ goto error2;
+ }
+
+ ret = ib_register_client(&cm_client);
+ if (ret)
+ goto error3;
+
+ return 0;
+error3:
+ destroy_workqueue(cm.wq);
+error2:
+ class_unregister(&cm_class);
+error1:
+ idr_destroy(&cm.local_id_table);
+ return ret;
+}
+
+static void __exit ib_cm_cleanup(void)
+{
+ struct cm_timewait_info *timewait_info, *tmp;
+
+ spin_lock_irq(&cm.lock);
+ list_for_each_entry(timewait_info, &cm.timewait_list, list)
+ cancel_delayed_work(&timewait_info->work.work);
+ spin_unlock_irq(&cm.lock);
+
+ ib_unregister_client(&cm_client);
+ destroy_workqueue(cm.wq);
+
+ list_for_each_entry_safe(timewait_info, tmp, &cm.timewait_list, list) {
+ list_del(&timewait_info->list);
+ kfree(timewait_info);
+ }
+
+ class_unregister(&cm_class);
+ idr_destroy(&cm.local_id_table);
+}
+
+module_init(ib_cm_init);
+module_exit(ib_cm_cleanup);
+
diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h
new file mode 100644
index 000000000..8b76f0ef9
--- /dev/null
+++ b/drivers/infiniband/core/cm_msgs.h
@@ -0,0 +1,836 @@
+/*
+ * Copyright (c) 2004, 2011 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING the madirectory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use source and binary forms, with or
+ * withmodification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retathe above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHWARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS THE
+ * SOFTWARE.
+ */
+#if !defined(CM_MSGS_H)
+#define CM_MSGS_H
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_cm.h>
+
+/*
+ * Parameters to routines below should be in network-byte order, and values
+ * are returned in network-byte order.
+ */
+
+#define IB_CM_CLASS_VERSION 2 /* IB specification 1.2 */
+
+enum cm_msg_sequence {
+ CM_MSG_SEQUENCE_REQ,
+ CM_MSG_SEQUENCE_LAP,
+ CM_MSG_SEQUENCE_DREQ,
+ CM_MSG_SEQUENCE_SIDR
+};
+
+struct cm_req_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 rsvd4;
+ __be64 service_id;
+ __be64 local_ca_guid;
+ __be32 rsvd24;
+ __be32 local_qkey;
+ /* local QPN:24, responder resources:8 */
+ __be32 offset32;
+ /* local EECN:24, initiator depth:8 */
+ __be32 offset36;
+ /*
+ * remote EECN:24, remote CM response timeout:5,
+ * transport service type:2, end-to-end flow control:1
+ */
+ __be32 offset40;
+ /* starting PSN:24, local CM response timeout:5, retry count:3 */
+ __be32 offset44;
+ __be16 pkey;
+ /* path MTU:4, RDC exists:1, RNR retry count:3. */
+ u8 offset50;
+ /* max CM Retries:4, SRQ:1, extended transport type:3 */
+ u8 offset51;
+
+ __be16 primary_local_lid;
+ __be16 primary_remote_lid;
+ union ib_gid primary_local_gid;
+ union ib_gid primary_remote_gid;
+ /* flow label:20, rsvd:6, packet rate:6 */
+ __be32 primary_offset88;
+ u8 primary_traffic_class;
+ u8 primary_hop_limit;
+ /* SL:4, subnet local:1, rsvd:3 */
+ u8 primary_offset94;
+ /* local ACK timeout:5, rsvd:3 */
+ u8 primary_offset95;
+
+ __be16 alt_local_lid;
+ __be16 alt_remote_lid;
+ union ib_gid alt_local_gid;
+ union ib_gid alt_remote_gid;
+ /* flow label:20, rsvd:6, packet rate:6 */
+ __be32 alt_offset132;
+ u8 alt_traffic_class;
+ u8 alt_hop_limit;
+ /* SL:4, subnet local:1, rsvd:3 */
+ u8 alt_offset138;
+ /* local ACK timeout:5, rsvd:3 */
+ u8 alt_offset139;
+
+ u32 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE / sizeof(u32)];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_req_get_local_qpn(struct cm_req_msg *req_msg)
+{
+ return cpu_to_be32(be32_to_cpu(req_msg->offset32) >> 8);
+}
+
+static inline void cm_req_set_local_qpn(struct cm_req_msg *req_msg, __be32 qpn)
+{
+ req_msg->offset32 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+ (be32_to_cpu(req_msg->offset32) &
+ 0x000000FF));
+}
+
+static inline u8 cm_req_get_resp_res(struct cm_req_msg *req_msg)
+{
+ return (u8) be32_to_cpu(req_msg->offset32);
+}
+
+static inline void cm_req_set_resp_res(struct cm_req_msg *req_msg, u8 resp_res)
+{
+ req_msg->offset32 = cpu_to_be32(resp_res |
+ (be32_to_cpu(req_msg->offset32) &
+ 0xFFFFFF00));
+}
+
+static inline u8 cm_req_get_init_depth(struct cm_req_msg *req_msg)
+{
+ return (u8) be32_to_cpu(req_msg->offset36);
+}
+
+static inline void cm_req_set_init_depth(struct cm_req_msg *req_msg,
+ u8 init_depth)
+{
+ req_msg->offset36 = cpu_to_be32(init_depth |
+ (be32_to_cpu(req_msg->offset36) &
+ 0xFFFFFF00));
+}
+
+static inline u8 cm_req_get_remote_resp_timeout(struct cm_req_msg *req_msg)
+{
+ return (u8) ((be32_to_cpu(req_msg->offset40) & 0xF8) >> 3);
+}
+
+static inline void cm_req_set_remote_resp_timeout(struct cm_req_msg *req_msg,
+ u8 resp_timeout)
+{
+ req_msg->offset40 = cpu_to_be32((resp_timeout << 3) |
+ (be32_to_cpu(req_msg->offset40) &
+ 0xFFFFFF07));
+}
+
+static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg)
+{
+ u8 transport_type = (u8) (be32_to_cpu(req_msg->offset40) & 0x06) >> 1;
+ switch(transport_type) {
+ case 0: return IB_QPT_RC;
+ case 1: return IB_QPT_UC;
+ case 3:
+ switch (req_msg->offset51 & 0x7) {
+ case 1: return IB_QPT_XRC_TGT;
+ default: return 0;
+ }
+ default: return 0;
+ }
+}
+
+static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg,
+ enum ib_qp_type qp_type)
+{
+ switch(qp_type) {
+ case IB_QPT_UC:
+ req_msg->offset40 = cpu_to_be32((be32_to_cpu(
+ req_msg->offset40) &
+ 0xFFFFFFF9) | 0x2);
+ break;
+ case IB_QPT_XRC_INI:
+ req_msg->offset40 = cpu_to_be32((be32_to_cpu(
+ req_msg->offset40) &
+ 0xFFFFFFF9) | 0x6);
+ req_msg->offset51 = (req_msg->offset51 & 0xF8) | 1;
+ break;
+ default:
+ req_msg->offset40 = cpu_to_be32(be32_to_cpu(
+ req_msg->offset40) &
+ 0xFFFFFFF9);
+ }
+}
+
+static inline u8 cm_req_get_flow_ctrl(struct cm_req_msg *req_msg)
+{
+ return be32_to_cpu(req_msg->offset40) & 0x1;
+}
+
+static inline void cm_req_set_flow_ctrl(struct cm_req_msg *req_msg,
+ u8 flow_ctrl)
+{
+ req_msg->offset40 = cpu_to_be32((flow_ctrl & 0x1) |
+ (be32_to_cpu(req_msg->offset40) &
+ 0xFFFFFFFE));
+}
+
+static inline __be32 cm_req_get_starting_psn(struct cm_req_msg *req_msg)
+{
+ return cpu_to_be32(be32_to_cpu(req_msg->offset44) >> 8);
+}
+
+static inline void cm_req_set_starting_psn(struct cm_req_msg *req_msg,
+ __be32 starting_psn)
+{
+ req_msg->offset44 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) |
+ (be32_to_cpu(req_msg->offset44) & 0x000000FF));
+}
+
+static inline u8 cm_req_get_local_resp_timeout(struct cm_req_msg *req_msg)
+{
+ return (u8) ((be32_to_cpu(req_msg->offset44) & 0xF8) >> 3);
+}
+
+static inline void cm_req_set_local_resp_timeout(struct cm_req_msg *req_msg,
+ u8 resp_timeout)
+{
+ req_msg->offset44 = cpu_to_be32((resp_timeout << 3) |
+ (be32_to_cpu(req_msg->offset44) & 0xFFFFFF07));
+}
+
+static inline u8 cm_req_get_retry_count(struct cm_req_msg *req_msg)
+{
+ return (u8) (be32_to_cpu(req_msg->offset44) & 0x7);
+}
+
+static inline void cm_req_set_retry_count(struct cm_req_msg *req_msg,
+ u8 retry_count)
+{
+ req_msg->offset44 = cpu_to_be32((retry_count & 0x7) |
+ (be32_to_cpu(req_msg->offset44) & 0xFFFFFFF8));
+}
+
+static inline u8 cm_req_get_path_mtu(struct cm_req_msg *req_msg)
+{
+ return req_msg->offset50 >> 4;
+}
+
+static inline void cm_req_set_path_mtu(struct cm_req_msg *req_msg, u8 path_mtu)
+{
+ req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF) | (path_mtu << 4));
+}
+
+static inline u8 cm_req_get_rnr_retry_count(struct cm_req_msg *req_msg)
+{
+ return req_msg->offset50 & 0x7;
+}
+
+static inline void cm_req_set_rnr_retry_count(struct cm_req_msg *req_msg,
+ u8 rnr_retry_count)
+{
+ req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF8) |
+ (rnr_retry_count & 0x7));
+}
+
+static inline u8 cm_req_get_max_cm_retries(struct cm_req_msg *req_msg)
+{
+ return req_msg->offset51 >> 4;
+}
+
+static inline void cm_req_set_max_cm_retries(struct cm_req_msg *req_msg,
+ u8 retries)
+{
+ req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF) | (retries << 4));
+}
+
+static inline u8 cm_req_get_srq(struct cm_req_msg *req_msg)
+{
+ return (req_msg->offset51 & 0x8) >> 3;
+}
+
+static inline void cm_req_set_srq(struct cm_req_msg *req_msg, u8 srq)
+{
+ req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF7) |
+ ((srq & 0x1) << 3));
+}
+
+static inline __be32 cm_req_get_primary_flow_label(struct cm_req_msg *req_msg)
+{
+ return cpu_to_be32(be32_to_cpu(req_msg->primary_offset88) >> 12);
+}
+
+static inline void cm_req_set_primary_flow_label(struct cm_req_msg *req_msg,
+ __be32 flow_label)
+{
+ req_msg->primary_offset88 = cpu_to_be32(
+ (be32_to_cpu(req_msg->primary_offset88) &
+ 0x00000FFF) |
+ (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_req_get_primary_packet_rate(struct cm_req_msg *req_msg)
+{
+ return (u8) (be32_to_cpu(req_msg->primary_offset88) & 0x3F);
+}
+
+static inline void cm_req_set_primary_packet_rate(struct cm_req_msg *req_msg,
+ u8 rate)
+{
+ req_msg->primary_offset88 = cpu_to_be32(
+ (be32_to_cpu(req_msg->primary_offset88) &
+ 0xFFFFFFC0) | (rate & 0x3F));
+}
+
+static inline u8 cm_req_get_primary_sl(struct cm_req_msg *req_msg)
+{
+ return (u8) (req_msg->primary_offset94 >> 4);
+}
+
+static inline void cm_req_set_primary_sl(struct cm_req_msg *req_msg, u8 sl)
+{
+ req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0x0F) |
+ (sl << 4));
+}
+
+static inline u8 cm_req_get_primary_subnet_local(struct cm_req_msg *req_msg)
+{
+ return (u8) ((req_msg->primary_offset94 & 0x08) >> 3);
+}
+
+static inline void cm_req_set_primary_subnet_local(struct cm_req_msg *req_msg,
+ u8 subnet_local)
+{
+ req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0xF7) |
+ ((subnet_local & 0x1) << 3));
+}
+
+static inline u8 cm_req_get_primary_local_ack_timeout(struct cm_req_msg *req_msg)
+{
+ return (u8) (req_msg->primary_offset95 >> 3);
+}
+
+static inline void cm_req_set_primary_local_ack_timeout(struct cm_req_msg *req_msg,
+ u8 local_ack_timeout)
+{
+ req_msg->primary_offset95 = (u8) ((req_msg->primary_offset95 & 0x07) |
+ (local_ack_timeout << 3));
+}
+
+static inline __be32 cm_req_get_alt_flow_label(struct cm_req_msg *req_msg)
+{
+ return cpu_to_be32(be32_to_cpu(req_msg->alt_offset132) >> 12);
+}
+
+static inline void cm_req_set_alt_flow_label(struct cm_req_msg *req_msg,
+ __be32 flow_label)
+{
+ req_msg->alt_offset132 = cpu_to_be32(
+ (be32_to_cpu(req_msg->alt_offset132) &
+ 0x00000FFF) |
+ (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_req_get_alt_packet_rate(struct cm_req_msg *req_msg)
+{
+ return (u8) (be32_to_cpu(req_msg->alt_offset132) & 0x3F);
+}
+
+static inline void cm_req_set_alt_packet_rate(struct cm_req_msg *req_msg,
+ u8 rate)
+{
+ req_msg->alt_offset132 = cpu_to_be32(
+ (be32_to_cpu(req_msg->alt_offset132) &
+ 0xFFFFFFC0) | (rate & 0x3F));
+}
+
+static inline u8 cm_req_get_alt_sl(struct cm_req_msg *req_msg)
+{
+ return (u8) (req_msg->alt_offset138 >> 4);
+}
+
+static inline void cm_req_set_alt_sl(struct cm_req_msg *req_msg, u8 sl)
+{
+ req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0x0F) |
+ (sl << 4));
+}
+
+static inline u8 cm_req_get_alt_subnet_local(struct cm_req_msg *req_msg)
+{
+ return (u8) ((req_msg->alt_offset138 & 0x08) >> 3);
+}
+
+static inline void cm_req_set_alt_subnet_local(struct cm_req_msg *req_msg,
+ u8 subnet_local)
+{
+ req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0xF7) |
+ ((subnet_local & 0x1) << 3));
+}
+
+static inline u8 cm_req_get_alt_local_ack_timeout(struct cm_req_msg *req_msg)
+{
+ return (u8) (req_msg->alt_offset139 >> 3);
+}
+
+static inline void cm_req_set_alt_local_ack_timeout(struct cm_req_msg *req_msg,
+ u8 local_ack_timeout)
+{
+ req_msg->alt_offset139 = (u8) ((req_msg->alt_offset139 & 0x07) |
+ (local_ack_timeout << 3));
+}
+
+/* Message REJected or MRAed */
+enum cm_msg_response {
+ CM_MSG_RESPONSE_REQ = 0x0,
+ CM_MSG_RESPONSE_REP = 0x1,
+ CM_MSG_RESPONSE_OTHER = 0x2
+};
+
+ struct cm_mra_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+ /* message MRAed:2, rsvd:6 */
+ u8 offset8;
+ /* service timeout:5, rsvd:3 */
+ u8 offset9;
+
+ u8 private_data[IB_CM_MRA_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline u8 cm_mra_get_msg_mraed(struct cm_mra_msg *mra_msg)
+{
+ return (u8) (mra_msg->offset8 >> 6);
+}
+
+static inline void cm_mra_set_msg_mraed(struct cm_mra_msg *mra_msg, u8 msg)
+{
+ mra_msg->offset8 = (u8) ((mra_msg->offset8 & 0x3F) | (msg << 6));
+}
+
+static inline u8 cm_mra_get_service_timeout(struct cm_mra_msg *mra_msg)
+{
+ return (u8) (mra_msg->offset9 >> 3);
+}
+
+static inline void cm_mra_set_service_timeout(struct cm_mra_msg *mra_msg,
+ u8 service_timeout)
+{
+ mra_msg->offset9 = (u8) ((mra_msg->offset9 & 0x07) |
+ (service_timeout << 3));
+}
+
+struct cm_rej_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+ /* message REJected:2, rsvd:6 */
+ u8 offset8;
+ /* reject info length:7, rsvd:1. */
+ u8 offset9;
+ __be16 reason;
+ u8 ari[IB_CM_REJ_ARI_LENGTH];
+
+ u8 private_data[IB_CM_REJ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline u8 cm_rej_get_msg_rejected(struct cm_rej_msg *rej_msg)
+{
+ return (u8) (rej_msg->offset8 >> 6);
+}
+
+static inline void cm_rej_set_msg_rejected(struct cm_rej_msg *rej_msg, u8 msg)
+{
+ rej_msg->offset8 = (u8) ((rej_msg->offset8 & 0x3F) | (msg << 6));
+}
+
+static inline u8 cm_rej_get_reject_info_len(struct cm_rej_msg *rej_msg)
+{
+ return (u8) (rej_msg->offset9 >> 1);
+}
+
+static inline void cm_rej_set_reject_info_len(struct cm_rej_msg *rej_msg,
+ u8 len)
+{
+ rej_msg->offset9 = (u8) ((rej_msg->offset9 & 0x1) | (len << 1));
+}
+
+struct cm_rep_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+ __be32 local_qkey;
+ /* local QPN:24, rsvd:8 */
+ __be32 offset12;
+ /* local EECN:24, rsvd:8 */
+ __be32 offset16;
+ /* starting PSN:24 rsvd:8 */
+ __be32 offset20;
+ u8 resp_resources;
+ u8 initiator_depth;
+ /* target ACK delay:5, failover accepted:2, end-to-end flow control:1 */
+ u8 offset26;
+ /* RNR retry count:3, SRQ:1, rsvd:5 */
+ u8 offset27;
+ __be64 local_ca_guid;
+
+ u8 private_data[IB_CM_REP_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_rep_get_local_qpn(struct cm_rep_msg *rep_msg)
+{
+ return cpu_to_be32(be32_to_cpu(rep_msg->offset12) >> 8);
+}
+
+static inline void cm_rep_set_local_qpn(struct cm_rep_msg *rep_msg, __be32 qpn)
+{
+ rep_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+ (be32_to_cpu(rep_msg->offset12) & 0x000000FF));
+}
+
+static inline __be32 cm_rep_get_local_eecn(struct cm_rep_msg *rep_msg)
+{
+ return cpu_to_be32(be32_to_cpu(rep_msg->offset16) >> 8);
+}
+
+static inline void cm_rep_set_local_eecn(struct cm_rep_msg *rep_msg, __be32 eecn)
+{
+ rep_msg->offset16 = cpu_to_be32((be32_to_cpu(eecn) << 8) |
+ (be32_to_cpu(rep_msg->offset16) & 0x000000FF));
+}
+
+static inline __be32 cm_rep_get_qpn(struct cm_rep_msg *rep_msg, enum ib_qp_type qp_type)
+{
+ return (qp_type == IB_QPT_XRC_INI) ?
+ cm_rep_get_local_eecn(rep_msg) : cm_rep_get_local_qpn(rep_msg);
+}
+
+static inline __be32 cm_rep_get_starting_psn(struct cm_rep_msg *rep_msg)
+{
+ return cpu_to_be32(be32_to_cpu(rep_msg->offset20) >> 8);
+}
+
+static inline void cm_rep_set_starting_psn(struct cm_rep_msg *rep_msg,
+ __be32 starting_psn)
+{
+ rep_msg->offset20 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) |
+ (be32_to_cpu(rep_msg->offset20) & 0x000000FF));
+}
+
+static inline u8 cm_rep_get_target_ack_delay(struct cm_rep_msg *rep_msg)
+{
+ return (u8) (rep_msg->offset26 >> 3);
+}
+
+static inline void cm_rep_set_target_ack_delay(struct cm_rep_msg *rep_msg,
+ u8 target_ack_delay)
+{
+ rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0x07) |
+ (target_ack_delay << 3));
+}
+
+static inline u8 cm_rep_get_failover(struct cm_rep_msg *rep_msg)
+{
+ return (u8) ((rep_msg->offset26 & 0x06) >> 1);
+}
+
+static inline void cm_rep_set_failover(struct cm_rep_msg *rep_msg, u8 failover)
+{
+ rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xF9) |
+ ((failover & 0x3) << 1));
+}
+
+static inline u8 cm_rep_get_flow_ctrl(struct cm_rep_msg *rep_msg)
+{
+ return (u8) (rep_msg->offset26 & 0x01);
+}
+
+static inline void cm_rep_set_flow_ctrl(struct cm_rep_msg *rep_msg,
+ u8 flow_ctrl)
+{
+ rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xFE) |
+ (flow_ctrl & 0x1));
+}
+
+static inline u8 cm_rep_get_rnr_retry_count(struct cm_rep_msg *rep_msg)
+{
+ return (u8) (rep_msg->offset27 >> 5);
+}
+
+static inline void cm_rep_set_rnr_retry_count(struct cm_rep_msg *rep_msg,
+ u8 rnr_retry_count)
+{
+ rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0x1F) |
+ (rnr_retry_count << 5));
+}
+
+static inline u8 cm_rep_get_srq(struct cm_rep_msg *rep_msg)
+{
+ return (u8) ((rep_msg->offset27 >> 4) & 0x1);
+}
+
+static inline void cm_rep_set_srq(struct cm_rep_msg *rep_msg, u8 srq)
+{
+ rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0xEF) |
+ ((srq & 0x1) << 4));
+}
+
+struct cm_rtu_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+
+ u8 private_data[IB_CM_RTU_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+struct cm_dreq_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+ /* remote QPN/EECN:24, rsvd:8 */
+ __be32 offset8;
+
+ u8 private_data[IB_CM_DREQ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_dreq_get_remote_qpn(struct cm_dreq_msg *dreq_msg)
+{
+ return cpu_to_be32(be32_to_cpu(dreq_msg->offset8) >> 8);
+}
+
+static inline void cm_dreq_set_remote_qpn(struct cm_dreq_msg *dreq_msg, __be32 qpn)
+{
+ dreq_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+ (be32_to_cpu(dreq_msg->offset8) & 0x000000FF));
+}
+
+struct cm_drep_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+
+ u8 private_data[IB_CM_DREP_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+struct cm_lap_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+
+ __be32 rsvd8;
+ /* remote QPN/EECN:24, remote CM response timeout:5, rsvd:3 */
+ __be32 offset12;
+ __be32 rsvd16;
+
+ __be16 alt_local_lid;
+ __be16 alt_remote_lid;
+ union ib_gid alt_local_gid;
+ union ib_gid alt_remote_gid;
+ /* flow label:20, rsvd:4, traffic class:8 */
+ __be32 offset56;
+ u8 alt_hop_limit;
+ /* rsvd:2, packet rate:6 */
+ u8 offset61;
+ /* SL:4, subnet local:1, rsvd:3 */
+ u8 offset62;
+ /* local ACK timeout:5, rsvd:3 */
+ u8 offset63;
+
+ u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+static inline __be32 cm_lap_get_remote_qpn(struct cm_lap_msg *lap_msg)
+{
+ return cpu_to_be32(be32_to_cpu(lap_msg->offset12) >> 8);
+}
+
+static inline void cm_lap_set_remote_qpn(struct cm_lap_msg *lap_msg, __be32 qpn)
+{
+ lap_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+ (be32_to_cpu(lap_msg->offset12) &
+ 0x000000FF));
+}
+
+static inline u8 cm_lap_get_remote_resp_timeout(struct cm_lap_msg *lap_msg)
+{
+ return (u8) ((be32_to_cpu(lap_msg->offset12) & 0xF8) >> 3);
+}
+
+static inline void cm_lap_set_remote_resp_timeout(struct cm_lap_msg *lap_msg,
+ u8 resp_timeout)
+{
+ lap_msg->offset12 = cpu_to_be32((resp_timeout << 3) |
+ (be32_to_cpu(lap_msg->offset12) &
+ 0xFFFFFF07));
+}
+
+static inline __be32 cm_lap_get_flow_label(struct cm_lap_msg *lap_msg)
+{
+ return cpu_to_be32(be32_to_cpu(lap_msg->offset56) >> 12);
+}
+
+static inline void cm_lap_set_flow_label(struct cm_lap_msg *lap_msg,
+ __be32 flow_label)
+{
+ lap_msg->offset56 = cpu_to_be32(
+ (be32_to_cpu(lap_msg->offset56) & 0x00000FFF) |
+ (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_lap_get_traffic_class(struct cm_lap_msg *lap_msg)
+{
+ return (u8) be32_to_cpu(lap_msg->offset56);
+}
+
+static inline void cm_lap_set_traffic_class(struct cm_lap_msg *lap_msg,
+ u8 traffic_class)
+{
+ lap_msg->offset56 = cpu_to_be32(traffic_class |
+ (be32_to_cpu(lap_msg->offset56) &
+ 0xFFFFFF00));
+}
+
+static inline u8 cm_lap_get_packet_rate(struct cm_lap_msg *lap_msg)
+{
+ return lap_msg->offset61 & 0x3F;
+}
+
+static inline void cm_lap_set_packet_rate(struct cm_lap_msg *lap_msg,
+ u8 packet_rate)
+{
+ lap_msg->offset61 = (packet_rate & 0x3F) | (lap_msg->offset61 & 0xC0);
+}
+
+static inline u8 cm_lap_get_sl(struct cm_lap_msg *lap_msg)
+{
+ return lap_msg->offset62 >> 4;
+}
+
+static inline void cm_lap_set_sl(struct cm_lap_msg *lap_msg, u8 sl)
+{
+ lap_msg->offset62 = (sl << 4) | (lap_msg->offset62 & 0x0F);
+}
+
+static inline u8 cm_lap_get_subnet_local(struct cm_lap_msg *lap_msg)
+{
+ return (lap_msg->offset62 >> 3) & 0x1;
+}
+
+static inline void cm_lap_set_subnet_local(struct cm_lap_msg *lap_msg,
+ u8 subnet_local)
+{
+ lap_msg->offset62 = ((subnet_local & 0x1) << 3) |
+ (lap_msg->offset61 & 0xF7);
+}
+static inline u8 cm_lap_get_local_ack_timeout(struct cm_lap_msg *lap_msg)
+{
+ return lap_msg->offset63 >> 3;
+}
+
+static inline void cm_lap_set_local_ack_timeout(struct cm_lap_msg *lap_msg,
+ u8 local_ack_timeout)
+{
+ lap_msg->offset63 = (local_ack_timeout << 3) |
+ (lap_msg->offset63 & 0x07);
+}
+
+struct cm_apr_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 local_comm_id;
+ __be32 remote_comm_id;
+
+ u8 info_length;
+ u8 ap_status;
+ __be16 rsvd;
+ u8 info[IB_CM_APR_INFO_LENGTH];
+
+ u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+struct cm_sidr_req_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 request_id;
+ __be16 pkey;
+ __be16 rsvd;
+ __be64 service_id;
+
+ u32 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE / sizeof(u32)];
+} __attribute__ ((packed));
+
+struct cm_sidr_rep_msg {
+ struct ib_mad_hdr hdr;
+
+ __be32 request_id;
+ u8 status;
+ u8 info_length;
+ __be16 rsvd;
+ /* QPN:24, rsvd:8 */
+ __be32 offset8;
+ __be64 service_id;
+ __be32 qkey;
+ u8 info[IB_CM_SIDR_REP_INFO_LENGTH];
+
+ u8 private_data[IB_CM_SIDR_REP_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+static inline __be32 cm_sidr_rep_get_qpn(struct cm_sidr_rep_msg *sidr_rep_msg)
+{
+ return cpu_to_be32(be32_to_cpu(sidr_rep_msg->offset8) >> 8);
+}
+
+static inline void cm_sidr_rep_set_qpn(struct cm_sidr_rep_msg *sidr_rep_msg,
+ __be32 qpn)
+{
+ sidr_rep_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+ (be32_to_cpu(sidr_rep_msg->offset8) &
+ 0x000000FF));
+}
+
+#endif /* CM_MSGS_H */
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
new file mode 100644
index 000000000..38ffe0981
--- /dev/null
+++ b/drivers/infiniband/core/cma.c
@@ -0,0 +1,3720 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc. All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/mutex.h>
+#include <linux/random.h>
+#include <linux/idr.h>
+#include <linux/inetdevice.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/route.h>
+
+#include <net/tcp.h>
+#include <net/ipv6.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+#include <rdma/rdma_netlink.h>
+#include <rdma/ib.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_sa.h>
+#include <rdma/iw_cm.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("Generic RDMA CM Agent");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define CMA_CM_RESPONSE_TIMEOUT 20
+#define CMA_MAX_CM_RETRIES 15
+#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
+#define CMA_IBOE_PACKET_LIFETIME 18
+
+static void cma_add_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device);
+
+static struct ib_client cma_client = {
+ .name = "cma",
+ .add = cma_add_one,
+ .remove = cma_remove_one
+};
+
+static struct ib_sa_client sa_client;
+static struct rdma_addr_client addr_client;
+static LIST_HEAD(dev_list);
+static LIST_HEAD(listen_any_list);
+static DEFINE_MUTEX(lock);
+static struct workqueue_struct *cma_wq;
+static DEFINE_IDR(tcp_ps);
+static DEFINE_IDR(udp_ps);
+static DEFINE_IDR(ipoib_ps);
+static DEFINE_IDR(ib_ps);
+
+struct cma_device {
+ struct list_head list;
+ struct ib_device *device;
+ struct completion comp;
+ atomic_t refcount;
+ struct list_head id_list;
+};
+
+struct rdma_bind_list {
+ struct idr *ps;
+ struct hlist_head owners;
+ unsigned short port;
+};
+
+enum {
+ CMA_OPTION_AFONLY,
+};
+
+/*
+ * Device removal can occur at anytime, so we need extra handling to
+ * serialize notifying the user of device removal with other callbacks.
+ * We do this by disabling removal notification while a callback is in process,
+ * and reporting it after the callback completes.
+ */
+struct rdma_id_private {
+ struct rdma_cm_id id;
+
+ struct rdma_bind_list *bind_list;
+ struct hlist_node node;
+ struct list_head list; /* listen_any_list or cma_device.list */
+ struct list_head listen_list; /* per device listens */
+ struct cma_device *cma_dev;
+ struct list_head mc_list;
+
+ int internal_id;
+ enum rdma_cm_state state;
+ spinlock_t lock;
+ struct mutex qp_mutex;
+
+ struct completion comp;
+ atomic_t refcount;
+ struct mutex handler_mutex;
+
+ int backlog;
+ int timeout_ms;
+ struct ib_sa_query *query;
+ int query_id;
+ union {
+ struct ib_cm_id *ib;
+ struct iw_cm_id *iw;
+ } cm_id;
+
+ u32 seq_num;
+ u32 qkey;
+ u32 qp_num;
+ pid_t owner;
+ u32 options;
+ u8 srq;
+ u8 tos;
+ u8 reuseaddr;
+ u8 afonly;
+};
+
+struct cma_multicast {
+ struct rdma_id_private *id_priv;
+ union {
+ struct ib_sa_multicast *ib;
+ } multicast;
+ struct list_head list;
+ void *context;
+ struct sockaddr_storage addr;
+ struct kref mcref;
+};
+
+struct cma_work {
+ struct work_struct work;
+ struct rdma_id_private *id;
+ enum rdma_cm_state old_state;
+ enum rdma_cm_state new_state;
+ struct rdma_cm_event event;
+};
+
+struct cma_ndev_work {
+ struct work_struct work;
+ struct rdma_id_private *id;
+ struct rdma_cm_event event;
+};
+
+struct iboe_mcast_work {
+ struct work_struct work;
+ struct rdma_id_private *id;
+ struct cma_multicast *mc;
+};
+
+union cma_ip_addr {
+ struct in6_addr ip6;
+ struct {
+ __be32 pad[3];
+ __be32 addr;
+ } ip4;
+};
+
+struct cma_hdr {
+ u8 cma_version;
+ u8 ip_version; /* IP version: 7:4 */
+ __be16 port;
+ union cma_ip_addr src_addr;
+ union cma_ip_addr dst_addr;
+};
+
+#define CMA_VERSION 0x00
+
+static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&id_priv->lock, flags);
+ ret = (id_priv->state == comp);
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return ret;
+}
+
+static int cma_comp_exch(struct rdma_id_private *id_priv,
+ enum rdma_cm_state comp, enum rdma_cm_state exch)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&id_priv->lock, flags);
+ if ((ret = (id_priv->state == comp)))
+ id_priv->state = exch;
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return ret;
+}
+
+static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv,
+ enum rdma_cm_state exch)
+{
+ unsigned long flags;
+ enum rdma_cm_state old;
+
+ spin_lock_irqsave(&id_priv->lock, flags);
+ old = id_priv->state;
+ id_priv->state = exch;
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return old;
+}
+
+static inline u8 cma_get_ip_ver(struct cma_hdr *hdr)
+{
+ return hdr->ip_version >> 4;
+}
+
+static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
+{
+ hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
+}
+
+static void cma_attach_to_dev(struct rdma_id_private *id_priv,
+ struct cma_device *cma_dev)
+{
+ atomic_inc(&cma_dev->refcount);
+ id_priv->cma_dev = cma_dev;
+ id_priv->id.device = cma_dev->device;
+ id_priv->id.route.addr.dev_addr.transport =
+ rdma_node_get_transport(cma_dev->device->node_type);
+ list_add_tail(&id_priv->list, &cma_dev->id_list);
+}
+
+static inline void cma_deref_dev(struct cma_device *cma_dev)
+{
+ if (atomic_dec_and_test(&cma_dev->refcount))
+ complete(&cma_dev->comp);
+}
+
+static inline void release_mc(struct kref *kref)
+{
+ struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref);
+
+ kfree(mc->multicast.ib);
+ kfree(mc);
+}
+
+static void cma_release_dev(struct rdma_id_private *id_priv)
+{
+ mutex_lock(&lock);
+ list_del(&id_priv->list);
+ cma_deref_dev(id_priv->cma_dev);
+ id_priv->cma_dev = NULL;
+ mutex_unlock(&lock);
+}
+
+static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv)
+{
+ return (struct sockaddr *) &id_priv->id.route.addr.src_addr;
+}
+
+static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv)
+{
+ return (struct sockaddr *) &id_priv->id.route.addr.dst_addr;
+}
+
+static inline unsigned short cma_family(struct rdma_id_private *id_priv)
+{
+ return id_priv->id.route.addr.src_addr.ss_family;
+}
+
+static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey)
+{
+ struct ib_sa_mcmember_rec rec;
+ int ret = 0;
+
+ if (id_priv->qkey) {
+ if (qkey && id_priv->qkey != qkey)
+ return -EINVAL;
+ return 0;
+ }
+
+ if (qkey) {
+ id_priv->qkey = qkey;
+ return 0;
+ }
+
+ switch (id_priv->id.ps) {
+ case RDMA_PS_UDP:
+ case RDMA_PS_IB:
+ id_priv->qkey = RDMA_UDP_QKEY;
+ break;
+ case RDMA_PS_IPOIB:
+ ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid);
+ ret = ib_sa_get_mcmember_rec(id_priv->id.device,
+ id_priv->id.port_num, &rec.mgid,
+ &rec);
+ if (!ret)
+ id_priv->qkey = be32_to_cpu(rec.qkey);
+ break;
+ default:
+ break;
+ }
+ return ret;
+}
+
+static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr)
+{
+ dev_addr->dev_type = ARPHRD_INFINIBAND;
+ rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr);
+ ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey));
+}
+
+static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
+{
+ int ret;
+
+ if (addr->sa_family != AF_IB) {
+ ret = rdma_translate_ip(addr, dev_addr, NULL);
+ } else {
+ cma_translate_ib((struct sockaddr_ib *) addr, dev_addr);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static int cma_acquire_dev(struct rdma_id_private *id_priv,
+ struct rdma_id_private *listen_id_priv)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ struct cma_device *cma_dev;
+ union ib_gid gid, iboe_gid;
+ int ret = -ENODEV;
+ u8 port, found_port;
+ enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ?
+ IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
+
+ if (dev_ll != IB_LINK_LAYER_INFINIBAND &&
+ id_priv->id.ps == RDMA_PS_IPOIB)
+ return -EINVAL;
+
+ mutex_lock(&lock);
+ rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+ &iboe_gid);
+
+ memcpy(&gid, dev_addr->src_dev_addr +
+ rdma_addr_gid_offset(dev_addr), sizeof gid);
+ if (listen_id_priv &&
+ rdma_port_get_link_layer(listen_id_priv->id.device,
+ listen_id_priv->id.port_num) == dev_ll) {
+ cma_dev = listen_id_priv->cma_dev;
+ port = listen_id_priv->id.port_num;
+ if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB &&
+ rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET)
+ ret = ib_find_cached_gid(cma_dev->device, &iboe_gid,
+ &found_port, NULL);
+ else
+ ret = ib_find_cached_gid(cma_dev->device, &gid,
+ &found_port, NULL);
+
+ if (!ret && (port == found_port)) {
+ id_priv->id.port_num = found_port;
+ goto out;
+ }
+ }
+ list_for_each_entry(cma_dev, &dev_list, list) {
+ for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) {
+ if (listen_id_priv &&
+ listen_id_priv->cma_dev == cma_dev &&
+ listen_id_priv->id.port_num == port)
+ continue;
+ if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) {
+ if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB &&
+ rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET)
+ ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, &found_port, NULL);
+ else
+ ret = ib_find_cached_gid(cma_dev->device, &gid, &found_port, NULL);
+
+ if (!ret && (port == found_port)) {
+ id_priv->id.port_num = found_port;
+ goto out;
+ }
+ }
+ }
+ }
+
+out:
+ if (!ret)
+ cma_attach_to_dev(id_priv, cma_dev);
+
+ mutex_unlock(&lock);
+ return ret;
+}
+
+/*
+ * Select the source IB device and address to reach the destination IB address.
+ */
+static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
+{
+ struct cma_device *cma_dev, *cur_dev;
+ struct sockaddr_ib *addr;
+ union ib_gid gid, sgid, *dgid;
+ u16 pkey, index;
+ u8 p;
+ int i;
+
+ cma_dev = NULL;
+ addr = (struct sockaddr_ib *) cma_dst_addr(id_priv);
+ dgid = (union ib_gid *) &addr->sib_addr;
+ pkey = ntohs(addr->sib_pkey);
+
+ list_for_each_entry(cur_dev, &dev_list, list) {
+ if (rdma_node_get_transport(cur_dev->device->node_type) != RDMA_TRANSPORT_IB)
+ continue;
+
+ for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) {
+ if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index))
+ continue;
+
+ for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i, &gid); i++) {
+ if (!memcmp(&gid, dgid, sizeof(gid))) {
+ cma_dev = cur_dev;
+ sgid = gid;
+ id_priv->id.port_num = p;
+ goto found;
+ }
+
+ if (!cma_dev && (gid.global.subnet_prefix ==
+ dgid->global.subnet_prefix)) {
+ cma_dev = cur_dev;
+ sgid = gid;
+ id_priv->id.port_num = p;
+ }
+ }
+ }
+ }
+
+ if (!cma_dev)
+ return -ENODEV;
+
+found:
+ cma_attach_to_dev(id_priv, cma_dev);
+ addr = (struct sockaddr_ib *) cma_src_addr(id_priv);
+ memcpy(&addr->sib_addr, &sgid, sizeof sgid);
+ cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr);
+ return 0;
+}
+
+static void cma_deref_id(struct rdma_id_private *id_priv)
+{
+ if (atomic_dec_and_test(&id_priv->refcount))
+ complete(&id_priv->comp);
+}
+
+static int cma_disable_callback(struct rdma_id_private *id_priv,
+ enum rdma_cm_state state)
+{
+ mutex_lock(&id_priv->handler_mutex);
+ if (id_priv->state != state) {
+ mutex_unlock(&id_priv->handler_mutex);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
+ void *context, enum rdma_port_space ps,
+ enum ib_qp_type qp_type)
+{
+ struct rdma_id_private *id_priv;
+
+ id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL);
+ if (!id_priv)
+ return ERR_PTR(-ENOMEM);
+
+ id_priv->owner = task_pid_nr(current);
+ id_priv->state = RDMA_CM_IDLE;
+ id_priv->id.context = context;
+ id_priv->id.event_handler = event_handler;
+ id_priv->id.ps = ps;
+ id_priv->id.qp_type = qp_type;
+ spin_lock_init(&id_priv->lock);
+ mutex_init(&id_priv->qp_mutex);
+ init_completion(&id_priv->comp);
+ atomic_set(&id_priv->refcount, 1);
+ mutex_init(&id_priv->handler_mutex);
+ INIT_LIST_HEAD(&id_priv->listen_list);
+ INIT_LIST_HEAD(&id_priv->mc_list);
+ get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
+
+ return &id_priv->id;
+}
+EXPORT_SYMBOL(rdma_create_id);
+
+static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ return ret;
+
+ ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+ if (ret)
+ return ret;
+
+ qp_attr.qp_state = IB_QPS_RTR;
+ ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+ if (ret)
+ return ret;
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ qp_attr.sq_psn = 0;
+ ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN);
+
+ return ret;
+}
+
+static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ return ret;
+
+ return ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+}
+
+int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct rdma_id_private *id_priv;
+ struct ib_qp *qp;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (id->device != pd->device)
+ return -EINVAL;
+
+ qp = ib_create_qp(pd, qp_init_attr);
+ if (IS_ERR(qp))
+ return PTR_ERR(qp);
+
+ if (id->qp_type == IB_QPT_UD)
+ ret = cma_init_ud_qp(id_priv, qp);
+ else
+ ret = cma_init_conn_qp(id_priv, qp);
+ if (ret)
+ goto err;
+
+ id->qp = qp;
+ id_priv->qp_num = qp->qp_num;
+ id_priv->srq = (qp->srq != NULL);
+ return 0;
+err:
+ ib_destroy_qp(qp);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_create_qp);
+
+void rdma_destroy_qp(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ mutex_lock(&id_priv->qp_mutex);
+ ib_destroy_qp(id_priv->id.qp);
+ id_priv->id.qp = NULL;
+ mutex_unlock(&id_priv->qp_mutex);
+}
+EXPORT_SYMBOL(rdma_destroy_qp);
+
+static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+ union ib_gid sgid;
+
+ mutex_lock(&id_priv->qp_mutex);
+ if (!id_priv->id.qp) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Need to update QP attributes from default values. */
+ qp_attr.qp_state = IB_QPS_INIT;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ goto out;
+
+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+ if (ret)
+ goto out;
+
+ qp_attr.qp_state = IB_QPS_RTR;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ goto out;
+
+ ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num,
+ qp_attr.ah_attr.grh.sgid_index, &sgid);
+ if (ret)
+ goto out;
+
+ if (rdma_node_get_transport(id_priv->cma_dev->device->node_type)
+ == RDMA_TRANSPORT_IB &&
+ rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)
+ == IB_LINK_LAYER_ETHERNET) {
+ ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr.smac, NULL);
+
+ if (ret)
+ goto out;
+ }
+ if (conn_param)
+ qp_attr.max_dest_rd_atomic = conn_param->responder_resources;
+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+ mutex_unlock(&id_priv->qp_mutex);
+ return ret;
+}
+
+static int cma_modify_qp_rts(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_qp_attr qp_attr;
+ int qp_attr_mask, ret;
+
+ mutex_lock(&id_priv->qp_mutex);
+ if (!id_priv->id.qp) {
+ ret = 0;
+ goto out;
+ }
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+ if (ret)
+ goto out;
+
+ if (conn_param)
+ qp_attr.max_rd_atomic = conn_param->initiator_depth;
+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+ mutex_unlock(&id_priv->qp_mutex);
+ return ret;
+}
+
+static int cma_modify_qp_err(struct rdma_id_private *id_priv)
+{
+ struct ib_qp_attr qp_attr;
+ int ret;
+
+ mutex_lock(&id_priv->qp_mutex);
+ if (!id_priv->id.qp) {
+ ret = 0;
+ goto out;
+ }
+
+ qp_attr.qp_state = IB_QPS_ERR;
+ ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE);
+out:
+ mutex_unlock(&id_priv->qp_mutex);
+ return ret;
+}
+
+static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
+ struct ib_qp_attr *qp_attr, int *qp_attr_mask)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ int ret;
+ u16 pkey;
+
+ if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) ==
+ IB_LINK_LAYER_INFINIBAND)
+ pkey = ib_addr_get_pkey(dev_addr);
+ else
+ pkey = 0xffff;
+
+ ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
+ pkey, &qp_attr->pkey_index);
+ if (ret)
+ return ret;
+
+ qp_attr->port_num = id_priv->id.port_num;
+ *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT;
+
+ if (id_priv->id.qp_type == IB_QPT_UD) {
+ ret = cma_set_qkey(id_priv, 0);
+ if (ret)
+ return ret;
+
+ qp_attr->qkey = id_priv->qkey;
+ *qp_attr_mask |= IB_QP_QKEY;
+ } else {
+ qp_attr->qp_access_flags = 0;
+ *qp_attr_mask |= IB_QP_ACCESS_FLAGS;
+ }
+ return 0;
+}
+
+int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ struct rdma_id_private *id_priv;
+ int ret = 0;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD))
+ ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
+ else
+ ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr,
+ qp_attr_mask);
+
+ if (qp_attr->qp_state == IB_QPS_RTR)
+ qp_attr->rq_psn = id_priv->seq_num;
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ if (!id_priv->cm_id.iw) {
+ qp_attr->qp_access_flags = 0;
+ *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+ } else
+ ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr,
+ qp_attr_mask);
+ break;
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(rdma_init_qp_attr);
+
+static inline int cma_zero_addr(struct sockaddr *addr)
+{
+ switch (addr->sa_family) {
+ case AF_INET:
+ return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr);
+ case AF_INET6:
+ return ipv6_addr_any(&((struct sockaddr_in6 *) addr)->sin6_addr);
+ case AF_IB:
+ return ib_addr_any(&((struct sockaddr_ib *) addr)->sib_addr);
+ default:
+ return 0;
+ }
+}
+
+static inline int cma_loopback_addr(struct sockaddr *addr)
+{
+ switch (addr->sa_family) {
+ case AF_INET:
+ return ipv4_is_loopback(((struct sockaddr_in *) addr)->sin_addr.s_addr);
+ case AF_INET6:
+ return ipv6_addr_loopback(&((struct sockaddr_in6 *) addr)->sin6_addr);
+ case AF_IB:
+ return ib_addr_loopback(&((struct sockaddr_ib *) addr)->sib_addr);
+ default:
+ return 0;
+ }
+}
+
+static inline int cma_any_addr(struct sockaddr *addr)
+{
+ return cma_zero_addr(addr) || cma_loopback_addr(addr);
+}
+
+static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst)
+{
+ if (src->sa_family != dst->sa_family)
+ return -1;
+
+ switch (src->sa_family) {
+ case AF_INET:
+ return ((struct sockaddr_in *) src)->sin_addr.s_addr !=
+ ((struct sockaddr_in *) dst)->sin_addr.s_addr;
+ case AF_INET6:
+ return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr,
+ &((struct sockaddr_in6 *) dst)->sin6_addr);
+ default:
+ return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr,
+ &((struct sockaddr_ib *) dst)->sib_addr);
+ }
+}
+
+static __be16 cma_port(struct sockaddr *addr)
+{
+ struct sockaddr_ib *sib;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ return ((struct sockaddr_in *) addr)->sin_port;
+ case AF_INET6:
+ return ((struct sockaddr_in6 *) addr)->sin6_port;
+ case AF_IB:
+ sib = (struct sockaddr_ib *) addr;
+ return htons((u16) (be64_to_cpu(sib->sib_sid) &
+ be64_to_cpu(sib->sib_sid_mask)));
+ default:
+ return 0;
+ }
+}
+
+static inline int cma_any_port(struct sockaddr *addr)
+{
+ return !cma_port(addr);
+}
+
+static void cma_save_ib_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+ struct ib_sa_path_rec *path)
+{
+ struct sockaddr_ib *listen_ib, *ib;
+
+ listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr;
+ ib = (struct sockaddr_ib *) &id->route.addr.src_addr;
+ ib->sib_family = listen_ib->sib_family;
+ if (path) {
+ ib->sib_pkey = path->pkey;
+ ib->sib_flowinfo = path->flow_label;
+ memcpy(&ib->sib_addr, &path->sgid, 16);
+ } else {
+ ib->sib_pkey = listen_ib->sib_pkey;
+ ib->sib_flowinfo = listen_ib->sib_flowinfo;
+ ib->sib_addr = listen_ib->sib_addr;
+ }
+ ib->sib_sid = listen_ib->sib_sid;
+ ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
+ ib->sib_scope_id = listen_ib->sib_scope_id;
+
+ if (path) {
+ ib = (struct sockaddr_ib *) &id->route.addr.dst_addr;
+ ib->sib_family = listen_ib->sib_family;
+ ib->sib_pkey = path->pkey;
+ ib->sib_flowinfo = path->flow_label;
+ memcpy(&ib->sib_addr, &path->dgid, 16);
+ }
+}
+
+static __be16 ss_get_port(const struct sockaddr_storage *ss)
+{
+ if (ss->ss_family == AF_INET)
+ return ((struct sockaddr_in *)ss)->sin_port;
+ else if (ss->ss_family == AF_INET6)
+ return ((struct sockaddr_in6 *)ss)->sin6_port;
+ BUG();
+}
+
+static void cma_save_ip4_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+ struct cma_hdr *hdr)
+{
+ struct sockaddr_in *ip4;
+
+ ip4 = (struct sockaddr_in *) &id->route.addr.src_addr;
+ ip4->sin_family = AF_INET;
+ ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
+ ip4->sin_port = ss_get_port(&listen_id->route.addr.src_addr);
+
+ ip4 = (struct sockaddr_in *) &id->route.addr.dst_addr;
+ ip4->sin_family = AF_INET;
+ ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
+ ip4->sin_port = hdr->port;
+}
+
+static void cma_save_ip6_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+ struct cma_hdr *hdr)
+{
+ struct sockaddr_in6 *ip6;
+
+ ip6 = (struct sockaddr_in6 *) &id->route.addr.src_addr;
+ ip6->sin6_family = AF_INET6;
+ ip6->sin6_addr = hdr->dst_addr.ip6;
+ ip6->sin6_port = ss_get_port(&listen_id->route.addr.src_addr);
+
+ ip6 = (struct sockaddr_in6 *) &id->route.addr.dst_addr;
+ ip6->sin6_family = AF_INET6;
+ ip6->sin6_addr = hdr->src_addr.ip6;
+ ip6->sin6_port = hdr->port;
+}
+
+static int cma_save_net_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+ struct ib_cm_event *ib_event)
+{
+ struct cma_hdr *hdr;
+
+ if (listen_id->route.addr.src_addr.ss_family == AF_IB) {
+ if (ib_event->event == IB_CM_REQ_RECEIVED)
+ cma_save_ib_info(id, listen_id, ib_event->param.req_rcvd.primary_path);
+ else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
+ cma_save_ib_info(id, listen_id, NULL);
+ return 0;
+ }
+
+ hdr = ib_event->private_data;
+ if (hdr->cma_version != CMA_VERSION)
+ return -EINVAL;
+
+ switch (cma_get_ip_ver(hdr)) {
+ case 4:
+ cma_save_ip4_info(id, listen_id, hdr);
+ break;
+ case 6:
+ cma_save_ip6_info(id, listen_id, hdr);
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static inline int cma_user_data_offset(struct rdma_id_private *id_priv)
+{
+ return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
+}
+
+static void cma_cancel_route(struct rdma_id_private *id_priv)
+{
+ switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ if (id_priv->query)
+ ib_sa_cancel_query(id_priv->query_id, id_priv->query);
+ break;
+ default:
+ break;
+ }
+}
+
+static void cma_cancel_listens(struct rdma_id_private *id_priv)
+{
+ struct rdma_id_private *dev_id_priv;
+
+ /*
+ * Remove from listen_any_list to prevent added devices from spawning
+ * additional listen requests.
+ */
+ mutex_lock(&lock);
+ list_del(&id_priv->list);
+
+ while (!list_empty(&id_priv->listen_list)) {
+ dev_id_priv = list_entry(id_priv->listen_list.next,
+ struct rdma_id_private, listen_list);
+ /* sync with device removal to avoid duplicate destruction */
+ list_del_init(&dev_id_priv->list);
+ list_del(&dev_id_priv->listen_list);
+ mutex_unlock(&lock);
+
+ rdma_destroy_id(&dev_id_priv->id);
+ mutex_lock(&lock);
+ }
+ mutex_unlock(&lock);
+}
+
+static void cma_cancel_operation(struct rdma_id_private *id_priv,
+ enum rdma_cm_state state)
+{
+ switch (state) {
+ case RDMA_CM_ADDR_QUERY:
+ rdma_addr_cancel(&id_priv->id.route.addr.dev_addr);
+ break;
+ case RDMA_CM_ROUTE_QUERY:
+ cma_cancel_route(id_priv);
+ break;
+ case RDMA_CM_LISTEN:
+ if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev)
+ cma_cancel_listens(id_priv);
+ break;
+ default:
+ break;
+ }
+}
+
+static void cma_release_port(struct rdma_id_private *id_priv)
+{
+ struct rdma_bind_list *bind_list = id_priv->bind_list;
+
+ if (!bind_list)
+ return;
+
+ mutex_lock(&lock);
+ hlist_del(&id_priv->node);
+ if (hlist_empty(&bind_list->owners)) {
+ idr_remove(bind_list->ps, bind_list->port);
+ kfree(bind_list);
+ }
+ mutex_unlock(&lock);
+}
+
+static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
+{
+ struct cma_multicast *mc;
+
+ while (!list_empty(&id_priv->mc_list)) {
+ mc = container_of(id_priv->mc_list.next,
+ struct cma_multicast, list);
+ list_del(&mc->list);
+ switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ ib_sa_free_multicast(mc->multicast.ib);
+ kfree(mc);
+ break;
+ case IB_LINK_LAYER_ETHERNET:
+ kref_put(&mc->mcref, release_mc);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+void rdma_destroy_id(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv;
+ enum rdma_cm_state state;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ state = cma_exch(id_priv, RDMA_CM_DESTROYING);
+ cma_cancel_operation(id_priv, state);
+
+ /*
+ * Wait for any active callback to finish. New callbacks will find
+ * the id_priv state set to destroying and abort.
+ */
+ mutex_lock(&id_priv->handler_mutex);
+ mutex_unlock(&id_priv->handler_mutex);
+
+ if (id_priv->cma_dev) {
+ switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (id_priv->cm_id.ib)
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ if (id_priv->cm_id.iw)
+ iw_destroy_cm_id(id_priv->cm_id.iw);
+ break;
+ default:
+ break;
+ }
+ cma_leave_mc_groups(id_priv);
+ cma_release_dev(id_priv);
+ }
+
+ cma_release_port(id_priv);
+ cma_deref_id(id_priv);
+ wait_for_completion(&id_priv->comp);
+
+ if (id_priv->internal_id)
+ cma_deref_id(id_priv->id.context);
+
+ kfree(id_priv->id.route.path_rec);
+ kfree(id_priv);
+}
+EXPORT_SYMBOL(rdma_destroy_id);
+
+static int cma_rep_recv(struct rdma_id_private *id_priv)
+{
+ int ret;
+
+ ret = cma_modify_qp_rtr(id_priv, NULL);
+ if (ret)
+ goto reject;
+
+ ret = cma_modify_qp_rts(id_priv, NULL);
+ if (ret)
+ goto reject;
+
+ ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0);
+ if (ret)
+ goto reject;
+
+ return 0;
+reject:
+ cma_modify_qp_err(id_priv);
+ ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
+ NULL, 0, NULL, 0);
+ return ret;
+}
+
+static void cma_set_rep_event_data(struct rdma_cm_event *event,
+ struct ib_cm_rep_event_param *rep_data,
+ void *private_data)
+{
+ event->param.conn.private_data = private_data;
+ event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+ event->param.conn.responder_resources = rep_data->responder_resources;
+ event->param.conn.initiator_depth = rep_data->initiator_depth;
+ event->param.conn.flow_control = rep_data->flow_control;
+ event->param.conn.rnr_retry_count = rep_data->rnr_retry_count;
+ event->param.conn.srq = rep_data->srq;
+ event->param.conn.qp_num = rep_data->remote_qpn;
+}
+
+static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *id_priv = cm_id->context;
+ struct rdma_cm_event event;
+ int ret = 0;
+
+ if ((ib_event->event != IB_CM_TIMEWAIT_EXIT &&
+ cma_disable_callback(id_priv, RDMA_CM_CONNECT)) ||
+ (ib_event->event == IB_CM_TIMEWAIT_EXIT &&
+ cma_disable_callback(id_priv, RDMA_CM_DISCONNECT)))
+ return 0;
+
+ memset(&event, 0, sizeof event);
+ switch (ib_event->event) {
+ case IB_CM_REQ_ERROR:
+ case IB_CM_REP_ERROR:
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = -ETIMEDOUT;
+ break;
+ case IB_CM_REP_RECEIVED:
+ if (id_priv->id.qp) {
+ event.status = cma_rep_recv(id_priv);
+ event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR :
+ RDMA_CM_EVENT_ESTABLISHED;
+ } else {
+ event.event = RDMA_CM_EVENT_CONNECT_RESPONSE;
+ }
+ cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd,
+ ib_event->private_data);
+ break;
+ case IB_CM_RTU_RECEIVED:
+ case IB_CM_USER_ESTABLISHED:
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ break;
+ case IB_CM_DREQ_ERROR:
+ event.status = -ETIMEDOUT; /* fall through */
+ case IB_CM_DREQ_RECEIVED:
+ case IB_CM_DREP_RECEIVED:
+ if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT,
+ RDMA_CM_DISCONNECT))
+ goto out;
+ event.event = RDMA_CM_EVENT_DISCONNECTED;
+ break;
+ case IB_CM_TIMEWAIT_EXIT:
+ event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT;
+ break;
+ case IB_CM_MRA_RECEIVED:
+ /* ignore event */
+ goto out;
+ case IB_CM_REJ_RECEIVED:
+ cma_modify_qp_err(id_priv);
+ event.status = ib_event->param.rej_rcvd.reason;
+ event.event = RDMA_CM_EVENT_REJECTED;
+ event.param.conn.private_data = ib_event->private_data;
+ event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+ break;
+ default:
+ printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
+ ib_event->event);
+ goto out;
+ }
+
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+ if (ret) {
+ /* Destroy the CM ID by returning a non-zero value. */
+ id_priv->cm_id.ib = NULL;
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
+ mutex_unlock(&id_priv->handler_mutex);
+ rdma_destroy_id(&id_priv->id);
+ return ret;
+ }
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ return ret;
+}
+
+static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
+ struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *id_priv;
+ struct rdma_cm_id *id;
+ struct rdma_route *rt;
+ int ret;
+
+ id = rdma_create_id(listen_id->event_handler, listen_id->context,
+ listen_id->ps, ib_event->param.req_rcvd.qp_type);
+ if (IS_ERR(id))
+ return NULL;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (cma_save_net_info(id, listen_id, ib_event))
+ goto err;
+
+ rt = &id->route;
+ rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1;
+ rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths,
+ GFP_KERNEL);
+ if (!rt->path_rec)
+ goto err;
+
+ rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path;
+ if (rt->num_paths == 2)
+ rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
+
+ if (cma_any_addr(cma_src_addr(id_priv))) {
+ rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
+ rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
+ ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
+ } else {
+ ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr);
+ if (ret)
+ goto err;
+ }
+ rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
+
+ id_priv->state = RDMA_CM_CONNECT;
+ return id_priv;
+
+err:
+ rdma_destroy_id(id);
+ return NULL;
+}
+
+static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
+ struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *id_priv;
+ struct rdma_cm_id *id;
+ int ret;
+
+ id = rdma_create_id(listen_id->event_handler, listen_id->context,
+ listen_id->ps, IB_QPT_UD);
+ if (IS_ERR(id))
+ return NULL;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (cma_save_net_info(id, listen_id, ib_event))
+ goto err;
+
+ if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) {
+ ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr);
+ if (ret)
+ goto err;
+ }
+
+ id_priv->state = RDMA_CM_CONNECT;
+ return id_priv;
+err:
+ rdma_destroy_id(id);
+ return NULL;
+}
+
+static void cma_set_req_event_data(struct rdma_cm_event *event,
+ struct ib_cm_req_event_param *req_data,
+ void *private_data, int offset)
+{
+ event->param.conn.private_data = private_data + offset;
+ event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset;
+ event->param.conn.responder_resources = req_data->responder_resources;
+ event->param.conn.initiator_depth = req_data->initiator_depth;
+ event->param.conn.flow_control = req_data->flow_control;
+ event->param.conn.retry_count = req_data->retry_count;
+ event->param.conn.rnr_retry_count = req_data->rnr_retry_count;
+ event->param.conn.srq = req_data->srq;
+ event->param.conn.qp_num = req_data->remote_qpn;
+}
+
+static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_event)
+{
+ return (((ib_event->event == IB_CM_REQ_RECEIVED) &&
+ (ib_event->param.req_rcvd.qp_type == id->qp_type)) ||
+ ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) &&
+ (id->qp_type == IB_QPT_UD)) ||
+ (!id->qp_type));
+}
+
+static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *listen_id, *conn_id;
+ struct rdma_cm_event event;
+ int offset, ret;
+
+ listen_id = cm_id->context;
+ if (!cma_check_req_qp_type(&listen_id->id, ib_event))
+ return -EINVAL;
+
+ if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
+ return -ECONNABORTED;
+
+ memset(&event, 0, sizeof event);
+ offset = cma_user_data_offset(listen_id);
+ event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+ if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) {
+ conn_id = cma_new_udp_id(&listen_id->id, ib_event);
+ event.param.ud.private_data = ib_event->private_data + offset;
+ event.param.ud.private_data_len =
+ IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
+ } else {
+ conn_id = cma_new_conn_id(&listen_id->id, ib_event);
+ cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
+ ib_event->private_data, offset);
+ }
+ if (!conn_id) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+
+ mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+ ret = cma_acquire_dev(conn_id, listen_id);
+ if (ret)
+ goto err2;
+
+ conn_id->cm_id.ib = cm_id;
+ cm_id->context = conn_id;
+ cm_id->cm_handler = cma_ib_handler;
+
+ /*
+ * Protect against the user destroying conn_id from another thread
+ * until we're done accessing it.
+ */
+ atomic_inc(&conn_id->refcount);
+ ret = conn_id->id.event_handler(&conn_id->id, &event);
+ if (ret)
+ goto err3;
+ /*
+ * Acquire mutex to prevent user executing rdma_destroy_id()
+ * while we're accessing the cm_id.
+ */
+ mutex_lock(&lock);
+ if (cma_comp(conn_id, RDMA_CM_CONNECT) &&
+ (conn_id->id.qp_type != IB_QPT_UD))
+ ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+ mutex_unlock(&lock);
+ mutex_unlock(&conn_id->handler_mutex);
+ mutex_unlock(&listen_id->handler_mutex);
+ cma_deref_id(conn_id);
+ return 0;
+
+err3:
+ cma_deref_id(conn_id);
+ /* Destroy the CM ID by returning a non-zero value. */
+ conn_id->cm_id.ib = NULL;
+err2:
+ cma_exch(conn_id, RDMA_CM_DESTROYING);
+ mutex_unlock(&conn_id->handler_mutex);
+err1:
+ mutex_unlock(&listen_id->handler_mutex);
+ if (conn_id)
+ rdma_destroy_id(&conn_id->id);
+ return ret;
+}
+
+__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+ if (addr->sa_family == AF_IB)
+ return ((struct sockaddr_ib *) addr)->sib_sid;
+
+ return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr)));
+}
+EXPORT_SYMBOL(rdma_get_service_id);
+
+static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
+ struct ib_cm_compare_data *compare)
+{
+ struct cma_hdr *cma_data, *cma_mask;
+ __be32 ip4_addr;
+ struct in6_addr ip6_addr;
+
+ memset(compare, 0, sizeof *compare);
+ cma_data = (void *) compare->data;
+ cma_mask = (void *) compare->mask;
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
+ cma_set_ip_ver(cma_data, 4);
+ cma_set_ip_ver(cma_mask, 0xF);
+ if (!cma_any_addr(addr)) {
+ cma_data->dst_addr.ip4.addr = ip4_addr;
+ cma_mask->dst_addr.ip4.addr = htonl(~0);
+ }
+ break;
+ case AF_INET6:
+ ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
+ cma_set_ip_ver(cma_data, 6);
+ cma_set_ip_ver(cma_mask, 0xF);
+ if (!cma_any_addr(addr)) {
+ cma_data->dst_addr.ip6 = ip6_addr;
+ memset(&cma_mask->dst_addr.ip6, 0xFF,
+ sizeof cma_mask->dst_addr.ip6);
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
+{
+ struct rdma_id_private *id_priv = iw_id->context;
+ struct rdma_cm_event event;
+ int ret = 0;
+ struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
+ struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
+
+ if (cma_disable_callback(id_priv, RDMA_CM_CONNECT))
+ return 0;
+
+ memset(&event, 0, sizeof event);
+ switch (iw_event->event) {
+ case IW_CM_EVENT_CLOSE:
+ event.event = RDMA_CM_EVENT_DISCONNECTED;
+ break;
+ case IW_CM_EVENT_CONNECT_REPLY:
+ memcpy(cma_src_addr(id_priv), laddr,
+ rdma_addr_size(laddr));
+ memcpy(cma_dst_addr(id_priv), raddr,
+ rdma_addr_size(raddr));
+ switch (iw_event->status) {
+ case 0:
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ event.param.conn.initiator_depth = iw_event->ird;
+ event.param.conn.responder_resources = iw_event->ord;
+ break;
+ case -ECONNRESET:
+ case -ECONNREFUSED:
+ event.event = RDMA_CM_EVENT_REJECTED;
+ break;
+ case -ETIMEDOUT:
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ break;
+ default:
+ event.event = RDMA_CM_EVENT_CONNECT_ERROR;
+ break;
+ }
+ break;
+ case IW_CM_EVENT_ESTABLISHED:
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ event.param.conn.initiator_depth = iw_event->ird;
+ event.param.conn.responder_resources = iw_event->ord;
+ break;
+ default:
+ BUG_ON(1);
+ }
+
+ event.status = iw_event->status;
+ event.param.conn.private_data = iw_event->private_data;
+ event.param.conn.private_data_len = iw_event->private_data_len;
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+ if (ret) {
+ /* Destroy the CM ID by returning a non-zero value. */
+ id_priv->cm_id.iw = NULL;
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
+ mutex_unlock(&id_priv->handler_mutex);
+ rdma_destroy_id(&id_priv->id);
+ return ret;
+ }
+
+ mutex_unlock(&id_priv->handler_mutex);
+ return ret;
+}
+
+static int iw_conn_req_handler(struct iw_cm_id *cm_id,
+ struct iw_cm_event *iw_event)
+{
+ struct rdma_cm_id *new_cm_id;
+ struct rdma_id_private *listen_id, *conn_id;
+ struct rdma_cm_event event;
+ int ret;
+ struct ib_device_attr attr;
+ struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
+ struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
+
+ listen_id = cm_id->context;
+ if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
+ return -ECONNABORTED;
+
+ /* Create a new RDMA id for the new IW CM ID */
+ new_cm_id = rdma_create_id(listen_id->id.event_handler,
+ listen_id->id.context,
+ RDMA_PS_TCP, IB_QPT_RC);
+ if (IS_ERR(new_cm_id)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ conn_id = container_of(new_cm_id, struct rdma_id_private, id);
+ mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+ conn_id->state = RDMA_CM_CONNECT;
+
+ ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr, NULL);
+ if (ret) {
+ mutex_unlock(&conn_id->handler_mutex);
+ rdma_destroy_id(new_cm_id);
+ goto out;
+ }
+
+ ret = cma_acquire_dev(conn_id, listen_id);
+ if (ret) {
+ mutex_unlock(&conn_id->handler_mutex);
+ rdma_destroy_id(new_cm_id);
+ goto out;
+ }
+
+ conn_id->cm_id.iw = cm_id;
+ cm_id->context = conn_id;
+ cm_id->cm_handler = cma_iw_handler;
+
+ memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr));
+ memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr));
+
+ ret = ib_query_device(conn_id->id.device, &attr);
+ if (ret) {
+ mutex_unlock(&conn_id->handler_mutex);
+ rdma_destroy_id(new_cm_id);
+ goto out;
+ }
+
+ memset(&event, 0, sizeof event);
+ event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+ event.param.conn.private_data = iw_event->private_data;
+ event.param.conn.private_data_len = iw_event->private_data_len;
+ event.param.conn.initiator_depth = iw_event->ird;
+ event.param.conn.responder_resources = iw_event->ord;
+
+ /*
+ * Protect against the user destroying conn_id from another thread
+ * until we're done accessing it.
+ */
+ atomic_inc(&conn_id->refcount);
+ ret = conn_id->id.event_handler(&conn_id->id, &event);
+ if (ret) {
+ /* User wants to destroy the CM ID */
+ conn_id->cm_id.iw = NULL;
+ cma_exch(conn_id, RDMA_CM_DESTROYING);
+ mutex_unlock(&conn_id->handler_mutex);
+ cma_deref_id(conn_id);
+ rdma_destroy_id(&conn_id->id);
+ goto out;
+ }
+
+ mutex_unlock(&conn_id->handler_mutex);
+ cma_deref_id(conn_id);
+
+out:
+ mutex_unlock(&listen_id->handler_mutex);
+ return ret;
+}
+
+static int cma_ib_listen(struct rdma_id_private *id_priv)
+{
+ struct ib_cm_compare_data compare_data;
+ struct sockaddr *addr;
+ struct ib_cm_id *id;
+ __be64 svc_id;
+ int ret;
+
+ id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv);
+ if (IS_ERR(id))
+ return PTR_ERR(id);
+
+ id_priv->cm_id.ib = id;
+
+ addr = cma_src_addr(id_priv);
+ svc_id = rdma_get_service_id(&id_priv->id, addr);
+ if (cma_any_addr(addr) && !id_priv->afonly)
+ ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
+ else {
+ cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
+ ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
+ }
+
+ if (ret) {
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ id_priv->cm_id.ib = NULL;
+ }
+
+ return ret;
+}
+
+static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
+{
+ int ret;
+ struct iw_cm_id *id;
+
+ id = iw_create_cm_id(id_priv->id.device,
+ iw_conn_req_handler,
+ id_priv);
+ if (IS_ERR(id))
+ return PTR_ERR(id);
+
+ id_priv->cm_id.iw = id;
+
+ memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv),
+ rdma_addr_size(cma_src_addr(id_priv)));
+
+ ret = iw_cm_listen(id_priv->cm_id.iw, backlog);
+
+ if (ret) {
+ iw_destroy_cm_id(id_priv->cm_id.iw);
+ id_priv->cm_id.iw = NULL;
+ }
+
+ return ret;
+}
+
+static int cma_listen_handler(struct rdma_cm_id *id,
+ struct rdma_cm_event *event)
+{
+ struct rdma_id_private *id_priv = id->context;
+
+ id->context = id_priv->id.context;
+ id->event_handler = id_priv->id.event_handler;
+ return id_priv->id.event_handler(id, event);
+}
+
+static void cma_listen_on_dev(struct rdma_id_private *id_priv,
+ struct cma_device *cma_dev)
+{
+ struct rdma_id_private *dev_id_priv;
+ struct rdma_cm_id *id;
+ int ret;
+
+ if (cma_family(id_priv) == AF_IB &&
+ rdma_node_get_transport(cma_dev->device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps,
+ id_priv->id.qp_type);
+ if (IS_ERR(id))
+ return;
+
+ dev_id_priv = container_of(id, struct rdma_id_private, id);
+
+ dev_id_priv->state = RDMA_CM_ADDR_BOUND;
+ memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv),
+ rdma_addr_size(cma_src_addr(id_priv)));
+
+ cma_attach_to_dev(dev_id_priv, cma_dev);
+ list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
+ atomic_inc(&id_priv->refcount);
+ dev_id_priv->internal_id = 1;
+ dev_id_priv->afonly = id_priv->afonly;
+
+ ret = rdma_listen(id, id_priv->backlog);
+ if (ret)
+ printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, "
+ "listening on device %s\n", ret, cma_dev->device->name);
+}
+
+static void cma_listen_on_all(struct rdma_id_private *id_priv)
+{
+ struct cma_device *cma_dev;
+
+ mutex_lock(&lock);
+ list_add_tail(&id_priv->list, &listen_any_list);
+ list_for_each_entry(cma_dev, &dev_list, list)
+ cma_listen_on_dev(id_priv, cma_dev);
+ mutex_unlock(&lock);
+}
+
+void rdma_set_service_type(struct rdma_cm_id *id, int tos)
+{
+ struct rdma_id_private *id_priv;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ id_priv->tos = (u8) tos;
+}
+EXPORT_SYMBOL(rdma_set_service_type);
+
+static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
+ void *context)
+{
+ struct cma_work *work = context;
+ struct rdma_route *route;
+
+ route = &work->id->id.route;
+
+ if (!status) {
+ route->num_paths = 1;
+ *route->path_rec = *path_rec;
+ } else {
+ work->old_state = RDMA_CM_ROUTE_QUERY;
+ work->new_state = RDMA_CM_ADDR_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
+ work->event.status = status;
+ }
+
+ queue_work(cma_wq, &work->work);
+}
+
+static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
+ struct cma_work *work)
+{
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ struct ib_sa_path_rec path_rec;
+ ib_sa_comp_mask comp_mask;
+ struct sockaddr_in6 *sin6;
+ struct sockaddr_ib *sib;
+
+ memset(&path_rec, 0, sizeof path_rec);
+ rdma_addr_get_sgid(dev_addr, &path_rec.sgid);
+ rdma_addr_get_dgid(dev_addr, &path_rec.dgid);
+ path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+ path_rec.numb_path = 1;
+ path_rec.reversible = 1;
+ path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
+
+ comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
+ IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
+ IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID;
+
+ switch (cma_family(id_priv)) {
+ case AF_INET:
+ path_rec.qos_class = cpu_to_be16((u16) id_priv->tos);
+ comp_mask |= IB_SA_PATH_REC_QOS_CLASS;
+ break;
+ case AF_INET6:
+ sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv);
+ path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20);
+ comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+ break;
+ case AF_IB:
+ sib = (struct sockaddr_ib *) cma_src_addr(id_priv);
+ path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20);
+ comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+ break;
+ }
+
+ id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device,
+ id_priv->id.port_num, &path_rec,
+ comp_mask, timeout_ms,
+ GFP_KERNEL, cma_query_handler,
+ work, &id_priv->query);
+
+ return (id_priv->query_id < 0) ? id_priv->query_id : 0;
+}
+
+static void cma_work_handler(struct work_struct *_work)
+{
+ struct cma_work *work = container_of(_work, struct cma_work, work);
+ struct rdma_id_private *id_priv = work->id;
+ int destroy = 0;
+
+ mutex_lock(&id_priv->handler_mutex);
+ if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
+ goto out;
+
+ if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
+ destroy = 1;
+ }
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ cma_deref_id(id_priv);
+ if (destroy)
+ rdma_destroy_id(&id_priv->id);
+ kfree(work);
+}
+
+static void cma_ndev_work_handler(struct work_struct *_work)
+{
+ struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work);
+ struct rdma_id_private *id_priv = work->id;
+ int destroy = 0;
+
+ mutex_lock(&id_priv->handler_mutex);
+ if (id_priv->state == RDMA_CM_DESTROYING ||
+ id_priv->state == RDMA_CM_DEVICE_REMOVAL)
+ goto out;
+
+ if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
+ destroy = 1;
+ }
+
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ cma_deref_id(id_priv);
+ if (destroy)
+ rdma_destroy_id(&id_priv->id);
+ kfree(work);
+}
+
+static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+ struct rdma_route *route = &id_priv->id.route;
+ struct cma_work *work;
+ int ret;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ work->id = id_priv;
+ INIT_WORK(&work->work, cma_work_handler);
+ work->old_state = RDMA_CM_ROUTE_QUERY;
+ work->new_state = RDMA_CM_ROUTE_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+
+ route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL);
+ if (!route->path_rec) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+
+ ret = cma_query_ib_route(id_priv, timeout_ms, work);
+ if (ret)
+ goto err2;
+
+ return 0;
+err2:
+ kfree(route->path_rec);
+ route->path_rec = NULL;
+err1:
+ kfree(work);
+ return ret;
+}
+
+int rdma_set_ib_paths(struct rdma_cm_id *id,
+ struct ib_sa_path_rec *path_rec, int num_paths)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+ RDMA_CM_ROUTE_RESOLVED))
+ return -EINVAL;
+
+ id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths,
+ GFP_KERNEL);
+ if (!id->route.path_rec) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ id->route.num_paths = num_paths;
+ return 0;
+err:
+ cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_set_ib_paths);
+
+static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+ struct cma_work *work;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ work->id = id_priv;
+ INIT_WORK(&work->work, cma_work_handler);
+ work->old_state = RDMA_CM_ROUTE_QUERY;
+ work->new_state = RDMA_CM_ROUTE_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+ queue_work(cma_wq, &work->work);
+ return 0;
+}
+
+static int iboe_tos_to_sl(struct net_device *ndev, int tos)
+{
+ int prio;
+ struct net_device *dev;
+
+ prio = rt_tos2priority(tos);
+ dev = ndev->priv_flags & IFF_802_1Q_VLAN ?
+ vlan_dev_real_dev(ndev) : ndev;
+
+ if (dev->num_tc)
+ return netdev_get_prio_tc_map(dev, prio);
+
+#if IS_ENABLED(CONFIG_VLAN_8021Q)
+ if (ndev->priv_flags & IFF_802_1Q_VLAN)
+ return (vlan_dev_get_egress_qos_mask(ndev, prio) &
+ VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+#endif
+ return 0;
+}
+
+static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
+{
+ struct rdma_route *route = &id_priv->id.route;
+ struct rdma_addr *addr = &route->addr;
+ struct cma_work *work;
+ int ret;
+ struct net_device *ndev = NULL;
+
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ work->id = id_priv;
+ INIT_WORK(&work->work, cma_work_handler);
+
+ route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL);
+ if (!route->path_rec) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+
+ route->num_paths = 1;
+
+ if (addr->dev_addr.bound_dev_if)
+ ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if);
+ if (!ndev) {
+ ret = -ENODEV;
+ goto err2;
+ }
+
+ route->path_rec->vlan_id = rdma_vlan_dev_vlan_id(ndev);
+ memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN);
+ memcpy(route->path_rec->smac, ndev->dev_addr, ndev->addr_len);
+
+ rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+ &route->path_rec->sgid);
+ rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr,
+ &route->path_rec->dgid);
+
+ route->path_rec->hop_limit = 1;
+ route->path_rec->reversible = 1;
+ route->path_rec->pkey = cpu_to_be16(0xffff);
+ route->path_rec->mtu_selector = IB_SA_EQ;
+ route->path_rec->sl = iboe_tos_to_sl(ndev, id_priv->tos);
+ route->path_rec->mtu = iboe_get_mtu(ndev->mtu);
+ route->path_rec->rate_selector = IB_SA_EQ;
+ route->path_rec->rate = iboe_get_rate(ndev);
+ dev_put(ndev);
+ route->path_rec->packet_life_time_selector = IB_SA_EQ;
+ route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME;
+ if (!route->path_rec->mtu) {
+ ret = -EINVAL;
+ goto err2;
+ }
+
+ work->old_state = RDMA_CM_ROUTE_QUERY;
+ work->new_state = RDMA_CM_ROUTE_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+ work->event.status = 0;
+
+ queue_work(cma_wq, &work->work);
+
+ return 0;
+
+err2:
+ kfree(route->path_rec);
+ route->path_rec = NULL;
+err1:
+ kfree(work);
+ return ret;
+}
+
+int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY))
+ return -EINVAL;
+
+ atomic_inc(&id_priv->refcount);
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ ret = cma_resolve_ib_route(id_priv, timeout_ms);
+ break;
+ case IB_LINK_LAYER_ETHERNET:
+ ret = cma_resolve_iboe_route(id_priv);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ ret = cma_resolve_iw_route(id_priv, timeout_ms);
+ break;
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED);
+ cma_deref_id(id_priv);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_route);
+
+static void cma_set_loopback(struct sockaddr *addr)
+{
+ switch (addr->sa_family) {
+ case AF_INET:
+ ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ break;
+ case AF_INET6:
+ ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr,
+ 0, 0, 0, htonl(1));
+ break;
+ default:
+ ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr,
+ 0, 0, 0, htonl(1));
+ break;
+ }
+}
+
+static int cma_bind_loopback(struct rdma_id_private *id_priv)
+{
+ struct cma_device *cma_dev, *cur_dev;
+ struct ib_port_attr port_attr;
+ union ib_gid gid;
+ u16 pkey;
+ int ret;
+ u8 p;
+
+ cma_dev = NULL;
+ mutex_lock(&lock);
+ list_for_each_entry(cur_dev, &dev_list, list) {
+ if (cma_family(id_priv) == AF_IB &&
+ rdma_node_get_transport(cur_dev->device->node_type) != RDMA_TRANSPORT_IB)
+ continue;
+
+ if (!cma_dev)
+ cma_dev = cur_dev;
+
+ for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) {
+ if (!ib_query_port(cur_dev->device, p, &port_attr) &&
+ port_attr.state == IB_PORT_ACTIVE) {
+ cma_dev = cur_dev;
+ goto port_found;
+ }
+ }
+ }
+
+ if (!cma_dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ p = 1;
+
+port_found:
+ ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid);
+ if (ret)
+ goto out;
+
+ ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey);
+ if (ret)
+ goto out;
+
+ id_priv->id.route.addr.dev_addr.dev_type =
+ (rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ?
+ ARPHRD_INFINIBAND : ARPHRD_ETHER;
+
+ rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+ ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey);
+ id_priv->id.port_num = p;
+ cma_attach_to_dev(id_priv, cma_dev);
+ cma_set_loopback(cma_src_addr(id_priv));
+out:
+ mutex_unlock(&lock);
+ return ret;
+}
+
+static void addr_handler(int status, struct sockaddr *src_addr,
+ struct rdma_dev_addr *dev_addr, void *context)
+{
+ struct rdma_id_private *id_priv = context;
+ struct rdma_cm_event event;
+
+ memset(&event, 0, sizeof event);
+ mutex_lock(&id_priv->handler_mutex);
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY,
+ RDMA_CM_ADDR_RESOLVED))
+ goto out;
+
+ memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr));
+ if (!status && !id_priv->cma_dev)
+ status = cma_acquire_dev(id_priv, NULL);
+
+ if (status) {
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+ RDMA_CM_ADDR_BOUND))
+ goto out;
+ event.event = RDMA_CM_EVENT_ADDR_ERROR;
+ event.status = status;
+ } else
+ event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+
+ if (id_priv->id.event_handler(&id_priv->id, &event)) {
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
+ mutex_unlock(&id_priv->handler_mutex);
+ cma_deref_id(id_priv);
+ rdma_destroy_id(&id_priv->id);
+ return;
+ }
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ cma_deref_id(id_priv);
+}
+
+static int cma_resolve_loopback(struct rdma_id_private *id_priv)
+{
+ struct cma_work *work;
+ union ib_gid gid;
+ int ret;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ if (!id_priv->cma_dev) {
+ ret = cma_bind_loopback(id_priv);
+ if (ret)
+ goto err;
+ }
+
+ rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+ rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid);
+
+ work->id = id_priv;
+ INIT_WORK(&work->work, cma_work_handler);
+ work->old_state = RDMA_CM_ADDR_QUERY;
+ work->new_state = RDMA_CM_ADDR_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+ queue_work(cma_wq, &work->work);
+ return 0;
+err:
+ kfree(work);
+ return ret;
+}
+
+static int cma_resolve_ib_addr(struct rdma_id_private *id_priv)
+{
+ struct cma_work *work;
+ int ret;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ if (!id_priv->cma_dev) {
+ ret = cma_resolve_ib_dev(id_priv);
+ if (ret)
+ goto err;
+ }
+
+ rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *)
+ &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr));
+
+ work->id = id_priv;
+ INIT_WORK(&work->work, cma_work_handler);
+ work->old_state = RDMA_CM_ADDR_QUERY;
+ work->new_state = RDMA_CM_ADDR_RESOLVED;
+ work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+ queue_work(cma_wq, &work->work);
+ return 0;
+err:
+ kfree(work);
+ return ret;
+}
+
+static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+ struct sockaddr *dst_addr)
+{
+ if (!src_addr || !src_addr->sa_family) {
+ src_addr = (struct sockaddr *) &id->route.addr.src_addr;
+ src_addr->sa_family = dst_addr->sa_family;
+ if (dst_addr->sa_family == AF_INET6) {
+ ((struct sockaddr_in6 *) src_addr)->sin6_scope_id =
+ ((struct sockaddr_in6 *) dst_addr)->sin6_scope_id;
+ } else if (dst_addr->sa_family == AF_IB) {
+ ((struct sockaddr_ib *) src_addr)->sib_pkey =
+ ((struct sockaddr_ib *) dst_addr)->sib_pkey;
+ }
+ }
+ return rdma_bind_addr(id, src_addr);
+}
+
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+ struct sockaddr *dst_addr, int timeout_ms)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (id_priv->state == RDMA_CM_IDLE) {
+ ret = cma_bind_addr(id, src_addr, dst_addr);
+ if (ret)
+ return ret;
+ }
+
+ if (cma_family(id_priv) != dst_addr->sa_family)
+ return -EINVAL;
+
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY))
+ return -EINVAL;
+
+ atomic_inc(&id_priv->refcount);
+ memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr));
+ if (cma_any_addr(dst_addr)) {
+ ret = cma_resolve_loopback(id_priv);
+ } else {
+ if (dst_addr->sa_family == AF_IB) {
+ ret = cma_resolve_ib_addr(id_priv);
+ } else {
+ ret = rdma_resolve_ip(&addr_client, cma_src_addr(id_priv),
+ dst_addr, &id->route.addr.dev_addr,
+ timeout_ms, addr_handler, id_priv);
+ }
+ }
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND);
+ cma_deref_id(id_priv);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_addr);
+
+int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse)
+{
+ struct rdma_id_private *id_priv;
+ unsigned long flags;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ spin_lock_irqsave(&id_priv->lock, flags);
+ if (reuse || id_priv->state == RDMA_CM_IDLE) {
+ id_priv->reuseaddr = reuse;
+ ret = 0;
+ } else {
+ ret = -EINVAL;
+ }
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_set_reuseaddr);
+
+int rdma_set_afonly(struct rdma_cm_id *id, int afonly)
+{
+ struct rdma_id_private *id_priv;
+ unsigned long flags;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ spin_lock_irqsave(&id_priv->lock, flags);
+ if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) {
+ id_priv->options |= (1 << CMA_OPTION_AFONLY);
+ id_priv->afonly = afonly;
+ ret = 0;
+ } else {
+ ret = -EINVAL;
+ }
+ spin_unlock_irqrestore(&id_priv->lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_set_afonly);
+
+static void cma_bind_port(struct rdma_bind_list *bind_list,
+ struct rdma_id_private *id_priv)
+{
+ struct sockaddr *addr;
+ struct sockaddr_ib *sib;
+ u64 sid, mask;
+ __be16 port;
+
+ addr = cma_src_addr(id_priv);
+ port = htons(bind_list->port);
+
+ switch (addr->sa_family) {
+ case AF_INET:
+ ((struct sockaddr_in *) addr)->sin_port = port;
+ break;
+ case AF_INET6:
+ ((struct sockaddr_in6 *) addr)->sin6_port = port;
+ break;
+ case AF_IB:
+ sib = (struct sockaddr_ib *) addr;
+ sid = be64_to_cpu(sib->sib_sid);
+ mask = be64_to_cpu(sib->sib_sid_mask);
+ sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port));
+ sib->sib_sid_mask = cpu_to_be64(~0ULL);
+ break;
+ }
+ id_priv->bind_list = bind_list;
+ hlist_add_head(&id_priv->node, &bind_list->owners);
+}
+
+static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
+ unsigned short snum)
+{
+ struct rdma_bind_list *bind_list;
+ int ret;
+
+ bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
+ if (!bind_list)
+ return -ENOMEM;
+
+ ret = idr_alloc(ps, bind_list, snum, snum + 1, GFP_KERNEL);
+ if (ret < 0)
+ goto err;
+
+ bind_list->ps = ps;
+ bind_list->port = (unsigned short)ret;
+ cma_bind_port(bind_list, id_priv);
+ return 0;
+err:
+ kfree(bind_list);
+ return ret == -ENOSPC ? -EADDRNOTAVAIL : ret;
+}
+
+static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
+{
+ static unsigned int last_used_port;
+ int low, high, remaining;
+ unsigned int rover;
+
+ inet_get_local_port_range(&init_net, &low, &high);
+ remaining = (high - low) + 1;
+ rover = prandom_u32() % remaining + low;
+retry:
+ if (last_used_port != rover &&
+ !idr_find(ps, (unsigned short) rover)) {
+ int ret = cma_alloc_port(ps, id_priv, rover);
+ /*
+ * Remember previously used port number in order to avoid
+ * re-using same port immediately after it is closed.
+ */
+ if (!ret)
+ last_used_port = rover;
+ if (ret != -EADDRNOTAVAIL)
+ return ret;
+ }
+ if (--remaining) {
+ rover++;
+ if ((rover < low) || (rover > high))
+ rover = low;
+ goto retry;
+ }
+ return -EADDRNOTAVAIL;
+}
+
+/*
+ * Check that the requested port is available. This is called when trying to
+ * bind to a specific port, or when trying to listen on a bound port. In
+ * the latter case, the provided id_priv may already be on the bind_list, but
+ * we still need to check that it's okay to start listening.
+ */
+static int cma_check_port(struct rdma_bind_list *bind_list,
+ struct rdma_id_private *id_priv, uint8_t reuseaddr)
+{
+ struct rdma_id_private *cur_id;
+ struct sockaddr *addr, *cur_addr;
+
+ addr = cma_src_addr(id_priv);
+ hlist_for_each_entry(cur_id, &bind_list->owners, node) {
+ if (id_priv == cur_id)
+ continue;
+
+ if ((cur_id->state != RDMA_CM_LISTEN) && reuseaddr &&
+ cur_id->reuseaddr)
+ continue;
+
+ cur_addr = cma_src_addr(cur_id);
+ if (id_priv->afonly && cur_id->afonly &&
+ (addr->sa_family != cur_addr->sa_family))
+ continue;
+
+ if (cma_any_addr(addr) || cma_any_addr(cur_addr))
+ return -EADDRNOTAVAIL;
+
+ if (!cma_addr_cmp(addr, cur_addr))
+ return -EADDRINUSE;
+ }
+ return 0;
+}
+
+static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
+{
+ struct rdma_bind_list *bind_list;
+ unsigned short snum;
+ int ret;
+
+ snum = ntohs(cma_port(cma_src_addr(id_priv)));
+ if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+ return -EACCES;
+
+ bind_list = idr_find(ps, snum);
+ if (!bind_list) {
+ ret = cma_alloc_port(ps, id_priv, snum);
+ } else {
+ ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr);
+ if (!ret)
+ cma_bind_port(bind_list, id_priv);
+ }
+ return ret;
+}
+
+static int cma_bind_listen(struct rdma_id_private *id_priv)
+{
+ struct rdma_bind_list *bind_list = id_priv->bind_list;
+ int ret = 0;
+
+ mutex_lock(&lock);
+ if (bind_list->owners.first->next)
+ ret = cma_check_port(bind_list, id_priv, 0);
+ mutex_unlock(&lock);
+ return ret;
+}
+
+static struct idr *cma_select_inet_ps(struct rdma_id_private *id_priv)
+{
+ switch (id_priv->id.ps) {
+ case RDMA_PS_TCP:
+ return &tcp_ps;
+ case RDMA_PS_UDP:
+ return &udp_ps;
+ case RDMA_PS_IPOIB:
+ return &ipoib_ps;
+ case RDMA_PS_IB:
+ return &ib_ps;
+ default:
+ return NULL;
+ }
+}
+
+static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
+{
+ struct idr *ps = NULL;
+ struct sockaddr_ib *sib;
+ u64 sid_ps, mask, sid;
+
+ sib = (struct sockaddr_ib *) cma_src_addr(id_priv);
+ mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK;
+ sid = be64_to_cpu(sib->sib_sid) & mask;
+
+ if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) {
+ sid_ps = RDMA_IB_IP_PS_IB;
+ ps = &ib_ps;
+ } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) &&
+ (sid == (RDMA_IB_IP_PS_TCP & mask))) {
+ sid_ps = RDMA_IB_IP_PS_TCP;
+ ps = &tcp_ps;
+ } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) &&
+ (sid == (RDMA_IB_IP_PS_UDP & mask))) {
+ sid_ps = RDMA_IB_IP_PS_UDP;
+ ps = &udp_ps;
+ }
+
+ if (ps) {
+ sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib)));
+ sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK |
+ be64_to_cpu(sib->sib_sid_mask));
+ }
+ return ps;
+}
+
+static int cma_get_port(struct rdma_id_private *id_priv)
+{
+ struct idr *ps;
+ int ret;
+
+ if (cma_family(id_priv) != AF_IB)
+ ps = cma_select_inet_ps(id_priv);
+ else
+ ps = cma_select_ib_ps(id_priv);
+ if (!ps)
+ return -EPROTONOSUPPORT;
+
+ mutex_lock(&lock);
+ if (cma_any_port(cma_src_addr(id_priv)))
+ ret = cma_alloc_any_port(ps, id_priv);
+ else
+ ret = cma_use_port(ps, id_priv);
+ mutex_unlock(&lock);
+
+ return ret;
+}
+
+static int cma_check_linklocal(struct rdma_dev_addr *dev_addr,
+ struct sockaddr *addr)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ struct sockaddr_in6 *sin6;
+
+ if (addr->sa_family != AF_INET6)
+ return 0;
+
+ sin6 = (struct sockaddr_in6 *) addr;
+
+ if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+ return 0;
+
+ if (!sin6->sin6_scope_id)
+ return -EINVAL;
+
+ dev_addr->bound_dev_if = sin6->sin6_scope_id;
+#endif
+ return 0;
+}
+
+int rdma_listen(struct rdma_cm_id *id, int backlog)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (id_priv->state == RDMA_CM_IDLE) {
+ id->route.addr.src_addr.ss_family = AF_INET;
+ ret = rdma_bind_addr(id, cma_src_addr(id_priv));
+ if (ret)
+ return ret;
+ }
+
+ if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN))
+ return -EINVAL;
+
+ if (id_priv->reuseaddr) {
+ ret = cma_bind_listen(id_priv);
+ if (ret)
+ goto err;
+ }
+
+ id_priv->backlog = backlog;
+ if (id->device) {
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ ret = cma_ib_listen(id_priv);
+ if (ret)
+ goto err;
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ ret = cma_iw_listen(id_priv, backlog);
+ if (ret)
+ goto err;
+ break;
+ default:
+ ret = -ENOSYS;
+ goto err;
+ }
+ } else
+ cma_listen_on_all(id_priv);
+
+ return 0;
+err:
+ id_priv->backlog = 0;
+ cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_listen);
+
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 &&
+ addr->sa_family != AF_IB)
+ return -EAFNOSUPPORT;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND))
+ return -EINVAL;
+
+ ret = cma_check_linklocal(&id->route.addr.dev_addr, addr);
+ if (ret)
+ goto err1;
+
+ memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr));
+ if (!cma_any_addr(addr)) {
+ ret = cma_translate_addr(addr, &id->route.addr.dev_addr);
+ if (ret)
+ goto err1;
+
+ ret = cma_acquire_dev(id_priv, NULL);
+ if (ret)
+ goto err1;
+ }
+
+ if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) {
+ if (addr->sa_family == AF_INET)
+ id_priv->afonly = 1;
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (addr->sa_family == AF_INET6)
+ id_priv->afonly = init_net.ipv6.sysctl.bindv6only;
+#endif
+ }
+ ret = cma_get_port(id_priv);
+ if (ret)
+ goto err2;
+
+ return 0;
+err2:
+ if (id_priv->cma_dev)
+ cma_release_dev(id_priv);
+err1:
+ cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_bind_addr);
+
+static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv)
+{
+ struct cma_hdr *cma_hdr;
+
+ cma_hdr = hdr;
+ cma_hdr->cma_version = CMA_VERSION;
+ if (cma_family(id_priv) == AF_INET) {
+ struct sockaddr_in *src4, *dst4;
+
+ src4 = (struct sockaddr_in *) cma_src_addr(id_priv);
+ dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv);
+
+ cma_set_ip_ver(cma_hdr, 4);
+ cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
+ cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
+ cma_hdr->port = src4->sin_port;
+ } else if (cma_family(id_priv) == AF_INET6) {
+ struct sockaddr_in6 *src6, *dst6;
+
+ src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv);
+ dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv);
+
+ cma_set_ip_ver(cma_hdr, 6);
+ cma_hdr->src_addr.ip6 = src6->sin6_addr;
+ cma_hdr->dst_addr.ip6 = dst6->sin6_addr;
+ cma_hdr->port = src6->sin6_port;
+ }
+ return 0;
+}
+
+static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
+ struct ib_cm_event *ib_event)
+{
+ struct rdma_id_private *id_priv = cm_id->context;
+ struct rdma_cm_event event;
+ struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd;
+ int ret = 0;
+
+ if (cma_disable_callback(id_priv, RDMA_CM_CONNECT))
+ return 0;
+
+ memset(&event, 0, sizeof event);
+ switch (ib_event->event) {
+ case IB_CM_SIDR_REQ_ERROR:
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = -ETIMEDOUT;
+ break;
+ case IB_CM_SIDR_REP_RECEIVED:
+ event.param.ud.private_data = ib_event->private_data;
+ event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+ if (rep->status != IB_SIDR_SUCCESS) {
+ event.event = RDMA_CM_EVENT_UNREACHABLE;
+ event.status = ib_event->param.sidr_rep_rcvd.status;
+ break;
+ }
+ ret = cma_set_qkey(id_priv, rep->qkey);
+ if (ret) {
+ event.event = RDMA_CM_EVENT_ADDR_ERROR;
+ event.status = ret;
+ break;
+ }
+ ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num,
+ id_priv->id.route.path_rec,
+ &event.param.ud.ah_attr);
+ event.param.ud.qp_num = rep->qpn;
+ event.param.ud.qkey = rep->qkey;
+ event.event = RDMA_CM_EVENT_ESTABLISHED;
+ event.status = 0;
+ break;
+ default:
+ printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
+ ib_event->event);
+ goto out;
+ }
+
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+ if (ret) {
+ /* Destroy the CM ID by returning a non-zero value. */
+ id_priv->cm_id.ib = NULL;
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
+ mutex_unlock(&id_priv->handler_mutex);
+ rdma_destroy_id(&id_priv->id);
+ return ret;
+ }
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ return ret;
+}
+
+static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_cm_sidr_req_param req;
+ struct ib_cm_id *id;
+ void *private_data;
+ int offset, ret;
+
+ memset(&req, 0, sizeof req);
+ offset = cma_user_data_offset(id_priv);
+ req.private_data_len = offset + conn_param->private_data_len;
+ if (req.private_data_len < conn_param->private_data_len)
+ return -EINVAL;
+
+ if (req.private_data_len) {
+ private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+ if (!private_data)
+ return -ENOMEM;
+ } else {
+ private_data = NULL;
+ }
+
+ if (conn_param->private_data && conn_param->private_data_len)
+ memcpy(private_data + offset, conn_param->private_data,
+ conn_param->private_data_len);
+
+ if (private_data) {
+ ret = cma_format_hdr(private_data, id_priv);
+ if (ret)
+ goto out;
+ req.private_data = private_data;
+ }
+
+ id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler,
+ id_priv);
+ if (IS_ERR(id)) {
+ ret = PTR_ERR(id);
+ goto out;
+ }
+ id_priv->cm_id.ib = id;
+
+ req.path = id_priv->id.route.path_rec;
+ req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
+ req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8);
+ req.max_cm_retries = CMA_MAX_CM_RETRIES;
+
+ ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
+ if (ret) {
+ ib_destroy_cm_id(id_priv->cm_id.ib);
+ id_priv->cm_id.ib = NULL;
+ }
+out:
+ kfree(private_data);
+ return ret;
+}
+
+static int cma_connect_ib(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_cm_req_param req;
+ struct rdma_route *route;
+ void *private_data;
+ struct ib_cm_id *id;
+ int offset, ret;
+
+ memset(&req, 0, sizeof req);
+ offset = cma_user_data_offset(id_priv);
+ req.private_data_len = offset + conn_param->private_data_len;
+ if (req.private_data_len < conn_param->private_data_len)
+ return -EINVAL;
+
+ if (req.private_data_len) {
+ private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+ if (!private_data)
+ return -ENOMEM;
+ } else {
+ private_data = NULL;
+ }
+
+ if (conn_param->private_data && conn_param->private_data_len)
+ memcpy(private_data + offset, conn_param->private_data,
+ conn_param->private_data_len);
+
+ id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv);
+ if (IS_ERR(id)) {
+ ret = PTR_ERR(id);
+ goto out;
+ }
+ id_priv->cm_id.ib = id;
+
+ route = &id_priv->id.route;
+ if (private_data) {
+ ret = cma_format_hdr(private_data, id_priv);
+ if (ret)
+ goto out;
+ req.private_data = private_data;
+ }
+
+ req.primary_path = &route->path_rec[0];
+ if (route->num_paths == 2)
+ req.alternate_path = &route->path_rec[1];
+
+ req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
+ req.qp_num = id_priv->qp_num;
+ req.qp_type = id_priv->id.qp_type;
+ req.starting_psn = id_priv->seq_num;
+ req.responder_resources = conn_param->responder_resources;
+ req.initiator_depth = conn_param->initiator_depth;
+ req.flow_control = conn_param->flow_control;
+ req.retry_count = min_t(u8, 7, conn_param->retry_count);
+ req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
+ req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+ req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+ req.max_cm_retries = CMA_MAX_CM_RETRIES;
+ req.srq = id_priv->srq ? 1 : 0;
+
+ ret = ib_send_cm_req(id_priv->cm_id.ib, &req);
+out:
+ if (ret && !IS_ERR(id)) {
+ ib_destroy_cm_id(id);
+ id_priv->cm_id.ib = NULL;
+ }
+
+ kfree(private_data);
+ return ret;
+}
+
+static int cma_connect_iw(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct iw_cm_id *cm_id;
+ int ret;
+ struct iw_cm_conn_param iw_param;
+
+ cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv);
+ if (IS_ERR(cm_id))
+ return PTR_ERR(cm_id);
+
+ id_priv->cm_id.iw = cm_id;
+
+ memcpy(&cm_id->local_addr, cma_src_addr(id_priv),
+ rdma_addr_size(cma_src_addr(id_priv)));
+ memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv),
+ rdma_addr_size(cma_dst_addr(id_priv)));
+
+ ret = cma_modify_qp_rtr(id_priv, conn_param);
+ if (ret)
+ goto out;
+
+ if (conn_param) {
+ iw_param.ord = conn_param->initiator_depth;
+ iw_param.ird = conn_param->responder_resources;
+ iw_param.private_data = conn_param->private_data;
+ iw_param.private_data_len = conn_param->private_data_len;
+ iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num;
+ } else {
+ memset(&iw_param, 0, sizeof iw_param);
+ iw_param.qpn = id_priv->qp_num;
+ }
+ ret = iw_cm_connect(cm_id, &iw_param);
+out:
+ if (ret) {
+ iw_destroy_cm_id(cm_id);
+ id_priv->cm_id.iw = NULL;
+ }
+ return ret;
+}
+
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT))
+ return -EINVAL;
+
+ if (!id->qp) {
+ id_priv->qp_num = conn_param->qp_num;
+ id_priv->srq = conn_param->srq;
+ }
+
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (id->qp_type == IB_QPT_UD)
+ ret = cma_resolve_ib_udp(id_priv, conn_param);
+ else
+ ret = cma_connect_ib(id_priv, conn_param);
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ ret = cma_connect_iw(id_priv, conn_param);
+ break;
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+ if (ret)
+ goto err;
+
+ return 0;
+err:
+ cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_connect);
+
+static int cma_accept_ib(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct ib_cm_rep_param rep;
+ int ret;
+
+ ret = cma_modify_qp_rtr(id_priv, conn_param);
+ if (ret)
+ goto out;
+
+ ret = cma_modify_qp_rts(id_priv, conn_param);
+ if (ret)
+ goto out;
+
+ memset(&rep, 0, sizeof rep);
+ rep.qp_num = id_priv->qp_num;
+ rep.starting_psn = id_priv->seq_num;
+ rep.private_data = conn_param->private_data;
+ rep.private_data_len = conn_param->private_data_len;
+ rep.responder_resources = conn_param->responder_resources;
+ rep.initiator_depth = conn_param->initiator_depth;
+ rep.failover_accepted = 0;
+ rep.flow_control = conn_param->flow_control;
+ rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
+ rep.srq = id_priv->srq ? 1 : 0;
+
+ ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep);
+out:
+ return ret;
+}
+
+static int cma_accept_iw(struct rdma_id_private *id_priv,
+ struct rdma_conn_param *conn_param)
+{
+ struct iw_cm_conn_param iw_param;
+ int ret;
+
+ ret = cma_modify_qp_rtr(id_priv, conn_param);
+ if (ret)
+ return ret;
+
+ iw_param.ord = conn_param->initiator_depth;
+ iw_param.ird = conn_param->responder_resources;
+ iw_param.private_data = conn_param->private_data;
+ iw_param.private_data_len = conn_param->private_data_len;
+ if (id_priv->id.qp) {
+ iw_param.qpn = id_priv->qp_num;
+ } else
+ iw_param.qpn = conn_param->qp_num;
+
+ return iw_cm_accept(id_priv->cm_id.iw, &iw_param);
+}
+
+static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
+ enum ib_cm_sidr_status status, u32 qkey,
+ const void *private_data, int private_data_len)
+{
+ struct ib_cm_sidr_rep_param rep;
+ int ret;
+
+ memset(&rep, 0, sizeof rep);
+ rep.status = status;
+ if (status == IB_SIDR_SUCCESS) {
+ ret = cma_set_qkey(id_priv, qkey);
+ if (ret)
+ return ret;
+ rep.qp_num = id_priv->qp_num;
+ rep.qkey = id_priv->qkey;
+ }
+ rep.private_data = private_data;
+ rep.private_data_len = private_data_len;
+
+ return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep);
+}
+
+int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+
+ id_priv->owner = task_pid_nr(current);
+
+ if (!cma_comp(id_priv, RDMA_CM_CONNECT))
+ return -EINVAL;
+
+ if (!id->qp && conn_param) {
+ id_priv->qp_num = conn_param->qp_num;
+ id_priv->srq = conn_param->srq;
+ }
+
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (id->qp_type == IB_QPT_UD) {
+ if (conn_param)
+ ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+ conn_param->qkey,
+ conn_param->private_data,
+ conn_param->private_data_len);
+ else
+ ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+ 0, NULL, 0);
+ } else {
+ if (conn_param)
+ ret = cma_accept_ib(id_priv, conn_param);
+ else
+ ret = cma_rep_recv(id_priv);
+ }
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ ret = cma_accept_iw(id_priv, conn_param);
+ break;
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+
+ if (ret)
+ goto reject;
+
+ return 0;
+reject:
+ cma_modify_qp_err(id_priv);
+ rdma_reject(id, NULL, 0);
+ return ret;
+}
+EXPORT_SYMBOL(rdma_accept);
+
+int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!id_priv->cm_id.ib)
+ return -EINVAL;
+
+ switch (id->device->node_type) {
+ case RDMA_NODE_IB_CA:
+ ret = ib_cm_notify(id_priv->cm_id.ib, event);
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rdma_notify);
+
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+ u8 private_data_len)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!id_priv->cm_id.ib)
+ return -EINVAL;
+
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ if (id->qp_type == IB_QPT_UD)
+ ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0,
+ private_data, private_data_len);
+ else
+ ret = ib_send_cm_rej(id_priv->cm_id.ib,
+ IB_CM_REJ_CONSUMER_DEFINED, NULL,
+ 0, private_data, private_data_len);
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ ret = iw_cm_reject(id_priv->cm_id.iw,
+ private_data, private_data_len);
+ break;
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rdma_reject);
+
+int rdma_disconnect(struct rdma_cm_id *id)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!id_priv->cm_id.ib)
+ return -EINVAL;
+
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ ret = cma_modify_qp_err(id_priv);
+ if (ret)
+ goto out;
+ /* Initiate or respond to a disconnect. */
+ if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0))
+ ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0);
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+out:
+ return ret;
+}
+EXPORT_SYMBOL(rdma_disconnect);
+
+static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
+{
+ struct rdma_id_private *id_priv;
+ struct cma_multicast *mc = multicast->context;
+ struct rdma_cm_event event;
+ int ret;
+
+ id_priv = mc->id_priv;
+ if (cma_disable_callback(id_priv, RDMA_CM_ADDR_BOUND) &&
+ cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED))
+ return 0;
+
+ if (!status)
+ status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey));
+ mutex_lock(&id_priv->qp_mutex);
+ if (!status && id_priv->id.qp)
+ status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid,
+ be16_to_cpu(multicast->rec.mlid));
+ mutex_unlock(&id_priv->qp_mutex);
+
+ memset(&event, 0, sizeof event);
+ event.status = status;
+ event.param.ud.private_data = mc->context;
+ if (!status) {
+ event.event = RDMA_CM_EVENT_MULTICAST_JOIN;
+ ib_init_ah_from_mcmember(id_priv->id.device,
+ id_priv->id.port_num, &multicast->rec,
+ &event.param.ud.ah_attr);
+ event.param.ud.qp_num = 0xFFFFFF;
+ event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey);
+ } else
+ event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
+
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+ if (ret) {
+ cma_exch(id_priv, RDMA_CM_DESTROYING);
+ mutex_unlock(&id_priv->handler_mutex);
+ rdma_destroy_id(&id_priv->id);
+ return 0;
+ }
+
+ mutex_unlock(&id_priv->handler_mutex);
+ return 0;
+}
+
+static void cma_set_mgid(struct rdma_id_private *id_priv,
+ struct sockaddr *addr, union ib_gid *mgid)
+{
+ unsigned char mc_map[MAX_ADDR_LEN];
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ struct sockaddr_in *sin = (struct sockaddr_in *) addr;
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr;
+
+ if (cma_any_addr(addr)) {
+ memset(mgid, 0, sizeof *mgid);
+ } else if ((addr->sa_family == AF_INET6) &&
+ ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) ==
+ 0xFF10A01B)) {
+ /* IPv6 address is an SA assigned MGID. */
+ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+ } else if (addr->sa_family == AF_IB) {
+ memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid);
+ } else if ((addr->sa_family == AF_INET6)) {
+ ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map);
+ if (id_priv->id.ps == RDMA_PS_UDP)
+ mc_map[7] = 0x01; /* Use RDMA CM signature */
+ *mgid = *(union ib_gid *) (mc_map + 4);
+ } else {
+ ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map);
+ if (id_priv->id.ps == RDMA_PS_UDP)
+ mc_map[7] = 0x01; /* Use RDMA CM signature */
+ *mgid = *(union ib_gid *) (mc_map + 4);
+ }
+}
+
+static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
+ struct cma_multicast *mc)
+{
+ struct ib_sa_mcmember_rec rec;
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ ib_sa_comp_mask comp_mask;
+ int ret;
+
+ ib_addr_get_mgid(dev_addr, &rec.mgid);
+ ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num,
+ &rec.mgid, &rec);
+ if (ret)
+ return ret;
+
+ ret = cma_set_qkey(id_priv, 0);
+ if (ret)
+ return ret;
+
+ cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid);
+ rec.qkey = cpu_to_be32(id_priv->qkey);
+ rdma_addr_get_sgid(dev_addr, &rec.port_gid);
+ rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+ rec.join_state = 1;
+
+ comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
+ IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE |
+ IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL |
+ IB_SA_MCMEMBER_REC_FLOW_LABEL |
+ IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
+
+ if (id_priv->id.ps == RDMA_PS_IPOIB)
+ comp_mask |= IB_SA_MCMEMBER_REC_RATE |
+ IB_SA_MCMEMBER_REC_RATE_SELECTOR |
+ IB_SA_MCMEMBER_REC_MTU_SELECTOR |
+ IB_SA_MCMEMBER_REC_MTU |
+ IB_SA_MCMEMBER_REC_HOP_LIMIT;
+
+ mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device,
+ id_priv->id.port_num, &rec,
+ comp_mask, GFP_KERNEL,
+ cma_ib_mc_handler, mc);
+ return PTR_ERR_OR_ZERO(mc->multicast.ib);
+}
+
+static void iboe_mcast_work_handler(struct work_struct *work)
+{
+ struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work);
+ struct cma_multicast *mc = mw->mc;
+ struct ib_sa_multicast *m = mc->multicast.ib;
+
+ mc->multicast.ib->context = mc;
+ cma_ib_mc_handler(0, m);
+ kref_put(&mc->mcref, release_mc);
+ kfree(mw);
+}
+
+static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr;
+
+ if (cma_any_addr(addr)) {
+ memset(mgid, 0, sizeof *mgid);
+ } else if (addr->sa_family == AF_INET6) {
+ memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+ } else {
+ mgid->raw[0] = 0xff;
+ mgid->raw[1] = 0x0e;
+ mgid->raw[2] = 0;
+ mgid->raw[3] = 0;
+ mgid->raw[4] = 0;
+ mgid->raw[5] = 0;
+ mgid->raw[6] = 0;
+ mgid->raw[7] = 0;
+ mgid->raw[8] = 0;
+ mgid->raw[9] = 0;
+ mgid->raw[10] = 0xff;
+ mgid->raw[11] = 0xff;
+ *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr;
+ }
+}
+
+static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
+ struct cma_multicast *mc)
+{
+ struct iboe_mcast_work *work;
+ struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+ int err;
+ struct sockaddr *addr = (struct sockaddr *)&mc->addr;
+ struct net_device *ndev = NULL;
+
+ if (cma_zero_addr((struct sockaddr *)&mc->addr))
+ return -EINVAL;
+
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL);
+ if (!mc->multicast.ib) {
+ err = -ENOMEM;
+ goto out1;
+ }
+
+ cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid);
+
+ mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff);
+ if (id_priv->id.ps == RDMA_PS_UDP)
+ mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
+
+ if (dev_addr->bound_dev_if)
+ ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+ if (!ndev) {
+ err = -ENODEV;
+ goto out2;
+ }
+ mc->multicast.ib->rec.rate = iboe_get_rate(ndev);
+ mc->multicast.ib->rec.hop_limit = 1;
+ mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu);
+ dev_put(ndev);
+ if (!mc->multicast.ib->rec.mtu) {
+ err = -EINVAL;
+ goto out2;
+ }
+ rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+ &mc->multicast.ib->rec.port_gid);
+ work->id = id_priv;
+ work->mc = mc;
+ INIT_WORK(&work->work, iboe_mcast_work_handler);
+ kref_get(&mc->mcref);
+ queue_work(cma_wq, &work->work);
+
+ return 0;
+
+out2:
+ kfree(mc->multicast.ib);
+out1:
+ kfree(work);
+ return err;
+}
+
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+ void *context)
+{
+ struct rdma_id_private *id_priv;
+ struct cma_multicast *mc;
+ int ret;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) &&
+ !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED))
+ return -EINVAL;
+
+ mc = kmalloc(sizeof *mc, GFP_KERNEL);
+ if (!mc)
+ return -ENOMEM;
+
+ memcpy(&mc->addr, addr, rdma_addr_size(addr));
+ mc->context = context;
+ mc->id_priv = id_priv;
+
+ spin_lock(&id_priv->lock);
+ list_add(&mc->list, &id_priv->mc_list);
+ spin_unlock(&id_priv->lock);
+
+ switch (rdma_node_get_transport(id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ ret = cma_join_ib_multicast(id_priv, mc);
+ break;
+ case IB_LINK_LAYER_ETHERNET:
+ kref_init(&mc->mcref);
+ ret = cma_iboe_join_multicast(id_priv, mc);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ break;
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+
+ if (ret) {
+ spin_lock_irq(&id_priv->lock);
+ list_del(&mc->list);
+ spin_unlock_irq(&id_priv->lock);
+ kfree(mc);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(rdma_join_multicast);
+
+void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+ struct rdma_id_private *id_priv;
+ struct cma_multicast *mc;
+
+ id_priv = container_of(id, struct rdma_id_private, id);
+ spin_lock_irq(&id_priv->lock);
+ list_for_each_entry(mc, &id_priv->mc_list, list) {
+ if (!memcmp(&mc->addr, addr, rdma_addr_size(addr))) {
+ list_del(&mc->list);
+ spin_unlock_irq(&id_priv->lock);
+
+ if (id->qp)
+ ib_detach_mcast(id->qp,
+ &mc->multicast.ib->rec.mgid,
+ be16_to_cpu(mc->multicast.ib->rec.mlid));
+ if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) {
+ switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ ib_sa_free_multicast(mc->multicast.ib);
+ kfree(mc);
+ break;
+ case IB_LINK_LAYER_ETHERNET:
+ kref_put(&mc->mcref, release_mc);
+ break;
+ default:
+ break;
+ }
+ }
+ return;
+ }
+ }
+ spin_unlock_irq(&id_priv->lock);
+}
+EXPORT_SYMBOL(rdma_leave_multicast);
+
+static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv)
+{
+ struct rdma_dev_addr *dev_addr;
+ struct cma_ndev_work *work;
+
+ dev_addr = &id_priv->id.route.addr.dev_addr;
+
+ if ((dev_addr->bound_dev_if == ndev->ifindex) &&
+ memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) {
+ printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n",
+ ndev->name, &id_priv->id);
+ work = kzalloc(sizeof *work, GFP_KERNEL);
+ if (!work)
+ return -ENOMEM;
+
+ INIT_WORK(&work->work, cma_ndev_work_handler);
+ work->id = id_priv;
+ work->event.event = RDMA_CM_EVENT_ADDR_CHANGE;
+ atomic_inc(&id_priv->refcount);
+ queue_work(cma_wq, &work->work);
+ }
+
+ return 0;
+}
+
+static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
+ void *ptr)
+{
+ struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+ struct cma_device *cma_dev;
+ struct rdma_id_private *id_priv;
+ int ret = NOTIFY_DONE;
+
+ if (dev_net(ndev) != &init_net)
+ return NOTIFY_DONE;
+
+ if (event != NETDEV_BONDING_FAILOVER)
+ return NOTIFY_DONE;
+
+ if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING))
+ return NOTIFY_DONE;
+
+ mutex_lock(&lock);
+ list_for_each_entry(cma_dev, &dev_list, list)
+ list_for_each_entry(id_priv, &cma_dev->id_list, list) {
+ ret = cma_netdev_change(ndev, id_priv);
+ if (ret)
+ goto out;
+ }
+
+out:
+ mutex_unlock(&lock);
+ return ret;
+}
+
+static struct notifier_block cma_nb = {
+ .notifier_call = cma_netdev_callback
+};
+
+static void cma_add_one(struct ib_device *device)
+{
+ struct cma_device *cma_dev;
+ struct rdma_id_private *id_priv;
+
+ cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL);
+ if (!cma_dev)
+ return;
+
+ cma_dev->device = device;
+
+ init_completion(&cma_dev->comp);
+ atomic_set(&cma_dev->refcount, 1);
+ INIT_LIST_HEAD(&cma_dev->id_list);
+ ib_set_client_data(device, &cma_client, cma_dev);
+
+ mutex_lock(&lock);
+ list_add_tail(&cma_dev->list, &dev_list);
+ list_for_each_entry(id_priv, &listen_any_list, list)
+ cma_listen_on_dev(id_priv, cma_dev);
+ mutex_unlock(&lock);
+}
+
+static int cma_remove_id_dev(struct rdma_id_private *id_priv)
+{
+ struct rdma_cm_event event;
+ enum rdma_cm_state state;
+ int ret = 0;
+
+ /* Record that we want to remove the device */
+ state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL);
+ if (state == RDMA_CM_DESTROYING)
+ return 0;
+
+ cma_cancel_operation(id_priv, state);
+ mutex_lock(&id_priv->handler_mutex);
+
+ /* Check for destruction from another callback. */
+ if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL))
+ goto out;
+
+ memset(&event, 0, sizeof event);
+ event.event = RDMA_CM_EVENT_DEVICE_REMOVAL;
+ ret = id_priv->id.event_handler(&id_priv->id, &event);
+out:
+ mutex_unlock(&id_priv->handler_mutex);
+ return ret;
+}
+
+static void cma_process_remove(struct cma_device *cma_dev)
+{
+ struct rdma_id_private *id_priv;
+ int ret;
+
+ mutex_lock(&lock);
+ while (!list_empty(&cma_dev->id_list)) {
+ id_priv = list_entry(cma_dev->id_list.next,
+ struct rdma_id_private, list);
+
+ list_del(&id_priv->listen_list);
+ list_del_init(&id_priv->list);
+ atomic_inc(&id_priv->refcount);
+ mutex_unlock(&lock);
+
+ ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv);
+ cma_deref_id(id_priv);
+ if (ret)
+ rdma_destroy_id(&id_priv->id);
+
+ mutex_lock(&lock);
+ }
+ mutex_unlock(&lock);
+
+ cma_deref_dev(cma_dev);
+ wait_for_completion(&cma_dev->comp);
+}
+
+static void cma_remove_one(struct ib_device *device)
+{
+ struct cma_device *cma_dev;
+
+ cma_dev = ib_get_client_data(device, &cma_client);
+ if (!cma_dev)
+ return;
+
+ mutex_lock(&lock);
+ list_del(&cma_dev->list);
+ mutex_unlock(&lock);
+
+ cma_process_remove(cma_dev);
+ kfree(cma_dev);
+}
+
+static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlmsghdr *nlh;
+ struct rdma_cm_id_stats *id_stats;
+ struct rdma_id_private *id_priv;
+ struct rdma_cm_id *id = NULL;
+ struct cma_device *cma_dev;
+ int i_dev = 0, i_id = 0;
+
+ /*
+ * We export all of the IDs as a sequence of messages. Each
+ * ID gets its own netlink message.
+ */
+ mutex_lock(&lock);
+
+ list_for_each_entry(cma_dev, &dev_list, list) {
+ if (i_dev < cb->args[0]) {
+ i_dev++;
+ continue;
+ }
+
+ i_id = 0;
+ list_for_each_entry(id_priv, &cma_dev->id_list, list) {
+ if (i_id < cb->args[1]) {
+ i_id++;
+ continue;
+ }
+
+ id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq,
+ sizeof *id_stats, RDMA_NL_RDMA_CM,
+ RDMA_NL_RDMA_CM_ID_STATS,
+ NLM_F_MULTI);
+ if (!id_stats)
+ goto out;
+
+ memset(id_stats, 0, sizeof *id_stats);
+ id = &id_priv->id;
+ id_stats->node_type = id->route.addr.dev_addr.dev_type;
+ id_stats->port_num = id->port_num;
+ id_stats->bound_dev_if =
+ id->route.addr.dev_addr.bound_dev_if;
+
+ if (ibnl_put_attr(skb, nlh,
+ rdma_addr_size(cma_src_addr(id_priv)),
+ cma_src_addr(id_priv),
+ RDMA_NL_RDMA_CM_ATTR_SRC_ADDR))
+ goto out;
+ if (ibnl_put_attr(skb, nlh,
+ rdma_addr_size(cma_src_addr(id_priv)),
+ cma_dst_addr(id_priv),
+ RDMA_NL_RDMA_CM_ATTR_DST_ADDR))
+ goto out;
+
+ id_stats->pid = id_priv->owner;
+ id_stats->port_space = id->ps;
+ id_stats->cm_state = id_priv->state;
+ id_stats->qp_num = id_priv->qp_num;
+ id_stats->qp_type = id->qp_type;
+
+ i_id++;
+ }
+
+ cb->args[1] = 0;
+ i_dev++;
+ }
+
+out:
+ mutex_unlock(&lock);
+ cb->args[0] = i_dev;
+ cb->args[1] = i_id;
+
+ return skb->len;
+}
+
+static const struct ibnl_client_cbs cma_cb_table[] = {
+ [RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats,
+ .module = THIS_MODULE },
+};
+
+static int __init cma_init(void)
+{
+ int ret;
+
+ cma_wq = create_singlethread_workqueue("rdma_cm");
+ if (!cma_wq)
+ return -ENOMEM;
+
+ ib_sa_register_client(&sa_client);
+ rdma_addr_register_client(&addr_client);
+ register_netdevice_notifier(&cma_nb);
+
+ ret = ib_register_client(&cma_client);
+ if (ret)
+ goto err;
+
+ if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table))
+ printk(KERN_WARNING "RDMA CMA: failed to add netlink callback\n");
+
+ return 0;
+
+err:
+ unregister_netdevice_notifier(&cma_nb);
+ rdma_addr_unregister_client(&addr_client);
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(cma_wq);
+ return ret;
+}
+
+static void __exit cma_cleanup(void)
+{
+ ibnl_remove_client(RDMA_NL_RDMA_CM);
+ ib_unregister_client(&cma_client);
+ unregister_netdevice_notifier(&cma_nb);
+ rdma_addr_unregister_client(&addr_client);
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(cma_wq);
+ idr_destroy(&tcp_ps);
+ idr_destroy(&udp_ps);
+ idr_destroy(&ipoib_ps);
+ idr_destroy(&ib_ps);
+}
+
+module_init(cma_init);
+module_exit(cma_cleanup);
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
new file mode 100644
index 000000000..87d1936f5
--- /dev/null
+++ b/drivers/infiniband/core/core_priv.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _CORE_PRIV_H
+#define _CORE_PRIV_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include <rdma/ib_verbs.h>
+
+int ib_device_register_sysfs(struct ib_device *device,
+ int (*port_callback)(struct ib_device *,
+ u8, struct kobject *));
+void ib_device_unregister_sysfs(struct ib_device *device);
+
+int ib_sysfs_setup(void);
+void ib_sysfs_cleanup(void);
+
+int ib_cache_setup(void);
+void ib_cache_cleanup(void);
+
+int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr, int *qp_attr_mask);
+#endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
new file mode 100644
index 000000000..18c1ece76
--- /dev/null
+++ b/drivers/infiniband/core/device.c
@@ -0,0 +1,785 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <rdma/rdma_netlink.h>
+
+#include "core_priv.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("core kernel InfiniBand API");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_client_data {
+ struct list_head list;
+ struct ib_client *client;
+ void * data;
+};
+
+struct workqueue_struct *ib_wq;
+EXPORT_SYMBOL_GPL(ib_wq);
+
+static LIST_HEAD(device_list);
+static LIST_HEAD(client_list);
+
+/*
+ * device_mutex protects access to both device_list and client_list.
+ * There's no real point to using multiple locks or something fancier
+ * like an rwsem: we always access both lists, and we're always
+ * modifying one list or the other list. In any case this is not a
+ * hot path so there's no point in trying to optimize.
+ */
+static DEFINE_MUTEX(device_mutex);
+
+static int ib_device_check_mandatory(struct ib_device *device)
+{
+#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
+ static const struct {
+ size_t offset;
+ char *name;
+ } mandatory_table[] = {
+ IB_MANDATORY_FUNC(query_device),
+ IB_MANDATORY_FUNC(query_port),
+ IB_MANDATORY_FUNC(query_pkey),
+ IB_MANDATORY_FUNC(query_gid),
+ IB_MANDATORY_FUNC(alloc_pd),
+ IB_MANDATORY_FUNC(dealloc_pd),
+ IB_MANDATORY_FUNC(create_ah),
+ IB_MANDATORY_FUNC(destroy_ah),
+ IB_MANDATORY_FUNC(create_qp),
+ IB_MANDATORY_FUNC(modify_qp),
+ IB_MANDATORY_FUNC(destroy_qp),
+ IB_MANDATORY_FUNC(post_send),
+ IB_MANDATORY_FUNC(post_recv),
+ IB_MANDATORY_FUNC(create_cq),
+ IB_MANDATORY_FUNC(destroy_cq),
+ IB_MANDATORY_FUNC(poll_cq),
+ IB_MANDATORY_FUNC(req_notify_cq),
+ IB_MANDATORY_FUNC(get_dma_mr),
+ IB_MANDATORY_FUNC(dereg_mr)
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
+ if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
+ printk(KERN_WARNING "Device %s is missing mandatory function %s\n",
+ device->name, mandatory_table[i].name);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static struct ib_device *__ib_device_get_by_name(const char *name)
+{
+ struct ib_device *device;
+
+ list_for_each_entry(device, &device_list, core_list)
+ if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
+ return device;
+
+ return NULL;
+}
+
+
+static int alloc_name(char *name)
+{
+ unsigned long *inuse;
+ char buf[IB_DEVICE_NAME_MAX];
+ struct ib_device *device;
+ int i;
+
+ inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+ if (!inuse)
+ return -ENOMEM;
+
+ list_for_each_entry(device, &device_list, core_list) {
+ if (!sscanf(device->name, name, &i))
+ continue;
+ if (i < 0 || i >= PAGE_SIZE * 8)
+ continue;
+ snprintf(buf, sizeof buf, name, i);
+ if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
+ set_bit(i, inuse);
+ }
+
+ i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
+ free_page((unsigned long) inuse);
+ snprintf(buf, sizeof buf, name, i);
+
+ if (__ib_device_get_by_name(buf))
+ return -ENFILE;
+
+ strlcpy(name, buf, IB_DEVICE_NAME_MAX);
+ return 0;
+}
+
+static int start_port(struct ib_device *device)
+{
+ return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+
+static int end_port(struct ib_device *device)
+{
+ return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+ 0 : device->phys_port_cnt;
+}
+
+/**
+ * ib_alloc_device - allocate an IB device struct
+ * @size:size of structure to allocate
+ *
+ * Low-level drivers should use ib_alloc_device() to allocate &struct
+ * ib_device. @size is the size of the structure to be allocated,
+ * including any private data used by the low-level driver.
+ * ib_dealloc_device() must be used to free structures allocated with
+ * ib_alloc_device().
+ */
+struct ib_device *ib_alloc_device(size_t size)
+{
+ BUG_ON(size < sizeof (struct ib_device));
+
+ return kzalloc(size, GFP_KERNEL);
+}
+EXPORT_SYMBOL(ib_alloc_device);
+
+/**
+ * ib_dealloc_device - free an IB device struct
+ * @device:structure to free
+ *
+ * Free a structure allocated with ib_alloc_device().
+ */
+void ib_dealloc_device(struct ib_device *device)
+{
+ if (device->reg_state == IB_DEV_UNINITIALIZED) {
+ kfree(device);
+ return;
+ }
+
+ BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
+
+ kobject_put(&device->dev.kobj);
+}
+EXPORT_SYMBOL(ib_dealloc_device);
+
+static int add_client_context(struct ib_device *device, struct ib_client *client)
+{
+ struct ib_client_data *context;
+ unsigned long flags;
+
+ context = kmalloc(sizeof *context, GFP_KERNEL);
+ if (!context) {
+ printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n",
+ device->name, client->name);
+ return -ENOMEM;
+ }
+
+ context->client = client;
+ context->data = NULL;
+
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_add(&context->list, &device->client_data_list);
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+ return 0;
+}
+
+static int read_port_table_lengths(struct ib_device *device)
+{
+ struct ib_port_attr *tprops = NULL;
+ int num_ports, ret = -ENOMEM;
+ u8 port_index;
+
+ tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+ if (!tprops)
+ goto out;
+
+ num_ports = end_port(device) - start_port(device) + 1;
+
+ device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports,
+ GFP_KERNEL);
+ device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports,
+ GFP_KERNEL);
+ if (!device->pkey_tbl_len || !device->gid_tbl_len)
+ goto err;
+
+ for (port_index = 0; port_index < num_ports; ++port_index) {
+ ret = ib_query_port(device, port_index + start_port(device),
+ tprops);
+ if (ret)
+ goto err;
+ device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len;
+ device->gid_tbl_len[port_index] = tprops->gid_tbl_len;
+ }
+
+ ret = 0;
+ goto out;
+
+err:
+ kfree(device->gid_tbl_len);
+ kfree(device->pkey_tbl_len);
+out:
+ kfree(tprops);
+ return ret;
+}
+
+/**
+ * ib_register_device - Register an IB device with IB core
+ * @device:Device to register
+ *
+ * Low-level drivers use ib_register_device() to register their
+ * devices with the IB core. All registered clients will receive a
+ * callback for each device that is added. @device must be allocated
+ * with ib_alloc_device().
+ */
+int ib_register_device(struct ib_device *device,
+ int (*port_callback)(struct ib_device *,
+ u8, struct kobject *))
+{
+ int ret;
+
+ mutex_lock(&device_mutex);
+
+ if (strchr(device->name, '%')) {
+ ret = alloc_name(device->name);
+ if (ret)
+ goto out;
+ }
+
+ if (ib_device_check_mandatory(device)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&device->event_handler_list);
+ INIT_LIST_HEAD(&device->client_data_list);
+ spin_lock_init(&device->event_handler_lock);
+ spin_lock_init(&device->client_data_lock);
+
+ ret = read_port_table_lengths(device);
+ if (ret) {
+ printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n",
+ device->name);
+ goto out;
+ }
+
+ ret = ib_device_register_sysfs(device, port_callback);
+ if (ret) {
+ printk(KERN_WARNING "Couldn't register device %s with driver model\n",
+ device->name);
+ kfree(device->gid_tbl_len);
+ kfree(device->pkey_tbl_len);
+ goto out;
+ }
+
+ list_add_tail(&device->core_list, &device_list);
+
+ device->reg_state = IB_DEV_REGISTERED;
+
+ {
+ struct ib_client *client;
+
+ list_for_each_entry(client, &client_list, list)
+ if (client->add && !add_client_context(device, client))
+ client->add(device);
+ }
+
+ out:
+ mutex_unlock(&device_mutex);
+ return ret;
+}
+EXPORT_SYMBOL(ib_register_device);
+
+/**
+ * ib_unregister_device - Unregister an IB device
+ * @device:Device to unregister
+ *
+ * Unregister an IB device. All clients will receive a remove callback.
+ */
+void ib_unregister_device(struct ib_device *device)
+{
+ struct ib_client *client;
+ struct ib_client_data *context, *tmp;
+ unsigned long flags;
+
+ mutex_lock(&device_mutex);
+
+ list_for_each_entry_reverse(client, &client_list, list)
+ if (client->remove)
+ client->remove(device);
+
+ list_del(&device->core_list);
+
+ kfree(device->gid_tbl_len);
+ kfree(device->pkey_tbl_len);
+
+ mutex_unlock(&device_mutex);
+
+ ib_device_unregister_sysfs(device);
+
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+ kfree(context);
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+ device->reg_state = IB_DEV_UNREGISTERED;
+}
+EXPORT_SYMBOL(ib_unregister_device);
+
+/**
+ * ib_register_client - Register an IB client
+ * @client:Client to register
+ *
+ * Upper level users of the IB drivers can use ib_register_client() to
+ * register callbacks for IB device addition and removal. When an IB
+ * device is added, each registered client's add method will be called
+ * (in the order the clients were registered), and when a device is
+ * removed, each client's remove method will be called (in the reverse
+ * order that clients were registered). In addition, when
+ * ib_register_client() is called, the client will receive an add
+ * callback for all devices already registered.
+ */
+int ib_register_client(struct ib_client *client)
+{
+ struct ib_device *device;
+
+ mutex_lock(&device_mutex);
+
+ list_add_tail(&client->list, &client_list);
+ list_for_each_entry(device, &device_list, core_list)
+ if (client->add && !add_client_context(device, client))
+ client->add(device);
+
+ mutex_unlock(&device_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_register_client);
+
+/**
+ * ib_unregister_client - Unregister an IB client
+ * @client:Client to unregister
+ *
+ * Upper level users use ib_unregister_client() to remove their client
+ * registration. When ib_unregister_client() is called, the client
+ * will receive a remove callback for each IB device still registered.
+ */
+void ib_unregister_client(struct ib_client *client)
+{
+ struct ib_client_data *context, *tmp;
+ struct ib_device *device;
+ unsigned long flags;
+
+ mutex_lock(&device_mutex);
+
+ list_for_each_entry(device, &device_list, core_list) {
+ if (client->remove)
+ client->remove(device);
+
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+ if (context->client == client) {
+ list_del(&context->list);
+ kfree(context);
+ }
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+ }
+ list_del(&client->list);
+
+ mutex_unlock(&device_mutex);
+}
+EXPORT_SYMBOL(ib_unregister_client);
+
+/**
+ * ib_get_client_data - Get IB client context
+ * @device:Device to get context for
+ * @client:Client to get context for
+ *
+ * ib_get_client_data() returns client context set with
+ * ib_set_client_data().
+ */
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
+{
+ struct ib_client_data *context;
+ void *ret = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry(context, &device->client_data_list, list)
+ if (context->client == client) {
+ ret = context->data;
+ break;
+ }
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_get_client_data);
+
+/**
+ * ib_set_client_data - Set IB client context
+ * @device:Device to set context for
+ * @client:Client to set context for
+ * @data:Context to set
+ *
+ * ib_set_client_data() sets client context that can be retrieved with
+ * ib_get_client_data().
+ */
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+ void *data)
+{
+ struct ib_client_data *context;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device->client_data_lock, flags);
+ list_for_each_entry(context, &device->client_data_list, list)
+ if (context->client == client) {
+ context->data = data;
+ goto out;
+ }
+
+ printk(KERN_WARNING "No client context found for %s/%s\n",
+ device->name, client->name);
+
+out:
+ spin_unlock_irqrestore(&device->client_data_lock, flags);
+}
+EXPORT_SYMBOL(ib_set_client_data);
+
+/**
+ * ib_register_event_handler - Register an IB event handler
+ * @event_handler:Handler to register
+ *
+ * ib_register_event_handler() registers an event handler that will be
+ * called back when asynchronous IB events occur (as defined in
+ * chapter 11 of the InfiniBand Architecture Specification). This
+ * callback may occur in interrupt context.
+ */
+int ib_register_event_handler (struct ib_event_handler *event_handler)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+ list_add_tail(&event_handler->list,
+ &event_handler->device->event_handler_list);
+ spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_register_event_handler);
+
+/**
+ * ib_unregister_event_handler - Unregister an event handler
+ * @event_handler:Handler to unregister
+ *
+ * Unregister an event handler registered with
+ * ib_register_event_handler().
+ */
+int ib_unregister_event_handler(struct ib_event_handler *event_handler)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+ list_del(&event_handler->list);
+ spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_unregister_event_handler);
+
+/**
+ * ib_dispatch_event - Dispatch an asynchronous event
+ * @event:Event to dispatch
+ *
+ * Low-level drivers must call ib_dispatch_event() to dispatch the
+ * event to all registered event handlers when an asynchronous event
+ * occurs.
+ */
+void ib_dispatch_event(struct ib_event *event)
+{
+ unsigned long flags;
+ struct ib_event_handler *handler;
+
+ spin_lock_irqsave(&event->device->event_handler_lock, flags);
+
+ list_for_each_entry(handler, &event->device->event_handler_list, list)
+ handler->handler(handler, event);
+
+ spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
+}
+EXPORT_SYMBOL(ib_dispatch_event);
+
+/**
+ * ib_query_device - Query IB device attributes
+ * @device:Device to query
+ * @device_attr:Device attributes
+ *
+ * ib_query_device() returns the attributes of a device through the
+ * @device_attr pointer.
+ */
+int ib_query_device(struct ib_device *device,
+ struct ib_device_attr *device_attr)
+{
+ return device->query_device(device, device_attr);
+}
+EXPORT_SYMBOL(ib_query_device);
+
+/**
+ * ib_query_port - Query IB port attributes
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @port_attr:Port attributes
+ *
+ * ib_query_port() returns the attributes of a port through the
+ * @port_attr pointer.
+ */
+int ib_query_port(struct ib_device *device,
+ u8 port_num,
+ struct ib_port_attr *port_attr)
+{
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ return device->query_port(device, port_num, port_attr);
+}
+EXPORT_SYMBOL(ib_query_port);
+
+/**
+ * ib_query_gid - Get GID table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:GID table index to query
+ * @gid:Returned GID
+ *
+ * ib_query_gid() fetches the specified GID table entry.
+ */
+int ib_query_gid(struct ib_device *device,
+ u8 port_num, int index, union ib_gid *gid)
+{
+ return device->query_gid(device, port_num, index, gid);
+}
+EXPORT_SYMBOL(ib_query_gid);
+
+/**
+ * ib_query_pkey - Get P_Key table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:P_Key table index to query
+ * @pkey:Returned P_Key
+ *
+ * ib_query_pkey() fetches the specified P_Key table entry.
+ */
+int ib_query_pkey(struct ib_device *device,
+ u8 port_num, u16 index, u16 *pkey)
+{
+ return device->query_pkey(device, port_num, index, pkey);
+}
+EXPORT_SYMBOL(ib_query_pkey);
+
+/**
+ * ib_modify_device - Change IB device attributes
+ * @device:Device to modify
+ * @device_modify_mask:Mask of attributes to change
+ * @device_modify:New attribute values
+ *
+ * ib_modify_device() changes a device's attributes as specified by
+ * the @device_modify_mask and @device_modify structure.
+ */
+int ib_modify_device(struct ib_device *device,
+ int device_modify_mask,
+ struct ib_device_modify *device_modify)
+{
+ if (!device->modify_device)
+ return -ENOSYS;
+
+ return device->modify_device(device, device_modify_mask,
+ device_modify);
+}
+EXPORT_SYMBOL(ib_modify_device);
+
+/**
+ * ib_modify_port - Modifies the attributes for the specified port.
+ * @device: The device to modify.
+ * @port_num: The number of the port to modify.
+ * @port_modify_mask: Mask used to specify which attributes of the port
+ * to change.
+ * @port_modify: New attribute values for the port.
+ *
+ * ib_modify_port() changes a port's attributes as specified by the
+ * @port_modify_mask and @port_modify structure.
+ */
+int ib_modify_port(struct ib_device *device,
+ u8 port_num, int port_modify_mask,
+ struct ib_port_modify *port_modify)
+{
+ if (!device->modify_port)
+ return -ENOSYS;
+
+ if (port_num < start_port(device) || port_num > end_port(device))
+ return -EINVAL;
+
+ return device->modify_port(device, port_num, port_modify_mask,
+ port_modify);
+}
+EXPORT_SYMBOL(ib_modify_port);
+
+/**
+ * ib_find_gid - Returns the port number and GID table index where
+ * a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the GID table where the GID was found. This
+ * parameter may be NULL.
+ */
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+ u8 *port_num, u16 *index)
+{
+ union ib_gid tmp_gid;
+ int ret, port, i;
+
+ for (port = start_port(device); port <= end_port(device); ++port) {
+ for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) {
+ ret = ib_query_gid(device, port, i, &tmp_gid);
+ if (ret)
+ return ret;
+ if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
+ *port_num = port;
+ if (index)
+ *index = i;
+ return 0;
+ }
+ }
+ }
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_gid);
+
+/**
+ * ib_find_pkey - Returns the PKey table index where a specified
+ * PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the PKey table where the PKey was found.
+ */
+int ib_find_pkey(struct ib_device *device,
+ u8 port_num, u16 pkey, u16 *index)
+{
+ int ret, i;
+ u16 tmp_pkey;
+ int partial_ix = -1;
+
+ for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
+ ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
+ if (ret)
+ return ret;
+ if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
+ /* if there is full-member pkey take it.*/
+ if (tmp_pkey & 0x8000) {
+ *index = i;
+ return 0;
+ }
+ if (partial_ix < 0)
+ partial_ix = i;
+ }
+ }
+
+ /*no full-member, if exists take the limited*/
+ if (partial_ix >= 0) {
+ *index = partial_ix;
+ return 0;
+ }
+ return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_pkey);
+
+static int __init ib_core_init(void)
+{
+ int ret;
+
+ ib_wq = alloc_workqueue("infiniband", 0, 0);
+ if (!ib_wq)
+ return -ENOMEM;
+
+ ret = ib_sysfs_setup();
+ if (ret) {
+ printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
+ goto err;
+ }
+
+ ret = ibnl_init();
+ if (ret) {
+ printk(KERN_WARNING "Couldn't init IB netlink interface\n");
+ goto err_sysfs;
+ }
+
+ ret = ib_cache_setup();
+ if (ret) {
+ printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
+ goto err_nl;
+ }
+
+ return 0;
+
+err_nl:
+ ibnl_cleanup();
+
+err_sysfs:
+ ib_sysfs_cleanup();
+
+err:
+ destroy_workqueue(ib_wq);
+ return ret;
+}
+
+static void __exit ib_core_cleanup(void)
+{
+ ib_cache_cleanup();
+ ibnl_cleanup();
+ ib_sysfs_cleanup();
+ /* Make sure that any pending umem accounting work is done. */
+ destroy_workqueue(ib_wq);
+}
+
+module_init(ib_core_init);
+module_exit(ib_core_cleanup);
diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c
new file mode 100644
index 000000000..9f5ad7cc3
--- /dev/null
+++ b/drivers/infiniband/core/fmr_pool.c
@@ -0,0 +1,544 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>
+#include <linux/kthread.h>
+
+#include <rdma/ib_fmr_pool.h>
+
+#include "core_priv.h"
+
+#define PFX "fmr_pool: "
+
+enum {
+ IB_FMR_MAX_REMAPS = 32,
+
+ IB_FMR_HASH_BITS = 8,
+ IB_FMR_HASH_SIZE = 1 << IB_FMR_HASH_BITS,
+ IB_FMR_HASH_MASK = IB_FMR_HASH_SIZE - 1
+};
+
+/*
+ * If an FMR is not in use, then the list member will point to either
+ * its pool's free_list (if the FMR can be mapped again; that is,
+ * remap_count < pool->max_remaps) or its pool's dirty_list (if the
+ * FMR needs to be unmapped before being remapped). In either of
+ * these cases it is a bug if the ref_count is not 0. In other words,
+ * if ref_count is > 0, then the list member must not be linked into
+ * either free_list or dirty_list.
+ *
+ * The cache_node member is used to link the FMR into a cache bucket
+ * (if caching is enabled). This is independent of the reference
+ * count of the FMR. When a valid FMR is released, its ref_count is
+ * decremented, and if ref_count reaches 0, the FMR is placed in
+ * either free_list or dirty_list as appropriate. However, it is not
+ * removed from the cache and may be "revived" if a call to
+ * ib_fmr_register_physical() occurs before the FMR is remapped. In
+ * this case we just increment the ref_count and remove the FMR from
+ * free_list/dirty_list.
+ *
+ * Before we remap an FMR from free_list, we remove it from the cache
+ * (to prevent another user from obtaining a stale FMR). When an FMR
+ * is released, we add it to the tail of the free list, so that our
+ * cache eviction policy is "least recently used."
+ *
+ * All manipulation of ref_count, list and cache_node is protected by
+ * pool_lock to maintain consistency.
+ */
+
+struct ib_fmr_pool {
+ spinlock_t pool_lock;
+
+ int pool_size;
+ int max_pages;
+ int max_remaps;
+ int dirty_watermark;
+ int dirty_len;
+ struct list_head free_list;
+ struct list_head dirty_list;
+ struct hlist_head *cache_bucket;
+
+ void (*flush_function)(struct ib_fmr_pool *pool,
+ void * arg);
+ void *flush_arg;
+
+ struct task_struct *thread;
+
+ atomic_t req_ser;
+ atomic_t flush_ser;
+
+ wait_queue_head_t force_wait;
+};
+
+static inline u32 ib_fmr_hash(u64 first_page)
+{
+ return jhash_2words((u32) first_page, (u32) (first_page >> 32), 0) &
+ (IB_FMR_HASH_SIZE - 1);
+}
+
+/* Caller must hold pool_lock */
+static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool,
+ u64 *page_list,
+ int page_list_len,
+ u64 io_virtual_address)
+{
+ struct hlist_head *bucket;
+ struct ib_pool_fmr *fmr;
+
+ if (!pool->cache_bucket)
+ return NULL;
+
+ bucket = pool->cache_bucket + ib_fmr_hash(*page_list);
+
+ hlist_for_each_entry(fmr, bucket, cache_node)
+ if (io_virtual_address == fmr->io_virtual_address &&
+ page_list_len == fmr->page_list_len &&
+ !memcmp(page_list, fmr->page_list,
+ page_list_len * sizeof *page_list))
+ return fmr;
+
+ return NULL;
+}
+
+static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
+{
+ int ret;
+ struct ib_pool_fmr *fmr;
+ LIST_HEAD(unmap_list);
+ LIST_HEAD(fmr_list);
+
+ spin_lock_irq(&pool->pool_lock);
+
+ list_for_each_entry(fmr, &pool->dirty_list, list) {
+ hlist_del_init(&fmr->cache_node);
+ fmr->remap_count = 0;
+ list_add_tail(&fmr->fmr->list, &fmr_list);
+
+#ifdef DEBUG
+ if (fmr->ref_count !=0) {
+ printk(KERN_WARNING PFX "Unmapping FMR 0x%08x with ref count %d\n",
+ fmr, fmr->ref_count);
+ }
+#endif
+ }
+
+ list_splice_init(&pool->dirty_list, &unmap_list);
+ pool->dirty_len = 0;
+
+ spin_unlock_irq(&pool->pool_lock);
+
+ if (list_empty(&unmap_list)) {
+ return;
+ }
+
+ ret = ib_unmap_fmr(&fmr_list);
+ if (ret)
+ printk(KERN_WARNING PFX "ib_unmap_fmr returned %d\n", ret);
+
+ spin_lock_irq(&pool->pool_lock);
+ list_splice(&unmap_list, &pool->free_list);
+ spin_unlock_irq(&pool->pool_lock);
+}
+
+static int ib_fmr_cleanup_thread(void *pool_ptr)
+{
+ struct ib_fmr_pool *pool = pool_ptr;
+
+ do {
+ if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) {
+ ib_fmr_batch_release(pool);
+
+ atomic_inc(&pool->flush_ser);
+ wake_up_interruptible(&pool->force_wait);
+
+ if (pool->flush_function)
+ pool->flush_function(pool, pool->flush_arg);
+ }
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 &&
+ !kthread_should_stop())
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ } while (!kthread_should_stop());
+
+ return 0;
+}
+
+/**
+ * ib_create_fmr_pool - Create an FMR pool
+ * @pd:Protection domain for FMRs
+ * @params:FMR pool parameters
+ *
+ * Create a pool of FMRs. Return value is pointer to new pool or
+ * error code if creation failed.
+ */
+struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd,
+ struct ib_fmr_pool_param *params)
+{
+ struct ib_device *device;
+ struct ib_fmr_pool *pool;
+ struct ib_device_attr *attr;
+ int i;
+ int ret;
+ int max_remaps;
+
+ if (!params)
+ return ERR_PTR(-EINVAL);
+
+ device = pd->device;
+ if (!device->alloc_fmr || !device->dealloc_fmr ||
+ !device->map_phys_fmr || !device->unmap_fmr) {
+ printk(KERN_INFO PFX "Device %s does not support FMRs\n",
+ device->name);
+ return ERR_PTR(-ENOSYS);
+ }
+
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ if (!attr) {
+ printk(KERN_WARNING PFX "couldn't allocate device attr struct\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ ret = ib_query_device(device, attr);
+ if (ret) {
+ printk(KERN_WARNING PFX "couldn't query device: %d\n", ret);
+ kfree(attr);
+ return ERR_PTR(ret);
+ }
+
+ if (!attr->max_map_per_fmr)
+ max_remaps = IB_FMR_MAX_REMAPS;
+ else
+ max_remaps = attr->max_map_per_fmr;
+
+ kfree(attr);
+
+ pool = kmalloc(sizeof *pool, GFP_KERNEL);
+ if (!pool) {
+ printk(KERN_WARNING PFX "couldn't allocate pool struct\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ pool->cache_bucket = NULL;
+
+ pool->flush_function = params->flush_function;
+ pool->flush_arg = params->flush_arg;
+
+ INIT_LIST_HEAD(&pool->free_list);
+ INIT_LIST_HEAD(&pool->dirty_list);
+
+ if (params->cache) {
+ pool->cache_bucket =
+ kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket,
+ GFP_KERNEL);
+ if (!pool->cache_bucket) {
+ printk(KERN_WARNING PFX "Failed to allocate cache in pool\n");
+ ret = -ENOMEM;
+ goto out_free_pool;
+ }
+
+ for (i = 0; i < IB_FMR_HASH_SIZE; ++i)
+ INIT_HLIST_HEAD(pool->cache_bucket + i);
+ }
+
+ pool->pool_size = 0;
+ pool->max_pages = params->max_pages_per_fmr;
+ pool->max_remaps = max_remaps;
+ pool->dirty_watermark = params->dirty_watermark;
+ pool->dirty_len = 0;
+ spin_lock_init(&pool->pool_lock);
+ atomic_set(&pool->req_ser, 0);
+ atomic_set(&pool->flush_ser, 0);
+ init_waitqueue_head(&pool->force_wait);
+
+ pool->thread = kthread_run(ib_fmr_cleanup_thread,
+ pool,
+ "ib_fmr(%s)",
+ device->name);
+ if (IS_ERR(pool->thread)) {
+ printk(KERN_WARNING PFX "couldn't start cleanup thread\n");
+ ret = PTR_ERR(pool->thread);
+ goto out_free_pool;
+ }
+
+ {
+ struct ib_pool_fmr *fmr;
+ struct ib_fmr_attr fmr_attr = {
+ .max_pages = params->max_pages_per_fmr,
+ .max_maps = pool->max_remaps,
+ .page_shift = params->page_shift
+ };
+ int bytes_per_fmr = sizeof *fmr;
+
+ if (pool->cache_bucket)
+ bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64);
+
+ for (i = 0; i < params->pool_size; ++i) {
+ fmr = kmalloc(bytes_per_fmr, GFP_KERNEL);
+ if (!fmr) {
+ printk(KERN_WARNING PFX "failed to allocate fmr "
+ "struct for FMR %d\n", i);
+ goto out_fail;
+ }
+
+ fmr->pool = pool;
+ fmr->remap_count = 0;
+ fmr->ref_count = 0;
+ INIT_HLIST_NODE(&fmr->cache_node);
+
+ fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr);
+ if (IS_ERR(fmr->fmr)) {
+ printk(KERN_WARNING PFX "fmr_create failed "
+ "for FMR %d\n", i);
+ kfree(fmr);
+ goto out_fail;
+ }
+
+ list_add_tail(&fmr->list, &pool->free_list);
+ ++pool->pool_size;
+ }
+ }
+
+ return pool;
+
+ out_free_pool:
+ kfree(pool->cache_bucket);
+ kfree(pool);
+
+ return ERR_PTR(ret);
+
+ out_fail:
+ ib_destroy_fmr_pool(pool);
+
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_fmr_pool);
+
+/**
+ * ib_destroy_fmr_pool - Free FMR pool
+ * @pool:FMR pool to free
+ *
+ * Destroy an FMR pool and free all associated resources.
+ */
+void ib_destroy_fmr_pool(struct ib_fmr_pool *pool)
+{
+ struct ib_pool_fmr *fmr;
+ struct ib_pool_fmr *tmp;
+ LIST_HEAD(fmr_list);
+ int i;
+
+ kthread_stop(pool->thread);
+ ib_fmr_batch_release(pool);
+
+ i = 0;
+ list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) {
+ if (fmr->remap_count) {
+ INIT_LIST_HEAD(&fmr_list);
+ list_add_tail(&fmr->fmr->list, &fmr_list);
+ ib_unmap_fmr(&fmr_list);
+ }
+ ib_dealloc_fmr(fmr->fmr);
+ list_del(&fmr->list);
+ kfree(fmr);
+ ++i;
+ }
+
+ if (i < pool->pool_size)
+ printk(KERN_WARNING PFX "pool still has %d regions registered\n",
+ pool->pool_size - i);
+
+ kfree(pool->cache_bucket);
+ kfree(pool);
+}
+EXPORT_SYMBOL(ib_destroy_fmr_pool);
+
+/**
+ * ib_flush_fmr_pool - Invalidate all unmapped FMRs
+ * @pool:FMR pool to flush
+ *
+ * Ensure that all unmapped FMRs are fully invalidated.
+ */
+int ib_flush_fmr_pool(struct ib_fmr_pool *pool)
+{
+ int serial;
+ struct ib_pool_fmr *fmr, *next;
+
+ /*
+ * The free_list holds FMRs that may have been used
+ * but have not been remapped enough times to be dirty.
+ * Put them on the dirty list now so that the cleanup
+ * thread will reap them too.
+ */
+ spin_lock_irq(&pool->pool_lock);
+ list_for_each_entry_safe(fmr, next, &pool->free_list, list) {
+ if (fmr->remap_count > 0)
+ list_move(&fmr->list, &pool->dirty_list);
+ }
+ spin_unlock_irq(&pool->pool_lock);
+
+ serial = atomic_inc_return(&pool->req_ser);
+ wake_up_process(pool->thread);
+
+ if (wait_event_interruptible(pool->force_wait,
+ atomic_read(&pool->flush_ser) - serial >= 0))
+ return -EINTR;
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_flush_fmr_pool);
+
+/**
+ * ib_fmr_pool_map_phys -
+ * @pool:FMR pool to allocate FMR from
+ * @page_list:List of pages to map
+ * @list_len:Number of pages in @page_list
+ * @io_virtual_address:I/O virtual address for new FMR
+ *
+ * Map an FMR from an FMR pool.
+ */
+struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
+ u64 *page_list,
+ int list_len,
+ u64 io_virtual_address)
+{
+ struct ib_fmr_pool *pool = pool_handle;
+ struct ib_pool_fmr *fmr;
+ unsigned long flags;
+ int result;
+
+ if (list_len < 1 || list_len > pool->max_pages)
+ return ERR_PTR(-EINVAL);
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ fmr = ib_fmr_cache_lookup(pool,
+ page_list,
+ list_len,
+ io_virtual_address);
+ if (fmr) {
+ /* found in cache */
+ ++fmr->ref_count;
+ if (fmr->ref_count == 1) {
+ list_del(&fmr->list);
+ }
+
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ return fmr;
+ }
+
+ if (list_empty(&pool->free_list)) {
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list);
+ list_del(&fmr->list);
+ hlist_del_init(&fmr->cache_node);
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ result = ib_map_phys_fmr(fmr->fmr, page_list, list_len,
+ io_virtual_address);
+
+ if (result) {
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ list_add(&fmr->list, &pool->free_list);
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ printk(KERN_WARNING PFX "fmr_map returns %d\n", result);
+
+ return ERR_PTR(result);
+ }
+
+ ++fmr->remap_count;
+ fmr->ref_count = 1;
+
+ if (pool->cache_bucket) {
+ fmr->io_virtual_address = io_virtual_address;
+ fmr->page_list_len = list_len;
+ memcpy(fmr->page_list, page_list, list_len * sizeof(*page_list));
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+ hlist_add_head(&fmr->cache_node,
+ pool->cache_bucket + ib_fmr_hash(fmr->page_list[0]));
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+ }
+
+ return fmr;
+}
+EXPORT_SYMBOL(ib_fmr_pool_map_phys);
+
+/**
+ * ib_fmr_pool_unmap - Unmap FMR
+ * @fmr:FMR to unmap
+ *
+ * Unmap an FMR. The FMR mapping may remain valid until the FMR is
+ * reused (or until ib_flush_fmr_pool() is called).
+ */
+int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
+{
+ struct ib_fmr_pool *pool;
+ unsigned long flags;
+
+ pool = fmr->pool;
+
+ spin_lock_irqsave(&pool->pool_lock, flags);
+
+ --fmr->ref_count;
+ if (!fmr->ref_count) {
+ if (fmr->remap_count < pool->max_remaps) {
+ list_add_tail(&fmr->list, &pool->free_list);
+ } else {
+ list_add_tail(&fmr->list, &pool->dirty_list);
+ if (++pool->dirty_len >= pool->dirty_watermark) {
+ atomic_inc(&pool->req_ser);
+ wake_up_process(pool->thread);
+ }
+ }
+ }
+
+#ifdef DEBUG
+ if (fmr->ref_count < 0)
+ printk(KERN_WARNING PFX "FMR %p has ref count %d < 0\n",
+ fmr, fmr->ref_count);
+#endif
+
+ spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_fmr_pool_unmap);
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
new file mode 100644
index 000000000..ff9163dc1
--- /dev/null
+++ b/drivers/infiniband/core/iwcm.c
@@ -0,0 +1,1069 @@
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_addr.h>
+
+#include "iwcm.h"
+
+MODULE_AUTHOR("Tom Tucker");
+MODULE_DESCRIPTION("iWARP CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static struct workqueue_struct *iwcm_wq;
+struct iwcm_work {
+ struct work_struct work;
+ struct iwcm_id_private *cm_id;
+ struct list_head list;
+ struct iw_cm_event event;
+ struct list_head free_list;
+};
+
+static unsigned int default_backlog = 256;
+
+static struct ctl_table_header *iwcm_ctl_table_hdr;
+static struct ctl_table iwcm_ctl_table[] = {
+ {
+ .procname = "default_backlog",
+ .data = &default_backlog,
+ .maxlen = sizeof(default_backlog),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+/*
+ * The following services provide a mechanism for pre-allocating iwcm_work
+ * elements. The design pre-allocates them based on the cm_id type:
+ * LISTENING IDS: Get enough elements preallocated to handle the
+ * listen backlog.
+ * ACTIVE IDS: 4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE
+ * PASSIVE IDS: 3: ESTABLISHED, DISCONNECT, CLOSE
+ *
+ * Allocating them in connect and listen avoids having to deal
+ * with allocation failures on the event upcall from the provider (which
+ * is called in the interrupt context).
+ *
+ * One exception is when creating the cm_id for incoming connection requests.
+ * There are two cases:
+ * 1) in the event upcall, cm_event_handler(), for a listening cm_id. If
+ * the backlog is exceeded, then no more connection request events will
+ * be processed. cm_event_handler() returns -ENOMEM in this case. Its up
+ * to the provider to reject the connection request.
+ * 2) in the connection request workqueue handler, cm_conn_req_handler().
+ * If work elements cannot be allocated for the new connect request cm_id,
+ * then IWCM will call the provider reject method. This is ok since
+ * cm_conn_req_handler() runs in the workqueue thread context.
+ */
+
+static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv)
+{
+ struct iwcm_work *work;
+
+ if (list_empty(&cm_id_priv->work_free_list))
+ return NULL;
+ work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work,
+ free_list);
+ list_del_init(&work->free_list);
+ return work;
+}
+
+static void put_work(struct iwcm_work *work)
+{
+ list_add(&work->free_list, &work->cm_id->work_free_list);
+}
+
+static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv)
+{
+ struct list_head *e, *tmp;
+
+ list_for_each_safe(e, tmp, &cm_id_priv->work_free_list)
+ kfree(list_entry(e, struct iwcm_work, free_list));
+}
+
+static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count)
+{
+ struct iwcm_work *work;
+
+ BUG_ON(!list_empty(&cm_id_priv->work_free_list));
+ while (count--) {
+ work = kmalloc(sizeof(struct iwcm_work), GFP_KERNEL);
+ if (!work) {
+ dealloc_work_entries(cm_id_priv);
+ return -ENOMEM;
+ }
+ work->cm_id = cm_id_priv;
+ INIT_LIST_HEAD(&work->list);
+ put_work(work);
+ }
+ return 0;
+}
+
+/*
+ * Save private data from incoming connection requests to
+ * iw_cm_event, so the low level driver doesn't have to. Adjust
+ * the event ptr to point to the local copy.
+ */
+static int copy_private_data(struct iw_cm_event *event)
+{
+ void *p;
+
+ p = kmemdup(event->private_data, event->private_data_len, GFP_ATOMIC);
+ if (!p)
+ return -ENOMEM;
+ event->private_data = p;
+ return 0;
+}
+
+static void free_cm_id(struct iwcm_id_private *cm_id_priv)
+{
+ dealloc_work_entries(cm_id_priv);
+ kfree(cm_id_priv);
+}
+
+/*
+ * Release a reference on cm_id. If the last reference is being
+ * released, enable the waiting thread (in iw_destroy_cm_id) to
+ * get woken up, and return 1 if a thread is already waiting.
+ */
+static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
+{
+ BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
+ if (atomic_dec_and_test(&cm_id_priv->refcount)) {
+ BUG_ON(!list_empty(&cm_id_priv->work_list));
+ complete(&cm_id_priv->destroy_comp);
+ return 1;
+ }
+
+ return 0;
+}
+
+static void add_ref(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ atomic_inc(&cm_id_priv->refcount);
+}
+
+static void rem_ref(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+ int cb_destroy;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ /*
+ * Test bit before deref in case the cm_id gets freed on another
+ * thread.
+ */
+ cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+ if (iwcm_deref_id(cm_id_priv) && cb_destroy) {
+ BUG_ON(!list_empty(&cm_id_priv->work_list));
+ free_cm_id(cm_id_priv);
+ }
+}
+
+static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
+
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+ iw_cm_handler cm_handler,
+ void *context)
+{
+ struct iwcm_id_private *cm_id_priv;
+
+ cm_id_priv = kzalloc(sizeof(*cm_id_priv), GFP_KERNEL);
+ if (!cm_id_priv)
+ return ERR_PTR(-ENOMEM);
+
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ cm_id_priv->id.device = device;
+ cm_id_priv->id.cm_handler = cm_handler;
+ cm_id_priv->id.context = context;
+ cm_id_priv->id.event_handler = cm_event_handler;
+ cm_id_priv->id.add_ref = add_ref;
+ cm_id_priv->id.rem_ref = rem_ref;
+ spin_lock_init(&cm_id_priv->lock);
+ atomic_set(&cm_id_priv->refcount, 1);
+ init_waitqueue_head(&cm_id_priv->connect_wait);
+ init_completion(&cm_id_priv->destroy_comp);
+ INIT_LIST_HEAD(&cm_id_priv->work_list);
+ INIT_LIST_HEAD(&cm_id_priv->work_free_list);
+
+ return &cm_id_priv->id;
+}
+EXPORT_SYMBOL(iw_create_cm_id);
+
+
+static int iwcm_modify_qp_err(struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+
+ if (!qp)
+ return -EINVAL;
+
+ qp_attr.qp_state = IB_QPS_ERR;
+ return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * This is really the RDMAC CLOSING state. It is most similar to the
+ * IB SQD QP state.
+ */
+static int iwcm_modify_qp_sqd(struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+
+ BUG_ON(qp == NULL);
+ qp_attr.qp_state = IB_QPS_SQD;
+ return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * Block if a passive or active connection is currently being processed. Then
+ * process the event as follows:
+ * - If we are ESTABLISHED, move to CLOSING and modify the QP state
+ * based on the abrupt flag
+ * - If the connection is already in the CLOSING or IDLE state, the peer is
+ * disconnecting concurrently with us and we've already seen the
+ * DISCONNECT event -- ignore the request and return 0
+ * - Disconnect on a listening endpoint returns -EINVAL
+ */
+int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
+{
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret = 0;
+ struct ib_qp *qp = NULL;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ /* Wait if we're currently in a connect or accept downcall */
+ wait_event(cm_id_priv->connect_wait,
+ !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_ESTABLISHED:
+ cm_id_priv->state = IW_CM_STATE_CLOSING;
+
+ /* QP could be <nul> for user-mode client */
+ if (cm_id_priv->qp)
+ qp = cm_id_priv->qp;
+ else
+ ret = -EINVAL;
+ break;
+ case IW_CM_STATE_LISTEN:
+ ret = -EINVAL;
+ break;
+ case IW_CM_STATE_CLOSING:
+ /* remote peer closed first */
+ case IW_CM_STATE_IDLE:
+ /* accept or connect returned !0 */
+ break;
+ case IW_CM_STATE_CONN_RECV:
+ /*
+ * App called disconnect before/without calling accept after
+ * connect_request event delivered.
+ */
+ break;
+ case IW_CM_STATE_CONN_SENT:
+ /* Can only get here if wait above fails */
+ default:
+ BUG();
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ if (qp) {
+ if (abrupt)
+ ret = iwcm_modify_qp_err(qp);
+ else
+ ret = iwcm_modify_qp_sqd(qp);
+
+ /*
+ * If both sides are disconnecting the QP could
+ * already be in ERR or SQD states
+ */
+ ret = 0;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_disconnect);
+
+/*
+ * CM_ID <-- DESTROYING
+ *
+ * Clean up all resources associated with the connection and release
+ * the initial reference taken by iw_create_cm_id.
+ */
+static void destroy_cm_id(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ /*
+ * Wait if we're currently in a connect or accept downcall. A
+ * listening endpoint should never block here.
+ */
+ wait_event(cm_id_priv->connect_wait,
+ !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_LISTEN:
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ /* destroy the listening endpoint */
+ cm_id->device->iwcm->destroy_listen(cm_id);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ case IW_CM_STATE_ESTABLISHED:
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ /* Abrupt close of the connection */
+ (void)iwcm_modify_qp_err(cm_id_priv->qp);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ case IW_CM_STATE_IDLE:
+ case IW_CM_STATE_CLOSING:
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ break;
+ case IW_CM_STATE_CONN_RECV:
+ /*
+ * App called destroy before/without calling accept after
+ * receiving connection request event notification or
+ * returned non zero from the event callback function.
+ * In either case, must tell the provider to reject.
+ */
+ cm_id_priv->state = IW_CM_STATE_DESTROYING;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ cm_id->device->iwcm->reject(cm_id, NULL, 0);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ case IW_CM_STATE_CONN_SENT:
+ case IW_CM_STATE_DESTROYING:
+ default:
+ BUG();
+ break;
+ }
+ if (cm_id_priv->qp) {
+ cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+ cm_id_priv->qp = NULL;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ (void)iwcm_deref_id(cm_id_priv);
+}
+
+/*
+ * This function is only called by the application thread and cannot
+ * be called by the event thread. The function will wait for all
+ * references to be released on the cm_id and then kfree the cm_id
+ * object.
+ */
+void iw_destroy_cm_id(struct iw_cm_id *cm_id)
+{
+ struct iwcm_id_private *cm_id_priv;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags));
+
+ destroy_cm_id(cm_id);
+
+ wait_for_completion(&cm_id_priv->destroy_comp);
+
+ free_cm_id(cm_id_priv);
+}
+EXPORT_SYMBOL(iw_destroy_cm_id);
+
+/*
+ * CM_ID <-- LISTEN
+ *
+ * Start listening for connect requests. Generates one CONNECT_REQUEST
+ * event for each inbound connect request.
+ */
+int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
+{
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ if (!backlog)
+ backlog = default_backlog;
+
+ ret = alloc_work_entries(cm_id_priv, backlog);
+ if (ret)
+ return ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_IDLE:
+ cm_id_priv->state = IW_CM_STATE_LISTEN;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ ret = cm_id->device->iwcm->create_listen(cm_id, backlog);
+ if (ret)
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_listen);
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * Rejects an inbound connection request. No events are generated.
+ */
+int iw_cm_reject(struct iw_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len)
+{
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ return -EINVAL;
+ }
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ ret = cm_id->device->iwcm->reject(cm_id, private_data,
+ private_data_len);
+
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_reject);
+
+/*
+ * CM_ID <-- ESTABLISHED
+ *
+ * Accepts an inbound connection request and generates an ESTABLISHED
+ * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block
+ * until the ESTABLISHED event is received from the provider.
+ */
+int iw_cm_accept(struct iw_cm_id *cm_id,
+ struct iw_cm_conn_param *iw_param)
+{
+ struct iwcm_id_private *cm_id_priv;
+ struct ib_qp *qp;
+ unsigned long flags;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ return -EINVAL;
+ }
+ /* Get the ib_qp given the QPN */
+ qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+ if (!qp) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ return -EINVAL;
+ }
+ cm_id->device->iwcm->add_ref(qp);
+ cm_id_priv->qp = qp;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ ret = cm_id->device->iwcm->accept(cm_id, iw_param);
+ if (ret) {
+ /* An error on accept precludes provider events */
+ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->qp) {
+ cm_id->device->iwcm->rem_ref(qp);
+ cm_id_priv->qp = NULL;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_accept);
+
+/*
+ * Active Side: CM_ID <-- CONN_SENT
+ *
+ * If successful, results in the generation of a CONNECT_REPLY
+ * event. iw_cm_disconnect and iw_cm_destroy will block until the
+ * CONNECT_REPLY event is received from the provider.
+ */
+int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+ unsigned long flags;
+ struct ib_qp *qp;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ ret = alloc_work_entries(cm_id_priv, 4);
+ if (ret)
+ return ret;
+
+ set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+ if (cm_id_priv->state != IW_CM_STATE_IDLE) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ return -EINVAL;
+ }
+
+ /* Get the ib_qp given the QPN */
+ qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+ if (!qp) {
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ return -EINVAL;
+ }
+ cm_id->device->iwcm->add_ref(qp);
+ cm_id_priv->qp = qp;
+ cm_id_priv->state = IW_CM_STATE_CONN_SENT;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ ret = cm_id->device->iwcm->connect(cm_id, iw_param);
+ if (ret) {
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->qp) {
+ cm_id->device->iwcm->rem_ref(qp);
+ cm_id_priv->qp = NULL;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ wake_up_all(&cm_id_priv->connect_wait);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_connect);
+
+/*
+ * Passive Side: new CM_ID <-- CONN_RECV
+ *
+ * Handles an inbound connect request. The function creates a new
+ * iw_cm_id to represent the new connection and inherits the client
+ * callback function and other attributes from the listening parent.
+ *
+ * The work item contains a pointer to the listen_cm_id and the event. The
+ * listen_cm_id contains the client cm_handler, context and
+ * device. These are copied when the device is cloned. The event
+ * contains the new four tuple.
+ *
+ * An error on the child should not affect the parent, so this
+ * function does not return a value.
+ */
+static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+ struct iw_cm_id *cm_id;
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+
+ /*
+ * The provider should never generate a connection request
+ * event with a bad status.
+ */
+ BUG_ON(iw_event->status);
+
+ cm_id = iw_create_cm_id(listen_id_priv->id.device,
+ listen_id_priv->id.cm_handler,
+ listen_id_priv->id.context);
+ /* If the cm_id could not be created, ignore the request */
+ if (IS_ERR(cm_id))
+ goto out;
+
+ cm_id->provider_data = iw_event->provider_data;
+ cm_id->local_addr = iw_event->local_addr;
+ cm_id->remote_addr = iw_event->remote_addr;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ cm_id_priv->state = IW_CM_STATE_CONN_RECV;
+
+ /*
+ * We could be destroying the listening id. If so, ignore this
+ * upcall.
+ */
+ spin_lock_irqsave(&listen_id_priv->lock, flags);
+ if (listen_id_priv->state != IW_CM_STATE_LISTEN) {
+ spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+ iw_cm_reject(cm_id, NULL, 0);
+ iw_destroy_cm_id(cm_id);
+ goto out;
+ }
+ spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+
+ ret = alloc_work_entries(cm_id_priv, 3);
+ if (ret) {
+ iw_cm_reject(cm_id, NULL, 0);
+ iw_destroy_cm_id(cm_id);
+ goto out;
+ }
+
+ /* Call the client CM handler */
+ ret = cm_id->cm_handler(cm_id, iw_event);
+ if (ret) {
+ iw_cm_reject(cm_id, NULL, 0);
+ set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+ destroy_cm_id(cm_id);
+ if (atomic_read(&cm_id_priv->refcount)==0)
+ free_cm_id(cm_id_priv);
+ }
+
+out:
+ if (iw_event->private_data_len)
+ kfree(iw_event->private_data);
+}
+
+/*
+ * Passive Side: CM_ID <-- ESTABLISHED
+ *
+ * The provider generated an ESTABLISHED event which means that
+ * the MPA negotion has completed successfully and we are now in MPA
+ * FPDU mode.
+ *
+ * This event can only be received in the CONN_RECV state. If the
+ * remote peer closed, the ESTABLISHED event would be received followed
+ * by the CLOSE event. If the app closes, it will block until we wake
+ * it up after processing this event.
+ */
+static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+ /*
+ * We clear the CONNECT_WAIT bit here to allow the callback
+ * function to call iw_cm_disconnect. Calling iw_destroy_cm_id
+ * from a callback handler is not allowed.
+ */
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+ cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+ wake_up_all(&cm_id_priv->connect_wait);
+
+ return ret;
+}
+
+/*
+ * Active Side: CM_ID <-- ESTABLISHED
+ *
+ * The app has called connect and is waiting for the established event to
+ * post it's requests to the server. This event will wake up anyone
+ * blocked in iw_cm_disconnect or iw_destroy_id.
+ */
+static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ /*
+ * Clear the connect wait bit so a callback function calling
+ * iw_cm_disconnect will not wait and deadlock this thread
+ */
+ clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+ BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
+ if (iw_event->status == 0) {
+ cm_id_priv->id.local_addr = iw_event->local_addr;
+ cm_id_priv->id.remote_addr = iw_event->remote_addr;
+ cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+ } else {
+ /* REJECTED or RESET */
+ cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+ cm_id_priv->qp = NULL;
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+
+ if (iw_event->private_data_len)
+ kfree(iw_event->private_data);
+
+ /* Wake up waiters on connect complete */
+ wake_up_all(&cm_id_priv->connect_wait);
+
+ return ret;
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * If in the ESTABLISHED state, move to CLOSING.
+ */
+static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED)
+ cm_id_priv->state = IW_CM_STATE_CLOSING;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * If in the ESTBLISHED or CLOSING states, the QP will have have been
+ * moved by the provider to the ERR state. Disassociate the CM_ID from
+ * the QP, move to IDLE, and remove the 'connected' reference.
+ *
+ * If in some other state, the cm_id was destroyed asynchronously.
+ * This is the last reference that will result in waking up
+ * the app thread blocked in iw_destroy_cm_id.
+ */
+static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ unsigned long flags;
+ int ret = 0;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+ if (cm_id_priv->qp) {
+ cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+ cm_id_priv->qp = NULL;
+ }
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_ESTABLISHED:
+ case IW_CM_STATE_CLOSING:
+ cm_id_priv->state = IW_CM_STATE_IDLE;
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ break;
+ case IW_CM_STATE_DESTROYING:
+ break;
+ default:
+ BUG();
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ return ret;
+}
+
+static int process_event(struct iwcm_id_private *cm_id_priv,
+ struct iw_cm_event *iw_event)
+{
+ int ret = 0;
+
+ switch (iw_event->event) {
+ case IW_CM_EVENT_CONNECT_REQUEST:
+ cm_conn_req_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_CONNECT_REPLY:
+ ret = cm_conn_rep_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_ESTABLISHED:
+ ret = cm_conn_est_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_DISCONNECT:
+ cm_disconnect_handler(cm_id_priv, iw_event);
+ break;
+ case IW_CM_EVENT_CLOSE:
+ ret = cm_close_handler(cm_id_priv, iw_event);
+ break;
+ default:
+ BUG();
+ }
+
+ return ret;
+}
+
+/*
+ * Process events on the work_list for the cm_id. If the callback
+ * function requests that the cm_id be deleted, a flag is set in the
+ * cm_id flags to indicate that when the last reference is
+ * removed, the cm_id is to be destroyed. This is necessary to
+ * distinguish between an object that will be destroyed by the app
+ * thread asleep on the destroy_comp list vs. an object destroyed
+ * here synchronously when the last reference is removed.
+ */
+static void cm_work_handler(struct work_struct *_work)
+{
+ struct iwcm_work *work = container_of(_work, struct iwcm_work, work);
+ struct iw_cm_event levent;
+ struct iwcm_id_private *cm_id_priv = work->cm_id;
+ unsigned long flags;
+ int empty;
+ int ret = 0;
+ int destroy_id;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ empty = list_empty(&cm_id_priv->work_list);
+ while (!empty) {
+ work = list_entry(cm_id_priv->work_list.next,
+ struct iwcm_work, list);
+ list_del_init(&work->list);
+ empty = list_empty(&cm_id_priv->work_list);
+ levent = work->event;
+ put_work(work);
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+ ret = process_event(cm_id_priv, &levent);
+ if (ret) {
+ set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+ destroy_cm_id(&cm_id_priv->id);
+ }
+ BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
+ destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+ if (iwcm_deref_id(cm_id_priv)) {
+ if (destroy_id) {
+ BUG_ON(!list_empty(&cm_id_priv->work_list));
+ free_cm_id(cm_id_priv);
+ }
+ return;
+ }
+ if (empty)
+ return;
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * This function is called on interrupt context. Schedule events on
+ * the iwcm_wq thread to allow callback functions to downcall into
+ * the CM and/or block. Events are queued to a per-CM_ID
+ * work_list. If this is the first event on the work_list, the work
+ * element is also queued on the iwcm_wq thread.
+ *
+ * Each event holds a reference on the cm_id. Until the last posted
+ * event has been delivered and processed, the cm_id cannot be
+ * deleted.
+ *
+ * Returns:
+ * 0 - the event was handled.
+ * -ENOMEM - the event was not handled due to lack of resources.
+ */
+static int cm_event_handler(struct iw_cm_id *cm_id,
+ struct iw_cm_event *iw_event)
+{
+ struct iwcm_work *work;
+ struct iwcm_id_private *cm_id_priv;
+ unsigned long flags;
+ int ret = 0;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ work = get_work(cm_id_priv);
+ if (!work) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ INIT_WORK(&work->work, cm_work_handler);
+ work->cm_id = cm_id_priv;
+ work->event = *iw_event;
+
+ if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST ||
+ work->event.event == IW_CM_EVENT_CONNECT_REPLY) &&
+ work->event.private_data_len) {
+ ret = copy_private_data(&work->event);
+ if (ret) {
+ put_work(work);
+ goto out;
+ }
+ }
+
+ atomic_inc(&cm_id_priv->refcount);
+ if (list_empty(&cm_id_priv->work_list)) {
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+ queue_work(iwcm_wq, &work->work);
+ } else
+ list_add_tail(&work->list, &cm_id_priv->work_list);
+out:
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_IDLE:
+ case IW_CM_STATE_CONN_SENT:
+ case IW_CM_STATE_CONN_RECV:
+ case IW_CM_STATE_ESTABLISHED:
+ *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+ qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE|
+ IB_ACCESS_REMOTE_READ;
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ unsigned long flags;
+ int ret;
+
+ spin_lock_irqsave(&cm_id_priv->lock, flags);
+ switch (cm_id_priv->state) {
+ case IW_CM_STATE_IDLE:
+ case IW_CM_STATE_CONN_SENT:
+ case IW_CM_STATE_CONN_RECV:
+ case IW_CM_STATE_ESTABLISHED:
+ *qp_attr_mask = 0;
+ ret = 0;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+ return ret;
+}
+
+int iw_cm_init_qp_attr(struct iw_cm_id *cm_id,
+ struct ib_qp_attr *qp_attr,
+ int *qp_attr_mask)
+{
+ struct iwcm_id_private *cm_id_priv;
+ int ret;
+
+ cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+ switch (qp_attr->qp_state) {
+ case IB_QPS_INIT:
+ case IB_QPS_RTR:
+ ret = iwcm_init_qp_init_attr(cm_id_priv,
+ qp_attr, qp_attr_mask);
+ break;
+ case IB_QPS_RTS:
+ ret = iwcm_init_qp_rts_attr(cm_id_priv,
+ qp_attr, qp_attr_mask);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(iw_cm_init_qp_attr);
+
+static int __init iw_cm_init(void)
+{
+ iwcm_wq = create_singlethread_workqueue("iw_cm_wq");
+ if (!iwcm_wq)
+ return -ENOMEM;
+
+ iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm",
+ iwcm_ctl_table);
+ if (!iwcm_ctl_table_hdr) {
+ pr_err("iw_cm: couldn't register sysctl paths\n");
+ destroy_workqueue(iwcm_wq);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __exit iw_cm_cleanup(void)
+{
+ unregister_net_sysctl_table(iwcm_ctl_table_hdr);
+ destroy_workqueue(iwcm_wq);
+}
+
+module_init(iw_cm_init);
+module_exit(iw_cm_cleanup);
diff --git a/drivers/infiniband/core/iwcm.h b/drivers/infiniband/core/iwcm.h
new file mode 100644
index 000000000..3f6cc8256
--- /dev/null
+++ b/drivers/infiniband/core/iwcm.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef IWCM_H
+#define IWCM_H
+
+enum iw_cm_state {
+ IW_CM_STATE_IDLE, /* unbound, inactive */
+ IW_CM_STATE_LISTEN, /* listen waiting for connect */
+ IW_CM_STATE_CONN_RECV, /* inbound waiting for user accept */
+ IW_CM_STATE_CONN_SENT, /* outbound waiting for peer accept */
+ IW_CM_STATE_ESTABLISHED, /* established */
+ IW_CM_STATE_CLOSING, /* disconnect */
+ IW_CM_STATE_DESTROYING /* object being deleted */
+};
+
+struct iwcm_id_private {
+ struct iw_cm_id id;
+ enum iw_cm_state state;
+ unsigned long flags;
+ struct ib_qp *qp;
+ struct completion destroy_comp;
+ wait_queue_head_t connect_wait;
+ struct list_head work_list;
+ spinlock_t lock;
+ atomic_t refcount;
+ struct list_head work_free_list;
+};
+
+#define IWCM_F_CALLBACK_DESTROY 1
+#define IWCM_F_CONNECT_WAIT 2
+
+#endif /* IWCM_H */
diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
new file mode 100644
index 000000000..e6ffa2e66
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -0,0 +1,756 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iwpm_util.h"
+
+static const char iwpm_ulib_name[IWPM_ULIBNAME_SIZE] = "iWarpPortMapperUser";
+static int iwpm_ulib_version = 3;
+static int iwpm_user_pid = IWPM_PID_UNDEFINED;
+static atomic_t echo_nlmsg_seq;
+
+int iwpm_valid_pid(void)
+{
+ return iwpm_user_pid > 0;
+}
+EXPORT_SYMBOL(iwpm_valid_pid);
+
+/*
+ * iwpm_register_pid - Send a netlink query to user space
+ * for the iwarp port mapper pid
+ *
+ * nlmsg attributes:
+ * [IWPM_NLA_REG_PID_SEQ]
+ * [IWPM_NLA_REG_IF_NAME]
+ * [IWPM_NLA_REG_IBDEV_NAME]
+ * [IWPM_NLA_REG_ULIB_NAME]
+ */
+int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client)
+{
+ struct sk_buff *skb = NULL;
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlmsghdr *nlh;
+ u32 msg_seq;
+ const char *err_str = "";
+ int ret = -EINVAL;
+
+ if (!iwpm_valid_client(nl_client)) {
+ err_str = "Invalid port mapper client";
+ goto pid_query_error;
+ }
+ if (iwpm_registered_client(nl_client))
+ return 0;
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REG_PID, &nlh, nl_client);
+ if (!skb) {
+ err_str = "Unable to create a nlmsg";
+ goto pid_query_error;
+ }
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL);
+ if (!nlmsg_request) {
+ err_str = "Unable to allocate netlink request";
+ goto pid_query_error;
+ }
+ msg_seq = atomic_read(&echo_nlmsg_seq);
+
+ /* fill in the pid request message */
+ err_str = "Unable to put attribute of the nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_REG_PID_SEQ);
+ if (ret)
+ goto pid_query_error;
+ ret = ibnl_put_attr(skb, nlh, IWPM_IFNAME_SIZE,
+ pm_msg->if_name, IWPM_NLA_REG_IF_NAME);
+ if (ret)
+ goto pid_query_error;
+ ret = ibnl_put_attr(skb, nlh, IWPM_DEVNAME_SIZE,
+ pm_msg->dev_name, IWPM_NLA_REG_IBDEV_NAME);
+ if (ret)
+ goto pid_query_error;
+ ret = ibnl_put_attr(skb, nlh, IWPM_ULIBNAME_SIZE,
+ (char *)iwpm_ulib_name, IWPM_NLA_REG_ULIB_NAME);
+ if (ret)
+ goto pid_query_error;
+
+ pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n",
+ __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name);
+
+ ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_IWPM, GFP_KERNEL);
+ if (ret) {
+ skb = NULL; /* skb is freed in the netlink send-op handling */
+ iwpm_set_registered(nl_client, 1);
+ iwpm_user_pid = IWPM_PID_UNAVAILABLE;
+ err_str = "Unable to send a nlmsg";
+ goto pid_query_error;
+ }
+ nlmsg_request->req_buffer = pm_msg;
+ ret = iwpm_wait_complete_req(nlmsg_request);
+ return ret;
+pid_query_error:
+ pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+ if (skb)
+ dev_kfree_skb(skb);
+ if (nlmsg_request)
+ iwpm_free_nlmsg_request(&nlmsg_request->kref);
+ return ret;
+}
+EXPORT_SYMBOL(iwpm_register_pid);
+
+/*
+ * iwpm_add_mapping - Send a netlink add mapping message
+ * to the port mapper
+ * nlmsg attributes:
+ * [IWPM_NLA_MANAGE_MAPPING_SEQ]
+ * [IWPM_NLA_MANAGE_ADDR]
+ */
+int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
+{
+ struct sk_buff *skb = NULL;
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlmsghdr *nlh;
+ u32 msg_seq;
+ const char *err_str = "";
+ int ret = -EINVAL;
+
+ if (!iwpm_valid_client(nl_client)) {
+ err_str = "Invalid port mapper client";
+ goto add_mapping_error;
+ }
+ if (!iwpm_registered_client(nl_client)) {
+ err_str = "Unregistered port mapper client";
+ goto add_mapping_error;
+ }
+ if (!iwpm_valid_pid())
+ return 0;
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_ADD_MAPPING, &nlh, nl_client);
+ if (!skb) {
+ err_str = "Unable to create a nlmsg";
+ goto add_mapping_error;
+ }
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL);
+ if (!nlmsg_request) {
+ err_str = "Unable to allocate netlink request";
+ goto add_mapping_error;
+ }
+ msg_seq = atomic_read(&echo_nlmsg_seq);
+ /* fill in the add mapping message */
+ err_str = "Unable to put attribute of the nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+ IWPM_NLA_MANAGE_MAPPING_SEQ);
+ if (ret)
+ goto add_mapping_error;
+ ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+ &pm_msg->loc_addr, IWPM_NLA_MANAGE_ADDR);
+ if (ret)
+ goto add_mapping_error;
+ nlmsg_request->req_buffer = pm_msg;
+
+ ret = ibnl_unicast(skb, nlh, iwpm_user_pid);
+ if (ret) {
+ skb = NULL; /* skb is freed in the netlink send-op handling */
+ iwpm_user_pid = IWPM_PID_UNDEFINED;
+ err_str = "Unable to send a nlmsg";
+ goto add_mapping_error;
+ }
+ ret = iwpm_wait_complete_req(nlmsg_request);
+ return ret;
+add_mapping_error:
+ pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+ if (skb)
+ dev_kfree_skb(skb);
+ if (nlmsg_request)
+ iwpm_free_nlmsg_request(&nlmsg_request->kref);
+ return ret;
+}
+EXPORT_SYMBOL(iwpm_add_mapping);
+
+/*
+ * iwpm_add_and_query_mapping - Send a netlink add and query
+ * mapping message to the port mapper
+ * nlmsg attributes:
+ * [IWPM_NLA_QUERY_MAPPING_SEQ]
+ * [IWPM_NLA_QUERY_LOCAL_ADDR]
+ * [IWPM_NLA_QUERY_REMOTE_ADDR]
+ */
+int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
+{
+ struct sk_buff *skb = NULL;
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlmsghdr *nlh;
+ u32 msg_seq;
+ const char *err_str = "";
+ int ret = -EINVAL;
+
+ if (!iwpm_valid_client(nl_client)) {
+ err_str = "Invalid port mapper client";
+ goto query_mapping_error;
+ }
+ if (!iwpm_registered_client(nl_client)) {
+ err_str = "Unregistered port mapper client";
+ goto query_mapping_error;
+ }
+ if (!iwpm_valid_pid())
+ return 0;
+ ret = -ENOMEM;
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_QUERY_MAPPING, &nlh, nl_client);
+ if (!skb) {
+ err_str = "Unable to create a nlmsg";
+ goto query_mapping_error;
+ }
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq,
+ nl_client, GFP_KERNEL);
+ if (!nlmsg_request) {
+ err_str = "Unable to allocate netlink request";
+ goto query_mapping_error;
+ }
+ msg_seq = atomic_read(&echo_nlmsg_seq);
+
+ /* fill in the query message */
+ err_str = "Unable to put attribute of the nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+ IWPM_NLA_QUERY_MAPPING_SEQ);
+ if (ret)
+ goto query_mapping_error;
+ ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+ &pm_msg->loc_addr, IWPM_NLA_QUERY_LOCAL_ADDR);
+ if (ret)
+ goto query_mapping_error;
+ ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+ &pm_msg->rem_addr, IWPM_NLA_QUERY_REMOTE_ADDR);
+ if (ret)
+ goto query_mapping_error;
+ nlmsg_request->req_buffer = pm_msg;
+
+ ret = ibnl_unicast(skb, nlh, iwpm_user_pid);
+ if (ret) {
+ skb = NULL; /* skb is freed in the netlink send-op handling */
+ err_str = "Unable to send a nlmsg";
+ goto query_mapping_error;
+ }
+ ret = iwpm_wait_complete_req(nlmsg_request);
+ return ret;
+query_mapping_error:
+ pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+ if (skb)
+ dev_kfree_skb(skb);
+ if (nlmsg_request)
+ iwpm_free_nlmsg_request(&nlmsg_request->kref);
+ return ret;
+}
+EXPORT_SYMBOL(iwpm_add_and_query_mapping);
+
+/*
+ * iwpm_remove_mapping - Send a netlink remove mapping message
+ * to the port mapper
+ * nlmsg attributes:
+ * [IWPM_NLA_MANAGE_MAPPING_SEQ]
+ * [IWPM_NLA_MANAGE_ADDR]
+ */
+int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ u32 msg_seq;
+ const char *err_str = "";
+ int ret = -EINVAL;
+
+ if (!iwpm_valid_client(nl_client)) {
+ err_str = "Invalid port mapper client";
+ goto remove_mapping_error;
+ }
+ if (!iwpm_registered_client(nl_client)) {
+ err_str = "Unregistered port mapper client";
+ goto remove_mapping_error;
+ }
+ if (!iwpm_valid_pid())
+ return 0;
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REMOVE_MAPPING, &nlh, nl_client);
+ if (!skb) {
+ ret = -ENOMEM;
+ err_str = "Unable to create a nlmsg";
+ goto remove_mapping_error;
+ }
+ msg_seq = atomic_read(&echo_nlmsg_seq);
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ err_str = "Unable to put attribute of the nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+ IWPM_NLA_MANAGE_MAPPING_SEQ);
+ if (ret)
+ goto remove_mapping_error;
+ ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+ local_addr, IWPM_NLA_MANAGE_ADDR);
+ if (ret)
+ goto remove_mapping_error;
+
+ ret = ibnl_unicast(skb, nlh, iwpm_user_pid);
+ if (ret) {
+ skb = NULL; /* skb is freed in the netlink send-op handling */
+ iwpm_user_pid = IWPM_PID_UNDEFINED;
+ err_str = "Unable to send a nlmsg";
+ goto remove_mapping_error;
+ }
+ iwpm_print_sockaddr(local_addr,
+ "remove_mapping: Local sockaddr:");
+ return 0;
+remove_mapping_error:
+ pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+ if (skb)
+ dev_kfree_skb_any(skb);
+ return ret;
+}
+EXPORT_SYMBOL(iwpm_remove_mapping);
+
+/* netlink attribute policy for the received response to register pid request */
+static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = {
+ [IWPM_NLA_RREG_PID_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_RREG_IBDEV_NAME] = { .type = NLA_STRING,
+ .len = IWPM_DEVNAME_SIZE - 1 },
+ [IWPM_NLA_RREG_ULIB_NAME] = { .type = NLA_STRING,
+ .len = IWPM_ULIBNAME_SIZE - 1 },
+ [IWPM_NLA_RREG_ULIB_VER] = { .type = NLA_U16 },
+ [IWPM_NLA_RREG_PID_ERR] = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_register_pid_cb - Process a port mapper response to
+ * iwpm_register_pid()
+ */
+int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlattr *nltb[IWPM_NLA_RREG_PID_MAX];
+ struct iwpm_dev_data *pm_msg;
+ char *dev_name, *iwpm_name;
+ u32 msg_seq;
+ u8 nl_client;
+ u16 iwpm_version;
+ const char *msg_type = "Register Pid response";
+
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_RREG_PID_MAX,
+ resp_reg_policy, nltb, msg_type))
+ return -EINVAL;
+
+ msg_seq = nla_get_u32(nltb[IWPM_NLA_RREG_PID_SEQ]);
+ nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+ if (!nlmsg_request) {
+ pr_info("%s: Could not find a matching request (seq = %u)\n",
+ __func__, msg_seq);
+ return -EINVAL;
+ }
+ pm_msg = nlmsg_request->req_buffer;
+ nl_client = nlmsg_request->nl_client;
+ dev_name = (char *)nla_data(nltb[IWPM_NLA_RREG_IBDEV_NAME]);
+ iwpm_name = (char *)nla_data(nltb[IWPM_NLA_RREG_ULIB_NAME]);
+ iwpm_version = nla_get_u16(nltb[IWPM_NLA_RREG_ULIB_VER]);
+
+ /* check device name, ulib name and version */
+ if (strcmp(pm_msg->dev_name, dev_name) ||
+ strcmp(iwpm_ulib_name, iwpm_name) ||
+ iwpm_version != iwpm_ulib_version) {
+
+ pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n",
+ __func__, dev_name, iwpm_name, iwpm_version);
+ nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+ goto register_pid_response_exit;
+ }
+ iwpm_user_pid = cb->nlh->nlmsg_pid;
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+ pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
+ __func__, iwpm_user_pid);
+ if (iwpm_valid_client(nl_client))
+ iwpm_set_registered(nl_client, 1);
+register_pid_response_exit:
+ nlmsg_request->request_done = 1;
+ /* always for found nlmsg_request */
+ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+ barrier();
+ wake_up(&nlmsg_request->waitq);
+ return 0;
+}
+EXPORT_SYMBOL(iwpm_register_pid_cb);
+
+/* netlink attribute policy for the received response to add mapping request */
+static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = {
+ [IWPM_NLA_MANAGE_MAPPING_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_MANAGE_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RMANAGE_MAPPING_ERR] = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_add_mapping_cb - Process a port mapper response to
+ * iwpm_add_mapping()
+ */
+int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct iwpm_sa_data *pm_msg;
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlattr *nltb[IWPM_NLA_RMANAGE_MAPPING_MAX];
+ struct sockaddr_storage *local_sockaddr;
+ struct sockaddr_storage *mapped_sockaddr;
+ const char *msg_type;
+ u32 msg_seq;
+
+ msg_type = "Add Mapping response";
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_RMANAGE_MAPPING_MAX,
+ resp_add_policy, nltb, msg_type))
+ return -EINVAL;
+
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+
+ msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]);
+ nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+ if (!nlmsg_request) {
+ pr_info("%s: Could not find a matching request (seq = %u)\n",
+ __func__, msg_seq);
+ return -EINVAL;
+ }
+ pm_msg = nlmsg_request->req_buffer;
+ local_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_MANAGE_ADDR]);
+ mapped_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]);
+
+ if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) {
+ nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+ goto add_mapping_response_exit;
+ }
+ if (mapped_sockaddr->ss_family != local_sockaddr->ss_family) {
+ pr_info("%s: Sockaddr family doesn't match the requested one\n",
+ __func__);
+ nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+ goto add_mapping_response_exit;
+ }
+ memcpy(&pm_msg->mapped_loc_addr, mapped_sockaddr,
+ sizeof(*mapped_sockaddr));
+ iwpm_print_sockaddr(&pm_msg->loc_addr,
+ "add_mapping: Local sockaddr:");
+ iwpm_print_sockaddr(&pm_msg->mapped_loc_addr,
+ "add_mapping: Mapped local sockaddr:");
+
+add_mapping_response_exit:
+ nlmsg_request->request_done = 1;
+ /* always for found request */
+ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+ barrier();
+ wake_up(&nlmsg_request->waitq);
+ return 0;
+}
+EXPORT_SYMBOL(iwpm_add_mapping_cb);
+
+/* netlink attribute policy for the response to add and query mapping request
+ * and response with remote address info */
+static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = {
+ [IWPM_NLA_QUERY_MAPPING_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_QUERY_LOCAL_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_QUERY_REMOTE_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+ [IWPM_NLA_RQUERY_MAPPING_ERR] = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_add_and_query_mapping_cb - Process a port mapper response to
+ * iwpm_add_and_query_mapping()
+ */
+int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct iwpm_sa_data *pm_msg;
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX];
+ struct sockaddr_storage *local_sockaddr, *remote_sockaddr;
+ struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr;
+ const char *msg_type;
+ u32 msg_seq;
+ u16 err_code;
+
+ msg_type = "Query Mapping response";
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX,
+ resp_query_policy, nltb, msg_type))
+ return -EINVAL;
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+
+ msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]);
+ nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+ if (!nlmsg_request) {
+ pr_info("%s: Could not find a matching request (seq = %u)\n",
+ __func__, msg_seq);
+ return -EINVAL;
+ }
+ pm_msg = nlmsg_request->req_buffer;
+ local_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+ remote_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+ mapped_loc_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
+ mapped_rem_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]);
+
+ err_code = nla_get_u16(nltb[IWPM_NLA_RQUERY_MAPPING_ERR]);
+ if (err_code == IWPM_REMOTE_QUERY_REJECT) {
+ pr_info("%s: Received a Reject (pid = %u, echo seq = %u)\n",
+ __func__, cb->nlh->nlmsg_pid, msg_seq);
+ nlmsg_request->err_code = IWPM_REMOTE_QUERY_REJECT;
+ }
+ if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr) ||
+ iwpm_compare_sockaddr(remote_sockaddr, &pm_msg->rem_addr)) {
+ pr_info("%s: Incorrect local sockaddr\n", __func__);
+ nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+ goto query_mapping_response_exit;
+ }
+ if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family ||
+ mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) {
+ pr_info("%s: Sockaddr family doesn't match the requested one\n",
+ __func__);
+ nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+ goto query_mapping_response_exit;
+ }
+ memcpy(&pm_msg->mapped_loc_addr, mapped_loc_sockaddr,
+ sizeof(*mapped_loc_sockaddr));
+ memcpy(&pm_msg->mapped_rem_addr, mapped_rem_sockaddr,
+ sizeof(*mapped_rem_sockaddr));
+
+ iwpm_print_sockaddr(&pm_msg->loc_addr,
+ "query_mapping: Local sockaddr:");
+ iwpm_print_sockaddr(&pm_msg->mapped_loc_addr,
+ "query_mapping: Mapped local sockaddr:");
+ iwpm_print_sockaddr(&pm_msg->rem_addr,
+ "query_mapping: Remote sockaddr:");
+ iwpm_print_sockaddr(&pm_msg->mapped_rem_addr,
+ "query_mapping: Mapped remote sockaddr:");
+query_mapping_response_exit:
+ nlmsg_request->request_done = 1;
+ /* always for found request */
+ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+ barrier();
+ wake_up(&nlmsg_request->waitq);
+ return 0;
+}
+EXPORT_SYMBOL(iwpm_add_and_query_mapping_cb);
+
+/*
+ * iwpm_remote_info_cb - Process a port mapper message, containing
+ * the remote connecting peer address info
+ */
+int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX];
+ struct sockaddr_storage *local_sockaddr, *remote_sockaddr;
+ struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr;
+ struct iwpm_remote_info *rem_info;
+ const char *msg_type;
+ u8 nl_client;
+ int ret = -EINVAL;
+
+ msg_type = "Remote Mapping info";
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX,
+ resp_query_policy, nltb, msg_type))
+ return ret;
+
+ nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+ if (!iwpm_valid_client(nl_client)) {
+ pr_info("%s: Invalid port mapper client = %d\n",
+ __func__, nl_client);
+ return ret;
+ }
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+
+ local_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+ remote_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+ mapped_loc_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
+ mapped_rem_sockaddr = (struct sockaddr_storage *)
+ nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]);
+
+ if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family ||
+ mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) {
+ pr_info("%s: Sockaddr family doesn't match the requested one\n",
+ __func__);
+ return ret;
+ }
+ rem_info = kzalloc(sizeof(struct iwpm_remote_info), GFP_ATOMIC);
+ if (!rem_info) {
+ pr_err("%s: Unable to allocate a remote info\n", __func__);
+ ret = -ENOMEM;
+ return ret;
+ }
+ memcpy(&rem_info->mapped_loc_sockaddr, mapped_loc_sockaddr,
+ sizeof(struct sockaddr_storage));
+ memcpy(&rem_info->remote_sockaddr, remote_sockaddr,
+ sizeof(struct sockaddr_storage));
+ memcpy(&rem_info->mapped_rem_sockaddr, mapped_rem_sockaddr,
+ sizeof(struct sockaddr_storage));
+ rem_info->nl_client = nl_client;
+
+ iwpm_add_remote_info(rem_info);
+
+ iwpm_print_sockaddr(local_sockaddr,
+ "remote_info: Local sockaddr:");
+ iwpm_print_sockaddr(mapped_loc_sockaddr,
+ "remote_info: Mapped local sockaddr:");
+ iwpm_print_sockaddr(remote_sockaddr,
+ "remote_info: Remote sockaddr:");
+ iwpm_print_sockaddr(mapped_rem_sockaddr,
+ "remote_info: Mapped remote sockaddr:");
+ return ret;
+}
+EXPORT_SYMBOL(iwpm_remote_info_cb);
+
+/* netlink attribute policy for the received request for mapping info */
+static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = {
+ [IWPM_NLA_MAPINFO_ULIB_NAME] = { .type = NLA_STRING,
+ .len = IWPM_ULIBNAME_SIZE - 1 },
+ [IWPM_NLA_MAPINFO_ULIB_VER] = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_mapping_info_cb - Process a port mapper request for mapping info
+ */
+int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlattr *nltb[IWPM_NLA_MAPINFO_REQ_MAX];
+ const char *msg_type = "Mapping Info response";
+ int iwpm_pid;
+ u8 nl_client;
+ char *iwpm_name;
+ u16 iwpm_version;
+ int ret = -EINVAL;
+
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_REQ_MAX,
+ resp_mapinfo_policy, nltb, msg_type)) {
+ pr_info("%s: Unable to parse nlmsg\n", __func__);
+ return ret;
+ }
+ iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]);
+ iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]);
+ if (strcmp(iwpm_ulib_name, iwpm_name) ||
+ iwpm_version != iwpm_ulib_version) {
+ pr_info("%s: Invalid port mapper name = %s version = %d\n",
+ __func__, iwpm_name, iwpm_version);
+ return ret;
+ }
+ nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+ if (!iwpm_valid_client(nl_client)) {
+ pr_info("%s: Invalid port mapper client = %d\n",
+ __func__, nl_client);
+ return ret;
+ }
+ iwpm_set_registered(nl_client, 0);
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+ if (!iwpm_mapinfo_available())
+ return 0;
+ iwpm_pid = cb->nlh->nlmsg_pid;
+ pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
+ __func__, iwpm_pid);
+ ret = iwpm_send_mapinfo(nl_client, iwpm_pid);
+ return ret;
+}
+EXPORT_SYMBOL(iwpm_mapping_info_cb);
+
+/* netlink attribute policy for the received mapping info ack */
+static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = {
+ [IWPM_NLA_MAPINFO_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_MAPINFO_SEND_NUM] = { .type = NLA_U32 },
+ [IWPM_NLA_MAPINFO_ACK_NUM] = { .type = NLA_U32 }
+};
+
+/*
+ * iwpm_ack_mapping_info_cb - Process a port mapper ack for
+ * the provided mapping info records
+ */
+int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlattr *nltb[IWPM_NLA_MAPINFO_NUM_MAX];
+ u32 mapinfo_send, mapinfo_ack;
+ const char *msg_type = "Mapping Info Ack";
+
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_NUM_MAX,
+ ack_mapinfo_policy, nltb, msg_type))
+ return -EINVAL;
+ mapinfo_send = nla_get_u32(nltb[IWPM_NLA_MAPINFO_SEND_NUM]);
+ mapinfo_ack = nla_get_u32(nltb[IWPM_NLA_MAPINFO_ACK_NUM]);
+ if (mapinfo_ack != mapinfo_send)
+ pr_info("%s: Invalid mapinfo number (sent = %u ack-ed = %u)\n",
+ __func__, mapinfo_send, mapinfo_ack);
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+ return 0;
+}
+EXPORT_SYMBOL(iwpm_ack_mapping_info_cb);
+
+/* netlink attribute policy for the received port mapper error message */
+static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = {
+ [IWPM_NLA_ERR_SEQ] = { .type = NLA_U32 },
+ [IWPM_NLA_ERR_CODE] = { .type = NLA_U16 },
+};
+
+/*
+ * iwpm_mapping_error_cb - Process a port mapper error message
+ */
+int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ int nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+ struct nlattr *nltb[IWPM_NLA_ERR_MAX];
+ u32 msg_seq;
+ u16 err_code;
+ const char *msg_type = "Mapping Error Msg";
+
+ if (iwpm_parse_nlmsg(cb, IWPM_NLA_ERR_MAX,
+ map_error_policy, nltb, msg_type))
+ return -EINVAL;
+
+ msg_seq = nla_get_u32(nltb[IWPM_NLA_ERR_SEQ]);
+ err_code = nla_get_u16(nltb[IWPM_NLA_ERR_CODE]);
+ pr_info("%s: Received msg seq = %u err code = %u client = %d\n",
+ __func__, msg_seq, err_code, nl_client);
+ /* look for nlmsg_request */
+ nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+ if (!nlmsg_request) {
+ /* not all errors have associated requests */
+ pr_debug("Could not find matching req (seq = %u)\n", msg_seq);
+ return 0;
+ }
+ atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+ nlmsg_request->err_code = err_code;
+ nlmsg_request->request_done = 1;
+ /* always for found request */
+ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+ barrier();
+ wake_up(&nlmsg_request->waitq);
+ return 0;
+}
+EXPORT_SYMBOL(iwpm_mapping_error_cb);
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c
new file mode 100644
index 000000000..a626795bf
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -0,0 +1,749 @@
+/*
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iwpm_util.h"
+
+#define IWPM_MAPINFO_HASH_SIZE 512
+#define IWPM_MAPINFO_HASH_MASK (IWPM_MAPINFO_HASH_SIZE - 1)
+#define IWPM_REMINFO_HASH_SIZE 64
+#define IWPM_REMINFO_HASH_MASK (IWPM_REMINFO_HASH_SIZE - 1)
+
+static LIST_HEAD(iwpm_nlmsg_req_list);
+static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock);
+
+static struct hlist_head *iwpm_hash_bucket;
+static DEFINE_SPINLOCK(iwpm_mapinfo_lock);
+
+static struct hlist_head *iwpm_reminfo_bucket;
+static DEFINE_SPINLOCK(iwpm_reminfo_lock);
+
+static DEFINE_MUTEX(iwpm_admin_lock);
+static struct iwpm_admin_data iwpm_admin;
+
+int iwpm_init(u8 nl_client)
+{
+ int ret = 0;
+ if (iwpm_valid_client(nl_client))
+ return -EINVAL;
+ mutex_lock(&iwpm_admin_lock);
+ if (atomic_read(&iwpm_admin.refcount) == 0) {
+ iwpm_hash_bucket = kzalloc(IWPM_MAPINFO_HASH_SIZE *
+ sizeof(struct hlist_head), GFP_KERNEL);
+ if (!iwpm_hash_bucket) {
+ ret = -ENOMEM;
+ pr_err("%s Unable to create mapinfo hash table\n", __func__);
+ goto init_exit;
+ }
+ iwpm_reminfo_bucket = kzalloc(IWPM_REMINFO_HASH_SIZE *
+ sizeof(struct hlist_head), GFP_KERNEL);
+ if (!iwpm_reminfo_bucket) {
+ kfree(iwpm_hash_bucket);
+ ret = -ENOMEM;
+ pr_err("%s Unable to create reminfo hash table\n", __func__);
+ goto init_exit;
+ }
+ }
+ atomic_inc(&iwpm_admin.refcount);
+init_exit:
+ mutex_unlock(&iwpm_admin_lock);
+ if (!ret) {
+ iwpm_set_valid(nl_client, 1);
+ pr_debug("%s: Mapinfo and reminfo tables are created\n",
+ __func__);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(iwpm_init);
+
+static void free_hash_bucket(void);
+static void free_reminfo_bucket(void);
+
+int iwpm_exit(u8 nl_client)
+{
+
+ if (!iwpm_valid_client(nl_client))
+ return -EINVAL;
+ mutex_lock(&iwpm_admin_lock);
+ if (atomic_read(&iwpm_admin.refcount) == 0) {
+ mutex_unlock(&iwpm_admin_lock);
+ pr_err("%s Incorrect usage - negative refcount\n", __func__);
+ return -EINVAL;
+ }
+ if (atomic_dec_and_test(&iwpm_admin.refcount)) {
+ free_hash_bucket();
+ free_reminfo_bucket();
+ pr_debug("%s: Resources are destroyed\n", __func__);
+ }
+ mutex_unlock(&iwpm_admin_lock);
+ iwpm_set_valid(nl_client, 0);
+ return 0;
+}
+EXPORT_SYMBOL(iwpm_exit);
+
+static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *,
+ struct sockaddr_storage *);
+
+int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
+ struct sockaddr_storage *mapped_sockaddr,
+ u8 nl_client)
+{
+ struct hlist_head *hash_bucket_head;
+ struct iwpm_mapping_info *map_info;
+ unsigned long flags;
+ int ret = -EINVAL;
+
+ if (!iwpm_valid_client(nl_client))
+ return ret;
+ map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL);
+ if (!map_info) {
+ pr_err("%s: Unable to allocate a mapping info\n", __func__);
+ return -ENOMEM;
+ }
+ memcpy(&map_info->local_sockaddr, local_sockaddr,
+ sizeof(struct sockaddr_storage));
+ memcpy(&map_info->mapped_sockaddr, mapped_sockaddr,
+ sizeof(struct sockaddr_storage));
+ map_info->nl_client = nl_client;
+
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ if (iwpm_hash_bucket) {
+ hash_bucket_head = get_mapinfo_hash_bucket(
+ &map_info->local_sockaddr,
+ &map_info->mapped_sockaddr);
+ if (hash_bucket_head) {
+ hlist_add_head(&map_info->hlist_node, hash_bucket_head);
+ ret = 0;
+ }
+ }
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(iwpm_create_mapinfo);
+
+int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr,
+ struct sockaddr_storage *mapped_local_addr)
+{
+ struct hlist_node *tmp_hlist_node;
+ struct hlist_head *hash_bucket_head;
+ struct iwpm_mapping_info *map_info = NULL;
+ unsigned long flags;
+ int ret = -EINVAL;
+
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ if (iwpm_hash_bucket) {
+ hash_bucket_head = get_mapinfo_hash_bucket(
+ local_sockaddr,
+ mapped_local_addr);
+ if (!hash_bucket_head)
+ goto remove_mapinfo_exit;
+
+ hlist_for_each_entry_safe(map_info, tmp_hlist_node,
+ hash_bucket_head, hlist_node) {
+
+ if (!iwpm_compare_sockaddr(&map_info->mapped_sockaddr,
+ mapped_local_addr)) {
+
+ hlist_del_init(&map_info->hlist_node);
+ kfree(map_info);
+ ret = 0;
+ break;
+ }
+ }
+ }
+remove_mapinfo_exit:
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(iwpm_remove_mapinfo);
+
+static void free_hash_bucket(void)
+{
+ struct hlist_node *tmp_hlist_node;
+ struct iwpm_mapping_info *map_info;
+ unsigned long flags;
+ int i;
+
+ /* remove all the mapinfo data from the list */
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) {
+ hlist_for_each_entry_safe(map_info, tmp_hlist_node,
+ &iwpm_hash_bucket[i], hlist_node) {
+
+ hlist_del_init(&map_info->hlist_node);
+ kfree(map_info);
+ }
+ }
+ /* free the hash list */
+ kfree(iwpm_hash_bucket);
+ iwpm_hash_bucket = NULL;
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+}
+
+static void free_reminfo_bucket(void)
+{
+ struct hlist_node *tmp_hlist_node;
+ struct iwpm_remote_info *rem_info;
+ unsigned long flags;
+ int i;
+
+ /* remove all the remote info from the list */
+ spin_lock_irqsave(&iwpm_reminfo_lock, flags);
+ for (i = 0; i < IWPM_REMINFO_HASH_SIZE; i++) {
+ hlist_for_each_entry_safe(rem_info, tmp_hlist_node,
+ &iwpm_reminfo_bucket[i], hlist_node) {
+
+ hlist_del_init(&rem_info->hlist_node);
+ kfree(rem_info);
+ }
+ }
+ /* free the hash list */
+ kfree(iwpm_reminfo_bucket);
+ iwpm_reminfo_bucket = NULL;
+ spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
+}
+
+static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage *,
+ struct sockaddr_storage *);
+
+void iwpm_add_remote_info(struct iwpm_remote_info *rem_info)
+{
+ struct hlist_head *hash_bucket_head;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iwpm_reminfo_lock, flags);
+ if (iwpm_reminfo_bucket) {
+ hash_bucket_head = get_reminfo_hash_bucket(
+ &rem_info->mapped_loc_sockaddr,
+ &rem_info->mapped_rem_sockaddr);
+ if (hash_bucket_head)
+ hlist_add_head(&rem_info->hlist_node, hash_bucket_head);
+ }
+ spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
+}
+
+int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr,
+ struct sockaddr_storage *mapped_rem_addr,
+ struct sockaddr_storage *remote_addr,
+ u8 nl_client)
+{
+ struct hlist_node *tmp_hlist_node;
+ struct hlist_head *hash_bucket_head;
+ struct iwpm_remote_info *rem_info = NULL;
+ unsigned long flags;
+ int ret = -EINVAL;
+
+ if (!iwpm_valid_client(nl_client)) {
+ pr_info("%s: Invalid client = %d\n", __func__, nl_client);
+ return ret;
+ }
+ spin_lock_irqsave(&iwpm_reminfo_lock, flags);
+ if (iwpm_reminfo_bucket) {
+ hash_bucket_head = get_reminfo_hash_bucket(
+ mapped_loc_addr,
+ mapped_rem_addr);
+ if (!hash_bucket_head)
+ goto get_remote_info_exit;
+ hlist_for_each_entry_safe(rem_info, tmp_hlist_node,
+ hash_bucket_head, hlist_node) {
+
+ if (!iwpm_compare_sockaddr(&rem_info->mapped_loc_sockaddr,
+ mapped_loc_addr) &&
+ !iwpm_compare_sockaddr(&rem_info->mapped_rem_sockaddr,
+ mapped_rem_addr)) {
+
+ memcpy(remote_addr, &rem_info->remote_sockaddr,
+ sizeof(struct sockaddr_storage));
+ iwpm_print_sockaddr(remote_addr,
+ "get_remote_info: Remote sockaddr:");
+
+ hlist_del_init(&rem_info->hlist_node);
+ kfree(rem_info);
+ ret = 0;
+ break;
+ }
+ }
+ }
+get_remote_info_exit:
+ spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
+ return ret;
+}
+EXPORT_SYMBOL(iwpm_get_remote_info);
+
+struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq,
+ u8 nl_client, gfp_t gfp)
+{
+ struct iwpm_nlmsg_request *nlmsg_request = NULL;
+ unsigned long flags;
+
+ nlmsg_request = kzalloc(sizeof(struct iwpm_nlmsg_request), gfp);
+ if (!nlmsg_request) {
+ pr_err("%s Unable to allocate a nlmsg_request\n", __func__);
+ return NULL;
+ }
+ spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+ list_add_tail(&nlmsg_request->inprocess_list, &iwpm_nlmsg_req_list);
+ spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+
+ kref_init(&nlmsg_request->kref);
+ kref_get(&nlmsg_request->kref);
+ nlmsg_request->nlmsg_seq = nlmsg_seq;
+ nlmsg_request->nl_client = nl_client;
+ nlmsg_request->request_done = 0;
+ nlmsg_request->err_code = 0;
+ return nlmsg_request;
+}
+
+void iwpm_free_nlmsg_request(struct kref *kref)
+{
+ struct iwpm_nlmsg_request *nlmsg_request;
+ unsigned long flags;
+
+ nlmsg_request = container_of(kref, struct iwpm_nlmsg_request, kref);
+
+ spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+ list_del_init(&nlmsg_request->inprocess_list);
+ spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+
+ if (!nlmsg_request->request_done)
+ pr_debug("%s Freeing incomplete nlmsg request (seq = %u).\n",
+ __func__, nlmsg_request->nlmsg_seq);
+ kfree(nlmsg_request);
+}
+
+struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq)
+{
+ struct iwpm_nlmsg_request *nlmsg_request;
+ struct iwpm_nlmsg_request *found_request = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+ list_for_each_entry(nlmsg_request, &iwpm_nlmsg_req_list,
+ inprocess_list) {
+ if (nlmsg_request->nlmsg_seq == echo_seq) {
+ found_request = nlmsg_request;
+ kref_get(&nlmsg_request->kref);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+ return found_request;
+}
+
+int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request)
+{
+ int ret;
+ init_waitqueue_head(&nlmsg_request->waitq);
+
+ ret = wait_event_timeout(nlmsg_request->waitq,
+ (nlmsg_request->request_done != 0), IWPM_NL_TIMEOUT);
+ if (!ret) {
+ ret = -EINVAL;
+ pr_info("%s: Timeout %d sec for netlink request (seq = %u)\n",
+ __func__, (IWPM_NL_TIMEOUT/HZ), nlmsg_request->nlmsg_seq);
+ } else {
+ ret = nlmsg_request->err_code;
+ }
+ kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+ return ret;
+}
+
+int iwpm_get_nlmsg_seq(void)
+{
+ return atomic_inc_return(&iwpm_admin.nlmsg_seq);
+}
+
+int iwpm_valid_client(u8 nl_client)
+{
+ if (nl_client >= RDMA_NL_NUM_CLIENTS)
+ return 0;
+ return iwpm_admin.client_list[nl_client];
+}
+
+void iwpm_set_valid(u8 nl_client, int valid)
+{
+ if (nl_client >= RDMA_NL_NUM_CLIENTS)
+ return;
+ iwpm_admin.client_list[nl_client] = valid;
+}
+
+/* valid client */
+int iwpm_registered_client(u8 nl_client)
+{
+ return iwpm_admin.reg_list[nl_client];
+}
+
+/* valid client */
+void iwpm_set_registered(u8 nl_client, int reg)
+{
+ iwpm_admin.reg_list[nl_client] = reg;
+}
+
+int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr,
+ struct sockaddr_storage *b_sockaddr)
+{
+ if (a_sockaddr->ss_family != b_sockaddr->ss_family)
+ return 1;
+ if (a_sockaddr->ss_family == AF_INET) {
+ struct sockaddr_in *a4_sockaddr =
+ (struct sockaddr_in *)a_sockaddr;
+ struct sockaddr_in *b4_sockaddr =
+ (struct sockaddr_in *)b_sockaddr;
+ if (!memcmp(&a4_sockaddr->sin_addr,
+ &b4_sockaddr->sin_addr, sizeof(struct in_addr))
+ && a4_sockaddr->sin_port == b4_sockaddr->sin_port)
+ return 0;
+
+ } else if (a_sockaddr->ss_family == AF_INET6) {
+ struct sockaddr_in6 *a6_sockaddr =
+ (struct sockaddr_in6 *)a_sockaddr;
+ struct sockaddr_in6 *b6_sockaddr =
+ (struct sockaddr_in6 *)b_sockaddr;
+ if (!memcmp(&a6_sockaddr->sin6_addr,
+ &b6_sockaddr->sin6_addr, sizeof(struct in6_addr))
+ && a6_sockaddr->sin6_port == b6_sockaddr->sin6_port)
+ return 0;
+
+ } else {
+ pr_err("%s: Invalid sockaddr family\n", __func__);
+ }
+ return 1;
+}
+
+struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh,
+ int nl_client)
+{
+ struct sk_buff *skb = NULL;
+
+ skb = dev_alloc_skb(NLMSG_GOODSIZE);
+ if (!skb) {
+ pr_err("%s Unable to allocate skb\n", __func__);
+ goto create_nlmsg_exit;
+ }
+ if (!(ibnl_put_msg(skb, nlh, 0, 0, nl_client, nl_op,
+ NLM_F_REQUEST))) {
+ pr_warn("%s: Unable to put the nlmsg header\n", __func__);
+ dev_kfree_skb(skb);
+ skb = NULL;
+ }
+create_nlmsg_exit:
+ return skb;
+}
+
+int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max,
+ const struct nla_policy *nlmsg_policy,
+ struct nlattr *nltb[], const char *msg_type)
+{
+ int nlh_len = 0;
+ int ret;
+ const char *err_str = "";
+
+ ret = nlmsg_validate(cb->nlh, nlh_len, policy_max-1, nlmsg_policy);
+ if (ret) {
+ err_str = "Invalid attribute";
+ goto parse_nlmsg_error;
+ }
+ ret = nlmsg_parse(cb->nlh, nlh_len, nltb, policy_max-1, nlmsg_policy);
+ if (ret) {
+ err_str = "Unable to parse the nlmsg";
+ goto parse_nlmsg_error;
+ }
+ ret = iwpm_validate_nlmsg_attr(nltb, policy_max);
+ if (ret) {
+ err_str = "Invalid NULL attribute";
+ goto parse_nlmsg_error;
+ }
+ return 0;
+parse_nlmsg_error:
+ pr_warn("%s: %s (msg type %s ret = %d)\n",
+ __func__, err_str, msg_type, ret);
+ return ret;
+}
+
+void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg)
+{
+ struct sockaddr_in6 *sockaddr_v6;
+ struct sockaddr_in *sockaddr_v4;
+
+ switch (sockaddr->ss_family) {
+ case AF_INET:
+ sockaddr_v4 = (struct sockaddr_in *)sockaddr;
+ pr_debug("%s IPV4 %pI4: %u(0x%04X)\n",
+ msg, &sockaddr_v4->sin_addr,
+ ntohs(sockaddr_v4->sin_port),
+ ntohs(sockaddr_v4->sin_port));
+ break;
+ case AF_INET6:
+ sockaddr_v6 = (struct sockaddr_in6 *)sockaddr;
+ pr_debug("%s IPV6 %pI6: %u(0x%04X)\n",
+ msg, &sockaddr_v6->sin6_addr,
+ ntohs(sockaddr_v6->sin6_port),
+ ntohs(sockaddr_v6->sin6_port));
+ break;
+ default:
+ break;
+ }
+}
+
+static u32 iwpm_ipv6_jhash(struct sockaddr_in6 *ipv6_sockaddr)
+{
+ u32 ipv6_hash = jhash(&ipv6_sockaddr->sin6_addr, sizeof(struct in6_addr), 0);
+ u32 hash = jhash_2words(ipv6_hash, (__force u32) ipv6_sockaddr->sin6_port, 0);
+ return hash;
+}
+
+static u32 iwpm_ipv4_jhash(struct sockaddr_in *ipv4_sockaddr)
+{
+ u32 ipv4_hash = jhash(&ipv4_sockaddr->sin_addr, sizeof(struct in_addr), 0);
+ u32 hash = jhash_2words(ipv4_hash, (__force u32) ipv4_sockaddr->sin_port, 0);
+ return hash;
+}
+
+static int get_hash_bucket(struct sockaddr_storage *a_sockaddr,
+ struct sockaddr_storage *b_sockaddr, u32 *hash)
+{
+ u32 a_hash, b_hash;
+
+ if (a_sockaddr->ss_family == AF_INET) {
+ a_hash = iwpm_ipv4_jhash((struct sockaddr_in *) a_sockaddr);
+ b_hash = iwpm_ipv4_jhash((struct sockaddr_in *) b_sockaddr);
+
+ } else if (a_sockaddr->ss_family == AF_INET6) {
+ a_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) a_sockaddr);
+ b_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) b_sockaddr);
+ } else {
+ pr_err("%s: Invalid sockaddr family\n", __func__);
+ return -EINVAL;
+ }
+
+ if (a_hash == b_hash) /* if port mapper isn't available */
+ *hash = a_hash;
+ else
+ *hash = jhash_2words(a_hash, b_hash, 0);
+ return 0;
+}
+
+static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage
+ *local_sockaddr, struct sockaddr_storage
+ *mapped_sockaddr)
+{
+ u32 hash;
+ int ret;
+
+ ret = get_hash_bucket(local_sockaddr, mapped_sockaddr, &hash);
+ if (ret)
+ return NULL;
+ return &iwpm_hash_bucket[hash & IWPM_MAPINFO_HASH_MASK];
+}
+
+static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage
+ *mapped_loc_sockaddr, struct sockaddr_storage
+ *mapped_rem_sockaddr)
+{
+ u32 hash;
+ int ret;
+
+ ret = get_hash_bucket(mapped_loc_sockaddr, mapped_rem_sockaddr, &hash);
+ if (ret)
+ return NULL;
+ return &iwpm_reminfo_bucket[hash & IWPM_REMINFO_HASH_MASK];
+}
+
+static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid)
+{
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ u32 msg_seq;
+ const char *err_str = "";
+ int ret = -EINVAL;
+
+ skb = iwpm_create_nlmsg(RDMA_NL_IWPM_MAPINFO_NUM, &nlh, nl_client);
+ if (!skb) {
+ err_str = "Unable to create a nlmsg";
+ goto mapinfo_num_error;
+ }
+ nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+ msg_seq = 0;
+ err_str = "Unable to put attribute of mapinfo number nlmsg";
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_MAPINFO_SEQ);
+ if (ret)
+ goto mapinfo_num_error;
+ ret = ibnl_put_attr(skb, nlh, sizeof(u32),
+ &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM);
+ if (ret)
+ goto mapinfo_num_error;
+ ret = ibnl_unicast(skb, nlh, iwpm_pid);
+ if (ret) {
+ skb = NULL;
+ err_str = "Unable to send a nlmsg";
+ goto mapinfo_num_error;
+ }
+ pr_debug("%s: Sent mapping number = %d\n", __func__, mapping_num);
+ return 0;
+mapinfo_num_error:
+ pr_info("%s: %s\n", __func__, err_str);
+ if (skb)
+ dev_kfree_skb(skb);
+ return ret;
+}
+
+static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid)
+{
+ struct nlmsghdr *nlh = NULL;
+ int ret = 0;
+
+ if (!skb)
+ return ret;
+ if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client,
+ RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) {
+ pr_warn("%s Unable to put NLMSG_DONE\n", __func__);
+ return -ENOMEM;
+ }
+ nlh->nlmsg_type = NLMSG_DONE;
+ ret = ibnl_unicast(skb, (struct nlmsghdr *)skb->data, iwpm_pid);
+ if (ret)
+ pr_warn("%s Unable to send a nlmsg\n", __func__);
+ return ret;
+}
+
+int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid)
+{
+ struct iwpm_mapping_info *map_info;
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ int skb_num = 0, mapping_num = 0;
+ int i = 0, nlmsg_bytes = 0;
+ unsigned long flags;
+ const char *err_str = "";
+ int ret;
+
+ skb = dev_alloc_skb(NLMSG_GOODSIZE);
+ if (!skb) {
+ ret = -ENOMEM;
+ err_str = "Unable to allocate skb";
+ goto send_mapping_info_exit;
+ }
+ skb_num++;
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) {
+ hlist_for_each_entry(map_info, &iwpm_hash_bucket[i],
+ hlist_node) {
+ if (map_info->nl_client != nl_client)
+ continue;
+ nlh = NULL;
+ if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client,
+ RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) {
+ ret = -ENOMEM;
+ err_str = "Unable to put the nlmsg header";
+ goto send_mapping_info_unlock;
+ }
+ err_str = "Unable to put attribute of the nlmsg";
+ ret = ibnl_put_attr(skb, nlh,
+ sizeof(struct sockaddr_storage),
+ &map_info->local_sockaddr,
+ IWPM_NLA_MAPINFO_LOCAL_ADDR);
+ if (ret)
+ goto send_mapping_info_unlock;
+
+ ret = ibnl_put_attr(skb, nlh,
+ sizeof(struct sockaddr_storage),
+ &map_info->mapped_sockaddr,
+ IWPM_NLA_MAPINFO_MAPPED_ADDR);
+ if (ret)
+ goto send_mapping_info_unlock;
+
+ iwpm_print_sockaddr(&map_info->local_sockaddr,
+ "send_mapping_info: Local sockaddr:");
+ iwpm_print_sockaddr(&map_info->mapped_sockaddr,
+ "send_mapping_info: Mapped local sockaddr:");
+ mapping_num++;
+ nlmsg_bytes += nlh->nlmsg_len;
+
+ /* check if all mappings can fit in one skb */
+ if (NLMSG_GOODSIZE - nlmsg_bytes < nlh->nlmsg_len * 2) {
+ /* and leave room for NLMSG_DONE */
+ nlmsg_bytes = 0;
+ skb_num++;
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock,
+ flags);
+ /* send the skb */
+ ret = send_nlmsg_done(skb, nl_client, iwpm_pid);
+ skb = NULL;
+ if (ret) {
+ err_str = "Unable to send map info";
+ goto send_mapping_info_exit;
+ }
+ if (skb_num == IWPM_MAPINFO_SKB_COUNT) {
+ ret = -ENOMEM;
+ err_str = "Insufficient skbs for map info";
+ goto send_mapping_info_exit;
+ }
+ skb = dev_alloc_skb(NLMSG_GOODSIZE);
+ if (!skb) {
+ ret = -ENOMEM;
+ err_str = "Unable to allocate skb";
+ goto send_mapping_info_exit;
+ }
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ }
+ }
+ }
+send_mapping_info_unlock:
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+send_mapping_info_exit:
+ if (ret) {
+ pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret);
+ if (skb)
+ dev_kfree_skb(skb);
+ return ret;
+ }
+ send_nlmsg_done(skb, nl_client, iwpm_pid);
+ return send_mapinfo_num(mapping_num, nl_client, iwpm_pid);
+}
+
+int iwpm_mapinfo_available(void)
+{
+ unsigned long flags;
+ int full_bucket = 0, i = 0;
+
+ spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+ if (iwpm_hash_bucket) {
+ for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) {
+ if (!hlist_empty(&iwpm_hash_bucket[i])) {
+ full_bucket = 1;
+ break;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+ return full_bucket;
+}
diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h
new file mode 100644
index 000000000..ee2d9ff09
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_util.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _IWPM_UTIL_H
+#define _IWPM_UTIL_H
+
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include <linux/jhash.h>
+#include <linux/kref.h>
+#include <net/netlink.h>
+#include <linux/errno.h>
+#include <rdma/iw_portmap.h>
+#include <rdma/rdma_netlink.h>
+
+
+#define IWPM_NL_RETRANS 3
+#define IWPM_NL_TIMEOUT (10*HZ)
+#define IWPM_MAPINFO_SKB_COUNT 20
+
+#define IWPM_PID_UNDEFINED -1
+#define IWPM_PID_UNAVAILABLE -2
+
+struct iwpm_nlmsg_request {
+ struct list_head inprocess_list;
+ __u32 nlmsg_seq;
+ void *req_buffer;
+ u8 nl_client;
+ u8 request_done;
+ u16 err_code;
+ wait_queue_head_t waitq;
+ struct kref kref;
+};
+
+struct iwpm_mapping_info {
+ struct hlist_node hlist_node;
+ struct sockaddr_storage local_sockaddr;
+ struct sockaddr_storage mapped_sockaddr;
+ u8 nl_client;
+};
+
+struct iwpm_remote_info {
+ struct hlist_node hlist_node;
+ struct sockaddr_storage remote_sockaddr;
+ struct sockaddr_storage mapped_loc_sockaddr;
+ struct sockaddr_storage mapped_rem_sockaddr;
+ u8 nl_client;
+};
+
+struct iwpm_admin_data {
+ atomic_t refcount;
+ atomic_t nlmsg_seq;
+ int client_list[RDMA_NL_NUM_CLIENTS];
+ int reg_list[RDMA_NL_NUM_CLIENTS];
+};
+
+/**
+ * iwpm_get_nlmsg_request - Allocate and initialize netlink message request
+ * @nlmsg_seq: Sequence number of the netlink message
+ * @nl_client: The index of the netlink client
+ * @gfp: Indicates how the memory for the request should be allocated
+ *
+ * Returns the newly allocated netlink request object if successful,
+ * otherwise returns NULL
+ */
+struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq,
+ u8 nl_client, gfp_t gfp);
+
+/**
+ * iwpm_free_nlmsg_request - Deallocate netlink message request
+ * @kref: Holds reference of netlink message request
+ */
+void iwpm_free_nlmsg_request(struct kref *kref);
+
+/**
+ * iwpm_find_nlmsg_request - Find netlink message request in the request list
+ * @echo_seq: Sequence number of the netlink request to find
+ *
+ * Returns the found netlink message request,
+ * if not found, returns NULL
+ */
+struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq);
+
+/**
+ * iwpm_wait_complete_req - Block while servicing the netlink request
+ * @nlmsg_request: Netlink message request to service
+ *
+ * Wakes up, after the request is completed or expired
+ * Returns 0 if the request is complete without error
+ */
+int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request);
+
+/**
+ * iwpm_get_nlmsg_seq - Get the sequence number for a netlink
+ * message to send to the port mapper
+ *
+ * Returns the sequence number for the netlink message.
+ */
+int iwpm_get_nlmsg_seq(void);
+
+/**
+ * iwpm_add_reminfo - Add remote address info of the connecting peer
+ * to the remote info hash table
+ * @reminfo: The remote info to be added
+ */
+void iwpm_add_remote_info(struct iwpm_remote_info *reminfo);
+
+/**
+ * iwpm_valid_client - Check if the port mapper client is valid
+ * @nl_client: The index of the netlink client
+ *
+ * Valid clients need to call iwpm_init() before using
+ * the port mapper
+ */
+int iwpm_valid_client(u8 nl_client);
+
+/**
+ * iwpm_set_valid - Set the port mapper client to valid or not
+ * @nl_client: The index of the netlink client
+ * @valid: 1 if valid or 0 if invalid
+ */
+void iwpm_set_valid(u8 nl_client, int valid);
+
+/**
+ * iwpm_registered_client - Check if the port mapper client is registered
+ * @nl_client: The index of the netlink client
+ *
+ * Call iwpm_register_pid() to register a client
+ */
+int iwpm_registered_client(u8 nl_client);
+
+/**
+ * iwpm_set_registered - Set the port mapper client to registered or not
+ * @nl_client: The index of the netlink client
+ * @reg: 1 if registered or 0 if not
+ */
+void iwpm_set_registered(u8 nl_client, int reg);
+
+/**
+ * iwpm_send_mapinfo - Send local and mapped IPv4/IPv6 address info of
+ * a client to the user space port mapper
+ * @nl_client: The index of the netlink client
+ * @iwpm_pid: The pid of the user space port mapper
+ *
+ * If successful, returns the number of sent mapping info records
+ */
+int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid);
+
+/**
+ * iwpm_mapinfo_available - Check if any mapping info records is available
+ * in the hash table
+ *
+ * Returns 1 if mapping information is available, otherwise returns 0
+ */
+int iwpm_mapinfo_available(void);
+
+/**
+ * iwpm_compare_sockaddr - Compare two sockaddr storage structs
+ *
+ * Returns 0 if they are holding the same ip/tcp address info,
+ * otherwise returns 1
+ */
+int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr,
+ struct sockaddr_storage *b_sockaddr);
+
+/**
+ * iwpm_validate_nlmsg_attr - Check for NULL netlink attributes
+ * @nltb: Holds address of each netlink message attributes
+ * @nla_count: Number of netlink message attributes
+ *
+ * Returns error if any of the nla_count attributes is NULL
+ */
+static inline int iwpm_validate_nlmsg_attr(struct nlattr *nltb[],
+ int nla_count)
+{
+ int i;
+ for (i = 1; i < nla_count; i++) {
+ if (!nltb[i])
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * iwpm_create_nlmsg - Allocate skb and form a netlink message
+ * @nl_op: Netlink message opcode
+ * @nlh: Holds address of the netlink message header in skb
+ * @nl_client: The index of the netlink client
+ *
+ * Returns the newly allcated skb, or NULL if the tailroom of the skb
+ * is insufficient to store the message header and payload
+ */
+struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh,
+ int nl_client);
+
+/**
+ * iwpm_parse_nlmsg - Validate and parse the received netlink message
+ * @cb: Netlink callback structure
+ * @policy_max: Maximum attribute type to be expected
+ * @nlmsg_policy: Validation policy
+ * @nltb: Array to store policy_max parsed elements
+ * @msg_type: Type of netlink message
+ *
+ * Returns 0 on success or a negative error code
+ */
+int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max,
+ const struct nla_policy *nlmsg_policy,
+ struct nlattr *nltb[], const char *msg_type);
+
+/**
+ * iwpm_print_sockaddr - Print IPv4/IPv6 address and TCP port
+ * @sockaddr: Socket address to print
+ * @msg: Message to print
+ */
+void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg);
+#endif
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
new file mode 100644
index 000000000..74c30f4c5
--- /dev/null
+++ b/drivers/infiniband/core/mad.c
@@ -0,0 +1,3176 @@
+/*
+ * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <rdma/ib_cache.h>
+
+#include "mad_priv.h"
+#include "mad_rmpp.h"
+#include "smi.h"
+#include "agent.h"
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("kernel IB MAD API");
+MODULE_AUTHOR("Hal Rosenstock");
+MODULE_AUTHOR("Sean Hefty");
+
+static int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
+static int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
+
+module_param_named(send_queue_size, mad_sendq_size, int, 0444);
+MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests");
+module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
+MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
+
+static struct kmem_cache *ib_mad_cache;
+
+static struct list_head ib_mad_port_list;
+static u32 ib_mad_client_id = 0;
+
+/* Port list lock */
+static DEFINE_SPINLOCK(ib_mad_port_list_lock);
+
+/* Forward declarations */
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+ struct ib_mad_reg_req *mad_reg_req);
+static void remove_mad_reg_req(struct ib_mad_agent_private *priv);
+static struct ib_mad_agent_private *find_mad_agent(
+ struct ib_mad_port_private *port_priv,
+ struct ib_mad *mad);
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_private *mad);
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv);
+static void timeout_sends(struct work_struct *work);
+static void local_completions(struct work_struct *work);
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv,
+ u8 mgmt_class);
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv);
+
+/*
+ * Returns a ib_mad_port_private structure or NULL for a device/port
+ * Assumes ib_mad_port_list_lock is being held
+ */
+static inline struct ib_mad_port_private *
+__ib_get_mad_port(struct ib_device *device, int port_num)
+{
+ struct ib_mad_port_private *entry;
+
+ list_for_each_entry(entry, &ib_mad_port_list, port_list) {
+ if (entry->device == device && entry->port_num == port_num)
+ return entry;
+ }
+ return NULL;
+}
+
+/*
+ * Wrapper function to return a ib_mad_port_private structure or NULL
+ * for a device/port
+ */
+static inline struct ib_mad_port_private *
+ib_get_mad_port(struct ib_device *device, int port_num)
+{
+ struct ib_mad_port_private *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ entry = __ib_get_mad_port(device, port_num);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ return entry;
+}
+
+static inline u8 convert_mgmt_class(u8 mgmt_class)
+{
+ /* Alias IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE to 0 */
+ return mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE ?
+ 0 : mgmt_class;
+}
+
+static int get_spl_qp_index(enum ib_qp_type qp_type)
+{
+ switch (qp_type)
+ {
+ case IB_QPT_SMI:
+ return 0;
+ case IB_QPT_GSI:
+ return 1;
+ default:
+ return -1;
+ }
+}
+
+static int vendor_class_index(u8 mgmt_class)
+{
+ return mgmt_class - IB_MGMT_CLASS_VENDOR_RANGE2_START;
+}
+
+static int is_vendor_class(u8 mgmt_class)
+{
+ if ((mgmt_class < IB_MGMT_CLASS_VENDOR_RANGE2_START) ||
+ (mgmt_class > IB_MGMT_CLASS_VENDOR_RANGE2_END))
+ return 0;
+ return 1;
+}
+
+static int is_vendor_oui(char *oui)
+{
+ if (oui[0] || oui[1] || oui[2])
+ return 1;
+ return 0;
+}
+
+static int is_vendor_method_in_use(
+ struct ib_mad_mgmt_vendor_class *vendor_class,
+ struct ib_mad_reg_req *mad_reg_req)
+{
+ struct ib_mad_mgmt_method_table *method;
+ int i;
+
+ for (i = 0; i < MAX_MGMT_OUI; i++) {
+ if (!memcmp(vendor_class->oui[i], mad_reg_req->oui, 3)) {
+ method = vendor_class->method_table[i];
+ if (method) {
+ if (method_in_use(&method, mad_reg_req))
+ return 1;
+ else
+ break;
+ }
+ }
+ }
+ return 0;
+}
+
+int ib_response_mad(struct ib_mad *mad)
+{
+ return ((mad->mad_hdr.method & IB_MGMT_METHOD_RESP) ||
+ (mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) ||
+ ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_BM) &&
+ (mad->mad_hdr.attr_mod & IB_BM_ATTR_MOD_RESP)));
+}
+EXPORT_SYMBOL(ib_response_mad);
+
+/*
+ * ib_register_mad_agent - Register to send/receive MADs
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+ u8 port_num,
+ enum ib_qp_type qp_type,
+ struct ib_mad_reg_req *mad_reg_req,
+ u8 rmpp_version,
+ ib_mad_send_handler send_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context,
+ u32 registration_flags)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_agent *ret = ERR_PTR(-EINVAL);
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_reg_req *reg_req = NULL;
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+ struct ib_mad_mgmt_vendor_class *vendor_class;
+ struct ib_mad_mgmt_method_table *method;
+ int ret2, qpn;
+ unsigned long flags;
+ u8 mgmt_class, vclass;
+
+ /* Validate parameters */
+ qpn = get_spl_qp_index(qp_type);
+ if (qpn == -1) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: invalid QP Type %d\n",
+ qp_type);
+ goto error1;
+ }
+
+ if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: invalid RMPP Version %u\n",
+ rmpp_version);
+ goto error1;
+ }
+
+ /* Validate MAD registration request if supplied */
+ if (mad_reg_req) {
+ if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: invalid Class Version %u\n",
+ mad_reg_req->mgmt_class_version);
+ goto error1;
+ }
+ if (!recv_handler) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: no recv_handler\n");
+ goto error1;
+ }
+ if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) {
+ /*
+ * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only
+ * one in this range currently allowed
+ */
+ if (mad_reg_req->mgmt_class !=
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n",
+ mad_reg_req->mgmt_class);
+ goto error1;
+ }
+ } else if (mad_reg_req->mgmt_class == 0) {
+ /*
+ * Class 0 is reserved in IBA and is used for
+ * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ */
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: Invalid Mgmt Class 0\n");
+ goto error1;
+ } else if (is_vendor_class(mad_reg_req->mgmt_class)) {
+ /*
+ * If class is in "new" vendor range,
+ * ensure supplied OUI is not zero
+ */
+ if (!is_vendor_oui(mad_reg_req->oui)) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: No OUI specified for class 0x%x\n",
+ mad_reg_req->mgmt_class);
+ goto error1;
+ }
+ }
+ /* Make sure class supplied is consistent with RMPP */
+ if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) {
+ if (rmpp_version) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n",
+ mad_reg_req->mgmt_class);
+ goto error1;
+ }
+ }
+
+ /* Make sure class supplied is consistent with QP type */
+ if (qp_type == IB_QPT_SMI) {
+ if ((mad_reg_req->mgmt_class !=
+ IB_MGMT_CLASS_SUBN_LID_ROUTED) &&
+ (mad_reg_req->mgmt_class !=
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n",
+ mad_reg_req->mgmt_class);
+ goto error1;
+ }
+ } else {
+ if ((mad_reg_req->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+ (mad_reg_req->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n",
+ mad_reg_req->mgmt_class);
+ goto error1;
+ }
+ }
+ } else {
+ /* No registration request supplied */
+ if (!send_handler)
+ goto error1;
+ if (registration_flags & IB_MAD_USER_RMPP)
+ goto error1;
+ }
+
+ /* Validate device and port */
+ port_priv = ib_get_mad_port(device, port_num);
+ if (!port_priv) {
+ dev_notice(&device->dev, "ib_register_mad_agent: Invalid port\n");
+ ret = ERR_PTR(-ENODEV);
+ goto error1;
+ }
+
+ /* Verify the QP requested is supported. For example, Ethernet devices
+ * will not have QP0 */
+ if (!port_priv->qp_info[qpn].qp) {
+ dev_notice(&device->dev,
+ "ib_register_mad_agent: QP %d not supported\n", qpn);
+ ret = ERR_PTR(-EPROTONOSUPPORT);
+ goto error1;
+ }
+
+ /* Allocate structures */
+ mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL);
+ if (!mad_agent_priv) {
+ ret = ERR_PTR(-ENOMEM);
+ goto error1;
+ }
+
+ mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd,
+ IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(mad_agent_priv->agent.mr)) {
+ ret = ERR_PTR(-ENOMEM);
+ goto error2;
+ }
+
+ if (mad_reg_req) {
+ reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL);
+ if (!reg_req) {
+ ret = ERR_PTR(-ENOMEM);
+ goto error3;
+ }
+ }
+
+ /* Now, fill in the various structures */
+ mad_agent_priv->qp_info = &port_priv->qp_info[qpn];
+ mad_agent_priv->reg_req = reg_req;
+ mad_agent_priv->agent.rmpp_version = rmpp_version;
+ mad_agent_priv->agent.device = device;
+ mad_agent_priv->agent.recv_handler = recv_handler;
+ mad_agent_priv->agent.send_handler = send_handler;
+ mad_agent_priv->agent.context = context;
+ mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp;
+ mad_agent_priv->agent.port_num = port_num;
+ mad_agent_priv->agent.flags = registration_flags;
+ spin_lock_init(&mad_agent_priv->lock);
+ INIT_LIST_HEAD(&mad_agent_priv->send_list);
+ INIT_LIST_HEAD(&mad_agent_priv->wait_list);
+ INIT_LIST_HEAD(&mad_agent_priv->done_list);
+ INIT_LIST_HEAD(&mad_agent_priv->rmpp_list);
+ INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends);
+ INIT_LIST_HEAD(&mad_agent_priv->local_list);
+ INIT_WORK(&mad_agent_priv->local_work, local_completions);
+ atomic_set(&mad_agent_priv->refcount, 1);
+ init_completion(&mad_agent_priv->comp);
+
+ spin_lock_irqsave(&port_priv->reg_lock, flags);
+ mad_agent_priv->agent.hi_tid = ++ib_mad_client_id;
+
+ /*
+ * Make sure MAD registration (if supplied)
+ * is non overlapping with any existing ones
+ */
+ if (mad_reg_req) {
+ mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class);
+ if (!is_vendor_class(mgmt_class)) {
+ class = port_priv->version[mad_reg_req->
+ mgmt_class_version].class;
+ if (class) {
+ method = class->method_table[mgmt_class];
+ if (method) {
+ if (method_in_use(&method,
+ mad_reg_req))
+ goto error4;
+ }
+ }
+ ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv,
+ mgmt_class);
+ } else {
+ /* "New" vendor class range */
+ vendor = port_priv->version[mad_reg_req->
+ mgmt_class_version].vendor;
+ if (vendor) {
+ vclass = vendor_class_index(mgmt_class);
+ vendor_class = vendor->vendor_class[vclass];
+ if (vendor_class) {
+ if (is_vendor_method_in_use(
+ vendor_class,
+ mad_reg_req))
+ goto error4;
+ }
+ }
+ ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv);
+ }
+ if (ret2) {
+ ret = ERR_PTR(ret2);
+ goto error4;
+ }
+ }
+
+ /* Add mad agent into port's agent list */
+ list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list);
+ spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+ return &mad_agent_priv->agent;
+
+error4:
+ spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+ kfree(reg_req);
+error3:
+ ib_dereg_mr(mad_agent_priv->agent.mr);
+error2:
+ kfree(mad_agent_priv);
+error1:
+ return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_agent);
+
+static inline int is_snooping_sends(int mad_snoop_flags)
+{
+ return (mad_snoop_flags &
+ (/*IB_MAD_SNOOP_POSTED_SENDS |
+ IB_MAD_SNOOP_RMPP_SENDS |*/
+ IB_MAD_SNOOP_SEND_COMPLETIONS /*|
+ IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/));
+}
+
+static inline int is_snooping_recvs(int mad_snoop_flags)
+{
+ return (mad_snoop_flags &
+ (IB_MAD_SNOOP_RECVS /*|
+ IB_MAD_SNOOP_RMPP_RECVS*/));
+}
+
+static int register_snoop_agent(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_snoop_private *mad_snoop_priv)
+{
+ struct ib_mad_snoop_private **new_snoop_table;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ /* Check for empty slot in array. */
+ for (i = 0; i < qp_info->snoop_table_size; i++)
+ if (!qp_info->snoop_table[i])
+ break;
+
+ if (i == qp_info->snoop_table_size) {
+ /* Grow table. */
+ new_snoop_table = krealloc(qp_info->snoop_table,
+ sizeof mad_snoop_priv *
+ (qp_info->snoop_table_size + 1),
+ GFP_ATOMIC);
+ if (!new_snoop_table) {
+ i = -ENOMEM;
+ goto out;
+ }
+
+ qp_info->snoop_table = new_snoop_table;
+ qp_info->snoop_table_size++;
+ }
+ qp_info->snoop_table[i] = mad_snoop_priv;
+ atomic_inc(&qp_info->snoop_count);
+out:
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+ return i;
+}
+
+struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
+ u8 port_num,
+ enum ib_qp_type qp_type,
+ int mad_snoop_flags,
+ ib_mad_snoop_handler snoop_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_agent *ret;
+ struct ib_mad_snoop_private *mad_snoop_priv;
+ int qpn;
+
+ /* Validate parameters */
+ if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) ||
+ (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) {
+ ret = ERR_PTR(-EINVAL);
+ goto error1;
+ }
+ qpn = get_spl_qp_index(qp_type);
+ if (qpn == -1) {
+ ret = ERR_PTR(-EINVAL);
+ goto error1;
+ }
+ port_priv = ib_get_mad_port(device, port_num);
+ if (!port_priv) {
+ ret = ERR_PTR(-ENODEV);
+ goto error1;
+ }
+ /* Allocate structures */
+ mad_snoop_priv = kzalloc(sizeof *mad_snoop_priv, GFP_KERNEL);
+ if (!mad_snoop_priv) {
+ ret = ERR_PTR(-ENOMEM);
+ goto error1;
+ }
+
+ /* Now, fill in the various structures */
+ mad_snoop_priv->qp_info = &port_priv->qp_info[qpn];
+ mad_snoop_priv->agent.device = device;
+ mad_snoop_priv->agent.recv_handler = recv_handler;
+ mad_snoop_priv->agent.snoop_handler = snoop_handler;
+ mad_snoop_priv->agent.context = context;
+ mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp;
+ mad_snoop_priv->agent.port_num = port_num;
+ mad_snoop_priv->mad_snoop_flags = mad_snoop_flags;
+ init_completion(&mad_snoop_priv->comp);
+ mad_snoop_priv->snoop_index = register_snoop_agent(
+ &port_priv->qp_info[qpn],
+ mad_snoop_priv);
+ if (mad_snoop_priv->snoop_index < 0) {
+ ret = ERR_PTR(mad_snoop_priv->snoop_index);
+ goto error2;
+ }
+
+ atomic_set(&mad_snoop_priv->refcount, 1);
+ return &mad_snoop_priv->agent;
+
+error2:
+ kfree(mad_snoop_priv);
+error1:
+ return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_snoop);
+
+static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+ if (atomic_dec_and_test(&mad_agent_priv->refcount))
+ complete(&mad_agent_priv->comp);
+}
+
+static inline void deref_snoop_agent(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+ if (atomic_dec_and_test(&mad_snoop_priv->refcount))
+ complete(&mad_snoop_priv->comp);
+}
+
+static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_port_private *port_priv;
+ unsigned long flags;
+
+ /* Note that we could still be handling received MADs */
+
+ /*
+ * Canceling all sends results in dropping received response
+ * MADs, preventing us from queuing additional work
+ */
+ cancel_mads(mad_agent_priv);
+ port_priv = mad_agent_priv->qp_info->port_priv;
+ cancel_delayed_work(&mad_agent_priv->timed_work);
+
+ spin_lock_irqsave(&port_priv->reg_lock, flags);
+ remove_mad_reg_req(mad_agent_priv);
+ list_del(&mad_agent_priv->agent_list);
+ spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+ flush_workqueue(port_priv->wq);
+ ib_cancel_rmpp_recvs(mad_agent_priv);
+
+ deref_mad_agent(mad_agent_priv);
+ wait_for_completion(&mad_agent_priv->comp);
+
+ kfree(mad_agent_priv->reg_req);
+ ib_dereg_mr(mad_agent_priv->agent.mr);
+ kfree(mad_agent_priv);
+}
+
+static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+ struct ib_mad_qp_info *qp_info;
+ unsigned long flags;
+
+ qp_info = mad_snoop_priv->qp_info;
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ qp_info->snoop_table[mad_snoop_priv->snoop_index] = NULL;
+ atomic_dec(&qp_info->snoop_count);
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+
+ deref_snoop_agent(mad_snoop_priv);
+ wait_for_completion(&mad_snoop_priv->comp);
+
+ kfree(mad_snoop_priv);
+}
+
+/*
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services
+ */
+int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_snoop_private *mad_snoop_priv;
+
+ /* If the TID is zero, the agent can only snoop. */
+ if (mad_agent->hi_tid) {
+ mad_agent_priv = container_of(mad_agent,
+ struct ib_mad_agent_private,
+ agent);
+ unregister_mad_agent(mad_agent_priv);
+ } else {
+ mad_snoop_priv = container_of(mad_agent,
+ struct ib_mad_snoop_private,
+ agent);
+ unregister_mad_snoop(mad_snoop_priv);
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ib_unregister_mad_agent);
+
+static void dequeue_mad(struct ib_mad_list_head *mad_list)
+{
+ struct ib_mad_queue *mad_queue;
+ unsigned long flags;
+
+ BUG_ON(!mad_list->mad_queue);
+ mad_queue = mad_list->mad_queue;
+ spin_lock_irqsave(&mad_queue->lock, flags);
+ list_del(&mad_list->list);
+ mad_queue->count--;
+ spin_unlock_irqrestore(&mad_queue->lock, flags);
+}
+
+static void snoop_send(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_send_buf *send_buf,
+ struct ib_mad_send_wc *mad_send_wc,
+ int mad_snoop_flags)
+{
+ struct ib_mad_snoop_private *mad_snoop_priv;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ for (i = 0; i < qp_info->snoop_table_size; i++) {
+ mad_snoop_priv = qp_info->snoop_table[i];
+ if (!mad_snoop_priv ||
+ !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+ continue;
+
+ atomic_inc(&mad_snoop_priv->refcount);
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+ mad_snoop_priv->agent.snoop_handler(&mad_snoop_priv->agent,
+ send_buf, mad_send_wc);
+ deref_snoop_agent(mad_snoop_priv);
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ }
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void snoop_recv(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_recv_wc *mad_recv_wc,
+ int mad_snoop_flags)
+{
+ struct ib_mad_snoop_private *mad_snoop_priv;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ for (i = 0; i < qp_info->snoop_table_size; i++) {
+ mad_snoop_priv = qp_info->snoop_table[i];
+ if (!mad_snoop_priv ||
+ !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+ continue;
+
+ atomic_inc(&mad_snoop_priv->refcount);
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+ mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent,
+ mad_recv_wc);
+ deref_snoop_agent(mad_snoop_priv);
+ spin_lock_irqsave(&qp_info->snoop_lock, flags);
+ }
+ spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void build_smp_wc(struct ib_qp *qp,
+ u64 wr_id, u16 slid, u16 pkey_index, u8 port_num,
+ struct ib_wc *wc)
+{
+ memset(wc, 0, sizeof *wc);
+ wc->wr_id = wr_id;
+ wc->status = IB_WC_SUCCESS;
+ wc->opcode = IB_WC_RECV;
+ wc->pkey_index = pkey_index;
+ wc->byte_len = sizeof(struct ib_mad) + sizeof(struct ib_grh);
+ wc->src_qp = IB_QP0;
+ wc->qp = qp;
+ wc->slid = slid;
+ wc->sl = 0;
+ wc->dlid_path_bits = 0;
+ wc->port_num = port_num;
+}
+
+/*
+ * Return 0 if SMP is to be sent
+ * Return 1 if SMP was consumed locally (whether or not solicited)
+ * Return < 0 if error
+ */
+static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_send_wr_private *mad_send_wr)
+{
+ int ret = 0;
+ struct ib_smp *smp = mad_send_wr->send_buf.mad;
+ unsigned long flags;
+ struct ib_mad_local_private *local;
+ struct ib_mad_private *mad_priv;
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_agent_private *recv_mad_agent = NULL;
+ struct ib_device *device = mad_agent_priv->agent.device;
+ u8 port_num;
+ struct ib_wc mad_wc;
+ struct ib_send_wr *send_wr = &mad_send_wr->send_wr;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH &&
+ smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ port_num = send_wr->wr.ud.port_num;
+ else
+ port_num = mad_agent_priv->agent.port_num;
+
+ /*
+ * Directed route handling starts if the initial LID routed part of
+ * a request or the ending LID routed part of a response is empty.
+ * If we are at the start of the LID routed part, don't update the
+ * hop_ptr or hop_cnt. See section 14.2.2, Vol 1 IB spec.
+ */
+ if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) ==
+ IB_LID_PERMISSIVE &&
+ smi_handle_dr_smp_send(smp, device->node_type, port_num) ==
+ IB_SMI_DISCARD) {
+ ret = -EINVAL;
+ dev_err(&device->dev, "Invalid directed route\n");
+ goto out;
+ }
+
+ /* Check to post send on QP or process locally */
+ if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD &&
+ smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD)
+ goto out;
+
+ local = kmalloc(sizeof *local, GFP_ATOMIC);
+ if (!local) {
+ ret = -ENOMEM;
+ dev_err(&device->dev, "No memory for ib_mad_local_private\n");
+ goto out;
+ }
+ local->mad_priv = NULL;
+ local->recv_mad_agent = NULL;
+ mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC);
+ if (!mad_priv) {
+ ret = -ENOMEM;
+ dev_err(&device->dev, "No memory for local response MAD\n");
+ kfree(local);
+ goto out;
+ }
+
+ build_smp_wc(mad_agent_priv->agent.qp,
+ send_wr->wr_id, be16_to_cpu(smp->dr_slid),
+ send_wr->wr.ud.pkey_index,
+ send_wr->wr.ud.port_num, &mad_wc);
+
+ /* No GRH for DR SMP */
+ ret = device->process_mad(device, 0, port_num, &mad_wc, NULL,
+ (struct ib_mad *)smp,
+ (struct ib_mad *)&mad_priv->mad);
+ switch (ret)
+ {
+ case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
+ if (ib_response_mad(&mad_priv->mad.mad) &&
+ mad_agent_priv->agent.recv_handler) {
+ local->mad_priv = mad_priv;
+ local->recv_mad_agent = mad_agent_priv;
+ /*
+ * Reference MAD agent until receive
+ * side of local completion handled
+ */
+ atomic_inc(&mad_agent_priv->refcount);
+ } else
+ kmem_cache_free(ib_mad_cache, mad_priv);
+ break;
+ case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED:
+ kmem_cache_free(ib_mad_cache, mad_priv);
+ break;
+ case IB_MAD_RESULT_SUCCESS:
+ /* Treat like an incoming receive MAD */
+ port_priv = ib_get_mad_port(mad_agent_priv->agent.device,
+ mad_agent_priv->agent.port_num);
+ if (port_priv) {
+ memcpy(&mad_priv->mad.mad, smp, sizeof(struct ib_mad));
+ recv_mad_agent = find_mad_agent(port_priv,
+ &mad_priv->mad.mad);
+ }
+ if (!port_priv || !recv_mad_agent) {
+ /*
+ * No receiving agent so drop packet and
+ * generate send completion.
+ */
+ kmem_cache_free(ib_mad_cache, mad_priv);
+ break;
+ }
+ local->mad_priv = mad_priv;
+ local->recv_mad_agent = recv_mad_agent;
+ break;
+ default:
+ kmem_cache_free(ib_mad_cache, mad_priv);
+ kfree(local);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ local->mad_send_wr = mad_send_wr;
+ /* Reference MAD agent until send side of local completion handled */
+ atomic_inc(&mad_agent_priv->refcount);
+ /* Queue local completion to local list */
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_add_tail(&local->completion_list, &mad_agent_priv->local_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ queue_work(mad_agent_priv->qp_info->port_priv->wq,
+ &mad_agent_priv->local_work);
+ ret = 1;
+out:
+ return ret;
+}
+
+static int get_pad_size(int hdr_len, int data_len)
+{
+ int seg_size, pad;
+
+ seg_size = sizeof(struct ib_mad) - hdr_len;
+ if (data_len && seg_size) {
+ pad = seg_size - data_len % seg_size;
+ return pad == seg_size ? 0 : pad;
+ } else
+ return seg_size;
+}
+
+static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_segment *s, *t;
+
+ list_for_each_entry_safe(s, t, &mad_send_wr->rmpp_list, list) {
+ list_del(&s->list);
+ kfree(s);
+ }
+}
+
+static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr,
+ gfp_t gfp_mask)
+{
+ struct ib_mad_send_buf *send_buf = &send_wr->send_buf;
+ struct ib_rmpp_mad *rmpp_mad = send_buf->mad;
+ struct ib_rmpp_segment *seg = NULL;
+ int left, seg_size, pad;
+
+ send_buf->seg_size = sizeof (struct ib_mad) - send_buf->hdr_len;
+ seg_size = send_buf->seg_size;
+ pad = send_wr->pad;
+
+ /* Allocate data segments. */
+ for (left = send_buf->data_len + pad; left > 0; left -= seg_size) {
+ seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask);
+ if (!seg) {
+ dev_err(&send_buf->mad_agent->device->dev,
+ "alloc_send_rmpp_segs: RMPP mem alloc failed for len %zd, gfp %#x\n",
+ sizeof (*seg) + seg_size, gfp_mask);
+ free_send_rmpp_list(send_wr);
+ return -ENOMEM;
+ }
+ seg->num = ++send_buf->seg_count;
+ list_add_tail(&seg->list, &send_wr->rmpp_list);
+ }
+
+ /* Zero any padding */
+ if (pad)
+ memset(seg->data + seg_size - pad, 0, pad);
+
+ rmpp_mad->rmpp_hdr.rmpp_version = send_wr->mad_agent_priv->
+ agent.rmpp_version;
+ rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA;
+ ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+ send_wr->cur_seg = container_of(send_wr->rmpp_list.next,
+ struct ib_rmpp_segment, list);
+ send_wr->last_ack_seg = send_wr->cur_seg;
+ return 0;
+}
+
+int ib_mad_kernel_rmpp_agent(struct ib_mad_agent *agent)
+{
+ return agent->rmpp_version && !(agent->flags & IB_MAD_USER_RMPP);
+}
+EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent);
+
+struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
+ u32 remote_qpn, u16 pkey_index,
+ int rmpp_active,
+ int hdr_len, int data_len,
+ gfp_t gfp_mask)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ int pad, message_size, ret, size;
+ void *buf;
+
+ mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+ agent);
+ pad = get_pad_size(hdr_len, data_len);
+ message_size = hdr_len + data_len + pad;
+
+ if (ib_mad_kernel_rmpp_agent(mad_agent)) {
+ if (!rmpp_active && message_size > sizeof(struct ib_mad))
+ return ERR_PTR(-EINVAL);
+ } else
+ if (rmpp_active || message_size > sizeof(struct ib_mad))
+ return ERR_PTR(-EINVAL);
+
+ size = rmpp_active ? hdr_len : sizeof(struct ib_mad);
+ buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask);
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ mad_send_wr = buf + size;
+ INIT_LIST_HEAD(&mad_send_wr->rmpp_list);
+ mad_send_wr->send_buf.mad = buf;
+ mad_send_wr->send_buf.hdr_len = hdr_len;
+ mad_send_wr->send_buf.data_len = data_len;
+ mad_send_wr->pad = pad;
+
+ mad_send_wr->mad_agent_priv = mad_agent_priv;
+ mad_send_wr->sg_list[0].length = hdr_len;
+ mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey;
+ mad_send_wr->sg_list[1].length = sizeof(struct ib_mad) - hdr_len;
+ mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey;
+
+ mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;
+ mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
+ mad_send_wr->send_wr.num_sge = 2;
+ mad_send_wr->send_wr.opcode = IB_WR_SEND;
+ mad_send_wr->send_wr.send_flags = IB_SEND_SIGNALED;
+ mad_send_wr->send_wr.wr.ud.remote_qpn = remote_qpn;
+ mad_send_wr->send_wr.wr.ud.remote_qkey = IB_QP_SET_QKEY;
+ mad_send_wr->send_wr.wr.ud.pkey_index = pkey_index;
+
+ if (rmpp_active) {
+ ret = alloc_send_rmpp_list(mad_send_wr, gfp_mask);
+ if (ret) {
+ kfree(buf);
+ return ERR_PTR(ret);
+ }
+ }
+
+ mad_send_wr->send_buf.mad_agent = mad_agent;
+ atomic_inc(&mad_agent_priv->refcount);
+ return &mad_send_wr->send_buf;
+}
+EXPORT_SYMBOL(ib_create_send_mad);
+
+int ib_get_mad_data_offset(u8 mgmt_class)
+{
+ if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM)
+ return IB_MGMT_SA_HDR;
+ else if ((mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+ (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+ (mgmt_class == IB_MGMT_CLASS_BIS))
+ return IB_MGMT_DEVICE_HDR;
+ else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+ (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))
+ return IB_MGMT_VENDOR_HDR;
+ else
+ return IB_MGMT_MAD_HDR;
+}
+EXPORT_SYMBOL(ib_get_mad_data_offset);
+
+int ib_is_mad_class_rmpp(u8 mgmt_class)
+{
+ if ((mgmt_class == IB_MGMT_CLASS_SUBN_ADM) ||
+ (mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+ (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+ (mgmt_class == IB_MGMT_CLASS_BIS) ||
+ ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+ (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END)))
+ return 1;
+ return 0;
+}
+EXPORT_SYMBOL(ib_is_mad_class_rmpp);
+
+void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct list_head *list;
+
+ mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+ send_buf);
+ list = &mad_send_wr->cur_seg->list;
+
+ if (mad_send_wr->cur_seg->num < seg_num) {
+ list_for_each_entry(mad_send_wr->cur_seg, list, list)
+ if (mad_send_wr->cur_seg->num == seg_num)
+ break;
+ } else if (mad_send_wr->cur_seg->num > seg_num) {
+ list_for_each_entry_reverse(mad_send_wr->cur_seg, list, list)
+ if (mad_send_wr->cur_seg->num == seg_num)
+ break;
+ }
+ return mad_send_wr->cur_seg->data;
+}
+EXPORT_SYMBOL(ib_get_rmpp_segment);
+
+static inline void *ib_get_payload(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ if (mad_send_wr->send_buf.seg_count)
+ return ib_get_rmpp_segment(&mad_send_wr->send_buf,
+ mad_send_wr->seg_num);
+ else
+ return mad_send_wr->send_buf.mad +
+ mad_send_wr->send_buf.hdr_len;
+}
+
+void ib_free_send_mad(struct ib_mad_send_buf *send_buf)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+
+ mad_agent_priv = container_of(send_buf->mad_agent,
+ struct ib_mad_agent_private, agent);
+ mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+ send_buf);
+
+ free_send_rmpp_list(mad_send_wr);
+ kfree(send_buf->mad);
+ deref_mad_agent(mad_agent_priv);
+}
+EXPORT_SYMBOL(ib_free_send_mad);
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_mad_qp_info *qp_info;
+ struct list_head *list;
+ struct ib_send_wr *bad_send_wr;
+ struct ib_mad_agent *mad_agent;
+ struct ib_sge *sge;
+ unsigned long flags;
+ int ret;
+
+ /* Set WR ID to find mad_send_wr upon completion */
+ qp_info = mad_send_wr->mad_agent_priv->qp_info;
+ mad_send_wr->send_wr.wr_id = (unsigned long)&mad_send_wr->mad_list;
+ mad_send_wr->mad_list.mad_queue = &qp_info->send_queue;
+
+ mad_agent = mad_send_wr->send_buf.mad_agent;
+ sge = mad_send_wr->sg_list;
+ sge[0].addr = ib_dma_map_single(mad_agent->device,
+ mad_send_wr->send_buf.mad,
+ sge[0].length,
+ DMA_TO_DEVICE);
+ if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr)))
+ return -ENOMEM;
+
+ mad_send_wr->header_mapping = sge[0].addr;
+
+ sge[1].addr = ib_dma_map_single(mad_agent->device,
+ ib_get_payload(mad_send_wr),
+ sge[1].length,
+ DMA_TO_DEVICE);
+ if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) {
+ ib_dma_unmap_single(mad_agent->device,
+ mad_send_wr->header_mapping,
+ sge[0].length, DMA_TO_DEVICE);
+ return -ENOMEM;
+ }
+ mad_send_wr->payload_mapping = sge[1].addr;
+
+ spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+ if (qp_info->send_queue.count < qp_info->send_queue.max_active) {
+ ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr,
+ &bad_send_wr);
+ list = &qp_info->send_queue.list;
+ } else {
+ ret = 0;
+ list = &qp_info->overflow_list;
+ }
+
+ if (!ret) {
+ qp_info->send_queue.count++;
+ list_add_tail(&mad_send_wr->mad_list.list, list);
+ }
+ spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+ if (ret) {
+ ib_dma_unmap_single(mad_agent->device,
+ mad_send_wr->header_mapping,
+ sge[0].length, DMA_TO_DEVICE);
+ ib_dma_unmap_single(mad_agent->device,
+ mad_send_wr->payload_mapping,
+ sge[1].length, DMA_TO_DEVICE);
+ }
+ return ret;
+}
+
+/*
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ * with the registered client
+ */
+int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
+ struct ib_mad_send_buf **bad_send_buf)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_buf *next_send_buf;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ unsigned long flags;
+ int ret = -EINVAL;
+
+ /* Walk list of send WRs and post each on send list */
+ for (; send_buf; send_buf = next_send_buf) {
+
+ mad_send_wr = container_of(send_buf,
+ struct ib_mad_send_wr_private,
+ send_buf);
+ mad_agent_priv = mad_send_wr->mad_agent_priv;
+
+ if (!send_buf->mad_agent->send_handler ||
+ (send_buf->timeout_ms &&
+ !send_buf->mad_agent->recv_handler)) {
+ ret = -EINVAL;
+ goto error;
+ }
+
+ if (!ib_is_mad_class_rmpp(((struct ib_mad_hdr *) send_buf->mad)->mgmt_class)) {
+ if (mad_agent_priv->agent.rmpp_version) {
+ ret = -EINVAL;
+ goto error;
+ }
+ }
+
+ /*
+ * Save pointer to next work request to post in case the
+ * current one completes, and the user modifies the work
+ * request associated with the completion
+ */
+ next_send_buf = send_buf->next;
+ mad_send_wr->send_wr.wr.ud.ah = send_buf->ah;
+
+ if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ ret = handle_outgoing_dr_smp(mad_agent_priv,
+ mad_send_wr);
+ if (ret < 0) /* error */
+ goto error;
+ else if (ret == 1) /* locally consumed */
+ continue;
+ }
+
+ mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
+ /* Timeout will be updated after send completes */
+ mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
+ mad_send_wr->max_retries = send_buf->retries;
+ mad_send_wr->retries_left = send_buf->retries;
+ send_buf->retries = 0;
+ /* Reference for work request to QP + response */
+ mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
+ mad_send_wr->status = IB_WC_SUCCESS;
+
+ /* Reference MAD agent until send completes */
+ atomic_inc(&mad_agent_priv->refcount);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_add_tail(&mad_send_wr->agent_list,
+ &mad_agent_priv->send_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+ ret = ib_send_rmpp_mad(mad_send_wr);
+ if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED)
+ ret = ib_send_mad(mad_send_wr);
+ } else
+ ret = ib_send_mad(mad_send_wr);
+ if (ret < 0) {
+ /* Fail send request */
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_del(&mad_send_wr->agent_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ atomic_dec(&mad_agent_priv->refcount);
+ goto error;
+ }
+ }
+ return 0;
+error:
+ if (bad_send_buf)
+ *bad_send_buf = send_buf;
+ return ret;
+}
+EXPORT_SYMBOL(ib_post_send_mad);
+
+/*
+ * ib_free_recv_mad - Returns data buffers used to receive
+ * a MAD to the access layer
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_recv_buf *mad_recv_buf, *temp_recv_buf;
+ struct ib_mad_private_header *mad_priv_hdr;
+ struct ib_mad_private *priv;
+ struct list_head free_list;
+
+ INIT_LIST_HEAD(&free_list);
+ list_splice_init(&mad_recv_wc->rmpp_list, &free_list);
+
+ list_for_each_entry_safe(mad_recv_buf, temp_recv_buf,
+ &free_list, list) {
+ mad_recv_wc = container_of(mad_recv_buf, struct ib_mad_recv_wc,
+ recv_buf);
+ mad_priv_hdr = container_of(mad_recv_wc,
+ struct ib_mad_private_header,
+ recv_wc);
+ priv = container_of(mad_priv_hdr, struct ib_mad_private,
+ header);
+ kmem_cache_free(ib_mad_cache, priv);
+ }
+}
+EXPORT_SYMBOL(ib_free_recv_mad);
+
+struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
+ u8 rmpp_version,
+ ib_mad_send_handler send_handler,
+ ib_mad_recv_handler recv_handler,
+ void *context)
+{
+ return ERR_PTR(-EINVAL); /* XXX: for now */
+}
+EXPORT_SYMBOL(ib_redirect_mad_qp);
+
+int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
+ struct ib_wc *wc)
+{
+ dev_err(&mad_agent->device->dev,
+ "ib_process_mad_wc() not implemented yet\n");
+ return 0;
+}
+EXPORT_SYMBOL(ib_process_mad_wc);
+
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+ struct ib_mad_reg_req *mad_reg_req)
+{
+ int i;
+
+ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) {
+ if ((*method)->agent[i]) {
+ pr_err("Method %d already in use\n", i);
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+static int allocate_method_table(struct ib_mad_mgmt_method_table **method)
+{
+ /* Allocate management method table */
+ *method = kzalloc(sizeof **method, GFP_ATOMIC);
+ if (!*method) {
+ pr_err("No memory for ib_mad_mgmt_method_table\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+/*
+ * Check to see if there are any methods still in use
+ */
+static int check_method_table(struct ib_mad_mgmt_method_table *method)
+{
+ int i;
+
+ for (i = 0; i < IB_MGMT_MAX_METHODS; i++)
+ if (method->agent[i])
+ return 1;
+ return 0;
+}
+
+/*
+ * Check to see if there are any method tables for this class still in use
+ */
+static int check_class_table(struct ib_mad_mgmt_class_table *class)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_CLASS; i++)
+ if (class->method_table[i])
+ return 1;
+ return 0;
+}
+
+static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_OUI; i++)
+ if (vendor_class->method_table[i])
+ return 1;
+ return 0;
+}
+
+static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class,
+ char *oui)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_OUI; i++)
+ /* Is there matching OUI for this vendor class ? */
+ if (!memcmp(vendor_class->oui[i], oui, 3))
+ return i;
+
+ return -1;
+}
+
+static int check_vendor_table(struct ib_mad_mgmt_vendor_class_table *vendor)
+{
+ int i;
+
+ for (i = 0; i < MAX_MGMT_VENDOR_RANGE2; i++)
+ if (vendor->vendor_class[i])
+ return 1;
+
+ return 0;
+}
+
+static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method,
+ struct ib_mad_agent_private *agent)
+{
+ int i;
+
+ /* Remove any methods for this mad agent */
+ for (i = 0; i < IB_MGMT_MAX_METHODS; i++) {
+ if (method->agent[i] == agent) {
+ method->agent[i] = NULL;
+ }
+ }
+}
+
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv,
+ u8 mgmt_class)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_mgmt_class_table **class;
+ struct ib_mad_mgmt_method_table **method;
+ int i, ret;
+
+ port_priv = agent_priv->qp_info->port_priv;
+ class = &port_priv->version[mad_reg_req->mgmt_class_version].class;
+ if (!*class) {
+ /* Allocate management class table for "new" class version */
+ *class = kzalloc(sizeof **class, GFP_ATOMIC);
+ if (!*class) {
+ dev_err(&agent_priv->agent.device->dev,
+ "No memory for ib_mad_mgmt_class_table\n");
+ ret = -ENOMEM;
+ goto error1;
+ }
+
+ /* Allocate method table for this management class */
+ method = &(*class)->method_table[mgmt_class];
+ if ((ret = allocate_method_table(method)))
+ goto error2;
+ } else {
+ method = &(*class)->method_table[mgmt_class];
+ if (!*method) {
+ /* Allocate method table for this management class */
+ if ((ret = allocate_method_table(method)))
+ goto error1;
+ }
+ }
+
+ /* Now, make sure methods are not already in use */
+ if (method_in_use(method, mad_reg_req))
+ goto error3;
+
+ /* Finally, add in methods being registered */
+ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS)
+ (*method)->agent[i] = agent_priv;
+
+ return 0;
+
+error3:
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(*method, agent_priv);
+ /* Now, check to see if there are any methods in use */
+ if (!check_method_table(*method)) {
+ /* If not, release management method table */
+ kfree(*method);
+ *method = NULL;
+ }
+ ret = -EINVAL;
+ goto error1;
+error2:
+ kfree(*class);
+ *class = NULL;
+error1:
+ return ret;
+}
+
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+ struct ib_mad_agent_private *agent_priv)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_mgmt_vendor_class_table **vendor_table;
+ struct ib_mad_mgmt_vendor_class_table *vendor = NULL;
+ struct ib_mad_mgmt_vendor_class *vendor_class = NULL;
+ struct ib_mad_mgmt_method_table **method;
+ int i, ret = -ENOMEM;
+ u8 vclass;
+
+ /* "New" vendor (with OUI) class */
+ vclass = vendor_class_index(mad_reg_req->mgmt_class);
+ port_priv = agent_priv->qp_info->port_priv;
+ vendor_table = &port_priv->version[
+ mad_reg_req->mgmt_class_version].vendor;
+ if (!*vendor_table) {
+ /* Allocate mgmt vendor class table for "new" class version */
+ vendor = kzalloc(sizeof *vendor, GFP_ATOMIC);
+ if (!vendor) {
+ dev_err(&agent_priv->agent.device->dev,
+ "No memory for ib_mad_mgmt_vendor_class_table\n");
+ goto error1;
+ }
+
+ *vendor_table = vendor;
+ }
+ if (!(*vendor_table)->vendor_class[vclass]) {
+ /* Allocate table for this management vendor class */
+ vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC);
+ if (!vendor_class) {
+ dev_err(&agent_priv->agent.device->dev,
+ "No memory for ib_mad_mgmt_vendor_class\n");
+ goto error2;
+ }
+
+ (*vendor_table)->vendor_class[vclass] = vendor_class;
+ }
+ for (i = 0; i < MAX_MGMT_OUI; i++) {
+ /* Is there matching OUI for this vendor class ? */
+ if (!memcmp((*vendor_table)->vendor_class[vclass]->oui[i],
+ mad_reg_req->oui, 3)) {
+ method = &(*vendor_table)->vendor_class[
+ vclass]->method_table[i];
+ BUG_ON(!*method);
+ goto check_in_use;
+ }
+ }
+ for (i = 0; i < MAX_MGMT_OUI; i++) {
+ /* OUI slot available ? */
+ if (!is_vendor_oui((*vendor_table)->vendor_class[
+ vclass]->oui[i])) {
+ method = &(*vendor_table)->vendor_class[
+ vclass]->method_table[i];
+ BUG_ON(*method);
+ /* Allocate method table for this OUI */
+ if ((ret = allocate_method_table(method)))
+ goto error3;
+ memcpy((*vendor_table)->vendor_class[vclass]->oui[i],
+ mad_reg_req->oui, 3);
+ goto check_in_use;
+ }
+ }
+ dev_err(&agent_priv->agent.device->dev, "All OUI slots in use\n");
+ goto error3;
+
+check_in_use:
+ /* Now, make sure methods are not already in use */
+ if (method_in_use(method, mad_reg_req))
+ goto error4;
+
+ /* Finally, add in methods being registered */
+ for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS)
+ (*method)->agent[i] = agent_priv;
+
+ return 0;
+
+error4:
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(*method, agent_priv);
+ /* Now, check to see if there are any methods in use */
+ if (!check_method_table(*method)) {
+ /* If not, release management method table */
+ kfree(*method);
+ *method = NULL;
+ }
+ ret = -EINVAL;
+error3:
+ if (vendor_class) {
+ (*vendor_table)->vendor_class[vclass] = NULL;
+ kfree(vendor_class);
+ }
+error2:
+ if (vendor) {
+ *vendor_table = NULL;
+ kfree(vendor);
+ }
+error1:
+ return ret;
+}
+
+static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_method_table *method;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+ struct ib_mad_mgmt_vendor_class *vendor_class;
+ int index;
+ u8 mgmt_class;
+
+ /*
+ * Was MAD registration request supplied
+ * with original registration ?
+ */
+ if (!agent_priv->reg_req) {
+ goto out;
+ }
+
+ port_priv = agent_priv->qp_info->port_priv;
+ mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class);
+ class = port_priv->version[
+ agent_priv->reg_req->mgmt_class_version].class;
+ if (!class)
+ goto vendor_check;
+
+ method = class->method_table[mgmt_class];
+ if (method) {
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(method, agent_priv);
+ /* Now, check to see if there are any methods still in use */
+ if (!check_method_table(method)) {
+ /* If not, release management method table */
+ kfree(method);
+ class->method_table[mgmt_class] = NULL;
+ /* Any management classes left ? */
+ if (!check_class_table(class)) {
+ /* If not, release management class table */
+ kfree(class);
+ port_priv->version[
+ agent_priv->reg_req->
+ mgmt_class_version].class = NULL;
+ }
+ }
+ }
+
+vendor_check:
+ if (!is_vendor_class(mgmt_class))
+ goto out;
+
+ /* normalize mgmt_class to vendor range 2 */
+ mgmt_class = vendor_class_index(agent_priv->reg_req->mgmt_class);
+ vendor = port_priv->version[
+ agent_priv->reg_req->mgmt_class_version].vendor;
+
+ if (!vendor)
+ goto out;
+
+ vendor_class = vendor->vendor_class[mgmt_class];
+ if (vendor_class) {
+ index = find_vendor_oui(vendor_class, agent_priv->reg_req->oui);
+ if (index < 0)
+ goto out;
+ method = vendor_class->method_table[index];
+ if (method) {
+ /* Remove any methods for this mad agent */
+ remove_methods_mad_agent(method, agent_priv);
+ /*
+ * Now, check to see if there are
+ * any methods still in use
+ */
+ if (!check_method_table(method)) {
+ /* If not, release management method table */
+ kfree(method);
+ vendor_class->method_table[index] = NULL;
+ memset(vendor_class->oui[index], 0, 3);
+ /* Any OUIs left ? */
+ if (!check_vendor_class(vendor_class)) {
+ /* If not, release vendor class table */
+ kfree(vendor_class);
+ vendor->vendor_class[mgmt_class] = NULL;
+ /* Any other vendor classes left ? */
+ if (!check_vendor_table(vendor)) {
+ kfree(vendor);
+ port_priv->version[
+ agent_priv->reg_req->
+ mgmt_class_version].
+ vendor = NULL;
+ }
+ }
+ }
+ }
+ }
+
+out:
+ return;
+}
+
+static struct ib_mad_agent_private *
+find_mad_agent(struct ib_mad_port_private *port_priv,
+ struct ib_mad *mad)
+{
+ struct ib_mad_agent_private *mad_agent = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&port_priv->reg_lock, flags);
+ if (ib_response_mad(mad)) {
+ u32 hi_tid;
+ struct ib_mad_agent_private *entry;
+
+ /*
+ * Routing is based on high 32 bits of transaction ID
+ * of MAD.
+ */
+ hi_tid = be64_to_cpu(mad->mad_hdr.tid) >> 32;
+ list_for_each_entry(entry, &port_priv->agent_list, agent_list) {
+ if (entry->agent.hi_tid == hi_tid) {
+ mad_agent = entry;
+ break;
+ }
+ }
+ } else {
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_method_table *method;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+ struct ib_mad_mgmt_vendor_class *vendor_class;
+ struct ib_vendor_mad *vendor_mad;
+ int index;
+
+ /*
+ * Routing is based on version, class, and method
+ * For "newer" vendor MADs, also based on OUI
+ */
+ if (mad->mad_hdr.class_version >= MAX_MGMT_VERSION)
+ goto out;
+ if (!is_vendor_class(mad->mad_hdr.mgmt_class)) {
+ class = port_priv->version[
+ mad->mad_hdr.class_version].class;
+ if (!class)
+ goto out;
+ if (convert_mgmt_class(mad->mad_hdr.mgmt_class) >=
+ IB_MGMT_MAX_METHODS)
+ goto out;
+ method = class->method_table[convert_mgmt_class(
+ mad->mad_hdr.mgmt_class)];
+ if (method)
+ mad_agent = method->agent[mad->mad_hdr.method &
+ ~IB_MGMT_METHOD_RESP];
+ } else {
+ vendor = port_priv->version[
+ mad->mad_hdr.class_version].vendor;
+ if (!vendor)
+ goto out;
+ vendor_class = vendor->vendor_class[vendor_class_index(
+ mad->mad_hdr.mgmt_class)];
+ if (!vendor_class)
+ goto out;
+ /* Find matching OUI */
+ vendor_mad = (struct ib_vendor_mad *)mad;
+ index = find_vendor_oui(vendor_class, vendor_mad->oui);
+ if (index == -1)
+ goto out;
+ method = vendor_class->method_table[index];
+ if (method) {
+ mad_agent = method->agent[mad->mad_hdr.method &
+ ~IB_MGMT_METHOD_RESP];
+ }
+ }
+ }
+
+ if (mad_agent) {
+ if (mad_agent->agent.recv_handler)
+ atomic_inc(&mad_agent->refcount);
+ else {
+ dev_notice(&port_priv->device->dev,
+ "No receive handler for client %p on port %d\n",
+ &mad_agent->agent, port_priv->port_num);
+ mad_agent = NULL;
+ }
+ }
+out:
+ spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+ return mad_agent;
+}
+
+static int validate_mad(struct ib_mad *mad, u32 qp_num)
+{
+ int valid = 0;
+
+ /* Make sure MAD base version is understood */
+ if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) {
+ pr_err("MAD received with unsupported base version %d\n",
+ mad->mad_hdr.base_version);
+ goto out;
+ }
+
+ /* Filter SMI packets sent to other than QP0 */
+ if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+ (mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+ if (qp_num == 0)
+ valid = 1;
+ } else {
+ /* Filter GSI packets sent to QP0 */
+ if (qp_num != 0)
+ valid = 1;
+ }
+
+out:
+ return valid;
+}
+
+static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_hdr *mad_hdr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_hdr;
+ return !mad_agent_priv->agent.rmpp_version ||
+ !ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) ||
+ !(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE) ||
+ (rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA);
+}
+
+static inline int rcv_has_same_class(struct ib_mad_send_wr_private *wr,
+ struct ib_mad_recv_wc *rwc)
+{
+ return ((struct ib_mad *)(wr->send_buf.mad))->mad_hdr.mgmt_class ==
+ rwc->recv_buf.mad->mad_hdr.mgmt_class;
+}
+
+static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_send_wr_private *wr,
+ struct ib_mad_recv_wc *rwc )
+{
+ struct ib_ah_attr attr;
+ u8 send_resp, rcv_resp;
+ union ib_gid sgid;
+ struct ib_device *device = mad_agent_priv->agent.device;
+ u8 port_num = mad_agent_priv->agent.port_num;
+ u8 lmc;
+
+ send_resp = ib_response_mad((struct ib_mad *)wr->send_buf.mad);
+ rcv_resp = ib_response_mad(rwc->recv_buf.mad);
+
+ if (send_resp == rcv_resp)
+ /* both requests, or both responses. GIDs different */
+ return 0;
+
+ if (ib_query_ah(wr->send_buf.ah, &attr))
+ /* Assume not equal, to avoid false positives. */
+ return 0;
+
+ if (!!(attr.ah_flags & IB_AH_GRH) !=
+ !!(rwc->wc->wc_flags & IB_WC_GRH))
+ /* one has GID, other does not. Assume different */
+ return 0;
+
+ if (!send_resp && rcv_resp) {
+ /* is request/response. */
+ if (!(attr.ah_flags & IB_AH_GRH)) {
+ if (ib_get_cached_lmc(device, port_num, &lmc))
+ return 0;
+ return (!lmc || !((attr.src_path_bits ^
+ rwc->wc->dlid_path_bits) &
+ ((1 << lmc) - 1)));
+ } else {
+ if (ib_get_cached_gid(device, port_num,
+ attr.grh.sgid_index, &sgid))
+ return 0;
+ return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw,
+ 16);
+ }
+ }
+
+ if (!(attr.ah_flags & IB_AH_GRH))
+ return attr.dlid == rwc->wc->slid;
+ else
+ return !memcmp(attr.grh.dgid.raw, rwc->recv_buf.grh->sgid.raw,
+ 16);
+}
+
+static inline int is_direct(u8 class)
+{
+ return (class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE);
+}
+
+struct ib_mad_send_wr_private*
+ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_recv_wc *wc)
+{
+ struct ib_mad_send_wr_private *wr;
+ struct ib_mad *mad;
+
+ mad = (struct ib_mad *)wc->recv_buf.mad;
+
+ list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) {
+ if ((wr->tid == mad->mad_hdr.tid) &&
+ rcv_has_same_class(wr, wc) &&
+ /*
+ * Don't check GID for direct routed MADs.
+ * These might have permissive LIDs.
+ */
+ (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) ||
+ rcv_has_same_gid(mad_agent_priv, wr, wc)))
+ return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+ }
+
+ /*
+ * It's possible to receive the response before we've
+ * been notified that the send has completed
+ */
+ list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) {
+ if (is_data_mad(mad_agent_priv, wr->send_buf.mad) &&
+ wr->tid == mad->mad_hdr.tid &&
+ wr->timeout &&
+ rcv_has_same_class(wr, wc) &&
+ /*
+ * Don't check GID for direct routed MADs.
+ * These might have permissive LIDs.
+ */
+ (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) ||
+ rcv_has_same_gid(mad_agent_priv, wr, wc)))
+ /* Verify request has not been canceled */
+ return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+ }
+ return NULL;
+}
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ mad_send_wr->timeout = 0;
+ if (mad_send_wr->refcount == 1)
+ list_move_tail(&mad_send_wr->agent_list,
+ &mad_send_wr->mad_agent_priv->done_list);
+}
+
+static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ unsigned long flags;
+
+ INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
+ list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
+ if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+ mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
+ mad_recv_wc);
+ if (!mad_recv_wc) {
+ deref_mad_agent(mad_agent_priv);
+ return;
+ }
+ }
+
+ /* Complete corresponding request */
+ if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
+ if (!mad_send_wr) {
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ if (!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)
+ && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class)
+ && (ib_get_rmpp_flags(&((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr)
+ & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+ /* user rmpp is in effect
+ * and this is an active RMPP MAD
+ */
+ mad_recv_wc->wc->wr_id = 0;
+ mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+ mad_recv_wc);
+ atomic_dec(&mad_agent_priv->refcount);
+ } else {
+ /* not user rmpp, revert to normal behavior and
+ * drop the mad */
+ ib_free_recv_mad(mad_recv_wc);
+ deref_mad_agent(mad_agent_priv);
+ return;
+ }
+ } else {
+ ib_mark_mad_done(mad_send_wr);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ /* Defined behavior is to complete response before request */
+ mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf;
+ mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+ mad_recv_wc);
+ atomic_dec(&mad_agent_priv->refcount);
+
+ mad_send_wc.status = IB_WC_SUCCESS;
+ mad_send_wc.vendor_err = 0;
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+ }
+ } else {
+ mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+ mad_recv_wc);
+ deref_mad_agent(mad_agent_priv);
+ }
+}
+
+static bool generate_unmatched_resp(struct ib_mad_private *recv,
+ struct ib_mad_private *response)
+{
+ if (recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_GET ||
+ recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_SET) {
+ memcpy(response, recv, sizeof *response);
+ response->header.recv_wc.wc = &response->header.wc;
+ response->header.recv_wc.recv_buf.mad = &response->mad.mad;
+ response->header.recv_wc.recv_buf.grh = &response->grh;
+ response->mad.mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+ response->mad.mad.mad_hdr.status =
+ cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB);
+ if (recv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+ response->mad.mad.mad_hdr.status |= IB_SMP_DIRECTION;
+
+ return true;
+ } else {
+ return false;
+ }
+}
+static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
+ struct ib_wc *wc)
+{
+ struct ib_mad_qp_info *qp_info;
+ struct ib_mad_private_header *mad_priv_hdr;
+ struct ib_mad_private *recv, *response = NULL;
+ struct ib_mad_list_head *mad_list;
+ struct ib_mad_agent_private *mad_agent;
+ int port_num;
+ int ret = IB_MAD_RESULT_SUCCESS;
+
+ mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+ qp_info = mad_list->mad_queue->qp_info;
+ dequeue_mad(mad_list);
+
+ mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header,
+ mad_list);
+ recv = container_of(mad_priv_hdr, struct ib_mad_private, header);
+ ib_dma_unmap_single(port_priv->device,
+ recv->header.mapping,
+ sizeof(struct ib_mad_private) -
+ sizeof(struct ib_mad_private_header),
+ DMA_FROM_DEVICE);
+
+ /* Setup MAD receive work completion from "normal" work completion */
+ recv->header.wc = *wc;
+ recv->header.recv_wc.wc = &recv->header.wc;
+ recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+ recv->header.recv_wc.recv_buf.mad = &recv->mad.mad;
+ recv->header.recv_wc.recv_buf.grh = &recv->grh;
+
+ if (atomic_read(&qp_info->snoop_count))
+ snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS);
+
+ /* Validate MAD */
+ if (!validate_mad(&recv->mad.mad, qp_info->qp->qp_num))
+ goto out;
+
+ response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+ if (!response) {
+ dev_err(&port_priv->device->dev,
+ "ib_mad_recv_done_handler no memory for response buffer\n");
+ goto out;
+ }
+
+ if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH)
+ port_num = wc->port_num;
+ else
+ port_num = port_priv->port_num;
+
+ if (recv->mad.mad.mad_hdr.mgmt_class ==
+ IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+ enum smi_forward_action retsmi;
+
+ if (smi_handle_dr_smp_recv(&recv->mad.smp,
+ port_priv->device->node_type,
+ port_num,
+ port_priv->device->phys_port_cnt) ==
+ IB_SMI_DISCARD)
+ goto out;
+
+ retsmi = smi_check_forward_dr_smp(&recv->mad.smp);
+ if (retsmi == IB_SMI_LOCAL)
+ goto local;
+
+ if (retsmi == IB_SMI_SEND) { /* don't forward */
+ if (smi_handle_dr_smp_send(&recv->mad.smp,
+ port_priv->device->node_type,
+ port_num) == IB_SMI_DISCARD)
+ goto out;
+
+ if (smi_check_local_smp(&recv->mad.smp, port_priv->device) == IB_SMI_DISCARD)
+ goto out;
+ } else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) {
+ /* forward case for switches */
+ memcpy(response, recv, sizeof(*response));
+ response->header.recv_wc.wc = &response->header.wc;
+ response->header.recv_wc.recv_buf.mad = &response->mad.mad;
+ response->header.recv_wc.recv_buf.grh = &response->grh;
+
+ agent_send_response(&response->mad.mad,
+ &response->grh, wc,
+ port_priv->device,
+ smi_get_fwd_port(&recv->mad.smp),
+ qp_info->qp->qp_num);
+
+ goto out;
+ }
+ }
+
+local:
+ /* Give driver "right of first refusal" on incoming MAD */
+ if (port_priv->device->process_mad) {
+ ret = port_priv->device->process_mad(port_priv->device, 0,
+ port_priv->port_num,
+ wc, &recv->grh,
+ &recv->mad.mad,
+ &response->mad.mad);
+ if (ret & IB_MAD_RESULT_SUCCESS) {
+ if (ret & IB_MAD_RESULT_CONSUMED)
+ goto out;
+ if (ret & IB_MAD_RESULT_REPLY) {
+ agent_send_response(&response->mad.mad,
+ &recv->grh, wc,
+ port_priv->device,
+ port_num,
+ qp_info->qp->qp_num);
+ goto out;
+ }
+ }
+ }
+
+ mad_agent = find_mad_agent(port_priv, &recv->mad.mad);
+ if (mad_agent) {
+ ib_mad_complete_recv(mad_agent, &recv->header.recv_wc);
+ /*
+ * recv is freed up in error cases in ib_mad_complete_recv
+ * or via recv_handler in ib_mad_complete_recv()
+ */
+ recv = NULL;
+ } else if ((ret & IB_MAD_RESULT_SUCCESS) &&
+ generate_unmatched_resp(recv, response)) {
+ agent_send_response(&response->mad.mad, &recv->grh, wc,
+ port_priv->device, port_num, qp_info->qp->qp_num);
+ }
+
+out:
+ /* Post another receive request for this QP */
+ if (response) {
+ ib_mad_post_receive_mads(qp_info, response);
+ if (recv)
+ kmem_cache_free(ib_mad_cache, recv);
+ } else
+ ib_mad_post_receive_mads(qp_info, recv);
+}
+
+static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ unsigned long delay;
+
+ if (list_empty(&mad_agent_priv->wait_list)) {
+ cancel_delayed_work(&mad_agent_priv->timed_work);
+ } else {
+ mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+ struct ib_mad_send_wr_private,
+ agent_list);
+
+ if (time_after(mad_agent_priv->timeout,
+ mad_send_wr->timeout)) {
+ mad_agent_priv->timeout = mad_send_wr->timeout;
+ delay = mad_send_wr->timeout - jiffies;
+ if ((long)delay <= 0)
+ delay = 1;
+ mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq,
+ &mad_agent_priv->timed_work, delay);
+ }
+ }
+}
+
+static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *temp_mad_send_wr;
+ struct list_head *list_item;
+ unsigned long delay;
+
+ mad_agent_priv = mad_send_wr->mad_agent_priv;
+ list_del(&mad_send_wr->agent_list);
+
+ delay = mad_send_wr->timeout;
+ mad_send_wr->timeout += jiffies;
+
+ if (delay) {
+ list_for_each_prev(list_item, &mad_agent_priv->wait_list) {
+ temp_mad_send_wr = list_entry(list_item,
+ struct ib_mad_send_wr_private,
+ agent_list);
+ if (time_after(mad_send_wr->timeout,
+ temp_mad_send_wr->timeout))
+ break;
+ }
+ }
+ else
+ list_item = &mad_agent_priv->wait_list;
+ list_add(&mad_send_wr->agent_list, list_item);
+
+ /* Reschedule a work item if we have a shorter timeout */
+ if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list)
+ mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq,
+ &mad_agent_priv->timed_work, delay);
+}
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+ int timeout_ms)
+{
+ mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+ wait_for_response(mad_send_wr);
+}
+
+/*
+ * Process a send work completion
+ */
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ unsigned long flags;
+ int ret;
+
+ mad_agent_priv = mad_send_wr->mad_agent_priv;
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+ ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc);
+ if (ret == IB_RMPP_RESULT_CONSUMED)
+ goto done;
+ } else
+ ret = IB_RMPP_RESULT_UNHANDLED;
+
+ if (mad_send_wc->status != IB_WC_SUCCESS &&
+ mad_send_wr->status == IB_WC_SUCCESS) {
+ mad_send_wr->status = mad_send_wc->status;
+ mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+ }
+
+ if (--mad_send_wr->refcount > 0) {
+ if (mad_send_wr->refcount == 1 && mad_send_wr->timeout &&
+ mad_send_wr->status == IB_WC_SUCCESS) {
+ wait_for_response(mad_send_wr);
+ }
+ goto done;
+ }
+
+ /* Remove send from MAD agent and notify client of completion */
+ list_del(&mad_send_wr->agent_list);
+ adjust_timeout(mad_agent_priv);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ if (mad_send_wr->status != IB_WC_SUCCESS )
+ mad_send_wc->status = mad_send_wr->status;
+ if (ret == IB_RMPP_RESULT_INTERNAL)
+ ib_rmpp_send_handler(mad_send_wc);
+ else
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ mad_send_wc);
+
+ /* Release reference on agent taken when sending */
+ deref_mad_agent(mad_agent_priv);
+ return;
+done:
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv,
+ struct ib_wc *wc)
+{
+ struct ib_mad_send_wr_private *mad_send_wr, *queued_send_wr;
+ struct ib_mad_list_head *mad_list;
+ struct ib_mad_qp_info *qp_info;
+ struct ib_mad_queue *send_queue;
+ struct ib_send_wr *bad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ unsigned long flags;
+ int ret;
+
+ mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+ mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+ mad_list);
+ send_queue = mad_list->mad_queue;
+ qp_info = send_queue->qp_info;
+
+retry:
+ ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+ mad_send_wr->header_mapping,
+ mad_send_wr->sg_list[0].length, DMA_TO_DEVICE);
+ ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+ mad_send_wr->payload_mapping,
+ mad_send_wr->sg_list[1].length, DMA_TO_DEVICE);
+ queued_send_wr = NULL;
+ spin_lock_irqsave(&send_queue->lock, flags);
+ list_del(&mad_list->list);
+
+ /* Move queued send to the send queue */
+ if (send_queue->count-- > send_queue->max_active) {
+ mad_list = container_of(qp_info->overflow_list.next,
+ struct ib_mad_list_head, list);
+ queued_send_wr = container_of(mad_list,
+ struct ib_mad_send_wr_private,
+ mad_list);
+ list_move_tail(&mad_list->list, &send_queue->list);
+ }
+ spin_unlock_irqrestore(&send_queue->lock, flags);
+
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ mad_send_wc.status = wc->status;
+ mad_send_wc.vendor_err = wc->vendor_err;
+ if (atomic_read(&qp_info->snoop_count))
+ snoop_send(qp_info, &mad_send_wr->send_buf, &mad_send_wc,
+ IB_MAD_SNOOP_SEND_COMPLETIONS);
+ ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+
+ if (queued_send_wr) {
+ ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr,
+ &bad_send_wr);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "ib_post_send failed: %d\n", ret);
+ mad_send_wr = queued_send_wr;
+ wc->status = IB_WC_LOC_QP_OP_ERR;
+ goto retry;
+ }
+ }
+}
+
+static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_list_head *mad_list;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+ list_for_each_entry(mad_list, &qp_info->send_queue.list, list) {
+ mad_send_wr = container_of(mad_list,
+ struct ib_mad_send_wr_private,
+ mad_list);
+ mad_send_wr->retry = 1;
+ }
+ spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+}
+
+static void mad_error_handler(struct ib_mad_port_private *port_priv,
+ struct ib_wc *wc)
+{
+ struct ib_mad_list_head *mad_list;
+ struct ib_mad_qp_info *qp_info;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ int ret;
+
+ /* Determine if failure was a send or receive */
+ mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+ qp_info = mad_list->mad_queue->qp_info;
+ if (mad_list->mad_queue == &qp_info->recv_queue)
+ /*
+ * Receive errors indicate that the QP has entered the error
+ * state - error handling/shutdown code will cleanup
+ */
+ return;
+
+ /*
+ * Send errors will transition the QP to SQE - move
+ * QP to RTS and repost flushed work requests
+ */
+ mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+ mad_list);
+ if (wc->status == IB_WC_WR_FLUSH_ERR) {
+ if (mad_send_wr->retry) {
+ /* Repost send */
+ struct ib_send_wr *bad_send_wr;
+
+ mad_send_wr->retry = 0;
+ ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr,
+ &bad_send_wr);
+ if (ret)
+ ib_mad_send_done_handler(port_priv, wc);
+ } else
+ ib_mad_send_done_handler(port_priv, wc);
+ } else {
+ struct ib_qp_attr *attr;
+
+ /* Transition QP to RTS and fail offending send */
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ if (attr) {
+ attr->qp_state = IB_QPS_RTS;
+ attr->cur_qp_state = IB_QPS_SQE;
+ ret = ib_modify_qp(qp_info->qp, attr,
+ IB_QP_STATE | IB_QP_CUR_STATE);
+ kfree(attr);
+ if (ret)
+ dev_err(&port_priv->device->dev,
+ "mad_error_handler - ib_modify_qp to RTS : %d\n",
+ ret);
+ else
+ mark_sends_for_retry(qp_info);
+ }
+ ib_mad_send_done_handler(port_priv, wc);
+ }
+}
+
+/*
+ * IB MAD completion callback
+ */
+static void ib_mad_completion_handler(struct work_struct *work)
+{
+ struct ib_mad_port_private *port_priv;
+ struct ib_wc wc;
+
+ port_priv = container_of(work, struct ib_mad_port_private, work);
+ ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+
+ while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) {
+ if (wc.status == IB_WC_SUCCESS) {
+ switch (wc.opcode) {
+ case IB_WC_SEND:
+ ib_mad_send_done_handler(port_priv, &wc);
+ break;
+ case IB_WC_RECV:
+ ib_mad_recv_done_handler(port_priv, &wc);
+ break;
+ default:
+ BUG_ON(1);
+ break;
+ }
+ } else
+ mad_error_handler(port_priv, &wc);
+ }
+}
+
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
+{
+ unsigned long flags;
+ struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ struct list_head cancel_list;
+
+ INIT_LIST_HEAD(&cancel_list);
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+ &mad_agent_priv->send_list, agent_list) {
+ if (mad_send_wr->status == IB_WC_SUCCESS) {
+ mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+ mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+ }
+ }
+
+ /* Empty wait list to prevent receives from finding a request */
+ list_splice_init(&mad_agent_priv->wait_list, &cancel_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ /* Report all cancelled requests */
+ mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+ mad_send_wc.vendor_err = 0;
+
+ list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+ &cancel_list, agent_list) {
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ list_del(&mad_send_wr->agent_list);
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+ atomic_dec(&mad_agent_priv->refcount);
+ }
+}
+
+static struct ib_mad_send_wr_private*
+find_send_wr(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_send_buf *send_buf)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+
+ list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list,
+ agent_list) {
+ if (&mad_send_wr->send_buf == send_buf)
+ return mad_send_wr;
+ }
+
+ list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list,
+ agent_list) {
+ if (is_data_mad(mad_agent_priv, mad_send_wr->send_buf.mad) &&
+ &mad_send_wr->send_buf == send_buf)
+ return mad_send_wr;
+ }
+ return NULL;
+}
+
+int ib_modify_mad(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf, u32 timeout_ms)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ unsigned long flags;
+ int active;
+
+ mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+ agent);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ mad_send_wr = find_send_wr(mad_agent_priv, send_buf);
+ if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) {
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ return -EINVAL;
+ }
+
+ active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1);
+ if (!timeout_ms) {
+ mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+ mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+ }
+
+ mad_send_wr->send_buf.timeout_ms = timeout_ms;
+ if (active)
+ mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+ else
+ ib_reset_mad_timeout(mad_send_wr, timeout_ms);
+
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ return 0;
+}
+EXPORT_SYMBOL(ib_modify_mad);
+
+void ib_cancel_mad(struct ib_mad_agent *mad_agent,
+ struct ib_mad_send_buf *send_buf)
+{
+ ib_modify_mad(mad_agent, send_buf, 0);
+}
+EXPORT_SYMBOL(ib_cancel_mad);
+
+static void local_completions(struct work_struct *work)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_local_private *local;
+ struct ib_mad_agent_private *recv_mad_agent;
+ unsigned long flags;
+ int free_mad;
+ struct ib_wc wc;
+ struct ib_mad_send_wc mad_send_wc;
+
+ mad_agent_priv =
+ container_of(work, struct ib_mad_agent_private, local_work);
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ while (!list_empty(&mad_agent_priv->local_list)) {
+ local = list_entry(mad_agent_priv->local_list.next,
+ struct ib_mad_local_private,
+ completion_list);
+ list_del(&local->completion_list);
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+ free_mad = 0;
+ if (local->mad_priv) {
+ recv_mad_agent = local->recv_mad_agent;
+ if (!recv_mad_agent) {
+ dev_err(&mad_agent_priv->agent.device->dev,
+ "No receive MAD agent for local completion\n");
+ free_mad = 1;
+ goto local_send_completion;
+ }
+
+ /*
+ * Defined behavior is to complete response
+ * before request
+ */
+ build_smp_wc(recv_mad_agent->agent.qp,
+ (unsigned long) local->mad_send_wr,
+ be16_to_cpu(IB_LID_PERMISSIVE),
+ 0, recv_mad_agent->agent.port_num, &wc);
+
+ local->mad_priv->header.recv_wc.wc = &wc;
+ local->mad_priv->header.recv_wc.mad_len =
+ sizeof(struct ib_mad);
+ INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list);
+ list_add(&local->mad_priv->header.recv_wc.recv_buf.list,
+ &local->mad_priv->header.recv_wc.rmpp_list);
+ local->mad_priv->header.recv_wc.recv_buf.grh = NULL;
+ local->mad_priv->header.recv_wc.recv_buf.mad =
+ &local->mad_priv->mad.mad;
+ if (atomic_read(&recv_mad_agent->qp_info->snoop_count))
+ snoop_recv(recv_mad_agent->qp_info,
+ &local->mad_priv->header.recv_wc,
+ IB_MAD_SNOOP_RECVS);
+ recv_mad_agent->agent.recv_handler(
+ &recv_mad_agent->agent,
+ &local->mad_priv->header.recv_wc);
+ spin_lock_irqsave(&recv_mad_agent->lock, flags);
+ atomic_dec(&recv_mad_agent->refcount);
+ spin_unlock_irqrestore(&recv_mad_agent->lock, flags);
+ }
+
+local_send_completion:
+ /* Complete send */
+ mad_send_wc.status = IB_WC_SUCCESS;
+ mad_send_wc.vendor_err = 0;
+ mad_send_wc.send_buf = &local->mad_send_wr->send_buf;
+ if (atomic_read(&mad_agent_priv->qp_info->snoop_count))
+ snoop_send(mad_agent_priv->qp_info,
+ &local->mad_send_wr->send_buf,
+ &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS);
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ atomic_dec(&mad_agent_priv->refcount);
+ if (free_mad)
+ kmem_cache_free(ib_mad_cache, local->mad_priv);
+ kfree(local);
+ }
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ int ret;
+
+ if (!mad_send_wr->retries_left)
+ return -ETIMEDOUT;
+
+ mad_send_wr->retries_left--;
+ mad_send_wr->send_buf.retries++;
+
+ mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+
+ if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) {
+ ret = ib_retry_rmpp(mad_send_wr);
+ switch (ret) {
+ case IB_RMPP_RESULT_UNHANDLED:
+ ret = ib_send_mad(mad_send_wr);
+ break;
+ case IB_RMPP_RESULT_CONSUMED:
+ ret = 0;
+ break;
+ default:
+ ret = -ECOMM;
+ break;
+ }
+ } else
+ ret = ib_send_mad(mad_send_wr);
+
+ if (!ret) {
+ mad_send_wr->refcount++;
+ list_add_tail(&mad_send_wr->agent_list,
+ &mad_send_wr->mad_agent_priv->send_list);
+ }
+ return ret;
+}
+
+static void timeout_sends(struct work_struct *work)
+{
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc mad_send_wc;
+ unsigned long flags, delay;
+
+ mad_agent_priv = container_of(work, struct ib_mad_agent_private,
+ timed_work.work);
+ mad_send_wc.vendor_err = 0;
+
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ while (!list_empty(&mad_agent_priv->wait_list)) {
+ mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+ struct ib_mad_send_wr_private,
+ agent_list);
+
+ if (time_after(mad_send_wr->timeout, jiffies)) {
+ delay = mad_send_wr->timeout - jiffies;
+ if ((long)delay <= 0)
+ delay = 1;
+ queue_delayed_work(mad_agent_priv->qp_info->
+ port_priv->wq,
+ &mad_agent_priv->timed_work, delay);
+ break;
+ }
+
+ list_del(&mad_send_wr->agent_list);
+ if (mad_send_wr->status == IB_WC_SUCCESS &&
+ !retry_send(mad_send_wr))
+ continue;
+
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+ if (mad_send_wr->status == IB_WC_SUCCESS)
+ mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR;
+ else
+ mad_send_wc.status = mad_send_wr->status;
+ mad_send_wc.send_buf = &mad_send_wr->send_buf;
+ mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+ &mad_send_wc);
+
+ atomic_dec(&mad_agent_priv->refcount);
+ spin_lock_irqsave(&mad_agent_priv->lock, flags);
+ }
+ spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void ib_mad_thread_completion_handler(struct ib_cq *cq, void *arg)
+{
+ struct ib_mad_port_private *port_priv = cq->cq_context;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ if (!list_empty(&port_priv->port_list))
+ queue_work(port_priv->wq, &port_priv->work);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+}
+
+/*
+ * Allocate receive MADs and post receive WRs for them
+ */
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_private *mad)
+{
+ unsigned long flags;
+ int post, ret;
+ struct ib_mad_private *mad_priv;
+ struct ib_sge sg_list;
+ struct ib_recv_wr recv_wr, *bad_recv_wr;
+ struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
+
+ /* Initialize common scatter list fields */
+ sg_list.length = sizeof *mad_priv - sizeof mad_priv->header;
+ sg_list.lkey = (*qp_info->port_priv->mr).lkey;
+
+ /* Initialize common receive WR fields */
+ recv_wr.next = NULL;
+ recv_wr.sg_list = &sg_list;
+ recv_wr.num_sge = 1;
+
+ do {
+ /* Allocate and map receive buffer */
+ if (mad) {
+ mad_priv = mad;
+ mad = NULL;
+ } else {
+ mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+ if (!mad_priv) {
+ dev_err(&qp_info->port_priv->device->dev,
+ "No memory for receive buffer\n");
+ ret = -ENOMEM;
+ break;
+ }
+ }
+ sg_list.addr = ib_dma_map_single(qp_info->port_priv->device,
+ &mad_priv->grh,
+ sizeof *mad_priv -
+ sizeof mad_priv->header,
+ DMA_FROM_DEVICE);
+ if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device,
+ sg_list.addr))) {
+ ret = -ENOMEM;
+ break;
+ }
+ mad_priv->header.mapping = sg_list.addr;
+ recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list;
+ mad_priv->header.mad_list.mad_queue = recv_queue;
+
+ /* Post receive WR */
+ spin_lock_irqsave(&recv_queue->lock, flags);
+ post = (++recv_queue->count < recv_queue->max_active);
+ list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list);
+ spin_unlock_irqrestore(&recv_queue->lock, flags);
+ ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr);
+ if (ret) {
+ spin_lock_irqsave(&recv_queue->lock, flags);
+ list_del(&mad_priv->header.mad_list.list);
+ recv_queue->count--;
+ spin_unlock_irqrestore(&recv_queue->lock, flags);
+ ib_dma_unmap_single(qp_info->port_priv->device,
+ mad_priv->header.mapping,
+ sizeof *mad_priv -
+ sizeof mad_priv->header,
+ DMA_FROM_DEVICE);
+ kmem_cache_free(ib_mad_cache, mad_priv);
+ dev_err(&qp_info->port_priv->device->dev,
+ "ib_post_recv failed: %d\n", ret);
+ break;
+ }
+ } while (post);
+
+ return ret;
+}
+
+/*
+ * Return all the posted receive MADs
+ */
+static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info)
+{
+ struct ib_mad_private_header *mad_priv_hdr;
+ struct ib_mad_private *recv;
+ struct ib_mad_list_head *mad_list;
+
+ if (!qp_info->qp)
+ return;
+
+ while (!list_empty(&qp_info->recv_queue.list)) {
+
+ mad_list = list_entry(qp_info->recv_queue.list.next,
+ struct ib_mad_list_head, list);
+ mad_priv_hdr = container_of(mad_list,
+ struct ib_mad_private_header,
+ mad_list);
+ recv = container_of(mad_priv_hdr, struct ib_mad_private,
+ header);
+
+ /* Remove from posted receive MAD list */
+ list_del(&mad_list->list);
+
+ ib_dma_unmap_single(qp_info->port_priv->device,
+ recv->header.mapping,
+ sizeof(struct ib_mad_private) -
+ sizeof(struct ib_mad_private_header),
+ DMA_FROM_DEVICE);
+ kmem_cache_free(ib_mad_cache, recv);
+ }
+
+ qp_info->recv_queue.count = 0;
+}
+
+/*
+ * Start the port
+ */
+static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
+{
+ int ret, i;
+ struct ib_qp_attr *attr;
+ struct ib_qp *qp;
+ u16 pkey_index;
+
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ if (!attr) {
+ dev_err(&port_priv->device->dev,
+ "Couldn't kmalloc ib_qp_attr\n");
+ return -ENOMEM;
+ }
+
+ ret = ib_find_pkey(port_priv->device, port_priv->port_num,
+ IB_DEFAULT_PKEY_FULL, &pkey_index);
+ if (ret)
+ pkey_index = 0;
+
+ for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+ qp = port_priv->qp_info[i].qp;
+ if (!qp)
+ continue;
+
+ /*
+ * PKey index for QP1 is irrelevant but
+ * one is needed for the Reset to Init transition
+ */
+ attr->qp_state = IB_QPS_INIT;
+ attr->pkey_index = pkey_index;
+ attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY;
+ ret = ib_modify_qp(qp, attr, IB_QP_STATE |
+ IB_QP_PKEY_INDEX | IB_QP_QKEY);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "Couldn't change QP%d state to INIT: %d\n",
+ i, ret);
+ goto out;
+ }
+
+ attr->qp_state = IB_QPS_RTR;
+ ret = ib_modify_qp(qp, attr, IB_QP_STATE);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "Couldn't change QP%d state to RTR: %d\n",
+ i, ret);
+ goto out;
+ }
+
+ attr->qp_state = IB_QPS_RTS;
+ attr->sq_psn = IB_MAD_SEND_Q_PSN;
+ ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "Couldn't change QP%d state to RTS: %d\n",
+ i, ret);
+ goto out;
+ }
+ }
+
+ ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "Failed to request completion notification: %d\n",
+ ret);
+ goto out;
+ }
+
+ for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+ if (!port_priv->qp_info[i].qp)
+ continue;
+
+ ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL);
+ if (ret) {
+ dev_err(&port_priv->device->dev,
+ "Couldn't post receive WRs\n");
+ goto out;
+ }
+ }
+out:
+ kfree(attr);
+ return ret;
+}
+
+static void qp_event_handler(struct ib_event *event, void *qp_context)
+{
+ struct ib_mad_qp_info *qp_info = qp_context;
+
+ /* It's worse than that! He's dead, Jim! */
+ dev_err(&qp_info->port_priv->device->dev,
+ "Fatal error (%d) on MAD QP (%d)\n",
+ event->event, qp_info->qp->qp_num);
+}
+
+static void init_mad_queue(struct ib_mad_qp_info *qp_info,
+ struct ib_mad_queue *mad_queue)
+{
+ mad_queue->qp_info = qp_info;
+ mad_queue->count = 0;
+ spin_lock_init(&mad_queue->lock);
+ INIT_LIST_HEAD(&mad_queue->list);
+}
+
+static void init_mad_qp(struct ib_mad_port_private *port_priv,
+ struct ib_mad_qp_info *qp_info)
+{
+ qp_info->port_priv = port_priv;
+ init_mad_queue(qp_info, &qp_info->send_queue);
+ init_mad_queue(qp_info, &qp_info->recv_queue);
+ INIT_LIST_HEAD(&qp_info->overflow_list);
+ spin_lock_init(&qp_info->snoop_lock);
+ qp_info->snoop_table = NULL;
+ qp_info->snoop_table_size = 0;
+ atomic_set(&qp_info->snoop_count, 0);
+}
+
+static int create_mad_qp(struct ib_mad_qp_info *qp_info,
+ enum ib_qp_type qp_type)
+{
+ struct ib_qp_init_attr qp_init_attr;
+ int ret;
+
+ memset(&qp_init_attr, 0, sizeof qp_init_attr);
+ qp_init_attr.send_cq = qp_info->port_priv->cq;
+ qp_init_attr.recv_cq = qp_info->port_priv->cq;
+ qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+ qp_init_attr.cap.max_send_wr = mad_sendq_size;
+ qp_init_attr.cap.max_recv_wr = mad_recvq_size;
+ qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG;
+ qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG;
+ qp_init_attr.qp_type = qp_type;
+ qp_init_attr.port_num = qp_info->port_priv->port_num;
+ qp_init_attr.qp_context = qp_info;
+ qp_init_attr.event_handler = qp_event_handler;
+ qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr);
+ if (IS_ERR(qp_info->qp)) {
+ dev_err(&qp_info->port_priv->device->dev,
+ "Couldn't create ib_mad QP%d\n",
+ get_spl_qp_index(qp_type));
+ ret = PTR_ERR(qp_info->qp);
+ goto error;
+ }
+ /* Use minimum queue sizes unless the CQ is resized */
+ qp_info->send_queue.max_active = mad_sendq_size;
+ qp_info->recv_queue.max_active = mad_recvq_size;
+ return 0;
+
+error:
+ return ret;
+}
+
+static void destroy_mad_qp(struct ib_mad_qp_info *qp_info)
+{
+ if (!qp_info->qp)
+ return;
+
+ ib_destroy_qp(qp_info->qp);
+ kfree(qp_info->snoop_table);
+}
+
+/*
+ * Open the port
+ * Create the QP, PD, MR, and CQ if needed
+ */
+static int ib_mad_port_open(struct ib_device *device,
+ int port_num)
+{
+ int ret, cq_size;
+ struct ib_mad_port_private *port_priv;
+ unsigned long flags;
+ char name[sizeof "ib_mad123"];
+ int has_smi;
+
+ /* Create new device info */
+ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+ if (!port_priv) {
+ dev_err(&device->dev, "No memory for ib_mad_port_private\n");
+ return -ENOMEM;
+ }
+
+ port_priv->device = device;
+ port_priv->port_num = port_num;
+ spin_lock_init(&port_priv->reg_lock);
+ INIT_LIST_HEAD(&port_priv->agent_list);
+ init_mad_qp(port_priv, &port_priv->qp_info[0]);
+ init_mad_qp(port_priv, &port_priv->qp_info[1]);
+
+ cq_size = mad_sendq_size + mad_recvq_size;
+ has_smi = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND;
+ if (has_smi)
+ cq_size *= 2;
+
+ port_priv->cq = ib_create_cq(port_priv->device,
+ ib_mad_thread_completion_handler,
+ NULL, port_priv, cq_size, 0);
+ if (IS_ERR(port_priv->cq)) {
+ dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
+ ret = PTR_ERR(port_priv->cq);
+ goto error3;
+ }
+
+ port_priv->pd = ib_alloc_pd(device);
+ if (IS_ERR(port_priv->pd)) {
+ dev_err(&device->dev, "Couldn't create ib_mad PD\n");
+ ret = PTR_ERR(port_priv->pd);
+ goto error4;
+ }
+
+ port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(port_priv->mr)) {
+ dev_err(&device->dev, "Couldn't get ib_mad DMA MR\n");
+ ret = PTR_ERR(port_priv->mr);
+ goto error5;
+ }
+
+ if (has_smi) {
+ ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
+ if (ret)
+ goto error6;
+ }
+ ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI);
+ if (ret)
+ goto error7;
+
+ snprintf(name, sizeof name, "ib_mad%d", port_num);
+ port_priv->wq = create_singlethread_workqueue(name);
+ if (!port_priv->wq) {
+ ret = -ENOMEM;
+ goto error8;
+ }
+ INIT_WORK(&port_priv->work, ib_mad_completion_handler);
+
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ list_add_tail(&port_priv->port_list, &ib_mad_port_list);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ ret = ib_mad_port_start(port_priv);
+ if (ret) {
+ dev_err(&device->dev, "Couldn't start port\n");
+ goto error9;
+ }
+
+ return 0;
+
+error9:
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ list_del_init(&port_priv->port_list);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ destroy_workqueue(port_priv->wq);
+error8:
+ destroy_mad_qp(&port_priv->qp_info[1]);
+error7:
+ destroy_mad_qp(&port_priv->qp_info[0]);
+error6:
+ ib_dereg_mr(port_priv->mr);
+error5:
+ ib_dealloc_pd(port_priv->pd);
+error4:
+ ib_destroy_cq(port_priv->cq);
+ cleanup_recv_queue(&port_priv->qp_info[1]);
+ cleanup_recv_queue(&port_priv->qp_info[0]);
+error3:
+ kfree(port_priv);
+
+ return ret;
+}
+
+/*
+ * Close the port
+ * If there are no classes using the port, free the port
+ * resources (CQ, MR, PD, QP) and remove the port's info structure
+ */
+static int ib_mad_port_close(struct ib_device *device, int port_num)
+{
+ struct ib_mad_port_private *port_priv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+ port_priv = __ib_get_mad_port(device, port_num);
+ if (port_priv == NULL) {
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+ dev_err(&device->dev, "Port %d not found\n", port_num);
+ return -ENODEV;
+ }
+ list_del_init(&port_priv->port_list);
+ spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+ destroy_workqueue(port_priv->wq);
+ destroy_mad_qp(&port_priv->qp_info[1]);
+ destroy_mad_qp(&port_priv->qp_info[0]);
+ ib_dereg_mr(port_priv->mr);
+ ib_dealloc_pd(port_priv->pd);
+ ib_destroy_cq(port_priv->cq);
+ cleanup_recv_queue(&port_priv->qp_info[1]);
+ cleanup_recv_queue(&port_priv->qp_info[0]);
+ /* XXX: Handle deallocation of MAD registration tables */
+
+ kfree(port_priv);
+
+ return 0;
+}
+
+static void ib_mad_init_device(struct ib_device *device)
+{
+ int start, end, i;
+
+ if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH) {
+ start = 0;
+ end = 0;
+ } else {
+ start = 1;
+ end = device->phys_port_cnt;
+ }
+
+ for (i = start; i <= end; i++) {
+ if (ib_mad_port_open(device, i)) {
+ dev_err(&device->dev, "Couldn't open port %d\n", i);
+ goto error;
+ }
+ if (ib_agent_port_open(device, i)) {
+ dev_err(&device->dev,
+ "Couldn't open port %d for agents\n", i);
+ goto error_agent;
+ }
+ }
+ return;
+
+error_agent:
+ if (ib_mad_port_close(device, i))
+ dev_err(&device->dev, "Couldn't close port %d\n", i);
+
+error:
+ i--;
+
+ while (i >= start) {
+ if (ib_agent_port_close(device, i))
+ dev_err(&device->dev,
+ "Couldn't close port %d for agents\n", i);
+ if (ib_mad_port_close(device, i))
+ dev_err(&device->dev, "Couldn't close port %d\n", i);
+ i--;
+ }
+}
+
+static void ib_mad_remove_device(struct ib_device *device)
+{
+ int i, num_ports, cur_port;
+
+ if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH) {
+ num_ports = 1;
+ cur_port = 0;
+ } else {
+ num_ports = device->phys_port_cnt;
+ cur_port = 1;
+ }
+ for (i = 0; i < num_ports; i++, cur_port++) {
+ if (ib_agent_port_close(device, cur_port))
+ dev_err(&device->dev,
+ "Couldn't close port %d for agents\n",
+ cur_port);
+ if (ib_mad_port_close(device, cur_port))
+ dev_err(&device->dev, "Couldn't close port %d\n",
+ cur_port);
+ }
+}
+
+static struct ib_client mad_client = {
+ .name = "mad",
+ .add = ib_mad_init_device,
+ .remove = ib_mad_remove_device
+};
+
+static int __init ib_mad_init_module(void)
+{
+ int ret;
+
+ mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE);
+ mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE);
+
+ mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE);
+ mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE);
+
+ ib_mad_cache = kmem_cache_create("ib_mad",
+ sizeof(struct ib_mad_private),
+ 0,
+ SLAB_HWCACHE_ALIGN,
+ NULL);
+ if (!ib_mad_cache) {
+ pr_err("Couldn't create ib_mad cache\n");
+ ret = -ENOMEM;
+ goto error1;
+ }
+
+ INIT_LIST_HEAD(&ib_mad_port_list);
+
+ if (ib_register_client(&mad_client)) {
+ pr_err("Couldn't register ib_mad client\n");
+ ret = -EINVAL;
+ goto error2;
+ }
+
+ return 0;
+
+error2:
+ kmem_cache_destroy(ib_mad_cache);
+error1:
+ return ret;
+}
+
+static void __exit ib_mad_cleanup_module(void)
+{
+ ib_unregister_client(&mad_client);
+ kmem_cache_destroy(ib_mad_cache);
+}
+
+module_init(ib_mad_init_module);
+module_exit(ib_mad_cleanup_module);
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
new file mode 100644
index 000000000..d1a0b0ee9
--- /dev/null
+++ b/drivers/infiniband/core/mad_priv.h
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2004, 2005, Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __IB_MAD_PRIV_H__
+#define __IB_MAD_PRIV_H__
+
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#define IB_MAD_QPS_CORE 2 /* Always QP0 and QP1 as a minimum */
+
+/* QP and CQ parameters */
+#define IB_MAD_QP_SEND_SIZE 128
+#define IB_MAD_QP_RECV_SIZE 512
+#define IB_MAD_QP_MIN_SIZE 64
+#define IB_MAD_QP_MAX_SIZE 8192
+#define IB_MAD_SEND_REQ_MAX_SG 2
+#define IB_MAD_RECV_REQ_MAX_SG 1
+
+#define IB_MAD_SEND_Q_PSN 0
+
+/* Registration table sizes */
+#define MAX_MGMT_CLASS 80
+#define MAX_MGMT_VERSION 8
+#define MAX_MGMT_OUI 8
+#define MAX_MGMT_VENDOR_RANGE2 (IB_MGMT_CLASS_VENDOR_RANGE2_END - \
+ IB_MGMT_CLASS_VENDOR_RANGE2_START + 1)
+
+struct ib_mad_list_head {
+ struct list_head list;
+ struct ib_mad_queue *mad_queue;
+};
+
+struct ib_mad_private_header {
+ struct ib_mad_list_head mad_list;
+ struct ib_mad_recv_wc recv_wc;
+ struct ib_wc wc;
+ u64 mapping;
+} __attribute__ ((packed));
+
+struct ib_mad_private {
+ struct ib_mad_private_header header;
+ struct ib_grh grh;
+ union {
+ struct ib_mad mad;
+ struct ib_rmpp_mad rmpp_mad;
+ struct ib_smp smp;
+ } mad;
+} __attribute__ ((packed));
+
+struct ib_rmpp_segment {
+ struct list_head list;
+ u32 num;
+ u8 data[0];
+};
+
+struct ib_mad_agent_private {
+ struct list_head agent_list;
+ struct ib_mad_agent agent;
+ struct ib_mad_reg_req *reg_req;
+ struct ib_mad_qp_info *qp_info;
+
+ spinlock_t lock;
+ struct list_head send_list;
+ struct list_head wait_list;
+ struct list_head done_list;
+ struct delayed_work timed_work;
+ unsigned long timeout;
+ struct list_head local_list;
+ struct work_struct local_work;
+ struct list_head rmpp_list;
+
+ atomic_t refcount;
+ struct completion comp;
+};
+
+struct ib_mad_snoop_private {
+ struct ib_mad_agent agent;
+ struct ib_mad_qp_info *qp_info;
+ int snoop_index;
+ int mad_snoop_flags;
+ atomic_t refcount;
+ struct completion comp;
+};
+
+struct ib_mad_send_wr_private {
+ struct ib_mad_list_head mad_list;
+ struct list_head agent_list;
+ struct ib_mad_agent_private *mad_agent_priv;
+ struct ib_mad_send_buf send_buf;
+ u64 header_mapping;
+ u64 payload_mapping;
+ struct ib_send_wr send_wr;
+ struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
+ __be64 tid;
+ unsigned long timeout;
+ int max_retries;
+ int retries_left;
+ int retry;
+ int refcount;
+ enum ib_wc_status status;
+
+ /* RMPP control */
+ struct list_head rmpp_list;
+ struct ib_rmpp_segment *last_ack_seg;
+ struct ib_rmpp_segment *cur_seg;
+ int last_ack;
+ int seg_num;
+ int newwin;
+ int pad;
+};
+
+struct ib_mad_local_private {
+ struct list_head completion_list;
+ struct ib_mad_private *mad_priv;
+ struct ib_mad_agent_private *recv_mad_agent;
+ struct ib_mad_send_wr_private *mad_send_wr;
+};
+
+struct ib_mad_mgmt_method_table {
+ struct ib_mad_agent_private *agent[IB_MGMT_MAX_METHODS];
+};
+
+struct ib_mad_mgmt_class_table {
+ struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_CLASS];
+};
+
+struct ib_mad_mgmt_vendor_class {
+ u8 oui[MAX_MGMT_OUI][3];
+ struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_OUI];
+};
+
+struct ib_mad_mgmt_vendor_class_table {
+ struct ib_mad_mgmt_vendor_class *vendor_class[MAX_MGMT_VENDOR_RANGE2];
+};
+
+struct ib_mad_mgmt_version_table {
+ struct ib_mad_mgmt_class_table *class;
+ struct ib_mad_mgmt_vendor_class_table *vendor;
+};
+
+struct ib_mad_queue {
+ spinlock_t lock;
+ struct list_head list;
+ int count;
+ int max_active;
+ struct ib_mad_qp_info *qp_info;
+};
+
+struct ib_mad_qp_info {
+ struct ib_mad_port_private *port_priv;
+ struct ib_qp *qp;
+ struct ib_mad_queue send_queue;
+ struct ib_mad_queue recv_queue;
+ struct list_head overflow_list;
+ spinlock_t snoop_lock;
+ struct ib_mad_snoop_private **snoop_table;
+ int snoop_table_size;
+ atomic_t snoop_count;
+};
+
+struct ib_mad_port_private {
+ struct list_head port_list;
+ struct ib_device *device;
+ int port_num;
+ struct ib_cq *cq;
+ struct ib_pd *pd;
+ struct ib_mr *mr;
+
+ spinlock_t reg_lock;
+ struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
+ struct list_head agent_list;
+ struct workqueue_struct *wq;
+ struct work_struct work;
+ struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
+};
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_send_wr_private *
+ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
+ struct ib_mad_recv_wc *mad_recv_wc);
+
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc);
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr);
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+ int timeout_ms);
+
+#endif /* __IB_MAD_PRIV_H__ */
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
new file mode 100644
index 000000000..f37878c9c
--- /dev/null
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -0,0 +1,953 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mad_priv.h"
+#include "mad_rmpp.h"
+
+enum rmpp_state {
+ RMPP_STATE_ACTIVE,
+ RMPP_STATE_TIMEOUT,
+ RMPP_STATE_COMPLETE,
+ RMPP_STATE_CANCELING
+};
+
+struct mad_rmpp_recv {
+ struct ib_mad_agent_private *agent;
+ struct list_head list;
+ struct delayed_work timeout_work;
+ struct delayed_work cleanup_work;
+ struct completion comp;
+ enum rmpp_state state;
+ spinlock_t lock;
+ atomic_t refcount;
+
+ struct ib_ah *ah;
+ struct ib_mad_recv_wc *rmpp_wc;
+ struct ib_mad_recv_buf *cur_seg_buf;
+ int last_ack;
+ int seg_num;
+ int newwin;
+ int repwin;
+
+ __be64 tid;
+ u32 src_qp;
+ u16 slid;
+ u8 mgmt_class;
+ u8 class_version;
+ u8 method;
+};
+
+static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+ if (atomic_dec_and_test(&rmpp_recv->refcount))
+ complete(&rmpp_recv->comp);
+}
+
+static void destroy_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+ deref_rmpp_recv(rmpp_recv);
+ wait_for_completion(&rmpp_recv->comp);
+ ib_destroy_ah(rmpp_recv->ah);
+ kfree(rmpp_recv);
+}
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent)
+{
+ struct mad_rmpp_recv *rmpp_recv, *temp_rmpp_recv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&agent->lock, flags);
+ list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+ if (rmpp_recv->state != RMPP_STATE_COMPLETE)
+ ib_free_recv_mad(rmpp_recv->rmpp_wc);
+ rmpp_recv->state = RMPP_STATE_CANCELING;
+ }
+ spin_unlock_irqrestore(&agent->lock, flags);
+
+ list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+ cancel_delayed_work(&rmpp_recv->timeout_work);
+ cancel_delayed_work(&rmpp_recv->cleanup_work);
+ }
+
+ flush_workqueue(agent->qp_info->port_priv->wq);
+
+ list_for_each_entry_safe(rmpp_recv, temp_rmpp_recv,
+ &agent->rmpp_list, list) {
+ list_del(&rmpp_recv->list);
+ destroy_rmpp_recv(rmpp_recv);
+ }
+}
+
+static void format_ack(struct ib_mad_send_buf *msg,
+ struct ib_rmpp_mad *data,
+ struct mad_rmpp_recv *rmpp_recv)
+{
+ struct ib_rmpp_mad *ack = msg->mad;
+ unsigned long flags;
+
+ memcpy(ack, &data->mad_hdr, msg->hdr_len);
+
+ ack->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+ ack->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ACK;
+ ib_set_rmpp_flags(&ack->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+ spin_lock_irqsave(&rmpp_recv->lock, flags);
+ rmpp_recv->last_ack = rmpp_recv->seg_num;
+ ack->rmpp_hdr.seg_num = cpu_to_be32(rmpp_recv->seg_num);
+ ack->rmpp_hdr.paylen_newwin = cpu_to_be32(rmpp_recv->newwin);
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+}
+
+static void ack_recv(struct mad_rmpp_recv *rmpp_recv,
+ struct ib_mad_recv_wc *recv_wc)
+{
+ struct ib_mad_send_buf *msg;
+ int ret, hdr_len;
+
+ hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+ msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp,
+ recv_wc->wc->pkey_index, 1, hdr_len,
+ 0, GFP_KERNEL);
+ if (IS_ERR(msg))
+ return;
+
+ format_ack(msg, (struct ib_rmpp_mad *) recv_wc->recv_buf.mad, rmpp_recv);
+ msg->ah = rmpp_recv->ah;
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret)
+ ib_free_send_mad(msg);
+}
+
+static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
+ struct ib_mad_recv_wc *recv_wc)
+{
+ struct ib_mad_send_buf *msg;
+ struct ib_ah *ah;
+ int hdr_len;
+
+ ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc,
+ recv_wc->recv_buf.grh, agent->port_num);
+ if (IS_ERR(ah))
+ return (void *) ah;
+
+ hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+ msg = ib_create_send_mad(agent, recv_wc->wc->src_qp,
+ recv_wc->wc->pkey_index, 1,
+ hdr_len, 0, GFP_KERNEL);
+ if (IS_ERR(msg))
+ ib_destroy_ah(ah);
+ else {
+ msg->ah = ah;
+ msg->context[0] = ah;
+ }
+
+ return msg;
+}
+
+static void ack_ds_ack(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *recv_wc)
+{
+ struct ib_mad_send_buf *msg;
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ msg = alloc_response_msg(&agent->agent, recv_wc);
+ if (IS_ERR(msg))
+ return;
+
+ rmpp_mad = msg->mad;
+ memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+ rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+ ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+ rmpp_mad->rmpp_hdr.seg_num = 0;
+ rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(1);
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ ib_destroy_ah(msg->ah);
+ ib_free_send_mad(msg);
+ }
+}
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc)
+{
+ if (mad_send_wc->send_buf->context[0] == mad_send_wc->send_buf->ah)
+ ib_destroy_ah(mad_send_wc->send_buf->ah);
+ ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+static void nack_recv(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *recv_wc, u8 rmpp_status)
+{
+ struct ib_mad_send_buf *msg;
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ msg = alloc_response_msg(&agent->agent, recv_wc);
+ if (IS_ERR(msg))
+ return;
+
+ rmpp_mad = msg->mad;
+ memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+ rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+ rmpp_mad->rmpp_hdr.rmpp_version = IB_MGMT_RMPP_VERSION;
+ rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ABORT;
+ ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+ rmpp_mad->rmpp_hdr.rmpp_status = rmpp_status;
+ rmpp_mad->rmpp_hdr.seg_num = 0;
+ rmpp_mad->rmpp_hdr.paylen_newwin = 0;
+
+ ret = ib_post_send_mad(msg, NULL);
+ if (ret) {
+ ib_destroy_ah(msg->ah);
+ ib_free_send_mad(msg);
+ }
+}
+
+static void recv_timeout_handler(struct work_struct *work)
+{
+ struct mad_rmpp_recv *rmpp_recv =
+ container_of(work, struct mad_rmpp_recv, timeout_work.work);
+ struct ib_mad_recv_wc *rmpp_wc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+ if (rmpp_recv->state != RMPP_STATE_ACTIVE) {
+ spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+ return;
+ }
+ rmpp_recv->state = RMPP_STATE_TIMEOUT;
+ list_del(&rmpp_recv->list);
+ spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+
+ rmpp_wc = rmpp_recv->rmpp_wc;
+ nack_recv(rmpp_recv->agent, rmpp_wc, IB_MGMT_RMPP_STATUS_T2L);
+ destroy_rmpp_recv(rmpp_recv);
+ ib_free_recv_mad(rmpp_wc);
+}
+
+static void recv_cleanup_handler(struct work_struct *work)
+{
+ struct mad_rmpp_recv *rmpp_recv =
+ container_of(work, struct mad_rmpp_recv, cleanup_work.work);
+ unsigned long flags;
+
+ spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+ if (rmpp_recv->state == RMPP_STATE_CANCELING) {
+ spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+ return;
+ }
+ list_del(&rmpp_recv->list);
+ spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+ destroy_rmpp_recv(rmpp_recv);
+}
+
+static struct mad_rmpp_recv *
+create_rmpp_recv(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ struct ib_mad_hdr *mad_hdr;
+
+ rmpp_recv = kmalloc(sizeof *rmpp_recv, GFP_KERNEL);
+ if (!rmpp_recv)
+ return NULL;
+
+ rmpp_recv->ah = ib_create_ah_from_wc(agent->agent.qp->pd,
+ mad_recv_wc->wc,
+ mad_recv_wc->recv_buf.grh,
+ agent->agent.port_num);
+ if (IS_ERR(rmpp_recv->ah))
+ goto error;
+
+ rmpp_recv->agent = agent;
+ init_completion(&rmpp_recv->comp);
+ INIT_DELAYED_WORK(&rmpp_recv->timeout_work, recv_timeout_handler);
+ INIT_DELAYED_WORK(&rmpp_recv->cleanup_work, recv_cleanup_handler);
+ spin_lock_init(&rmpp_recv->lock);
+ rmpp_recv->state = RMPP_STATE_ACTIVE;
+ atomic_set(&rmpp_recv->refcount, 1);
+
+ rmpp_recv->rmpp_wc = mad_recv_wc;
+ rmpp_recv->cur_seg_buf = &mad_recv_wc->recv_buf;
+ rmpp_recv->newwin = 1;
+ rmpp_recv->seg_num = 1;
+ rmpp_recv->last_ack = 0;
+ rmpp_recv->repwin = 1;
+
+ mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+ rmpp_recv->tid = mad_hdr->tid;
+ rmpp_recv->src_qp = mad_recv_wc->wc->src_qp;
+ rmpp_recv->slid = mad_recv_wc->wc->slid;
+ rmpp_recv->mgmt_class = mad_hdr->mgmt_class;
+ rmpp_recv->class_version = mad_hdr->class_version;
+ rmpp_recv->method = mad_hdr->method;
+ return rmpp_recv;
+
+error: kfree(rmpp_recv);
+ return NULL;
+}
+
+static struct mad_rmpp_recv *
+find_rmpp_recv(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ struct ib_mad_hdr *mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+
+ list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+ if (rmpp_recv->tid == mad_hdr->tid &&
+ rmpp_recv->src_qp == mad_recv_wc->wc->src_qp &&
+ rmpp_recv->slid == mad_recv_wc->wc->slid &&
+ rmpp_recv->mgmt_class == mad_hdr->mgmt_class &&
+ rmpp_recv->class_version == mad_hdr->class_version &&
+ rmpp_recv->method == mad_hdr->method)
+ return rmpp_recv;
+ }
+ return NULL;
+}
+
+static struct mad_rmpp_recv *
+acquire_rmpp_recv(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&agent->lock, flags);
+ rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+ if (rmpp_recv)
+ atomic_inc(&rmpp_recv->refcount);
+ spin_unlock_irqrestore(&agent->lock, flags);
+ return rmpp_recv;
+}
+
+static struct mad_rmpp_recv *
+insert_rmpp_recv(struct ib_mad_agent_private *agent,
+ struct mad_rmpp_recv *rmpp_recv)
+{
+ struct mad_rmpp_recv *cur_rmpp_recv;
+
+ cur_rmpp_recv = find_rmpp_recv(agent, rmpp_recv->rmpp_wc);
+ if (!cur_rmpp_recv)
+ list_add_tail(&rmpp_recv->list, &agent->rmpp_list);
+
+ return cur_rmpp_recv;
+}
+
+static inline int get_last_flag(struct ib_mad_recv_buf *seg)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+ return ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_LAST;
+}
+
+static inline int get_seg_num(struct ib_mad_recv_buf *seg)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+ return be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+}
+
+static inline struct ib_mad_recv_buf * get_next_seg(struct list_head *rmpp_list,
+ struct ib_mad_recv_buf *seg)
+{
+ if (seg->list.next == rmpp_list)
+ return NULL;
+
+ return container_of(seg->list.next, struct ib_mad_recv_buf, list);
+}
+
+static inline int window_size(struct ib_mad_agent_private *agent)
+{
+ return max(agent->qp_info->recv_queue.max_active >> 3, 1);
+}
+
+static struct ib_mad_recv_buf * find_seg_location(struct list_head *rmpp_list,
+ int seg_num)
+{
+ struct ib_mad_recv_buf *seg_buf;
+ int cur_seg_num;
+
+ list_for_each_entry_reverse(seg_buf, rmpp_list, list) {
+ cur_seg_num = get_seg_num(seg_buf);
+ if (seg_num > cur_seg_num)
+ return seg_buf;
+ if (seg_num == cur_seg_num)
+ break;
+ }
+ return NULL;
+}
+
+static void update_seg_num(struct mad_rmpp_recv *rmpp_recv,
+ struct ib_mad_recv_buf *new_buf)
+{
+ struct list_head *rmpp_list = &rmpp_recv->rmpp_wc->rmpp_list;
+
+ while (new_buf && (get_seg_num(new_buf) == rmpp_recv->seg_num + 1)) {
+ rmpp_recv->cur_seg_buf = new_buf;
+ rmpp_recv->seg_num++;
+ new_buf = get_next_seg(rmpp_list, new_buf);
+ }
+}
+
+static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int hdr_size, data_size, pad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad;
+
+ hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+ data_size = sizeof(struct ib_rmpp_mad) - hdr_size;
+ pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+ if (pad > IB_MGMT_RMPP_DATA || pad < 0)
+ pad = 0;
+
+ return hdr_size + rmpp_recv->seg_num * data_size - pad;
+}
+
+static struct ib_mad_recv_wc * complete_rmpp(struct mad_rmpp_recv *rmpp_recv)
+{
+ struct ib_mad_recv_wc *rmpp_wc;
+
+ ack_recv(rmpp_recv, rmpp_recv->rmpp_wc);
+ if (rmpp_recv->seg_num > 1)
+ cancel_delayed_work(&rmpp_recv->timeout_work);
+
+ rmpp_wc = rmpp_recv->rmpp_wc;
+ rmpp_wc->mad_len = get_mad_len(rmpp_recv);
+ /* 10 seconds until we can find the packet lifetime */
+ queue_delayed_work(rmpp_recv->agent->qp_info->port_priv->wq,
+ &rmpp_recv->cleanup_work, msecs_to_jiffies(10000));
+ return rmpp_wc;
+}
+
+static struct ib_mad_recv_wc *
+continue_rmpp(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ struct ib_mad_recv_buf *prev_buf;
+ struct ib_mad_recv_wc *done_wc;
+ int seg_num;
+ unsigned long flags;
+
+ rmpp_recv = acquire_rmpp_recv(agent, mad_recv_wc);
+ if (!rmpp_recv)
+ goto drop1;
+
+ seg_num = get_seg_num(&mad_recv_wc->recv_buf);
+
+ spin_lock_irqsave(&rmpp_recv->lock, flags);
+ if ((rmpp_recv->state == RMPP_STATE_TIMEOUT) ||
+ (seg_num > rmpp_recv->newwin))
+ goto drop3;
+
+ if ((seg_num <= rmpp_recv->last_ack) ||
+ (rmpp_recv->state == RMPP_STATE_COMPLETE)) {
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+ ack_recv(rmpp_recv, mad_recv_wc);
+ goto drop2;
+ }
+
+ prev_buf = find_seg_location(&rmpp_recv->rmpp_wc->rmpp_list, seg_num);
+ if (!prev_buf)
+ goto drop3;
+
+ done_wc = NULL;
+ list_add(&mad_recv_wc->recv_buf.list, &prev_buf->list);
+ if (rmpp_recv->cur_seg_buf == prev_buf) {
+ update_seg_num(rmpp_recv, &mad_recv_wc->recv_buf);
+ if (get_last_flag(rmpp_recv->cur_seg_buf)) {
+ rmpp_recv->state = RMPP_STATE_COMPLETE;
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+ done_wc = complete_rmpp(rmpp_recv);
+ goto out;
+ } else if (rmpp_recv->seg_num == rmpp_recv->newwin) {
+ rmpp_recv->newwin += window_size(agent);
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+ ack_recv(rmpp_recv, mad_recv_wc);
+ goto out;
+ }
+ }
+ spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+out:
+ deref_rmpp_recv(rmpp_recv);
+ return done_wc;
+
+drop3: spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+drop2: deref_rmpp_recv(rmpp_recv);
+drop1: ib_free_recv_mad(mad_recv_wc);
+ return NULL;
+}
+
+static struct ib_mad_recv_wc *
+start_rmpp(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+ unsigned long flags;
+
+ rmpp_recv = create_rmpp_recv(agent, mad_recv_wc);
+ if (!rmpp_recv) {
+ ib_free_recv_mad(mad_recv_wc);
+ return NULL;
+ }
+
+ spin_lock_irqsave(&agent->lock, flags);
+ if (insert_rmpp_recv(agent, rmpp_recv)) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ /* duplicate first MAD */
+ destroy_rmpp_recv(rmpp_recv);
+ return continue_rmpp(agent, mad_recv_wc);
+ }
+ atomic_inc(&rmpp_recv->refcount);
+
+ if (get_last_flag(&mad_recv_wc->recv_buf)) {
+ rmpp_recv->state = RMPP_STATE_COMPLETE;
+ spin_unlock_irqrestore(&agent->lock, flags);
+ complete_rmpp(rmpp_recv);
+ } else {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ /* 40 seconds until we can find the packet lifetimes */
+ queue_delayed_work(agent->qp_info->port_priv->wq,
+ &rmpp_recv->timeout_work,
+ msecs_to_jiffies(40000));
+ rmpp_recv->newwin += window_size(agent);
+ ack_recv(rmpp_recv, mad_recv_wc);
+ mad_recv_wc = NULL;
+ }
+ deref_rmpp_recv(rmpp_recv);
+ return mad_recv_wc;
+}
+
+static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int timeout;
+ u32 paylen = 0;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+ rmpp_mad->rmpp_hdr.seg_num = cpu_to_be32(++mad_send_wr->seg_num);
+
+ if (mad_send_wr->seg_num == 1) {
+ rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST;
+ paylen = mad_send_wr->send_buf.seg_count * IB_MGMT_RMPP_DATA -
+ mad_send_wr->pad;
+ }
+
+ if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) {
+ rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST;
+ paylen = IB_MGMT_RMPP_DATA - mad_send_wr->pad;
+ }
+ rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen);
+
+ /* 2 seconds for an ACK until we can find the packet lifetime */
+ timeout = mad_send_wr->send_buf.timeout_ms;
+ if (!timeout || timeout > 2000)
+ mad_send_wr->timeout = msecs_to_jiffies(2000);
+
+ return ib_send_mad(mad_send_wr);
+}
+
+static void abort_send(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc, u8 rmpp_status)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_mad_send_wc wc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&agent->lock, flags);
+ mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+ if (!mad_send_wr)
+ goto out; /* Unmatched send */
+
+ if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+ (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+ goto out; /* Send is already done */
+
+ ib_mark_mad_done(mad_send_wr);
+ spin_unlock_irqrestore(&agent->lock, flags);
+
+ wc.status = IB_WC_REM_ABORT_ERR;
+ wc.vendor_err = rmpp_status;
+ wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &wc);
+ return;
+out:
+ spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static inline void adjust_last_ack(struct ib_mad_send_wr_private *wr,
+ int seg_num)
+{
+ struct list_head *list;
+
+ wr->last_ack = seg_num;
+ list = &wr->last_ack_seg->list;
+ list_for_each_entry(wr->last_ack_seg, list, list)
+ if (wr->last_ack_seg->num == seg_num)
+ break;
+}
+
+static void process_ds_ack(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc, int newwin)
+{
+ struct mad_rmpp_recv *rmpp_recv;
+
+ rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+ if (rmpp_recv && rmpp_recv->state == RMPP_STATE_COMPLETE)
+ rmpp_recv->repwin = newwin;
+}
+
+static void process_rmpp_ack(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_mad_send_wr_private *mad_send_wr;
+ struct ib_rmpp_mad *rmpp_mad;
+ unsigned long flags;
+ int seg_num, newwin, ret;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+ if (rmpp_mad->rmpp_hdr.rmpp_status) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ return;
+ }
+
+ seg_num = be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+ newwin = be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+ if (newwin < seg_num) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+ return;
+ }
+
+ spin_lock_irqsave(&agent->lock, flags);
+ mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+ if (!mad_send_wr) {
+ if (!seg_num)
+ process_ds_ack(agent, mad_recv_wc, newwin);
+ goto out; /* Unmatched or DS RMPP ACK */
+ }
+
+ if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) &&
+ (mad_send_wr->timeout)) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ ack_ds_ack(agent, mad_recv_wc);
+ return; /* Repeated ACK for DS RMPP transaction */
+ }
+
+ if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+ (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+ goto out; /* Send is already done */
+
+ if (seg_num > mad_send_wr->send_buf.seg_count ||
+ seg_num > mad_send_wr->newwin) {
+ spin_unlock_irqrestore(&agent->lock, flags);
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+ return;
+ }
+
+ if (newwin < mad_send_wr->newwin || seg_num < mad_send_wr->last_ack)
+ goto out; /* Old ACK */
+
+ if (seg_num > mad_send_wr->last_ack) {
+ adjust_last_ack(mad_send_wr, seg_num);
+ mad_send_wr->retries_left = mad_send_wr->max_retries;
+ }
+ mad_send_wr->newwin = newwin;
+ if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+ /* If no response is expected, the ACK completes the send */
+ if (!mad_send_wr->send_buf.timeout_ms) {
+ struct ib_mad_send_wc wc;
+
+ ib_mark_mad_done(mad_send_wr);
+ spin_unlock_irqrestore(&agent->lock, flags);
+
+ wc.status = IB_WC_SUCCESS;
+ wc.vendor_err = 0;
+ wc.send_buf = &mad_send_wr->send_buf;
+ ib_mad_complete_send_wr(mad_send_wr, &wc);
+ return;
+ }
+ if (mad_send_wr->refcount == 1)
+ ib_reset_mad_timeout(mad_send_wr,
+ mad_send_wr->send_buf.timeout_ms);
+ spin_unlock_irqrestore(&agent->lock, flags);
+ ack_ds_ack(agent, mad_recv_wc);
+ return;
+ } else if (mad_send_wr->refcount == 1 &&
+ mad_send_wr->seg_num < mad_send_wr->newwin &&
+ mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) {
+ /* Send failure will just result in a timeout/retry */
+ ret = send_next_seg(mad_send_wr);
+ if (ret)
+ goto out;
+
+ mad_send_wr->refcount++;
+ list_move_tail(&mad_send_wr->agent_list,
+ &mad_send_wr->mad_agent_priv->send_list);
+ }
+out:
+ spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static struct ib_mad_recv_wc *
+process_rmpp_data(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_rmpp_hdr *rmpp_hdr;
+ u8 rmpp_status;
+
+ rmpp_hdr = &((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr;
+
+ if (rmpp_hdr->rmpp_status) {
+ rmpp_status = IB_MGMT_RMPP_STATUS_BAD_STATUS;
+ goto bad;
+ }
+
+ if (rmpp_hdr->seg_num == cpu_to_be32(1)) {
+ if (!(ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST)) {
+ rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+ goto bad;
+ }
+ return start_rmpp(agent, mad_recv_wc);
+ } else {
+ if (ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST) {
+ rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+ goto bad;
+ }
+ return continue_rmpp(agent, mad_recv_wc);
+ }
+bad:
+ nack_recv(agent, mad_recv_wc, rmpp_status);
+ ib_free_recv_mad(mad_recv_wc);
+ return NULL;
+}
+
+static void process_rmpp_stop(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+ if (rmpp_mad->rmpp_hdr.rmpp_status != IB_MGMT_RMPP_STATUS_RESX) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ } else
+ abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+static void process_rmpp_abort(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+ if (rmpp_mad->rmpp_hdr.rmpp_status < IB_MGMT_RMPP_STATUS_ABORT_MIN ||
+ rmpp_mad->rmpp_hdr.rmpp_status > IB_MGMT_RMPP_STATUS_ABORT_MAX) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+ } else
+ abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+
+ rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+ if (!(rmpp_mad->rmpp_hdr.rmpp_rtime_flags & IB_MGMT_RMPP_FLAG_ACTIVE))
+ return mad_recv_wc;
+
+ if (rmpp_mad->rmpp_hdr.rmpp_version != IB_MGMT_RMPP_VERSION) {
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+ goto out;
+ }
+
+ switch (rmpp_mad->rmpp_hdr.rmpp_type) {
+ case IB_MGMT_RMPP_TYPE_DATA:
+ return process_rmpp_data(agent, mad_recv_wc);
+ case IB_MGMT_RMPP_TYPE_ACK:
+ process_rmpp_ack(agent, mad_recv_wc);
+ break;
+ case IB_MGMT_RMPP_TYPE_STOP:
+ process_rmpp_stop(agent, mad_recv_wc);
+ break;
+ case IB_MGMT_RMPP_TYPE_ABORT:
+ process_rmpp_abort(agent, mad_recv_wc);
+ break;
+ default:
+ abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+ nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+ break;
+ }
+out:
+ ib_free_recv_mad(mad_recv_wc);
+ return NULL;
+}
+
+static int init_newwin(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_mad_agent_private *agent = mad_send_wr->mad_agent_priv;
+ struct ib_mad_hdr *mad_hdr = mad_send_wr->send_buf.mad;
+ struct mad_rmpp_recv *rmpp_recv;
+ struct ib_ah_attr ah_attr;
+ unsigned long flags;
+ int newwin = 1;
+
+ if (!(mad_hdr->method & IB_MGMT_METHOD_RESP))
+ goto out;
+
+ spin_lock_irqsave(&agent->lock, flags);
+ list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+ if (rmpp_recv->tid != mad_hdr->tid ||
+ rmpp_recv->mgmt_class != mad_hdr->mgmt_class ||
+ rmpp_recv->class_version != mad_hdr->class_version ||
+ (rmpp_recv->method & IB_MGMT_METHOD_RESP))
+ continue;
+
+ if (ib_query_ah(mad_send_wr->send_buf.ah, &ah_attr))
+ continue;
+
+ if (rmpp_recv->slid == ah_attr.dlid) {
+ newwin = rmpp_recv->repwin;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&agent->lock, flags);
+out:
+ return newwin;
+}
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE))
+ return IB_RMPP_RESULT_UNHANDLED;
+
+ if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) {
+ mad_send_wr->seg_num = 1;
+ return IB_RMPP_RESULT_INTERNAL;
+ }
+
+ mad_send_wr->newwin = init_newwin(mad_send_wr);
+
+ /* We need to wait for the final ACK even if there isn't a response */
+ mad_send_wr->refcount += (mad_send_wr->timeout == 0);
+ ret = send_next_seg(mad_send_wr);
+ if (!ret)
+ return IB_RMPP_RESULT_CONSUMED;
+ return ret;
+}
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE))
+ return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+ if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA)
+ return IB_RMPP_RESULT_INTERNAL; /* ACK, STOP, or ABORT */
+
+ if (mad_send_wc->status != IB_WC_SUCCESS ||
+ mad_send_wr->status != IB_WC_SUCCESS)
+ return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */
+
+ if (!mad_send_wr->timeout)
+ return IB_RMPP_RESULT_PROCESSED; /* Response received */
+
+ if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+ mad_send_wr->timeout =
+ msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+ return IB_RMPP_RESULT_PROCESSED; /* Send done */
+ }
+
+ if (mad_send_wr->seg_num == mad_send_wr->newwin ||
+ mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count)
+ return IB_RMPP_RESULT_PROCESSED; /* Wait for ACK */
+
+ ret = send_next_seg(mad_send_wr);
+ if (ret) {
+ mad_send_wc->status = IB_WC_GENERAL_ERR;
+ return IB_RMPP_RESULT_PROCESSED;
+ }
+ return IB_RMPP_RESULT_CONSUMED;
+}
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr)
+{
+ struct ib_rmpp_mad *rmpp_mad;
+ int ret;
+
+ rmpp_mad = mad_send_wr->send_buf.mad;
+ if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE))
+ return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+ if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count)
+ return IB_RMPP_RESULT_PROCESSED;
+
+ mad_send_wr->seg_num = mad_send_wr->last_ack;
+ mad_send_wr->cur_seg = mad_send_wr->last_ack_seg;
+
+ ret = send_next_seg(mad_send_wr);
+ if (ret)
+ return IB_RMPP_RESULT_PROCESSED;
+
+ return IB_RMPP_RESULT_CONSUMED;
+}
diff --git a/drivers/infiniband/core/mad_rmpp.h b/drivers/infiniband/core/mad_rmpp.h
new file mode 100644
index 000000000..3d336bff1
--- /dev/null
+++ b/drivers/infiniband/core/mad_rmpp.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MAD_RMPP_H__
+#define __MAD_RMPP_H__
+
+enum {
+ IB_RMPP_RESULT_PROCESSED,
+ IB_RMPP_RESULT_CONSUMED,
+ IB_RMPP_RESULT_INTERNAL,
+ IB_RMPP_RESULT_UNHANDLED
+};
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+ struct ib_mad_recv_wc *mad_recv_wc);
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+ struct ib_mad_send_wc *mad_send_wc);
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc);
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent);
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr);
+
+#endif /* __MAD_RMPP_H__ */
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
new file mode 100644
index 000000000..fa17b552f
--- /dev/null
+++ b/drivers/infiniband/core/multicast.c
@@ -0,0 +1,903 @@
+/*
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/random.h>
+
+#include <rdma/ib_cache.h>
+#include "sa.h"
+
+static void mcast_add_one(struct ib_device *device);
+static void mcast_remove_one(struct ib_device *device);
+
+static struct ib_client mcast_client = {
+ .name = "ib_multicast",
+ .add = mcast_add_one,
+ .remove = mcast_remove_one
+};
+
+static struct ib_sa_client sa_client;
+static struct workqueue_struct *mcast_wq;
+static union ib_gid mgid0;
+
+struct mcast_device;
+
+struct mcast_port {
+ struct mcast_device *dev;
+ spinlock_t lock;
+ struct rb_root table;
+ atomic_t refcount;
+ struct completion comp;
+ u8 port_num;
+};
+
+struct mcast_device {
+ struct ib_device *device;
+ struct ib_event_handler event_handler;
+ int start_port;
+ int end_port;
+ struct mcast_port port[0];
+};
+
+enum mcast_state {
+ MCAST_JOINING,
+ MCAST_MEMBER,
+ MCAST_ERROR,
+};
+
+enum mcast_group_state {
+ MCAST_IDLE,
+ MCAST_BUSY,
+ MCAST_GROUP_ERROR,
+ MCAST_PKEY_EVENT
+};
+
+enum {
+ MCAST_INVALID_PKEY_INDEX = 0xFFFF
+};
+
+struct mcast_member;
+
+struct mcast_group {
+ struct ib_sa_mcmember_rec rec;
+ struct rb_node node;
+ struct mcast_port *port;
+ spinlock_t lock;
+ struct work_struct work;
+ struct list_head pending_list;
+ struct list_head active_list;
+ struct mcast_member *last_join;
+ int members[3];
+ atomic_t refcount;
+ enum mcast_group_state state;
+ struct ib_sa_query *query;
+ int query_id;
+ u16 pkey_index;
+ u8 leave_state;
+ int retries;
+};
+
+struct mcast_member {
+ struct ib_sa_multicast multicast;
+ struct ib_sa_client *client;
+ struct mcast_group *group;
+ struct list_head list;
+ enum mcast_state state;
+ atomic_t refcount;
+ struct completion comp;
+};
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+ void *context);
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+ void *context);
+
+static struct mcast_group *mcast_find(struct mcast_port *port,
+ union ib_gid *mgid)
+{
+ struct rb_node *node = port->table.rb_node;
+ struct mcast_group *group;
+ int ret;
+
+ while (node) {
+ group = rb_entry(node, struct mcast_group, node);
+ ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
+ if (!ret)
+ return group;
+
+ if (ret < 0)
+ node = node->rb_left;
+ else
+ node = node->rb_right;
+ }
+ return NULL;
+}
+
+static struct mcast_group *mcast_insert(struct mcast_port *port,
+ struct mcast_group *group,
+ int allow_duplicates)
+{
+ struct rb_node **link = &port->table.rb_node;
+ struct rb_node *parent = NULL;
+ struct mcast_group *cur_group;
+ int ret;
+
+ while (*link) {
+ parent = *link;
+ cur_group = rb_entry(parent, struct mcast_group, node);
+
+ ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
+ sizeof group->rec.mgid);
+ if (ret < 0)
+ link = &(*link)->rb_left;
+ else if (ret > 0)
+ link = &(*link)->rb_right;
+ else if (allow_duplicates)
+ link = &(*link)->rb_left;
+ else
+ return cur_group;
+ }
+ rb_link_node(&group->node, parent, link);
+ rb_insert_color(&group->node, &port->table);
+ return NULL;
+}
+
+static void deref_port(struct mcast_port *port)
+{
+ if (atomic_dec_and_test(&port->refcount))
+ complete(&port->comp);
+}
+
+static void release_group(struct mcast_group *group)
+{
+ struct mcast_port *port = group->port;
+ unsigned long flags;
+
+ spin_lock_irqsave(&port->lock, flags);
+ if (atomic_dec_and_test(&group->refcount)) {
+ rb_erase(&group->node, &port->table);
+ spin_unlock_irqrestore(&port->lock, flags);
+ kfree(group);
+ deref_port(port);
+ } else
+ spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void deref_member(struct mcast_member *member)
+{
+ if (atomic_dec_and_test(&member->refcount))
+ complete(&member->comp);
+}
+
+static void queue_join(struct mcast_member *member)
+{
+ struct mcast_group *group = member->group;
+ unsigned long flags;
+
+ spin_lock_irqsave(&group->lock, flags);
+ list_add_tail(&member->list, &group->pending_list);
+ if (group->state == MCAST_IDLE) {
+ group->state = MCAST_BUSY;
+ atomic_inc(&group->refcount);
+ queue_work(mcast_wq, &group->work);
+ }
+ spin_unlock_irqrestore(&group->lock, flags);
+}
+
+/*
+ * A multicast group has three types of members: full member, non member, and
+ * send only member. We need to keep track of the number of members of each
+ * type based on their join state. Adjust the number of members the belong to
+ * the specified join states.
+ */
+static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
+{
+ int i;
+
+ for (i = 0; i < 3; i++, join_state >>= 1)
+ if (join_state & 0x1)
+ group->members[i] += inc;
+}
+
+/*
+ * If a multicast group has zero members left for a particular join state, but
+ * the group is still a member with the SA, we need to leave that join state.
+ * Determine which join states we still belong to, but that do not have any
+ * active members.
+ */
+static u8 get_leave_state(struct mcast_group *group)
+{
+ u8 leave_state = 0;
+ int i;
+
+ for (i = 0; i < 3; i++)
+ if (!group->members[i])
+ leave_state |= (0x1 << i);
+
+ return leave_state & group->rec.join_state;
+}
+
+static int check_selector(ib_sa_comp_mask comp_mask,
+ ib_sa_comp_mask selector_mask,
+ ib_sa_comp_mask value_mask,
+ u8 selector, u8 src_value, u8 dst_value)
+{
+ int err;
+
+ if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
+ return 0;
+
+ switch (selector) {
+ case IB_SA_GT:
+ err = (src_value <= dst_value);
+ break;
+ case IB_SA_LT:
+ err = (src_value >= dst_value);
+ break;
+ case IB_SA_EQ:
+ err = (src_value != dst_value);
+ break;
+ default:
+ err = 0;
+ break;
+ }
+
+ return err;
+}
+
+static int cmp_rec(struct ib_sa_mcmember_rec *src,
+ struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask)
+{
+ /* MGID must already match */
+
+ if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID &&
+ memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid))
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
+ return -EINVAL;
+ if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
+ IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector,
+ src->mtu, dst->mtu))
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
+ src->traffic_class != dst->traffic_class)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
+ return -EINVAL;
+ if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
+ IB_SA_MCMEMBER_REC_RATE, dst->rate_selector,
+ src->rate, dst->rate))
+ return -EINVAL;
+ if (check_selector(comp_mask,
+ IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
+ IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
+ dst->packet_life_time_selector,
+ src->packet_life_time, dst->packet_life_time))
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
+ src->flow_label != dst->flow_label)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
+ src->hop_limit != dst->hop_limit)
+ return -EINVAL;
+ if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope)
+ return -EINVAL;
+
+ /* join_state checked separately, proxy_join ignored */
+
+ return 0;
+}
+
+static int send_join(struct mcast_group *group, struct mcast_member *member)
+{
+ struct mcast_port *port = group->port;
+ int ret;
+
+ group->last_join = member;
+ ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+ port->port_num, IB_MGMT_METHOD_SET,
+ &member->multicast.rec,
+ member->multicast.comp_mask,
+ 3000, GFP_KERNEL, join_handler, group,
+ &group->query);
+ if (ret >= 0) {
+ group->query_id = ret;
+ ret = 0;
+ }
+ return ret;
+}
+
+static int send_leave(struct mcast_group *group, u8 leave_state)
+{
+ struct mcast_port *port = group->port;
+ struct ib_sa_mcmember_rec rec;
+ int ret;
+
+ rec = group->rec;
+ rec.join_state = leave_state;
+ group->leave_state = leave_state;
+
+ ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+ port->port_num, IB_SA_METHOD_DELETE, &rec,
+ IB_SA_MCMEMBER_REC_MGID |
+ IB_SA_MCMEMBER_REC_PORT_GID |
+ IB_SA_MCMEMBER_REC_JOIN_STATE,
+ 3000, GFP_KERNEL, leave_handler,
+ group, &group->query);
+ if (ret >= 0) {
+ group->query_id = ret;
+ ret = 0;
+ }
+ return ret;
+}
+
+static void join_group(struct mcast_group *group, struct mcast_member *member,
+ u8 join_state)
+{
+ member->state = MCAST_MEMBER;
+ adjust_membership(group, join_state, 1);
+ group->rec.join_state |= join_state;
+ member->multicast.rec = group->rec;
+ member->multicast.rec.join_state = join_state;
+ list_move(&member->list, &group->active_list);
+}
+
+static int fail_join(struct mcast_group *group, struct mcast_member *member,
+ int status)
+{
+ spin_lock_irq(&group->lock);
+ list_del_init(&member->list);
+ spin_unlock_irq(&group->lock);
+ return member->multicast.callback(status, &member->multicast);
+}
+
+static void process_group_error(struct mcast_group *group)
+{
+ struct mcast_member *member;
+ int ret = 0;
+ u16 pkey_index;
+
+ if (group->state == MCAST_PKEY_EVENT)
+ ret = ib_find_pkey(group->port->dev->device,
+ group->port->port_num,
+ be16_to_cpu(group->rec.pkey), &pkey_index);
+
+ spin_lock_irq(&group->lock);
+ if (group->state == MCAST_PKEY_EVENT && !ret &&
+ group->pkey_index == pkey_index)
+ goto out;
+
+ while (!list_empty(&group->active_list)) {
+ member = list_entry(group->active_list.next,
+ struct mcast_member, list);
+ atomic_inc(&member->refcount);
+ list_del_init(&member->list);
+ adjust_membership(group, member->multicast.rec.join_state, -1);
+ member->state = MCAST_ERROR;
+ spin_unlock_irq(&group->lock);
+
+ ret = member->multicast.callback(-ENETRESET,
+ &member->multicast);
+ deref_member(member);
+ if (ret)
+ ib_sa_free_multicast(&member->multicast);
+ spin_lock_irq(&group->lock);
+ }
+
+ group->rec.join_state = 0;
+out:
+ group->state = MCAST_BUSY;
+ spin_unlock_irq(&group->lock);
+}
+
+static void mcast_work_handler(struct work_struct *work)
+{
+ struct mcast_group *group;
+ struct mcast_member *member;
+ struct ib_sa_multicast *multicast;
+ int status, ret;
+ u8 join_state;
+
+ group = container_of(work, typeof(*group), work);
+retest:
+ spin_lock_irq(&group->lock);
+ while (!list_empty(&group->pending_list) ||
+ (group->state != MCAST_BUSY)) {
+
+ if (group->state != MCAST_BUSY) {
+ spin_unlock_irq(&group->lock);
+ process_group_error(group);
+ goto retest;
+ }
+
+ member = list_entry(group->pending_list.next,
+ struct mcast_member, list);
+ multicast = &member->multicast;
+ join_state = multicast->rec.join_state;
+ atomic_inc(&member->refcount);
+
+ if (join_state == (group->rec.join_state & join_state)) {
+ status = cmp_rec(&group->rec, &multicast->rec,
+ multicast->comp_mask);
+ if (!status)
+ join_group(group, member, join_state);
+ else
+ list_del_init(&member->list);
+ spin_unlock_irq(&group->lock);
+ ret = multicast->callback(status, multicast);
+ } else {
+ spin_unlock_irq(&group->lock);
+ status = send_join(group, member);
+ if (!status) {
+ deref_member(member);
+ return;
+ }
+ ret = fail_join(group, member, status);
+ }
+
+ deref_member(member);
+ if (ret)
+ ib_sa_free_multicast(&member->multicast);
+ spin_lock_irq(&group->lock);
+ }
+
+ join_state = get_leave_state(group);
+ if (join_state) {
+ group->rec.join_state &= ~join_state;
+ spin_unlock_irq(&group->lock);
+ if (send_leave(group, join_state))
+ goto retest;
+ } else {
+ group->state = MCAST_IDLE;
+ spin_unlock_irq(&group->lock);
+ release_group(group);
+ }
+}
+
+/*
+ * Fail a join request if it is still active - at the head of the pending queue.
+ */
+static void process_join_error(struct mcast_group *group, int status)
+{
+ struct mcast_member *member;
+ int ret;
+
+ spin_lock_irq(&group->lock);
+ member = list_entry(group->pending_list.next,
+ struct mcast_member, list);
+ if (group->last_join == member) {
+ atomic_inc(&member->refcount);
+ list_del_init(&member->list);
+ spin_unlock_irq(&group->lock);
+ ret = member->multicast.callback(status, &member->multicast);
+ deref_member(member);
+ if (ret)
+ ib_sa_free_multicast(&member->multicast);
+ } else
+ spin_unlock_irq(&group->lock);
+}
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+ void *context)
+{
+ struct mcast_group *group = context;
+ u16 pkey_index = MCAST_INVALID_PKEY_INDEX;
+
+ if (status)
+ process_join_error(group, status);
+ else {
+ int mgids_changed, is_mgid0;
+ ib_find_pkey(group->port->dev->device, group->port->port_num,
+ be16_to_cpu(rec->pkey), &pkey_index);
+
+ spin_lock_irq(&group->port->lock);
+ if (group->state == MCAST_BUSY &&
+ group->pkey_index == MCAST_INVALID_PKEY_INDEX)
+ group->pkey_index = pkey_index;
+ mgids_changed = memcmp(&rec->mgid, &group->rec.mgid,
+ sizeof(group->rec.mgid));
+ group->rec = *rec;
+ if (mgids_changed) {
+ rb_erase(&group->node, &group->port->table);
+ is_mgid0 = !memcmp(&mgid0, &group->rec.mgid,
+ sizeof(mgid0));
+ mcast_insert(group->port, group, is_mgid0);
+ }
+ spin_unlock_irq(&group->port->lock);
+ }
+ mcast_work_handler(&group->work);
+}
+
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+ void *context)
+{
+ struct mcast_group *group = context;
+
+ if (status && group->retries > 0 &&
+ !send_leave(group, group->leave_state))
+ group->retries--;
+ else
+ mcast_work_handler(&group->work);
+}
+
+static struct mcast_group *acquire_group(struct mcast_port *port,
+ union ib_gid *mgid, gfp_t gfp_mask)
+{
+ struct mcast_group *group, *cur_group;
+ unsigned long flags;
+ int is_mgid0;
+
+ is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
+ if (!is_mgid0) {
+ spin_lock_irqsave(&port->lock, flags);
+ group = mcast_find(port, mgid);
+ if (group)
+ goto found;
+ spin_unlock_irqrestore(&port->lock, flags);
+ }
+
+ group = kzalloc(sizeof *group, gfp_mask);
+ if (!group)
+ return NULL;
+
+ group->retries = 3;
+ group->port = port;
+ group->rec.mgid = *mgid;
+ group->pkey_index = MCAST_INVALID_PKEY_INDEX;
+ INIT_LIST_HEAD(&group->pending_list);
+ INIT_LIST_HEAD(&group->active_list);
+ INIT_WORK(&group->work, mcast_work_handler);
+ spin_lock_init(&group->lock);
+
+ spin_lock_irqsave(&port->lock, flags);
+ cur_group = mcast_insert(port, group, is_mgid0);
+ if (cur_group) {
+ kfree(group);
+ group = cur_group;
+ } else
+ atomic_inc(&port->refcount);
+found:
+ atomic_inc(&group->refcount);
+ spin_unlock_irqrestore(&port->lock, flags);
+ return group;
+}
+
+/*
+ * We serialize all join requests to a single group to make our lives much
+ * easier. Otherwise, two users could try to join the same group
+ * simultaneously, with different configurations, one could leave while the
+ * join is in progress, etc., which makes locking around error recovery
+ * difficult.
+ */
+struct ib_sa_multicast *
+ib_sa_join_multicast(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+ int (*callback)(int status,
+ struct ib_sa_multicast *multicast),
+ void *context)
+{
+ struct mcast_device *dev;
+ struct mcast_member *member;
+ struct ib_sa_multicast *multicast;
+ int ret;
+
+ dev = ib_get_client_data(device, &mcast_client);
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+
+ member = kmalloc(sizeof *member, gfp_mask);
+ if (!member)
+ return ERR_PTR(-ENOMEM);
+
+ ib_sa_client_get(client);
+ member->client = client;
+ member->multicast.rec = *rec;
+ member->multicast.comp_mask = comp_mask;
+ member->multicast.callback = callback;
+ member->multicast.context = context;
+ init_completion(&member->comp);
+ atomic_set(&member->refcount, 1);
+ member->state = MCAST_JOINING;
+
+ member->group = acquire_group(&dev->port[port_num - dev->start_port],
+ &rec->mgid, gfp_mask);
+ if (!member->group) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ /*
+ * The user will get the multicast structure in their callback. They
+ * could then free the multicast structure before we can return from
+ * this routine. So we save the pointer to return before queuing
+ * any callback.
+ */
+ multicast = &member->multicast;
+ queue_join(member);
+ return multicast;
+
+err:
+ ib_sa_client_put(client);
+ kfree(member);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_sa_join_multicast);
+
+void ib_sa_free_multicast(struct ib_sa_multicast *multicast)
+{
+ struct mcast_member *member;
+ struct mcast_group *group;
+
+ member = container_of(multicast, struct mcast_member, multicast);
+ group = member->group;
+
+ spin_lock_irq(&group->lock);
+ if (member->state == MCAST_MEMBER)
+ adjust_membership(group, multicast->rec.join_state, -1);
+
+ list_del_init(&member->list);
+
+ if (group->state == MCAST_IDLE) {
+ group->state = MCAST_BUSY;
+ spin_unlock_irq(&group->lock);
+ /* Continue to hold reference on group until callback */
+ queue_work(mcast_wq, &group->work);
+ } else {
+ spin_unlock_irq(&group->lock);
+ release_group(group);
+ }
+
+ deref_member(member);
+ wait_for_completion(&member->comp);
+ ib_sa_client_put(member->client);
+ kfree(member);
+}
+EXPORT_SYMBOL(ib_sa_free_multicast);
+
+int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
+ union ib_gid *mgid, struct ib_sa_mcmember_rec *rec)
+{
+ struct mcast_device *dev;
+ struct mcast_port *port;
+ struct mcast_group *group;
+ unsigned long flags;
+ int ret = 0;
+
+ dev = ib_get_client_data(device, &mcast_client);
+ if (!dev)
+ return -ENODEV;
+
+ port = &dev->port[port_num - dev->start_port];
+ spin_lock_irqsave(&port->lock, flags);
+ group = mcast_find(port, mgid);
+ if (group)
+ *rec = group->rec;
+ else
+ ret = -EADDRNOTAVAIL;
+ spin_unlock_irqrestore(&port->lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_sa_get_mcmember_rec);
+
+int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
+ struct ib_sa_mcmember_rec *rec,
+ struct ib_ah_attr *ah_attr)
+{
+ int ret;
+ u16 gid_index;
+ u8 p;
+
+ ret = ib_find_cached_gid(device, &rec->port_gid, &p, &gid_index);
+ if (ret)
+ return ret;
+
+ memset(ah_attr, 0, sizeof *ah_attr);
+ ah_attr->dlid = be16_to_cpu(rec->mlid);
+ ah_attr->sl = rec->sl;
+ ah_attr->port_num = port_num;
+ ah_attr->static_rate = rec->rate;
+
+ ah_attr->ah_flags = IB_AH_GRH;
+ ah_attr->grh.dgid = rec->mgid;
+
+ ah_attr->grh.sgid_index = (u8) gid_index;
+ ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label);
+ ah_attr->grh.hop_limit = rec->hop_limit;
+ ah_attr->grh.traffic_class = rec->traffic_class;
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_mcmember);
+
+static void mcast_groups_event(struct mcast_port *port,
+ enum mcast_group_state state)
+{
+ struct mcast_group *group;
+ struct rb_node *node;
+ unsigned long flags;
+
+ spin_lock_irqsave(&port->lock, flags);
+ for (node = rb_first(&port->table); node; node = rb_next(node)) {
+ group = rb_entry(node, struct mcast_group, node);
+ spin_lock(&group->lock);
+ if (group->state == MCAST_IDLE) {
+ atomic_inc(&group->refcount);
+ queue_work(mcast_wq, &group->work);
+ }
+ if (group->state != MCAST_GROUP_ERROR)
+ group->state = state;
+ spin_unlock(&group->lock);
+ }
+ spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void mcast_event_handler(struct ib_event_handler *handler,
+ struct ib_event *event)
+{
+ struct mcast_device *dev;
+ int index;
+
+ dev = container_of(handler, struct mcast_device, event_handler);
+ if (rdma_port_get_link_layer(dev->device, event->element.port_num) !=
+ IB_LINK_LAYER_INFINIBAND)
+ return;
+
+ index = event->element.port_num - dev->start_port;
+
+ switch (event->event) {
+ case IB_EVENT_PORT_ERR:
+ case IB_EVENT_LID_CHANGE:
+ case IB_EVENT_SM_CHANGE:
+ case IB_EVENT_CLIENT_REREGISTER:
+ mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);
+ break;
+ case IB_EVENT_PKEY_CHANGE:
+ mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT);
+ break;
+ default:
+ break;
+ }
+}
+
+static void mcast_add_one(struct ib_device *device)
+{
+ struct mcast_device *dev;
+ struct mcast_port *port;
+ int i;
+ int count = 0;
+
+ if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
+ GFP_KERNEL);
+ if (!dev)
+ return;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH)
+ dev->start_port = dev->end_port = 0;
+ else {
+ dev->start_port = 1;
+ dev->end_port = device->phys_port_cnt;
+ }
+
+ for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+ if (rdma_port_get_link_layer(device, dev->start_port + i) !=
+ IB_LINK_LAYER_INFINIBAND)
+ continue;
+ port = &dev->port[i];
+ port->dev = dev;
+ port->port_num = dev->start_port + i;
+ spin_lock_init(&port->lock);
+ port->table = RB_ROOT;
+ init_completion(&port->comp);
+ atomic_set(&port->refcount, 1);
+ ++count;
+ }
+
+ if (!count) {
+ kfree(dev);
+ return;
+ }
+
+ dev->device = device;
+ ib_set_client_data(device, &mcast_client, dev);
+
+ INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler);
+ ib_register_event_handler(&dev->event_handler);
+}
+
+static void mcast_remove_one(struct ib_device *device)
+{
+ struct mcast_device *dev;
+ struct mcast_port *port;
+ int i;
+
+ dev = ib_get_client_data(device, &mcast_client);
+ if (!dev)
+ return;
+
+ ib_unregister_event_handler(&dev->event_handler);
+ flush_workqueue(mcast_wq);
+
+ for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+ if (rdma_port_get_link_layer(device, dev->start_port + i) ==
+ IB_LINK_LAYER_INFINIBAND) {
+ port = &dev->port[i];
+ deref_port(port);
+ wait_for_completion(&port->comp);
+ }
+ }
+
+ kfree(dev);
+}
+
+int mcast_init(void)
+{
+ int ret;
+
+ mcast_wq = create_singlethread_workqueue("ib_mcast");
+ if (!mcast_wq)
+ return -ENOMEM;
+
+ ib_sa_register_client(&sa_client);
+
+ ret = ib_register_client(&mcast_client);
+ if (ret)
+ goto err;
+ return 0;
+
+err:
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(mcast_wq);
+ return ret;
+}
+
+void mcast_cleanup(void)
+{
+ ib_unregister_client(&mcast_client);
+ ib_sa_unregister_client(&sa_client);
+ destroy_workqueue(mcast_wq);
+}
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
new file mode 100644
index 000000000..23dd5a5c7
--- /dev/null
+++ b/drivers/infiniband/core/netlink.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2010 Voltaire Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+#include <linux/export.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <rdma/rdma_netlink.h>
+
+struct ibnl_client {
+ struct list_head list;
+ int index;
+ int nops;
+ const struct ibnl_client_cbs *cb_table;
+};
+
+static DEFINE_MUTEX(ibnl_mutex);
+static struct sock *nls;
+static LIST_HEAD(client_list);
+
+int ibnl_add_client(int index, int nops,
+ const struct ibnl_client_cbs cb_table[])
+{
+ struct ibnl_client *cur;
+ struct ibnl_client *nl_client;
+
+ nl_client = kmalloc(sizeof *nl_client, GFP_KERNEL);
+ if (!nl_client)
+ return -ENOMEM;
+
+ nl_client->index = index;
+ nl_client->nops = nops;
+ nl_client->cb_table = cb_table;
+
+ mutex_lock(&ibnl_mutex);
+
+ list_for_each_entry(cur, &client_list, list) {
+ if (cur->index == index) {
+ pr_warn("Client for %d already exists\n", index);
+ mutex_unlock(&ibnl_mutex);
+ kfree(nl_client);
+ return -EINVAL;
+ }
+ }
+
+ list_add_tail(&nl_client->list, &client_list);
+
+ mutex_unlock(&ibnl_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL(ibnl_add_client);
+
+int ibnl_remove_client(int index)
+{
+ struct ibnl_client *cur, *next;
+
+ mutex_lock(&ibnl_mutex);
+ list_for_each_entry_safe(cur, next, &client_list, list) {
+ if (cur->index == index) {
+ list_del(&(cur->list));
+ mutex_unlock(&ibnl_mutex);
+ kfree(cur);
+ return 0;
+ }
+ }
+ pr_warn("Can't remove callback for client idx %d. Not found\n", index);
+ mutex_unlock(&ibnl_mutex);
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL(ibnl_remove_client);
+
+void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq,
+ int len, int client, int op, int flags)
+{
+ unsigned char *prev_tail;
+
+ prev_tail = skb_tail_pointer(skb);
+ *nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op),
+ len, flags);
+ if (!*nlh)
+ goto out_nlmsg_trim;
+ (*nlh)->nlmsg_len = skb_tail_pointer(skb) - prev_tail;
+ return nlmsg_data(*nlh);
+
+out_nlmsg_trim:
+ nlmsg_trim(skb, prev_tail);
+ return NULL;
+}
+EXPORT_SYMBOL(ibnl_put_msg);
+
+int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh,
+ int len, void *data, int type)
+{
+ unsigned char *prev_tail;
+
+ prev_tail = skb_tail_pointer(skb);
+ if (nla_put(skb, type, len, data))
+ goto nla_put_failure;
+ nlh->nlmsg_len += skb_tail_pointer(skb) - prev_tail;
+ return 0;
+
+nla_put_failure:
+ nlmsg_trim(skb, prev_tail - nlh->nlmsg_len);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL(ibnl_put_attr);
+
+static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct ibnl_client *client;
+ int type = nlh->nlmsg_type;
+ int index = RDMA_NL_GET_CLIENT(type);
+ int op = RDMA_NL_GET_OP(type);
+
+ list_for_each_entry(client, &client_list, list) {
+ if (client->index == index) {
+ if (op < 0 || op >= client->nops ||
+ !client->cb_table[op].dump)
+ return -EINVAL;
+
+ {
+ struct netlink_dump_control c = {
+ .dump = client->cb_table[op].dump,
+ .module = client->cb_table[op].module,
+ };
+ return netlink_dump_start(nls, skb, nlh, &c);
+ }
+ }
+ }
+
+ pr_info("Index %d wasn't found in client list\n", index);
+ return -EINVAL;
+}
+
+static void ibnl_rcv(struct sk_buff *skb)
+{
+ mutex_lock(&ibnl_mutex);
+ netlink_rcv_skb(skb, &ibnl_rcv_msg);
+ mutex_unlock(&ibnl_mutex);
+}
+
+int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh,
+ __u32 pid)
+{
+ return nlmsg_unicast(nls, skb, pid);
+}
+EXPORT_SYMBOL(ibnl_unicast);
+
+int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh,
+ unsigned int group, gfp_t flags)
+{
+ return nlmsg_multicast(nls, skb, 0, group, flags);
+}
+EXPORT_SYMBOL(ibnl_multicast);
+
+int __init ibnl_init(void)
+{
+ struct netlink_kernel_cfg cfg = {
+ .input = ibnl_rcv,
+ };
+
+ nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg);
+ if (!nls) {
+ pr_warn("Failed to create netlink socket\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+void ibnl_cleanup(void)
+{
+ struct ibnl_client *cur, *next;
+
+ mutex_lock(&ibnl_mutex);
+ list_for_each_entry_safe(cur, next, &client_list, list) {
+ list_del(&(cur->list));
+ kfree(cur);
+ }
+ mutex_unlock(&ibnl_mutex);
+
+ netlink_kernel_release(nls);
+}
diff --git a/drivers/infiniband/core/packer.c b/drivers/infiniband/core/packer.c
new file mode 100644
index 000000000..1b65986c0
--- /dev/null
+++ b/drivers/infiniband/core/packer.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <linux/string.h>
+
+#include <rdma/ib_pack.h>
+
+static u64 value_read(int offset, int size, void *structure)
+{
+ switch (size) {
+ case 1: return *(u8 *) (structure + offset);
+ case 2: return be16_to_cpup((__be16 *) (structure + offset));
+ case 4: return be32_to_cpup((__be32 *) (structure + offset));
+ case 8: return be64_to_cpup((__be64 *) (structure + offset));
+ default:
+ printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+ return 0;
+ }
+}
+
+/**
+ * ib_pack - Pack a structure into a buffer
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @structure:Structure to pack from
+ * @buf:Buffer to pack into
+ *
+ * ib_pack() packs a list of structure fields into a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_pack(const struct ib_field *desc,
+ int desc_len,
+ void *structure,
+ void *buf)
+{
+ int i;
+
+ for (i = 0; i < desc_len; ++i) {
+ if (desc[i].size_bits <= 32) {
+ int shift;
+ u32 val;
+ __be32 mask;
+ __be32 *addr;
+
+ shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+ if (desc[i].struct_size_bytes)
+ val = value_read(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ structure) << shift;
+ else
+ val = 0;
+
+ mask = cpu_to_be32(((1ull << desc[i].size_bits) - 1) << shift);
+ addr = (__be32 *) buf + desc[i].offset_words;
+ *addr = (*addr & ~mask) | (cpu_to_be32(val) & mask);
+ } else if (desc[i].size_bits <= 64) {
+ int shift;
+ u64 val;
+ __be64 mask;
+ __be64 *addr;
+
+ shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+ if (desc[i].struct_size_bytes)
+ val = value_read(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ structure) << shift;
+ else
+ val = 0;
+
+ mask = cpu_to_be64((~0ull >> (64 - desc[i].size_bits)) << shift);
+ addr = (__be64 *) ((__be32 *) buf + desc[i].offset_words);
+ *addr = (*addr & ~mask) | (cpu_to_be64(val) & mask);
+ } else {
+ if (desc[i].offset_bits % 8 ||
+ desc[i].size_bits % 8) {
+ printk(KERN_WARNING "Structure field %s of size %d "
+ "bits is not byte-aligned\n",
+ desc[i].field_name, desc[i].size_bits);
+ }
+
+ if (desc[i].struct_size_bytes)
+ memcpy(buf + desc[i].offset_words * 4 +
+ desc[i].offset_bits / 8,
+ structure + desc[i].struct_offset_bytes,
+ desc[i].size_bits / 8);
+ else
+ memset(buf + desc[i].offset_words * 4 +
+ desc[i].offset_bits / 8,
+ 0,
+ desc[i].size_bits / 8);
+ }
+ }
+}
+EXPORT_SYMBOL(ib_pack);
+
+static void value_write(int offset, int size, u64 val, void *structure)
+{
+ switch (size * 8) {
+ case 8: *( u8 *) (structure + offset) = val; break;
+ case 16: *(__be16 *) (structure + offset) = cpu_to_be16(val); break;
+ case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break;
+ case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break;
+ default:
+ printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+ }
+}
+
+/**
+ * ib_unpack - Unpack a buffer into a structure
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @buf:Buffer to unpack from
+ * @structure:Structure to unpack into
+ *
+ * ib_pack() unpacks a list of structure fields from a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_unpack(const struct ib_field *desc,
+ int desc_len,
+ void *buf,
+ void *structure)
+{
+ int i;
+
+ for (i = 0; i < desc_len; ++i) {
+ if (!desc[i].struct_size_bytes)
+ continue;
+
+ if (desc[i].size_bits <= 32) {
+ int shift;
+ u32 val;
+ u32 mask;
+ __be32 *addr;
+
+ shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+ mask = ((1ull << desc[i].size_bits) - 1) << shift;
+ addr = (__be32 *) buf + desc[i].offset_words;
+ val = (be32_to_cpup(addr) & mask) >> shift;
+ value_write(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ val,
+ structure);
+ } else if (desc[i].size_bits <= 64) {
+ int shift;
+ u64 val;
+ u64 mask;
+ __be64 *addr;
+
+ shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+ mask = (~0ull >> (64 - desc[i].size_bits)) << shift;
+ addr = (__be64 *) buf + desc[i].offset_words;
+ val = (be64_to_cpup(addr) & mask) >> shift;
+ value_write(desc[i].struct_offset_bytes,
+ desc[i].struct_size_bytes,
+ val,
+ structure);
+ } else {
+ if (desc[i].offset_bits % 8 ||
+ desc[i].size_bits % 8) {
+ printk(KERN_WARNING "Structure field %s of size %d "
+ "bits is not byte-aligned\n",
+ desc[i].field_name, desc[i].size_bits);
+ }
+
+ memcpy(structure + desc[i].struct_offset_bytes,
+ buf + desc[i].offset_words * 4 +
+ desc[i].offset_bits / 8,
+ desc[i].size_bits / 8);
+ }
+ }
+}
+EXPORT_SYMBOL(ib_unpack);
diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h
new file mode 100644
index 000000000..b1d4bbf4c
--- /dev/null
+++ b/drivers/infiniband/core/sa.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef SA_H
+#define SA_H
+
+#include <rdma/ib_sa.h>
+
+static inline void ib_sa_client_get(struct ib_sa_client *client)
+{
+ atomic_inc(&client->users);
+}
+
+static inline void ib_sa_client_put(struct ib_sa_client *client)
+{
+ if (atomic_dec_and_test(&client->users))
+ complete(&client->comp);
+}
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ u8 method,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_mcmember_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query);
+
+int mcast_init(void);
+void mcast_cleanup(void);
+
+#endif /* SA_H */
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
new file mode 100644
index 000000000..c38f030f0
--- /dev/null
+++ b/drivers/infiniband/core/sa_query.c
@@ -0,0 +1,1280 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/dma-mapping.h>
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/workqueue.h>
+#include <uapi/linux/if_ether.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_cache.h>
+#include "sa.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand subnet administration query support");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_sa_sm_ah {
+ struct ib_ah *ah;
+ struct kref ref;
+ u16 pkey_index;
+ u8 src_path_mask;
+};
+
+struct ib_sa_port {
+ struct ib_mad_agent *agent;
+ struct ib_sa_sm_ah *sm_ah;
+ struct work_struct update_task;
+ spinlock_t ah_lock;
+ u8 port_num;
+};
+
+struct ib_sa_device {
+ int start_port, end_port;
+ struct ib_event_handler event_handler;
+ struct ib_sa_port port[0];
+};
+
+struct ib_sa_query {
+ void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *);
+ void (*release)(struct ib_sa_query *);
+ struct ib_sa_client *client;
+ struct ib_sa_port *port;
+ struct ib_mad_send_buf *mad_buf;
+ struct ib_sa_sm_ah *sm_ah;
+ int id;
+};
+
+struct ib_sa_service_query {
+ void (*callback)(int, struct ib_sa_service_rec *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
+struct ib_sa_path_query {
+ void (*callback)(int, struct ib_sa_path_rec *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
+struct ib_sa_guidinfo_query {
+ void (*callback)(int, struct ib_sa_guidinfo_rec *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
+struct ib_sa_mcmember_query {
+ void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
+ void *context;
+ struct ib_sa_query sa_query;
+};
+
+static void ib_sa_add_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device);
+
+static struct ib_client sa_client = {
+ .name = "sa",
+ .add = ib_sa_add_one,
+ .remove = ib_sa_remove_one
+};
+
+static DEFINE_SPINLOCK(idr_lock);
+static DEFINE_IDR(query_idr);
+
+static DEFINE_SPINLOCK(tid_lock);
+static u32 tid;
+
+#define PATH_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_sa_path_rec, field), \
+ .struct_size_bytes = sizeof ((struct ib_sa_path_rec *) 0)->field, \
+ .field_name = "sa_path_rec:" #field
+
+static const struct ib_field path_rec_table[] = {
+ { PATH_REC_FIELD(service_id),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 64 },
+ { PATH_REC_FIELD(dgid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { PATH_REC_FIELD(sgid),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { PATH_REC_FIELD(dlid),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { PATH_REC_FIELD(slid),
+ .offset_words = 10,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { PATH_REC_FIELD(raw_traffic),
+ .offset_words = 11,
+ .offset_bits = 0,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 11,
+ .offset_bits = 1,
+ .size_bits = 3 },
+ { PATH_REC_FIELD(flow_label),
+ .offset_words = 11,
+ .offset_bits = 4,
+ .size_bits = 20 },
+ { PATH_REC_FIELD(hop_limit),
+ .offset_words = 11,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { PATH_REC_FIELD(traffic_class),
+ .offset_words = 12,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { PATH_REC_FIELD(reversible),
+ .offset_words = 12,
+ .offset_bits = 8,
+ .size_bits = 1 },
+ { PATH_REC_FIELD(numb_path),
+ .offset_words = 12,
+ .offset_bits = 9,
+ .size_bits = 7 },
+ { PATH_REC_FIELD(pkey),
+ .offset_words = 12,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { PATH_REC_FIELD(qos_class),
+ .offset_words = 13,
+ .offset_bits = 0,
+ .size_bits = 12 },
+ { PATH_REC_FIELD(sl),
+ .offset_words = 13,
+ .offset_bits = 12,
+ .size_bits = 4 },
+ { PATH_REC_FIELD(mtu_selector),
+ .offset_words = 13,
+ .offset_bits = 16,
+ .size_bits = 2 },
+ { PATH_REC_FIELD(mtu),
+ .offset_words = 13,
+ .offset_bits = 18,
+ .size_bits = 6 },
+ { PATH_REC_FIELD(rate_selector),
+ .offset_words = 13,
+ .offset_bits = 24,
+ .size_bits = 2 },
+ { PATH_REC_FIELD(rate),
+ .offset_words = 13,
+ .offset_bits = 26,
+ .size_bits = 6 },
+ { PATH_REC_FIELD(packet_life_time_selector),
+ .offset_words = 14,
+ .offset_bits = 0,
+ .size_bits = 2 },
+ { PATH_REC_FIELD(packet_life_time),
+ .offset_words = 14,
+ .offset_bits = 2,
+ .size_bits = 6 },
+ { PATH_REC_FIELD(preference),
+ .offset_words = 14,
+ .offset_bits = 8,
+ .size_bits = 8 },
+ { RESERVED,
+ .offset_words = 14,
+ .offset_bits = 16,
+ .size_bits = 48 },
+};
+
+#define MCMEMBER_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field), \
+ .struct_size_bytes = sizeof ((struct ib_sa_mcmember_rec *) 0)->field, \
+ .field_name = "sa_mcmember_rec:" #field
+
+static const struct ib_field mcmember_rec_table[] = {
+ { MCMEMBER_REC_FIELD(mgid),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { MCMEMBER_REC_FIELD(port_gid),
+ .offset_words = 4,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { MCMEMBER_REC_FIELD(qkey),
+ .offset_words = 8,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { MCMEMBER_REC_FIELD(mlid),
+ .offset_words = 9,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { MCMEMBER_REC_FIELD(mtu_selector),
+ .offset_words = 9,
+ .offset_bits = 16,
+ .size_bits = 2 },
+ { MCMEMBER_REC_FIELD(mtu),
+ .offset_words = 9,
+ .offset_bits = 18,
+ .size_bits = 6 },
+ { MCMEMBER_REC_FIELD(traffic_class),
+ .offset_words = 9,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { MCMEMBER_REC_FIELD(pkey),
+ .offset_words = 10,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { MCMEMBER_REC_FIELD(rate_selector),
+ .offset_words = 10,
+ .offset_bits = 16,
+ .size_bits = 2 },
+ { MCMEMBER_REC_FIELD(rate),
+ .offset_words = 10,
+ .offset_bits = 18,
+ .size_bits = 6 },
+ { MCMEMBER_REC_FIELD(packet_life_time_selector),
+ .offset_words = 10,
+ .offset_bits = 24,
+ .size_bits = 2 },
+ { MCMEMBER_REC_FIELD(packet_life_time),
+ .offset_words = 10,
+ .offset_bits = 26,
+ .size_bits = 6 },
+ { MCMEMBER_REC_FIELD(sl),
+ .offset_words = 11,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { MCMEMBER_REC_FIELD(flow_label),
+ .offset_words = 11,
+ .offset_bits = 4,
+ .size_bits = 20 },
+ { MCMEMBER_REC_FIELD(hop_limit),
+ .offset_words = 11,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { MCMEMBER_REC_FIELD(scope),
+ .offset_words = 12,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { MCMEMBER_REC_FIELD(join_state),
+ .offset_words = 12,
+ .offset_bits = 4,
+ .size_bits = 4 },
+ { MCMEMBER_REC_FIELD(proxy_join),
+ .offset_words = 12,
+ .offset_bits = 8,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 12,
+ .offset_bits = 9,
+ .size_bits = 23 },
+};
+
+#define SERVICE_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_sa_service_rec, field), \
+ .struct_size_bytes = sizeof ((struct ib_sa_service_rec *) 0)->field, \
+ .field_name = "sa_service_rec:" #field
+
+static const struct ib_field service_rec_table[] = {
+ { SERVICE_REC_FIELD(id),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 64 },
+ { SERVICE_REC_FIELD(gid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { SERVICE_REC_FIELD(pkey),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { SERVICE_REC_FIELD(lease),
+ .offset_words = 7,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { SERVICE_REC_FIELD(key),
+ .offset_words = 8,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { SERVICE_REC_FIELD(name),
+ .offset_words = 12,
+ .offset_bits = 0,
+ .size_bits = 64*8 },
+ { SERVICE_REC_FIELD(data8),
+ .offset_words = 28,
+ .offset_bits = 0,
+ .size_bits = 16*8 },
+ { SERVICE_REC_FIELD(data16),
+ .offset_words = 32,
+ .offset_bits = 0,
+ .size_bits = 8*16 },
+ { SERVICE_REC_FIELD(data32),
+ .offset_words = 36,
+ .offset_bits = 0,
+ .size_bits = 4*32 },
+ { SERVICE_REC_FIELD(data64),
+ .offset_words = 40,
+ .offset_bits = 0,
+ .size_bits = 2*64 },
+};
+
+#define GUIDINFO_REC_FIELD(field) \
+ .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \
+ .struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \
+ .field_name = "sa_guidinfo_rec:" #field
+
+static const struct ib_field guidinfo_rec_table[] = {
+ { GUIDINFO_REC_FIELD(lid),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { GUIDINFO_REC_FIELD(block_num),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 8 },
+ { GUIDINFO_REC_FIELD(res1),
+ .offset_words = 0,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { GUIDINFO_REC_FIELD(res2),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { GUIDINFO_REC_FIELD(guid_info_list),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 512 },
+};
+
+static void free_sm_ah(struct kref *kref)
+{
+ struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
+
+ ib_destroy_ah(sm_ah->ah);
+ kfree(sm_ah);
+}
+
+static void update_sm_ah(struct work_struct *work)
+{
+ struct ib_sa_port *port =
+ container_of(work, struct ib_sa_port, update_task);
+ struct ib_sa_sm_ah *new_ah;
+ struct ib_port_attr port_attr;
+ struct ib_ah_attr ah_attr;
+
+ if (ib_query_port(port->agent->device, port->port_num, &port_attr)) {
+ printk(KERN_WARNING "Couldn't query port\n");
+ return;
+ }
+
+ new_ah = kmalloc(sizeof *new_ah, GFP_KERNEL);
+ if (!new_ah) {
+ printk(KERN_WARNING "Couldn't allocate new SM AH\n");
+ return;
+ }
+
+ kref_init(&new_ah->ref);
+ new_ah->src_path_mask = (1 << port_attr.lmc) - 1;
+
+ new_ah->pkey_index = 0;
+ if (ib_find_pkey(port->agent->device, port->port_num,
+ IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index))
+ printk(KERN_ERR "Couldn't find index for default PKey\n");
+
+ memset(&ah_attr, 0, sizeof ah_attr);
+ ah_attr.dlid = port_attr.sm_lid;
+ ah_attr.sl = port_attr.sm_sl;
+ ah_attr.port_num = port->port_num;
+
+ new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
+ if (IS_ERR(new_ah->ah)) {
+ printk(KERN_WARNING "Couldn't create new SM AH\n");
+ kfree(new_ah);
+ return;
+ }
+
+ spin_lock_irq(&port->ah_lock);
+ if (port->sm_ah)
+ kref_put(&port->sm_ah->ref, free_sm_ah);
+ port->sm_ah = new_ah;
+ spin_unlock_irq(&port->ah_lock);
+
+}
+
+static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event)
+{
+ if (event->event == IB_EVENT_PORT_ERR ||
+ event->event == IB_EVENT_PORT_ACTIVE ||
+ event->event == IB_EVENT_LID_CHANGE ||
+ event->event == IB_EVENT_PKEY_CHANGE ||
+ event->event == IB_EVENT_SM_CHANGE ||
+ event->event == IB_EVENT_CLIENT_REREGISTER) {
+ unsigned long flags;
+ struct ib_sa_device *sa_dev =
+ container_of(handler, typeof(*sa_dev), event_handler);
+ struct ib_sa_port *port =
+ &sa_dev->port[event->element.port_num - sa_dev->start_port];
+
+ if (rdma_port_get_link_layer(handler->device, port->port_num) != IB_LINK_LAYER_INFINIBAND)
+ return;
+
+ spin_lock_irqsave(&port->ah_lock, flags);
+ if (port->sm_ah)
+ kref_put(&port->sm_ah->ref, free_sm_ah);
+ port->sm_ah = NULL;
+ spin_unlock_irqrestore(&port->ah_lock, flags);
+
+ queue_work(ib_wq, &sa_dev->port[event->element.port_num -
+ sa_dev->start_port].update_task);
+ }
+}
+
+void ib_sa_register_client(struct ib_sa_client *client)
+{
+ atomic_set(&client->users, 1);
+ init_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_register_client);
+
+void ib_sa_unregister_client(struct ib_sa_client *client)
+{
+ ib_sa_client_put(client);
+ wait_for_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_unregister_client);
+
+/**
+ * ib_sa_cancel_query - try to cancel an SA query
+ * @id:ID of query to cancel
+ * @query:query pointer to cancel
+ *
+ * Try to cancel an SA query. If the id and query don't match up or
+ * the query has already completed, nothing is done. Otherwise the
+ * query is canceled and will complete with a status of -EINTR.
+ */
+void ib_sa_cancel_query(int id, struct ib_sa_query *query)
+{
+ unsigned long flags;
+ struct ib_mad_agent *agent;
+ struct ib_mad_send_buf *mad_buf;
+
+ spin_lock_irqsave(&idr_lock, flags);
+ if (idr_find(&query_idr, id) != query) {
+ spin_unlock_irqrestore(&idr_lock, flags);
+ return;
+ }
+ agent = query->port->agent;
+ mad_buf = query->mad_buf;
+ spin_unlock_irqrestore(&idr_lock, flags);
+
+ ib_cancel_mad(agent, mad_buf);
+}
+EXPORT_SYMBOL(ib_sa_cancel_query);
+
+static u8 get_src_path_mask(struct ib_device *device, u8 port_num)
+{
+ struct ib_sa_device *sa_dev;
+ struct ib_sa_port *port;
+ unsigned long flags;
+ u8 src_path_mask;
+
+ sa_dev = ib_get_client_data(device, &sa_client);
+ if (!sa_dev)
+ return 0x7f;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ spin_lock_irqsave(&port->ah_lock, flags);
+ src_path_mask = port->sm_ah ? port->sm_ah->src_path_mask : 0x7f;
+ spin_unlock_irqrestore(&port->ah_lock, flags);
+
+ return src_path_mask;
+}
+
+int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
+ struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr)
+{
+ int ret;
+ u16 gid_index;
+ int force_grh;
+
+ memset(ah_attr, 0, sizeof *ah_attr);
+ ah_attr->dlid = be16_to_cpu(rec->dlid);
+ ah_attr->sl = rec->sl;
+ ah_attr->src_path_bits = be16_to_cpu(rec->slid) &
+ get_src_path_mask(device, port_num);
+ ah_attr->port_num = port_num;
+ ah_attr->static_rate = rec->rate;
+
+ force_grh = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_ETHERNET;
+
+ if (rec->hop_limit > 1 || force_grh) {
+ ah_attr->ah_flags = IB_AH_GRH;
+ ah_attr->grh.dgid = rec->dgid;
+
+ ret = ib_find_cached_gid(device, &rec->sgid, &port_num,
+ &gid_index);
+ if (ret)
+ return ret;
+
+ ah_attr->grh.sgid_index = gid_index;
+ ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label);
+ ah_attr->grh.hop_limit = rec->hop_limit;
+ ah_attr->grh.traffic_class = rec->traffic_class;
+ }
+ if (force_grh) {
+ memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN);
+ ah_attr->vlan_id = rec->vlan_id;
+ } else {
+ ah_attr->vlan_id = 0xffff;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_path);
+
+static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&query->port->ah_lock, flags);
+ if (!query->port->sm_ah) {
+ spin_unlock_irqrestore(&query->port->ah_lock, flags);
+ return -EAGAIN;
+ }
+ kref_get(&query->port->sm_ah->ref);
+ query->sm_ah = query->port->sm_ah;
+ spin_unlock_irqrestore(&query->port->ah_lock, flags);
+
+ query->mad_buf = ib_create_send_mad(query->port->agent, 1,
+ query->sm_ah->pkey_index,
+ 0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,
+ gfp_mask);
+ if (IS_ERR(query->mad_buf)) {
+ kref_put(&query->sm_ah->ref, free_sm_ah);
+ return -ENOMEM;
+ }
+
+ query->mad_buf->ah = query->sm_ah->ah;
+
+ return 0;
+}
+
+static void free_mad(struct ib_sa_query *query)
+{
+ ib_free_send_mad(query->mad_buf);
+ kref_put(&query->sm_ah->ref, free_sm_ah);
+}
+
+static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent)
+{
+ unsigned long flags;
+
+ memset(mad, 0, sizeof *mad);
+
+ mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION;
+ mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+ mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+
+ spin_lock_irqsave(&tid_lock, flags);
+ mad->mad_hdr.tid =
+ cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++);
+ spin_unlock_irqrestore(&tid_lock, flags);
+}
+
+static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
+{
+ bool preload = !!(gfp_mask & __GFP_WAIT);
+ unsigned long flags;
+ int ret, id;
+
+ if (preload)
+ idr_preload(gfp_mask);
+ spin_lock_irqsave(&idr_lock, flags);
+
+ id = idr_alloc(&query_idr, query, 0, 0, GFP_NOWAIT);
+
+ spin_unlock_irqrestore(&idr_lock, flags);
+ if (preload)
+ idr_preload_end();
+ if (id < 0)
+ return id;
+
+ query->mad_buf->timeout_ms = timeout_ms;
+ query->mad_buf->context[0] = query;
+ query->id = id;
+
+ ret = ib_post_send_mad(query->mad_buf, NULL);
+ if (ret) {
+ spin_lock_irqsave(&idr_lock, flags);
+ idr_remove(&query_idr, id);
+ spin_unlock_irqrestore(&idr_lock, flags);
+ }
+
+ /*
+ * It's not safe to dereference query any more, because the
+ * send may already have completed and freed the query in
+ * another context.
+ */
+ return ret ? ret : id;
+}
+
+void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec)
+{
+ ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec);
+}
+EXPORT_SYMBOL(ib_sa_unpack_path);
+
+void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute)
+{
+ ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, attribute);
+}
+EXPORT_SYMBOL(ib_sa_pack_path);
+
+static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
+ int status,
+ struct ib_sa_mad *mad)
+{
+ struct ib_sa_path_query *query =
+ container_of(sa_query, struct ib_sa_path_query, sa_query);
+
+ if (mad) {
+ struct ib_sa_path_rec rec;
+
+ ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
+ mad->data, &rec);
+ rec.vlan_id = 0xffff;
+ memset(rec.dmac, 0, ETH_ALEN);
+ memset(rec.smac, 0, ETH_ALEN);
+ query->callback(status, &rec, query->context);
+ } else
+ query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(container_of(sa_query, struct ib_sa_path_query, sa_query));
+}
+
+/**
+ * ib_sa_path_rec_get - Start a Path get query
+ * @client:SA client
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:Path Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send a Path Record Get query to the SA to look up a path. The
+ * callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query. The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_path_rec_get() is negative, it is an
+ * error code. Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_path_rec_get(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_path_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_path_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_path_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ struct ib_mad_agent *agent;
+ struct ib_sa_mad *mad;
+ int ret;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ query = kmalloc(sizeof *query, gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err1;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(mad, agent);
+
+ query->sa_query.callback = callback ? ib_sa_path_rec_callback : NULL;
+ query->sa_query.release = ib_sa_path_rec_release;
+ mad->mad_hdr.method = IB_MGMT_METHOD_GET;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+ mad->sa_hdr.comp_mask = comp_mask;
+
+ ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, mad->data);
+
+ *sa_query = &query->sa_query;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+
+err1:
+ kfree(query);
+ return ret;
+}
+EXPORT_SYMBOL(ib_sa_path_rec_get);
+
+static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query,
+ int status,
+ struct ib_sa_mad *mad)
+{
+ struct ib_sa_service_query *query =
+ container_of(sa_query, struct ib_sa_service_query, sa_query);
+
+ if (mad) {
+ struct ib_sa_service_rec rec;
+
+ ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table),
+ mad->data, &rec);
+ query->callback(status, &rec, query->context);
+ } else
+ query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_service_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(container_of(sa_query, struct ib_sa_service_query, sa_query));
+}
+
+/**
+ * ib_sa_service_rec_query - Start Service Record operation
+ * @client:SA client
+ * @device:device to send request on
+ * @port_num: port number to send request on
+ * @method:SA method - should be get, set, or delete
+ * @rec:Service Record to send in request
+ * @comp_mask:component mask to send in request
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when request completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:request context, used to cancel request
+ *
+ * Send a Service Record set/get/delete to the SA to register,
+ * unregister or query a service record.
+ * The callback function will be called when the request completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query. The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_service_rec_query() is negative, it is an
+ * error code. Otherwise it is a request ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_service_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num, u8 method,
+ struct ib_sa_service_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_service_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_service_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ struct ib_mad_agent *agent;
+ struct ib_sa_mad *mad;
+ int ret;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ if (method != IB_MGMT_METHOD_GET &&
+ method != IB_MGMT_METHOD_SET &&
+ method != IB_SA_METHOD_DELETE)
+ return -EINVAL;
+
+ query = kmalloc(sizeof *query, gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err1;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(mad, agent);
+
+ query->sa_query.callback = callback ? ib_sa_service_rec_callback : NULL;
+ query->sa_query.release = ib_sa_service_rec_release;
+ mad->mad_hdr.method = method;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_SERVICE_REC);
+ mad->sa_hdr.comp_mask = comp_mask;
+
+ ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table),
+ rec, mad->data);
+
+ *sa_query = &query->sa_query;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+
+err1:
+ kfree(query);
+ return ret;
+}
+EXPORT_SYMBOL(ib_sa_service_rec_query);
+
+static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query,
+ int status,
+ struct ib_sa_mad *mad)
+{
+ struct ib_sa_mcmember_query *query =
+ container_of(sa_query, struct ib_sa_mcmember_query, sa_query);
+
+ if (mad) {
+ struct ib_sa_mcmember_rec rec;
+
+ ib_unpack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+ mad->data, &rec);
+ query->callback(status, &rec, query->context);
+ } else
+ query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(container_of(sa_query, struct ib_sa_mcmember_query, sa_query));
+}
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ u8 method,
+ struct ib_sa_mcmember_rec *rec,
+ ib_sa_comp_mask comp_mask,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_mcmember_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_mcmember_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ struct ib_mad_agent *agent;
+ struct ib_sa_mad *mad;
+ int ret;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ query = kmalloc(sizeof *query, gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err1;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(mad, agent);
+
+ query->sa_query.callback = callback ? ib_sa_mcmember_rec_callback : NULL;
+ query->sa_query.release = ib_sa_mcmember_rec_release;
+ mad->mad_hdr.method = method;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+ mad->sa_hdr.comp_mask = comp_mask;
+
+ ib_pack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+ rec, mad->data);
+
+ *sa_query = &query->sa_query;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+
+err1:
+ kfree(query);
+ return ret;
+}
+
+/* Support GuidInfoRecord */
+static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query,
+ int status,
+ struct ib_sa_mad *mad)
+{
+ struct ib_sa_guidinfo_query *query =
+ container_of(sa_query, struct ib_sa_guidinfo_query, sa_query);
+
+ if (mad) {
+ struct ib_sa_guidinfo_rec rec;
+
+ ib_unpack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table),
+ mad->data, &rec);
+ query->callback(status, &rec, query->context);
+ } else
+ query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query)
+{
+ kfree(container_of(sa_query, struct ib_sa_guidinfo_query, sa_query));
+}
+
+int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
+ struct ib_device *device, u8 port_num,
+ struct ib_sa_guidinfo_rec *rec,
+ ib_sa_comp_mask comp_mask, u8 method,
+ int timeout_ms, gfp_t gfp_mask,
+ void (*callback)(int status,
+ struct ib_sa_guidinfo_rec *resp,
+ void *context),
+ void *context,
+ struct ib_sa_query **sa_query)
+{
+ struct ib_sa_guidinfo_query *query;
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ struct ib_sa_port *port;
+ struct ib_mad_agent *agent;
+ struct ib_sa_mad *mad;
+ int ret;
+
+ if (!sa_dev)
+ return -ENODEV;
+
+ if (method != IB_MGMT_METHOD_GET &&
+ method != IB_MGMT_METHOD_SET &&
+ method != IB_SA_METHOD_DELETE) {
+ return -EINVAL;
+ }
+
+ port = &sa_dev->port[port_num - sa_dev->start_port];
+ agent = port->agent;
+
+ query = kmalloc(sizeof *query, gfp_mask);
+ if (!query)
+ return -ENOMEM;
+
+ query->sa_query.port = port;
+ ret = alloc_mad(&query->sa_query, gfp_mask);
+ if (ret)
+ goto err1;
+
+ ib_sa_client_get(client);
+ query->sa_query.client = client;
+ query->callback = callback;
+ query->context = context;
+
+ mad = query->sa_query.mad_buf->mad;
+ init_mad(mad, agent);
+
+ query->sa_query.callback = callback ? ib_sa_guidinfo_rec_callback : NULL;
+ query->sa_query.release = ib_sa_guidinfo_rec_release;
+
+ mad->mad_hdr.method = method;
+ mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_GUID_INFO_REC);
+ mad->sa_hdr.comp_mask = comp_mask;
+
+ ib_pack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), rec,
+ mad->data);
+
+ *sa_query = &query->sa_query;
+
+ ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+ if (ret < 0)
+ goto err2;
+
+ return ret;
+
+err2:
+ *sa_query = NULL;
+ ib_sa_client_put(query->sa_query.client);
+ free_mad(&query->sa_query);
+
+err1:
+ kfree(query);
+ return ret;
+}
+EXPORT_SYMBOL(ib_sa_guid_info_rec_query);
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *mad_send_wc)
+{
+ struct ib_sa_query *query = mad_send_wc->send_buf->context[0];
+ unsigned long flags;
+
+ if (query->callback)
+ switch (mad_send_wc->status) {
+ case IB_WC_SUCCESS:
+ /* No callback -- already got recv */
+ break;
+ case IB_WC_RESP_TIMEOUT_ERR:
+ query->callback(query, -ETIMEDOUT, NULL);
+ break;
+ case IB_WC_WR_FLUSH_ERR:
+ query->callback(query, -EINTR, NULL);
+ break;
+ default:
+ query->callback(query, -EIO, NULL);
+ break;
+ }
+
+ spin_lock_irqsave(&idr_lock, flags);
+ idr_remove(&query_idr, query->id);
+ spin_unlock_irqrestore(&idr_lock, flags);
+
+ free_mad(query);
+ ib_sa_client_put(query->client);
+ query->release(query);
+}
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_sa_query *query;
+ struct ib_mad_send_buf *mad_buf;
+
+ mad_buf = (void *) (unsigned long) mad_recv_wc->wc->wr_id;
+ query = mad_buf->context[0];
+
+ if (query->callback) {
+ if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
+ query->callback(query,
+ mad_recv_wc->recv_buf.mad->mad_hdr.status ?
+ -EINVAL : 0,
+ (struct ib_sa_mad *) mad_recv_wc->recv_buf.mad);
+ else
+ query->callback(query, -EIO, NULL);
+ }
+
+ ib_free_recv_mad(mad_recv_wc);
+}
+
+static void ib_sa_add_one(struct ib_device *device)
+{
+ struct ib_sa_device *sa_dev;
+ int s, e, i;
+
+ if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH)
+ s = e = 0;
+ else {
+ s = 1;
+ e = device->phys_port_cnt;
+ }
+
+ sa_dev = kzalloc(sizeof *sa_dev +
+ (e - s + 1) * sizeof (struct ib_sa_port),
+ GFP_KERNEL);
+ if (!sa_dev)
+ return;
+
+ sa_dev->start_port = s;
+ sa_dev->end_port = e;
+
+ for (i = 0; i <= e - s; ++i) {
+ spin_lock_init(&sa_dev->port[i].ah_lock);
+ if (rdma_port_get_link_layer(device, i + 1) != IB_LINK_LAYER_INFINIBAND)
+ continue;
+
+ sa_dev->port[i].sm_ah = NULL;
+ sa_dev->port[i].port_num = i + s;
+
+ sa_dev->port[i].agent =
+ ib_register_mad_agent(device, i + s, IB_QPT_GSI,
+ NULL, 0, send_handler,
+ recv_handler, sa_dev, 0);
+ if (IS_ERR(sa_dev->port[i].agent))
+ goto err;
+
+ INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah);
+ }
+
+ ib_set_client_data(device, &sa_client, sa_dev);
+
+ /*
+ * We register our event handler after everything is set up,
+ * and then update our cached info after the event handler is
+ * registered to avoid any problems if a port changes state
+ * during our initialization.
+ */
+
+ INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event);
+ if (ib_register_event_handler(&sa_dev->event_handler))
+ goto err;
+
+ for (i = 0; i <= e - s; ++i)
+ if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND)
+ update_sm_ah(&sa_dev->port[i].update_task);
+
+ return;
+
+err:
+ while (--i >= 0)
+ if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND)
+ ib_unregister_mad_agent(sa_dev->port[i].agent);
+
+ kfree(sa_dev);
+
+ return;
+}
+
+static void ib_sa_remove_one(struct ib_device *device)
+{
+ struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+ int i;
+
+ if (!sa_dev)
+ return;
+
+ ib_unregister_event_handler(&sa_dev->event_handler);
+
+ flush_workqueue(ib_wq);
+
+ for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) {
+ if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) {
+ ib_unregister_mad_agent(sa_dev->port[i].agent);
+ if (sa_dev->port[i].sm_ah)
+ kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah);
+ }
+
+ }
+
+ kfree(sa_dev);
+}
+
+static int __init ib_sa_init(void)
+{
+ int ret;
+
+ get_random_bytes(&tid, sizeof tid);
+
+ ret = ib_register_client(&sa_client);
+ if (ret) {
+ printk(KERN_ERR "Couldn't register ib_sa client\n");
+ goto err1;
+ }
+
+ ret = mcast_init();
+ if (ret) {
+ printk(KERN_ERR "Couldn't initialize multicast handling\n");
+ goto err2;
+ }
+
+ return 0;
+err2:
+ ib_unregister_client(&sa_client);
+err1:
+ return ret;
+}
+
+static void __exit ib_sa_cleanup(void)
+{
+ mcast_cleanup();
+ ib_unregister_client(&sa_client);
+ idr_destroy(&query_idr);
+}
+
+module_init(ib_sa_init);
+module_exit(ib_sa_cleanup);
diff --git a/drivers/infiniband/core/smi.c b/drivers/infiniband/core/smi.c
new file mode 100644
index 000000000..5855e4405
--- /dev/null
+++ b/drivers/infiniband/core/smi.c
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <rdma/ib_smi.h>
+#include "smi.h"
+
+/*
+ * Fixup a directed route SMP for sending
+ * Return 0 if the SMP should be discarded
+ */
+enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+ u8 node_type, int port_num)
+{
+ u8 hop_ptr, hop_cnt;
+
+ hop_ptr = smp->hop_ptr;
+ hop_cnt = smp->hop_cnt;
+
+ /* See section 14.2.2.2, Vol 1 IB spec */
+ /* C14-6 -- valid hop_cnt values are from 0 to 63 */
+ if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
+ return IB_SMI_DISCARD;
+
+ if (!ib_get_smp_direction(smp)) {
+ /* C14-9:1 */
+ if (hop_cnt && hop_ptr == 0) {
+ smp->hop_ptr++;
+ return (smp->initial_path[smp->hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:2 */
+ if (hop_ptr && hop_ptr < hop_cnt) {
+ if (node_type != RDMA_NODE_IB_SWITCH)
+ return IB_SMI_DISCARD;
+
+ /* smp->return_path set when received */
+ smp->hop_ptr++;
+ return (smp->initial_path[smp->hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ if (hop_ptr == hop_cnt) {
+ /* smp->return_path set when received */
+ smp->hop_ptr++;
+ return (node_type == RDMA_NODE_IB_SWITCH ||
+ smp->dr_dlid == IB_LID_PERMISSIVE ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ /* C14-9:5 -- Fail unreasonable hop pointer */
+ return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+ } else {
+ /* C14-13:1 */
+ if (hop_cnt && hop_ptr == hop_cnt + 1) {
+ smp->hop_ptr--;
+ return (smp->return_path[smp->hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:2 */
+ if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
+ if (node_type != RDMA_NODE_IB_SWITCH)
+ return IB_SMI_DISCARD;
+
+ smp->hop_ptr--;
+ return (smp->return_path[smp->hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:3 -- at the end of the DR segment of path */
+ if (hop_ptr == 1) {
+ smp->hop_ptr--;
+ /* C14-13:3 -- SMPs destined for SM shouldn't be here */
+ return (node_type == RDMA_NODE_IB_SWITCH ||
+ smp->dr_slid == IB_LID_PERMISSIVE ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */
+ if (hop_ptr == 0)
+ return IB_SMI_HANDLE;
+
+ /* C14-13:5 -- Check for unreasonable hop pointer */
+ return IB_SMI_DISCARD;
+ }
+}
+
+/*
+ * Adjust information for a received SMP
+ * Return 0 if the SMP should be dropped
+ */
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,
+ int port_num, int phys_port_cnt)
+{
+ u8 hop_ptr, hop_cnt;
+
+ hop_ptr = smp->hop_ptr;
+ hop_cnt = smp->hop_cnt;
+
+ /* See section 14.2.2.2, Vol 1 IB spec */
+ /* C14-6 -- valid hop_cnt values are from 0 to 63 */
+ if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
+ return IB_SMI_DISCARD;
+
+ if (!ib_get_smp_direction(smp)) {
+ /* C14-9:1 -- sender should have incremented hop_ptr */
+ if (hop_cnt && hop_ptr == 0)
+ return IB_SMI_DISCARD;
+
+ /* C14-9:2 -- intermediate hop */
+ if (hop_ptr && hop_ptr < hop_cnt) {
+ if (node_type != RDMA_NODE_IB_SWITCH)
+ return IB_SMI_DISCARD;
+
+ smp->return_path[hop_ptr] = port_num;
+ /* smp->hop_ptr updated when sending */
+ return (smp->initial_path[hop_ptr+1] <= phys_port_cnt ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ if (hop_ptr == hop_cnt) {
+ if (hop_cnt)
+ smp->return_path[hop_ptr] = port_num;
+ /* smp->hop_ptr updated when sending */
+
+ return (node_type == RDMA_NODE_IB_SWITCH ||
+ smp->dr_dlid == IB_LID_PERMISSIVE ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ /* C14-9:5 -- fail unreasonable hop pointer */
+ return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+ } else {
+
+ /* C14-13:1 */
+ if (hop_cnt && hop_ptr == hop_cnt + 1) {
+ smp->hop_ptr--;
+ return (smp->return_path[smp->hop_ptr] ==
+ port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:2 */
+ if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
+ if (node_type != RDMA_NODE_IB_SWITCH)
+ return IB_SMI_DISCARD;
+
+ /* smp->hop_ptr updated when sending */
+ return (smp->return_path[hop_ptr-1] <= phys_port_cnt ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:3 -- We're at the end of the DR segment of path */
+ if (hop_ptr == 1) {
+ if (smp->dr_slid == IB_LID_PERMISSIVE) {
+ /* giving SMP to SM - update hop_ptr */
+ smp->hop_ptr--;
+ return IB_SMI_HANDLE;
+ }
+ /* smp->hop_ptr updated when sending */
+ return (node_type == RDMA_NODE_IB_SWITCH ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+
+ /* C14-13:4 -- hop_ptr = 0 -> give to SM */
+ /* C14-13:5 -- Check for unreasonable hop pointer */
+ return (hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+ }
+}
+
+enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp)
+{
+ u8 hop_ptr, hop_cnt;
+
+ hop_ptr = smp->hop_ptr;
+ hop_cnt = smp->hop_cnt;
+
+ if (!ib_get_smp_direction(smp)) {
+ /* C14-9:2 -- intermediate hop */
+ if (hop_ptr && hop_ptr < hop_cnt)
+ return IB_SMI_FORWARD;
+
+ /* C14-9:3 -- at the end of the DR segment of path */
+ if (hop_ptr == hop_cnt)
+ return (smp->dr_dlid == IB_LID_PERMISSIVE ?
+ IB_SMI_SEND : IB_SMI_LOCAL);
+
+ /* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+ if (hop_ptr == hop_cnt + 1)
+ return IB_SMI_SEND;
+ } else {
+ /* C14-13:2 -- intermediate hop */
+ if (2 <= hop_ptr && hop_ptr <= hop_cnt)
+ return IB_SMI_FORWARD;
+
+ /* C14-13:3 -- at the end of the DR segment of path */
+ if (hop_ptr == 1)
+ return (smp->dr_slid != IB_LID_PERMISSIVE ?
+ IB_SMI_SEND : IB_SMI_LOCAL);
+ }
+ return IB_SMI_LOCAL;
+}
+
+/*
+ * Return the forwarding port number from initial_path for outgoing SMP and
+ * from return_path for returning SMP
+ */
+int smi_get_fwd_port(struct ib_smp *smp)
+{
+ return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] :
+ smp->return_path[smp->hop_ptr-1]);
+}
diff --git a/drivers/infiniband/core/smi.h b/drivers/infiniband/core/smi.h
new file mode 100644
index 000000000..aff96bac4
--- /dev/null
+++ b/drivers/infiniband/core/smi.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __SMI_H_
+#define __SMI_H_
+
+#include <rdma/ib_smi.h>
+
+enum smi_action {
+ IB_SMI_DISCARD,
+ IB_SMI_HANDLE
+};
+
+enum smi_forward_action {
+ IB_SMI_LOCAL, /* SMP should be completed up the stack */
+ IB_SMI_SEND, /* received DR SMP should be forwarded to the send queue */
+ IB_SMI_FORWARD /* SMP should be forwarded (for switches only) */
+};
+
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,
+ int port_num, int phys_port_cnt);
+int smi_get_fwd_port(struct ib_smp *smp);
+extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp);
+extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+ u8 node_type, int port_num);
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_smp(struct ib_smp *smp,
+ struct ib_device *device)
+{
+ /* C14-9:3 -- We're at the end of the DR segment of path */
+ /* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
+ return ((device->process_mad &&
+ !ib_get_smp_direction(smp) &&
+ (smp->hop_ptr == smp->hop_cnt + 1)) ?
+ IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_returning_smp(struct ib_smp *smp,
+ struct ib_device *device)
+{
+ /* C14-13:3 -- We're at the end of the DR segment of path */
+ /* C14-13:4 -- Hop Pointer == 0 -> give to SM */
+ return ((device->process_mad &&
+ ib_get_smp_direction(smp) &&
+ !smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+#endif /* __SMI_H_ */
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
new file mode 100644
index 000000000..cbd0383f6
--- /dev/null
+++ b/drivers/infiniband/core/sysfs.c
@@ -0,0 +1,922 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+
+#include <rdma/ib_mad.h>
+
+struct ib_port {
+ struct kobject kobj;
+ struct ib_device *ibdev;
+ struct attribute_group gid_group;
+ struct attribute_group pkey_group;
+ u8 port_num;
+};
+
+struct port_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf);
+ ssize_t (*store)(struct ib_port *, struct port_attribute *,
+ const char *buf, size_t count);
+};
+
+#define PORT_ATTR(_name, _mode, _show, _store) \
+struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+#define PORT_ATTR_RO(_name) \
+struct port_attribute port_attr_##_name = __ATTR_RO(_name)
+
+struct port_table_attribute {
+ struct port_attribute attr;
+ char name[8];
+ int index;
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct port_attribute *port_attr =
+ container_of(attr, struct port_attribute, attr);
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+ if (!port_attr->show)
+ return -EIO;
+
+ return port_attr->show(p, port_attr, buf);
+}
+
+static const struct sysfs_ops port_sysfs_ops = {
+ .show = port_attr_show
+};
+
+static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ static const char *state_name[] = {
+ [IB_PORT_NOP] = "NOP",
+ [IB_PORT_DOWN] = "DOWN",
+ [IB_PORT_INIT] = "INIT",
+ [IB_PORT_ARMED] = "ARMED",
+ [IB_PORT_ACTIVE] = "ACTIVE",
+ [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER"
+ };
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%d: %s\n", attr.state,
+ attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ?
+ state_name[attr.state] : "UNKNOWN");
+}
+
+static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%x\n", attr.lid);
+}
+
+static ssize_t lid_mask_count_show(struct ib_port *p,
+ struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%d\n", attr.lmc);
+}
+
+static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%x\n", attr.sm_lid);
+}
+
+static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%d\n", attr.sm_sl);
+}
+
+static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%08x\n", attr.port_cap_flags);
+}
+
+static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+ char *speed = "";
+ int rate; /* in deci-Gb/sec */
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ switch (attr.active_speed) {
+ case IB_SPEED_DDR:
+ speed = " DDR";
+ rate = 50;
+ break;
+ case IB_SPEED_QDR:
+ speed = " QDR";
+ rate = 100;
+ break;
+ case IB_SPEED_FDR10:
+ speed = " FDR10";
+ rate = 100;
+ break;
+ case IB_SPEED_FDR:
+ speed = " FDR";
+ rate = 140;
+ break;
+ case IB_SPEED_EDR:
+ speed = " EDR";
+ rate = 250;
+ break;
+ case IB_SPEED_SDR:
+ default: /* default to SDR for invalid rates */
+ rate = 25;
+ break;
+ }
+
+ rate *= ib_width_enum_to_int(attr.active_width);
+ if (rate < 0)
+ return -EINVAL;
+
+ return sprintf(buf, "%d%s Gb/sec (%dX%s)\n",
+ rate / 10, rate % 10 ? ".5" : "",
+ ib_width_enum_to_int(attr.active_width), speed);
+}
+
+static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ struct ib_port_attr attr;
+
+ ssize_t ret;
+
+ ret = ib_query_port(p->ibdev, p->port_num, &attr);
+ if (ret)
+ return ret;
+
+ switch (attr.phys_state) {
+ case 1: return sprintf(buf, "1: Sleep\n");
+ case 2: return sprintf(buf, "2: Polling\n");
+ case 3: return sprintf(buf, "3: Disabled\n");
+ case 4: return sprintf(buf, "4: PortConfigurationTraining\n");
+ case 5: return sprintf(buf, "5: LinkUp\n");
+ case 6: return sprintf(buf, "6: LinkErrorRecovery\n");
+ case 7: return sprintf(buf, "7: Phy Test\n");
+ default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state);
+ }
+}
+
+static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused,
+ char *buf)
+{
+ switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ return sprintf(buf, "%s\n", "InfiniBand");
+ case IB_LINK_LAYER_ETHERNET:
+ return sprintf(buf, "%s\n", "Ethernet");
+ default:
+ return sprintf(buf, "%s\n", "Unknown");
+ }
+}
+
+static PORT_ATTR_RO(state);
+static PORT_ATTR_RO(lid);
+static PORT_ATTR_RO(lid_mask_count);
+static PORT_ATTR_RO(sm_lid);
+static PORT_ATTR_RO(sm_sl);
+static PORT_ATTR_RO(cap_mask);
+static PORT_ATTR_RO(rate);
+static PORT_ATTR_RO(phys_state);
+static PORT_ATTR_RO(link_layer);
+
+static struct attribute *port_default_attrs[] = {
+ &port_attr_state.attr,
+ &port_attr_lid.attr,
+ &port_attr_lid_mask_count.attr,
+ &port_attr_sm_lid.attr,
+ &port_attr_sm_sl.attr,
+ &port_attr_cap_mask.attr,
+ &port_attr_rate.attr,
+ &port_attr_phys_state.attr,
+ &port_attr_link_layer.attr,
+ NULL
+};
+
+static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ union ib_gid gid;
+ ssize_t ret;
+
+ ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%pI6\n", gid.raw);
+}
+
+static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ u16 pkey;
+ ssize_t ret;
+
+ ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "0x%04x\n", pkey);
+}
+
+#define PORT_PMA_ATTR(_name, _counter, _width, _offset) \
+struct port_table_attribute port_pma_attr_##_name = { \
+ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \
+ .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \
+}
+
+static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
+ char *buf)
+{
+ struct port_table_attribute *tab_attr =
+ container_of(attr, struct port_table_attribute, attr);
+ int offset = tab_attr->index & 0xffff;
+ int width = (tab_attr->index >> 16) & 0xff;
+ struct ib_mad *in_mad = NULL;
+ struct ib_mad *out_mad = NULL;
+ ssize_t ret;
+
+ if (!p->ibdev->process_mad)
+ return sprintf(buf, "N/A (no PMA)\n");
+
+ in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL);
+ out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+ if (!in_mad || !out_mad) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ in_mad->mad_hdr.base_version = 1;
+ in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT;
+ in_mad->mad_hdr.class_version = 1;
+ in_mad->mad_hdr.method = IB_MGMT_METHOD_GET;
+ in_mad->mad_hdr.attr_id = cpu_to_be16(0x12); /* PortCounters */
+
+ in_mad->data[41] = p->port_num; /* PortSelect field */
+
+ if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY,
+ p->port_num, NULL, NULL, in_mad, out_mad) &
+ (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) !=
+ (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ switch (width) {
+ case 4:
+ ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >>
+ (4 - (offset % 8))) & 0xf);
+ break;
+ case 8:
+ ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]);
+ break;
+ case 16:
+ ret = sprintf(buf, "%u\n",
+ be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8)));
+ break;
+ case 32:
+ ret = sprintf(buf, "%u\n",
+ be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8)));
+ break;
+ default:
+ ret = 0;
+ }
+
+out:
+ kfree(in_mad);
+ kfree(out_mad);
+
+ return ret;
+}
+
+static PORT_PMA_ATTR(symbol_error , 0, 16, 32);
+static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48);
+static PORT_PMA_ATTR(link_downed , 2, 8, 56);
+static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64);
+static PORT_PMA_ATTR(port_rcv_remote_physical_errors, 4, 16, 80);
+static PORT_PMA_ATTR(port_rcv_switch_relay_errors , 5, 16, 96);
+static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112);
+static PORT_PMA_ATTR(port_xmit_constraint_errors , 7, 8, 128);
+static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136);
+static PORT_PMA_ATTR(local_link_integrity_errors , 9, 4, 152);
+static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10, 4, 156);
+static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176);
+static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192);
+static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224);
+static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256);
+static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288);
+
+static struct attribute *pma_attrs[] = {
+ &port_pma_attr_symbol_error.attr.attr,
+ &port_pma_attr_link_error_recovery.attr.attr,
+ &port_pma_attr_link_downed.attr.attr,
+ &port_pma_attr_port_rcv_errors.attr.attr,
+ &port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+ &port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+ &port_pma_attr_port_xmit_discards.attr.attr,
+ &port_pma_attr_port_xmit_constraint_errors.attr.attr,
+ &port_pma_attr_port_rcv_constraint_errors.attr.attr,
+ &port_pma_attr_local_link_integrity_errors.attr.attr,
+ &port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+ &port_pma_attr_VL15_dropped.attr.attr,
+ &port_pma_attr_port_xmit_data.attr.attr,
+ &port_pma_attr_port_rcv_data.attr.attr,
+ &port_pma_attr_port_xmit_packets.attr.attr,
+ &port_pma_attr_port_rcv_packets.attr.attr,
+ NULL
+};
+
+static struct attribute_group pma_group = {
+ .name = "counters",
+ .attrs = pma_attrs
+};
+
+static void ib_port_release(struct kobject *kobj)
+{
+ struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+ struct attribute *a;
+ int i;
+
+ if (p->gid_group.attrs) {
+ for (i = 0; (a = p->gid_group.attrs[i]); ++i)
+ kfree(a);
+
+ kfree(p->gid_group.attrs);
+ }
+
+ if (p->pkey_group.attrs) {
+ for (i = 0; (a = p->pkey_group.attrs[i]); ++i)
+ kfree(a);
+
+ kfree(p->pkey_group.attrs);
+ }
+
+ kfree(p);
+}
+
+static struct kobj_type port_type = {
+ .release = ib_port_release,
+ .sysfs_ops = &port_sysfs_ops,
+ .default_attrs = port_default_attrs
+};
+
+static void ib_device_release(struct device *device)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ kfree(dev);
+}
+
+static int ib_device_uevent(struct device *device,
+ struct kobj_uevent_env *env)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ if (add_uevent_var(env, "NAME=%s", dev->name))
+ return -ENOMEM;
+
+ /*
+ * It would be nice to pass the node GUID with the event...
+ */
+
+ return 0;
+}
+
+static struct attribute **
+alloc_group_attrs(ssize_t (*show)(struct ib_port *,
+ struct port_attribute *, char *buf),
+ int len)
+{
+ struct attribute **tab_attr;
+ struct port_table_attribute *element;
+ int i;
+
+ tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL);
+ if (!tab_attr)
+ return NULL;
+
+ for (i = 0; i < len; i++) {
+ element = kzalloc(sizeof(struct port_table_attribute),
+ GFP_KERNEL);
+ if (!element)
+ goto err;
+
+ if (snprintf(element->name, sizeof(element->name),
+ "%d", i) >= sizeof(element->name)) {
+ kfree(element);
+ goto err;
+ }
+
+ element->attr.attr.name = element->name;
+ element->attr.attr.mode = S_IRUGO;
+ element->attr.show = show;
+ element->index = i;
+ sysfs_attr_init(&element->attr.attr);
+
+ tab_attr[i] = &element->attr.attr;
+ }
+
+ return tab_attr;
+
+err:
+ while (--i >= 0)
+ kfree(tab_attr[i]);
+ kfree(tab_attr);
+ return NULL;
+}
+
+static int add_port(struct ib_device *device, int port_num,
+ int (*port_callback)(struct ib_device *,
+ u8, struct kobject *))
+{
+ struct ib_port *p;
+ struct ib_port_attr attr;
+ int i;
+ int ret;
+
+ ret = ib_query_port(device, port_num, &attr);
+ if (ret)
+ return ret;
+
+ p = kzalloc(sizeof *p, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ p->ibdev = device;
+ p->port_num = port_num;
+
+ ret = kobject_init_and_add(&p->kobj, &port_type,
+ device->ports_parent,
+ "%d", port_num);
+ if (ret) {
+ kfree(p);
+ return ret;
+ }
+
+ ret = sysfs_create_group(&p->kobj, &pma_group);
+ if (ret)
+ goto err_put;
+
+ p->gid_group.name = "gids";
+ p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len);
+ if (!p->gid_group.attrs) {
+ ret = -ENOMEM;
+ goto err_remove_pma;
+ }
+
+ ret = sysfs_create_group(&p->kobj, &p->gid_group);
+ if (ret)
+ goto err_free_gid;
+
+ p->pkey_group.name = "pkeys";
+ p->pkey_group.attrs = alloc_group_attrs(show_port_pkey,
+ attr.pkey_tbl_len);
+ if (!p->pkey_group.attrs) {
+ ret = -ENOMEM;
+ goto err_remove_gid;
+ }
+
+ ret = sysfs_create_group(&p->kobj, &p->pkey_group);
+ if (ret)
+ goto err_free_pkey;
+
+ if (port_callback) {
+ ret = port_callback(device, port_num, &p->kobj);
+ if (ret)
+ goto err_remove_pkey;
+ }
+
+ list_add_tail(&p->kobj.entry, &device->port_list);
+
+ kobject_uevent(&p->kobj, KOBJ_ADD);
+ return 0;
+
+err_remove_pkey:
+ sysfs_remove_group(&p->kobj, &p->pkey_group);
+
+err_free_pkey:
+ for (i = 0; i < attr.pkey_tbl_len; ++i)
+ kfree(p->pkey_group.attrs[i]);
+
+ kfree(p->pkey_group.attrs);
+ p->pkey_group.attrs = NULL;
+
+err_remove_gid:
+ sysfs_remove_group(&p->kobj, &p->gid_group);
+
+err_free_gid:
+ for (i = 0; i < attr.gid_tbl_len; ++i)
+ kfree(p->gid_group.attrs[i]);
+
+ kfree(p->gid_group.attrs);
+ p->gid_group.attrs = NULL;
+
+err_remove_pma:
+ sysfs_remove_group(&p->kobj, &pma_group);
+
+err_put:
+ kobject_put(&p->kobj);
+ return ret;
+}
+
+static ssize_t show_node_type(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ switch (dev->node_type) {
+ case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type);
+ case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type);
+ case RDMA_NODE_USNIC: return sprintf(buf, "%d: usNIC\n", dev->node_type);
+ case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type);
+ case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
+ case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
+ default: return sprintf(buf, "%d: <unknown>\n", dev->node_type);
+ }
+}
+
+static ssize_t show_sys_image_guid(struct device *device,
+ struct device_attribute *dev_attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device_attr attr;
+ ssize_t ret;
+
+ ret = ib_query_device(dev, &attr);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+ be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]),
+ be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]),
+ be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]),
+ be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3]));
+}
+
+static ssize_t show_node_guid(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+ be16_to_cpu(((__be16 *) &dev->node_guid)[0]),
+ be16_to_cpu(((__be16 *) &dev->node_guid)[1]),
+ be16_to_cpu(((__be16 *) &dev->node_guid)[2]),
+ be16_to_cpu(((__be16 *) &dev->node_guid)[3]));
+}
+
+static ssize_t show_node_desc(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+ return sprintf(buf, "%.64s\n", dev->node_desc);
+}
+
+static ssize_t set_node_desc(struct device *device,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+ struct ib_device_modify desc = {};
+ int ret;
+
+ if (!dev->modify_device)
+ return -EIO;
+
+ memcpy(desc.node_desc, buf, min_t(int, count, 64));
+ ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc);
+ if (ret)
+ return ret;
+
+ return count;
+}
+
+static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
+static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
+static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
+static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc);
+
+static struct device_attribute *ib_class_attributes[] = {
+ &dev_attr_node_type,
+ &dev_attr_sys_image_guid,
+ &dev_attr_node_guid,
+ &dev_attr_node_desc
+};
+
+static struct class ib_class = {
+ .name = "infiniband",
+ .dev_release = ib_device_release,
+ .dev_uevent = ib_device_uevent,
+};
+
+/* Show a given an attribute in the statistics group */
+static ssize_t show_protocol_stat(const struct device *device,
+ struct device_attribute *attr, char *buf,
+ unsigned offset)
+{
+ struct ib_device *dev = container_of(device, struct ib_device, dev);
+ union rdma_protocol_stats stats;
+ ssize_t ret;
+
+ ret = dev->get_protocol_stats(dev, &stats);
+ if (ret)
+ return ret;
+
+ return sprintf(buf, "%llu\n",
+ (unsigned long long) ((u64 *) &stats)[offset]);
+}
+
+/* generate a read-only iwarp statistics attribute */
+#define IW_STATS_ENTRY(name) \
+static ssize_t show_##name(struct device *device, \
+ struct device_attribute *attr, char *buf) \
+{ \
+ return show_protocol_stat(device, attr, buf, \
+ offsetof(struct iw_protocol_stats, name) / \
+ sizeof (u64)); \
+} \
+static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+IW_STATS_ENTRY(ipInReceives);
+IW_STATS_ENTRY(ipInHdrErrors);
+IW_STATS_ENTRY(ipInTooBigErrors);
+IW_STATS_ENTRY(ipInNoRoutes);
+IW_STATS_ENTRY(ipInAddrErrors);
+IW_STATS_ENTRY(ipInUnknownProtos);
+IW_STATS_ENTRY(ipInTruncatedPkts);
+IW_STATS_ENTRY(ipInDiscards);
+IW_STATS_ENTRY(ipInDelivers);
+IW_STATS_ENTRY(ipOutForwDatagrams);
+IW_STATS_ENTRY(ipOutRequests);
+IW_STATS_ENTRY(ipOutDiscards);
+IW_STATS_ENTRY(ipOutNoRoutes);
+IW_STATS_ENTRY(ipReasmTimeout);
+IW_STATS_ENTRY(ipReasmReqds);
+IW_STATS_ENTRY(ipReasmOKs);
+IW_STATS_ENTRY(ipReasmFails);
+IW_STATS_ENTRY(ipFragOKs);
+IW_STATS_ENTRY(ipFragFails);
+IW_STATS_ENTRY(ipFragCreates);
+IW_STATS_ENTRY(ipInMcastPkts);
+IW_STATS_ENTRY(ipOutMcastPkts);
+IW_STATS_ENTRY(ipInBcastPkts);
+IW_STATS_ENTRY(ipOutBcastPkts);
+IW_STATS_ENTRY(tcpRtoAlgorithm);
+IW_STATS_ENTRY(tcpRtoMin);
+IW_STATS_ENTRY(tcpRtoMax);
+IW_STATS_ENTRY(tcpMaxConn);
+IW_STATS_ENTRY(tcpActiveOpens);
+IW_STATS_ENTRY(tcpPassiveOpens);
+IW_STATS_ENTRY(tcpAttemptFails);
+IW_STATS_ENTRY(tcpEstabResets);
+IW_STATS_ENTRY(tcpCurrEstab);
+IW_STATS_ENTRY(tcpInSegs);
+IW_STATS_ENTRY(tcpOutSegs);
+IW_STATS_ENTRY(tcpRetransSegs);
+IW_STATS_ENTRY(tcpInErrs);
+IW_STATS_ENTRY(tcpOutRsts);
+
+static struct attribute *iw_proto_stats_attrs[] = {
+ &dev_attr_ipInReceives.attr,
+ &dev_attr_ipInHdrErrors.attr,
+ &dev_attr_ipInTooBigErrors.attr,
+ &dev_attr_ipInNoRoutes.attr,
+ &dev_attr_ipInAddrErrors.attr,
+ &dev_attr_ipInUnknownProtos.attr,
+ &dev_attr_ipInTruncatedPkts.attr,
+ &dev_attr_ipInDiscards.attr,
+ &dev_attr_ipInDelivers.attr,
+ &dev_attr_ipOutForwDatagrams.attr,
+ &dev_attr_ipOutRequests.attr,
+ &dev_attr_ipOutDiscards.attr,
+ &dev_attr_ipOutNoRoutes.attr,
+ &dev_attr_ipReasmTimeout.attr,
+ &dev_attr_ipReasmReqds.attr,
+ &dev_attr_ipReasmOKs.attr,
+ &dev_attr_ipReasmFails.attr,
+ &dev_attr_ipFragOKs.attr,
+ &dev_attr_ipFragFails.attr,
+ &dev_attr_ipFragCreates.attr,
+ &dev_attr_ipInMcastPkts.attr,
+ &dev_attr_ipOutMcastPkts.attr,
+ &dev_attr_ipInBcastPkts.attr,
+ &dev_attr_ipOutBcastPkts.attr,
+ &dev_attr_tcpRtoAlgorithm.attr,
+ &dev_attr_tcpRtoMin.attr,
+ &dev_attr_tcpRtoMax.attr,
+ &dev_attr_tcpMaxConn.attr,
+ &dev_attr_tcpActiveOpens.attr,
+ &dev_attr_tcpPassiveOpens.attr,
+ &dev_attr_tcpAttemptFails.attr,
+ &dev_attr_tcpEstabResets.attr,
+ &dev_attr_tcpCurrEstab.attr,
+ &dev_attr_tcpInSegs.attr,
+ &dev_attr_tcpOutSegs.attr,
+ &dev_attr_tcpRetransSegs.attr,
+ &dev_attr_tcpInErrs.attr,
+ &dev_attr_tcpOutRsts.attr,
+ NULL
+};
+
+static struct attribute_group iw_stats_group = {
+ .name = "proto_stats",
+ .attrs = iw_proto_stats_attrs,
+};
+
+static void free_port_list_attributes(struct ib_device *device)
+{
+ struct kobject *p, *t;
+
+ list_for_each_entry_safe(p, t, &device->port_list, entry) {
+ struct ib_port *port = container_of(p, struct ib_port, kobj);
+ list_del(&p->entry);
+ sysfs_remove_group(p, &pma_group);
+ sysfs_remove_group(p, &port->pkey_group);
+ sysfs_remove_group(p, &port->gid_group);
+ kobject_put(p);
+ }
+
+ kobject_put(device->ports_parent);
+}
+
+int ib_device_register_sysfs(struct ib_device *device,
+ int (*port_callback)(struct ib_device *,
+ u8, struct kobject *))
+{
+ struct device *class_dev = &device->dev;
+ int ret;
+ int i;
+
+ class_dev->class = &ib_class;
+ class_dev->parent = device->dma_device;
+ dev_set_name(class_dev, "%s", device->name);
+ dev_set_drvdata(class_dev, device);
+
+ INIT_LIST_HEAD(&device->port_list);
+
+ ret = device_register(class_dev);
+ if (ret)
+ goto err;
+
+ for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
+ ret = device_create_file(class_dev, ib_class_attributes[i]);
+ if (ret)
+ goto err_unregister;
+ }
+
+ device->ports_parent = kobject_create_and_add("ports",
+ &class_dev->kobj);
+ if (!device->ports_parent) {
+ ret = -ENOMEM;
+ goto err_put;
+ }
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH) {
+ ret = add_port(device, 0, port_callback);
+ if (ret)
+ goto err_put;
+ } else {
+ for (i = 1; i <= device->phys_port_cnt; ++i) {
+ ret = add_port(device, i, port_callback);
+ if (ret)
+ goto err_put;
+ }
+ }
+
+ if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) {
+ ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group);
+ if (ret)
+ goto err_put;
+ }
+
+ return 0;
+
+err_put:
+ free_port_list_attributes(device);
+
+err_unregister:
+ device_unregister(class_dev);
+
+err:
+ return ret;
+}
+
+void ib_device_unregister_sysfs(struct ib_device *device)
+{
+ /* Hold kobject until ib_dealloc_device() */
+ struct kobject *kobj_dev = kobject_get(&device->dev.kobj);
+ int i;
+
+ if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats)
+ sysfs_remove_group(kobj_dev, &iw_stats_group);
+
+ free_port_list_attributes(device);
+
+ for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i)
+ device_remove_file(&device->dev, ib_class_attributes[i]);
+
+ device_unregister(&device->dev);
+}
+
+int ib_sysfs_setup(void)
+{
+ return class_register(&ib_class);
+}
+
+void ib_sysfs_cleanup(void)
+{
+ class_unregister(&ib_class);
+}
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
new file mode 100644
index 000000000..f2f63933e
--- /dev/null
+++ b/drivers/infiniband/core/ucm.c
@@ -0,0 +1,1370 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/cdev.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+
+#include <rdma/ib_cm.h>
+#include <rdma/ib_user_cm.h>
+#include <rdma/ib_marshall.h>
+
+MODULE_AUTHOR("Libor Michalek");
+MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_ucm_device {
+ int devnum;
+ struct cdev cdev;
+ struct device dev;
+ struct ib_device *ib_dev;
+};
+
+struct ib_ucm_file {
+ struct mutex file_mutex;
+ struct file *filp;
+ struct ib_ucm_device *device;
+
+ struct list_head ctxs;
+ struct list_head events;
+ wait_queue_head_t poll_wait;
+};
+
+struct ib_ucm_context {
+ int id;
+ struct completion comp;
+ atomic_t ref;
+ int events_reported;
+
+ struct ib_ucm_file *file;
+ struct ib_cm_id *cm_id;
+ __u64 uid;
+
+ struct list_head events; /* list of pending events. */
+ struct list_head file_list; /* member in file ctx list */
+};
+
+struct ib_ucm_event {
+ struct ib_ucm_context *ctx;
+ struct list_head file_list; /* member in file event list */
+ struct list_head ctx_list; /* member in ctx event list */
+
+ struct ib_cm_id *cm_id;
+ struct ib_ucm_event_resp resp;
+ void *data;
+ void *info;
+ int data_len;
+ int info_len;
+};
+
+enum {
+ IB_UCM_MAJOR = 231,
+ IB_UCM_BASE_MINOR = 224,
+ IB_UCM_MAX_DEVICES = 32
+};
+
+#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
+
+static void ib_ucm_add_one(struct ib_device *device);
+static void ib_ucm_remove_one(struct ib_device *device);
+
+static struct ib_client ucm_client = {
+ .name = "ucm",
+ .add = ib_ucm_add_one,
+ .remove = ib_ucm_remove_one
+};
+
+static DEFINE_MUTEX(ctx_id_mutex);
+static DEFINE_IDR(ctx_id_table);
+static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES);
+
+static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id)
+{
+ struct ib_ucm_context *ctx;
+
+ mutex_lock(&ctx_id_mutex);
+ ctx = idr_find(&ctx_id_table, id);
+ if (!ctx)
+ ctx = ERR_PTR(-ENOENT);
+ else if (ctx->file != file)
+ ctx = ERR_PTR(-EINVAL);
+ else
+ atomic_inc(&ctx->ref);
+ mutex_unlock(&ctx_id_mutex);
+
+ return ctx;
+}
+
+static void ib_ucm_ctx_put(struct ib_ucm_context *ctx)
+{
+ if (atomic_dec_and_test(&ctx->ref))
+ complete(&ctx->comp);
+}
+
+static inline int ib_ucm_new_cm_id(int event)
+{
+ return event == IB_CM_REQ_RECEIVED || event == IB_CM_SIDR_REQ_RECEIVED;
+}
+
+static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx)
+{
+ struct ib_ucm_event *uevent;
+
+ mutex_lock(&ctx->file->file_mutex);
+ list_del(&ctx->file_list);
+ while (!list_empty(&ctx->events)) {
+
+ uevent = list_entry(ctx->events.next,
+ struct ib_ucm_event, ctx_list);
+ list_del(&uevent->file_list);
+ list_del(&uevent->ctx_list);
+ mutex_unlock(&ctx->file->file_mutex);
+
+ /* clear incoming connections. */
+ if (ib_ucm_new_cm_id(uevent->resp.event))
+ ib_destroy_cm_id(uevent->cm_id);
+
+ kfree(uevent);
+ mutex_lock(&ctx->file->file_mutex);
+ }
+ mutex_unlock(&ctx->file->file_mutex);
+}
+
+static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file)
+{
+ struct ib_ucm_context *ctx;
+
+ ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ atomic_set(&ctx->ref, 1);
+ init_completion(&ctx->comp);
+ ctx->file = file;
+ INIT_LIST_HEAD(&ctx->events);
+
+ mutex_lock(&ctx_id_mutex);
+ ctx->id = idr_alloc(&ctx_id_table, ctx, 0, 0, GFP_KERNEL);
+ mutex_unlock(&ctx_id_mutex);
+ if (ctx->id < 0)
+ goto error;
+
+ list_add_tail(&ctx->file_list, &file->ctxs);
+ return ctx;
+
+error:
+ kfree(ctx);
+ return NULL;
+}
+
+static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq,
+ struct ib_cm_req_event_param *kreq)
+{
+ ureq->remote_ca_guid = kreq->remote_ca_guid;
+ ureq->remote_qkey = kreq->remote_qkey;
+ ureq->remote_qpn = kreq->remote_qpn;
+ ureq->qp_type = kreq->qp_type;
+ ureq->starting_psn = kreq->starting_psn;
+ ureq->responder_resources = kreq->responder_resources;
+ ureq->initiator_depth = kreq->initiator_depth;
+ ureq->local_cm_response_timeout = kreq->local_cm_response_timeout;
+ ureq->flow_control = kreq->flow_control;
+ ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout;
+ ureq->retry_count = kreq->retry_count;
+ ureq->rnr_retry_count = kreq->rnr_retry_count;
+ ureq->srq = kreq->srq;
+ ureq->port = kreq->port;
+
+ ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path);
+ if (kreq->alternate_path)
+ ib_copy_path_rec_to_user(&ureq->alternate_path,
+ kreq->alternate_path);
+}
+
+static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep,
+ struct ib_cm_rep_event_param *krep)
+{
+ urep->remote_ca_guid = krep->remote_ca_guid;
+ urep->remote_qkey = krep->remote_qkey;
+ urep->remote_qpn = krep->remote_qpn;
+ urep->starting_psn = krep->starting_psn;
+ urep->responder_resources = krep->responder_resources;
+ urep->initiator_depth = krep->initiator_depth;
+ urep->target_ack_delay = krep->target_ack_delay;
+ urep->failover_accepted = krep->failover_accepted;
+ urep->flow_control = krep->flow_control;
+ urep->rnr_retry_count = krep->rnr_retry_count;
+ urep->srq = krep->srq;
+}
+
+static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep,
+ struct ib_cm_sidr_rep_event_param *krep)
+{
+ urep->status = krep->status;
+ urep->qkey = krep->qkey;
+ urep->qpn = krep->qpn;
+};
+
+static int ib_ucm_event_process(struct ib_cm_event *evt,
+ struct ib_ucm_event *uvt)
+{
+ void *info = NULL;
+
+ switch (evt->event) {
+ case IB_CM_REQ_RECEIVED:
+ ib_ucm_event_req_get(&uvt->resp.u.req_resp,
+ &evt->param.req_rcvd);
+ uvt->data_len = IB_CM_REQ_PRIVATE_DATA_SIZE;
+ uvt->resp.present = IB_UCM_PRES_PRIMARY;
+ uvt->resp.present |= (evt->param.req_rcvd.alternate_path ?
+ IB_UCM_PRES_ALTERNATE : 0);
+ break;
+ case IB_CM_REP_RECEIVED:
+ ib_ucm_event_rep_get(&uvt->resp.u.rep_resp,
+ &evt->param.rep_rcvd);
+ uvt->data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+ break;
+ case IB_CM_RTU_RECEIVED:
+ uvt->data_len = IB_CM_RTU_PRIVATE_DATA_SIZE;
+ uvt->resp.u.send_status = evt->param.send_status;
+ break;
+ case IB_CM_DREQ_RECEIVED:
+ uvt->data_len = IB_CM_DREQ_PRIVATE_DATA_SIZE;
+ uvt->resp.u.send_status = evt->param.send_status;
+ break;
+ case IB_CM_DREP_RECEIVED:
+ uvt->data_len = IB_CM_DREP_PRIVATE_DATA_SIZE;
+ uvt->resp.u.send_status = evt->param.send_status;
+ break;
+ case IB_CM_MRA_RECEIVED:
+ uvt->resp.u.mra_resp.timeout =
+ evt->param.mra_rcvd.service_timeout;
+ uvt->data_len = IB_CM_MRA_PRIVATE_DATA_SIZE;
+ break;
+ case IB_CM_REJ_RECEIVED:
+ uvt->resp.u.rej_resp.reason = evt->param.rej_rcvd.reason;
+ uvt->data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+ uvt->info_len = evt->param.rej_rcvd.ari_length;
+ info = evt->param.rej_rcvd.ari;
+ break;
+ case IB_CM_LAP_RECEIVED:
+ ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path,
+ evt->param.lap_rcvd.alternate_path);
+ uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE;
+ uvt->resp.present = IB_UCM_PRES_ALTERNATE;
+ break;
+ case IB_CM_APR_RECEIVED:
+ uvt->resp.u.apr_resp.status = evt->param.apr_rcvd.ap_status;
+ uvt->data_len = IB_CM_APR_PRIVATE_DATA_SIZE;
+ uvt->info_len = evt->param.apr_rcvd.info_len;
+ info = evt->param.apr_rcvd.apr_info;
+ break;
+ case IB_CM_SIDR_REQ_RECEIVED:
+ uvt->resp.u.sidr_req_resp.pkey =
+ evt->param.sidr_req_rcvd.pkey;
+ uvt->resp.u.sidr_req_resp.port =
+ evt->param.sidr_req_rcvd.port;
+ uvt->data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE;
+ break;
+ case IB_CM_SIDR_REP_RECEIVED:
+ ib_ucm_event_sidr_rep_get(&uvt->resp.u.sidr_rep_resp,
+ &evt->param.sidr_rep_rcvd);
+ uvt->data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+ uvt->info_len = evt->param.sidr_rep_rcvd.info_len;
+ info = evt->param.sidr_rep_rcvd.info;
+ break;
+ default:
+ uvt->resp.u.send_status = evt->param.send_status;
+ break;
+ }
+
+ if (uvt->data_len) {
+ uvt->data = kmemdup(evt->private_data, uvt->data_len, GFP_KERNEL);
+ if (!uvt->data)
+ goto err1;
+
+ uvt->resp.present |= IB_UCM_PRES_DATA;
+ }
+
+ if (uvt->info_len) {
+ uvt->info = kmemdup(info, uvt->info_len, GFP_KERNEL);
+ if (!uvt->info)
+ goto err2;
+
+ uvt->resp.present |= IB_UCM_PRES_INFO;
+ }
+ return 0;
+
+err2:
+ kfree(uvt->data);
+err1:
+ return -ENOMEM;
+}
+
+static int ib_ucm_event_handler(struct ib_cm_id *cm_id,
+ struct ib_cm_event *event)
+{
+ struct ib_ucm_event *uevent;
+ struct ib_ucm_context *ctx;
+ int result = 0;
+
+ ctx = cm_id->context;
+
+ uevent = kzalloc(sizeof *uevent, GFP_KERNEL);
+ if (!uevent)
+ goto err1;
+
+ uevent->ctx = ctx;
+ uevent->cm_id = cm_id;
+ uevent->resp.uid = ctx->uid;
+ uevent->resp.id = ctx->id;
+ uevent->resp.event = event->event;
+
+ result = ib_ucm_event_process(event, uevent);
+ if (result)
+ goto err2;
+
+ mutex_lock(&ctx->file->file_mutex);
+ list_add_tail(&uevent->file_list, &ctx->file->events);
+ list_add_tail(&uevent->ctx_list, &ctx->events);
+ wake_up_interruptible(&ctx->file->poll_wait);
+ mutex_unlock(&ctx->file->file_mutex);
+ return 0;
+
+err2:
+ kfree(uevent);
+err1:
+ /* Destroy new cm_id's */
+ return ib_ucm_new_cm_id(event->event);
+}
+
+static ssize_t ib_ucm_event(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_event_get cmd;
+ struct ib_ucm_event *uevent;
+ int result = 0;
+
+ if (out_len < sizeof(struct ib_ucm_event_resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&file->file_mutex);
+ while (list_empty(&file->events)) {
+ mutex_unlock(&file->file_mutex);
+
+ if (file->filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(file->poll_wait,
+ !list_empty(&file->events)))
+ return -ERESTARTSYS;
+
+ mutex_lock(&file->file_mutex);
+ }
+
+ uevent = list_entry(file->events.next, struct ib_ucm_event, file_list);
+
+ if (ib_ucm_new_cm_id(uevent->resp.event)) {
+ ctx = ib_ucm_ctx_alloc(file);
+ if (!ctx) {
+ result = -ENOMEM;
+ goto done;
+ }
+
+ ctx->cm_id = uevent->cm_id;
+ ctx->cm_id->context = ctx;
+ uevent->resp.id = ctx->id;
+ }
+
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &uevent->resp, sizeof(uevent->resp))) {
+ result = -EFAULT;
+ goto done;
+ }
+
+ if (uevent->data) {
+ if (cmd.data_len < uevent->data_len) {
+ result = -ENOMEM;
+ goto done;
+ }
+ if (copy_to_user((void __user *)(unsigned long)cmd.data,
+ uevent->data, uevent->data_len)) {
+ result = -EFAULT;
+ goto done;
+ }
+ }
+
+ if (uevent->info) {
+ if (cmd.info_len < uevent->info_len) {
+ result = -ENOMEM;
+ goto done;
+ }
+ if (copy_to_user((void __user *)(unsigned long)cmd.info,
+ uevent->info, uevent->info_len)) {
+ result = -EFAULT;
+ goto done;
+ }
+ }
+
+ list_del(&uevent->file_list);
+ list_del(&uevent->ctx_list);
+ uevent->ctx->events_reported++;
+
+ kfree(uevent->data);
+ kfree(uevent->info);
+ kfree(uevent);
+done:
+ mutex_unlock(&file->file_mutex);
+ return result;
+}
+
+static ssize_t ib_ucm_create_id(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_create_id cmd;
+ struct ib_ucm_create_id_resp resp;
+ struct ib_ucm_context *ctx;
+ int result;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&file->file_mutex);
+ ctx = ib_ucm_ctx_alloc(file);
+ mutex_unlock(&file->file_mutex);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->uid = cmd.uid;
+ ctx->cm_id = ib_create_cm_id(file->device->ib_dev,
+ ib_ucm_event_handler, ctx);
+ if (IS_ERR(ctx->cm_id)) {
+ result = PTR_ERR(ctx->cm_id);
+ goto err1;
+ }
+
+ resp.id = ctx->id;
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp))) {
+ result = -EFAULT;
+ goto err2;
+ }
+ return 0;
+
+err2:
+ ib_destroy_cm_id(ctx->cm_id);
+err1:
+ mutex_lock(&ctx_id_mutex);
+ idr_remove(&ctx_id_table, ctx->id);
+ mutex_unlock(&ctx_id_mutex);
+ kfree(ctx);
+ return result;
+}
+
+static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_destroy_id cmd;
+ struct ib_ucm_destroy_id_resp resp;
+ struct ib_ucm_context *ctx;
+ int result = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&ctx_id_mutex);
+ ctx = idr_find(&ctx_id_table, cmd.id);
+ if (!ctx)
+ ctx = ERR_PTR(-ENOENT);
+ else if (ctx->file != file)
+ ctx = ERR_PTR(-EINVAL);
+ else
+ idr_remove(&ctx_id_table, ctx->id);
+ mutex_unlock(&ctx_id_mutex);
+
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ib_ucm_ctx_put(ctx);
+ wait_for_completion(&ctx->comp);
+
+ /* No new events will be generated after destroying the cm_id. */
+ ib_destroy_cm_id(ctx->cm_id);
+ /* Cleanup events not yet reported to the user. */
+ ib_ucm_cleanup_events(ctx);
+
+ resp.events_reported = ctx->events_reported;
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ result = -EFAULT;
+
+ kfree(ctx);
+ return result;
+}
+
+static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_attr_id_resp resp;
+ struct ib_ucm_attr_id cmd;
+ struct ib_ucm_context *ctx;
+ int result = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ resp.service_id = ctx->cm_id->service_id;
+ resp.service_mask = ctx->cm_id->service_mask;
+ resp.local_id = ctx->cm_id->local_id;
+ resp.remote_id = ctx->cm_id->remote_id;
+
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ result = -EFAULT;
+
+ ib_ucm_ctx_put(ctx);
+ return result;
+}
+
+static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_qp_attr resp;
+ struct ib_ucm_init_qp_attr cmd;
+ struct ib_ucm_context *ctx;
+ struct ib_qp_attr qp_attr;
+ int result = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ resp.qp_attr_mask = 0;
+ memset(&qp_attr, 0, sizeof qp_attr);
+ qp_attr.qp_state = cmd.qp_state;
+ result = ib_cm_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+ if (result)
+ goto out;
+
+ ib_copy_qp_attr_to_user(&resp, &qp_attr);
+
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ result = -EFAULT;
+
+out:
+ ib_ucm_ctx_put(ctx);
+ return result;
+}
+
+static int ucm_validate_listen(__be64 service_id, __be64 service_mask)
+{
+ service_id &= service_mask;
+
+ if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) ||
+ ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID))
+ return -EINVAL;
+
+ return 0;
+}
+
+static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_listen cmd;
+ struct ib_ucm_context *ctx;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ result = ucm_validate_listen(cmd.service_id, cmd.service_mask);
+ if (result)
+ goto out;
+
+ result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask,
+ NULL);
+out:
+ ib_ucm_ctx_put(ctx);
+ return result;
+}
+
+static ssize_t ib_ucm_notify(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_notify cmd;
+ struct ib_ucm_context *ctx;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ result = ib_cm_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
+ ib_ucm_ctx_put(ctx);
+ return result;
+}
+
+static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len)
+{
+ void *data;
+
+ *dest = NULL;
+
+ if (!len)
+ return 0;
+
+ data = memdup_user((void __user *)(unsigned long)src, len);
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ *dest = data;
+ return 0;
+}
+
+static int ib_ucm_path_get(struct ib_sa_path_rec **path, u64 src)
+{
+ struct ib_user_path_rec upath;
+ struct ib_sa_path_rec *sa_path;
+
+ *path = NULL;
+
+ if (!src)
+ return 0;
+
+ sa_path = kmalloc(sizeof(*sa_path), GFP_KERNEL);
+ if (!sa_path)
+ return -ENOMEM;
+
+ if (copy_from_user(&upath, (void __user *)(unsigned long)src,
+ sizeof(upath))) {
+
+ kfree(sa_path);
+ return -EFAULT;
+ }
+
+ ib_copy_path_rec_from_user(sa_path, &upath);
+ *path = sa_path;
+ return 0;
+}
+
+static ssize_t ib_ucm_send_req(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_cm_req_param param;
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_req cmd;
+ int result;
+
+ param.private_data = NULL;
+ param.primary_path = NULL;
+ param.alternate_path = NULL;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+ if (result)
+ goto done;
+
+ result = ib_ucm_path_get(&param.primary_path, cmd.primary_path);
+ if (result)
+ goto done;
+
+ result = ib_ucm_path_get(&param.alternate_path, cmd.alternate_path);
+ if (result)
+ goto done;
+
+ param.private_data_len = cmd.len;
+ param.service_id = cmd.sid;
+ param.qp_num = cmd.qpn;
+ param.qp_type = cmd.qp_type;
+ param.starting_psn = cmd.psn;
+ param.peer_to_peer = cmd.peer_to_peer;
+ param.responder_resources = cmd.responder_resources;
+ param.initiator_depth = cmd.initiator_depth;
+ param.remote_cm_response_timeout = cmd.remote_cm_response_timeout;
+ param.flow_control = cmd.flow_control;
+ param.local_cm_response_timeout = cmd.local_cm_response_timeout;
+ param.retry_count = cmd.retry_count;
+ param.rnr_retry_count = cmd.rnr_retry_count;
+ param.max_cm_retries = cmd.max_cm_retries;
+ param.srq = cmd.srq;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = ib_send_cm_req(ctx->cm_id, &param);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+done:
+ kfree(param.private_data);
+ kfree(param.primary_path);
+ kfree(param.alternate_path);
+ return result;
+}
+
+static ssize_t ib_ucm_send_rep(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_cm_rep_param param;
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_rep cmd;
+ int result;
+
+ param.private_data = NULL;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+ if (result)
+ return result;
+
+ param.qp_num = cmd.qpn;
+ param.starting_psn = cmd.psn;
+ param.private_data_len = cmd.len;
+ param.responder_resources = cmd.responder_resources;
+ param.initiator_depth = cmd.initiator_depth;
+ param.failover_accepted = cmd.failover_accepted;
+ param.flow_control = cmd.flow_control;
+ param.rnr_retry_count = cmd.rnr_retry_count;
+ param.srq = cmd.srq;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ ctx->uid = cmd.uid;
+ result = ib_send_cm_rep(ctx->cm_id, &param);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+ kfree(param.private_data);
+ return result;
+}
+
+static ssize_t ib_ucm_send_private_data(struct ib_ucm_file *file,
+ const char __user *inbuf, int in_len,
+ int (*func)(struct ib_cm_id *cm_id,
+ const void *private_data,
+ u8 private_data_len))
+{
+ struct ib_ucm_private_data cmd;
+ struct ib_ucm_context *ctx;
+ const void *private_data = NULL;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&private_data, cmd.data, cmd.len);
+ if (result)
+ return result;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = func(ctx->cm_id, private_data, cmd.len);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+ kfree(private_data);
+ return result;
+}
+
+static ssize_t ib_ucm_send_rtu(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_rtu);
+}
+
+static ssize_t ib_ucm_send_dreq(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_dreq);
+}
+
+static ssize_t ib_ucm_send_drep(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_drep);
+}
+
+static ssize_t ib_ucm_send_info(struct ib_ucm_file *file,
+ const char __user *inbuf, int in_len,
+ int (*func)(struct ib_cm_id *cm_id,
+ int status,
+ const void *info,
+ u8 info_len,
+ const void *data,
+ u8 data_len))
+{
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_info cmd;
+ const void *data = NULL;
+ const void *info = NULL;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&data, cmd.data, cmd.data_len);
+ if (result)
+ goto done;
+
+ result = ib_ucm_alloc_data(&info, cmd.info, cmd.info_len);
+ if (result)
+ goto done;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = func(ctx->cm_id, cmd.status, info, cmd.info_len,
+ data, cmd.data_len);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+done:
+ kfree(data);
+ kfree(info);
+ return result;
+}
+
+static ssize_t ib_ucm_send_rej(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_rej);
+}
+
+static ssize_t ib_ucm_send_apr(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_apr);
+}
+
+static ssize_t ib_ucm_send_mra(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_mra cmd;
+ const void *data = NULL;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
+ if (result)
+ return result;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = ib_send_cm_mra(ctx->cm_id, cmd.timeout, data, cmd.len);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+ kfree(data);
+ return result;
+}
+
+static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_ucm_context *ctx;
+ struct ib_sa_path_rec *path = NULL;
+ struct ib_ucm_lap cmd;
+ const void *data = NULL;
+ int result;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
+ if (result)
+ goto done;
+
+ result = ib_ucm_path_get(&path, cmd.path);
+ if (result)
+ goto done;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = ib_send_cm_lap(ctx->cm_id, path, data, cmd.len);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+done:
+ kfree(data);
+ kfree(path);
+ return result;
+}
+
+static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_cm_sidr_req_param param;
+ struct ib_ucm_context *ctx;
+ struct ib_ucm_sidr_req cmd;
+ int result;
+
+ param.private_data = NULL;
+ param.path = NULL;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+ if (result)
+ goto done;
+
+ result = ib_ucm_path_get(&param.path, cmd.path);
+ if (result)
+ goto done;
+
+ param.private_data_len = cmd.len;
+ param.service_id = cmd.sid;
+ param.timeout_ms = cmd.timeout;
+ param.max_cm_retries = cmd.max_cm_retries;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = ib_send_cm_sidr_req(ctx->cm_id, &param);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+done:
+ kfree(param.private_data);
+ kfree(param.path);
+ return result;
+}
+
+static ssize_t ib_ucm_send_sidr_rep(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ib_cm_sidr_rep_param param;
+ struct ib_ucm_sidr_rep cmd;
+ struct ib_ucm_context *ctx;
+ int result;
+
+ param.info = NULL;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ result = ib_ucm_alloc_data(&param.private_data,
+ cmd.data, cmd.data_len);
+ if (result)
+ goto done;
+
+ result = ib_ucm_alloc_data(&param.info, cmd.info, cmd.info_len);
+ if (result)
+ goto done;
+
+ param.qp_num = cmd.qpn;
+ param.qkey = cmd.qkey;
+ param.status = cmd.status;
+ param.info_length = cmd.info_len;
+ param.private_data_len = cmd.data_len;
+
+ ctx = ib_ucm_ctx_get(file, cmd.id);
+ if (!IS_ERR(ctx)) {
+ result = ib_send_cm_sidr_rep(ctx->cm_id, &param);
+ ib_ucm_ctx_put(ctx);
+ } else
+ result = PTR_ERR(ctx);
+
+done:
+ kfree(param.private_data);
+ kfree(param.info);
+ return result;
+}
+
+static ssize_t (*ucm_cmd_table[])(struct ib_ucm_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len) = {
+ [IB_USER_CM_CMD_CREATE_ID] = ib_ucm_create_id,
+ [IB_USER_CM_CMD_DESTROY_ID] = ib_ucm_destroy_id,
+ [IB_USER_CM_CMD_ATTR_ID] = ib_ucm_attr_id,
+ [IB_USER_CM_CMD_LISTEN] = ib_ucm_listen,
+ [IB_USER_CM_CMD_NOTIFY] = ib_ucm_notify,
+ [IB_USER_CM_CMD_SEND_REQ] = ib_ucm_send_req,
+ [IB_USER_CM_CMD_SEND_REP] = ib_ucm_send_rep,
+ [IB_USER_CM_CMD_SEND_RTU] = ib_ucm_send_rtu,
+ [IB_USER_CM_CMD_SEND_DREQ] = ib_ucm_send_dreq,
+ [IB_USER_CM_CMD_SEND_DREP] = ib_ucm_send_drep,
+ [IB_USER_CM_CMD_SEND_REJ] = ib_ucm_send_rej,
+ [IB_USER_CM_CMD_SEND_MRA] = ib_ucm_send_mra,
+ [IB_USER_CM_CMD_SEND_LAP] = ib_ucm_send_lap,
+ [IB_USER_CM_CMD_SEND_APR] = ib_ucm_send_apr,
+ [IB_USER_CM_CMD_SEND_SIDR_REQ] = ib_ucm_send_sidr_req,
+ [IB_USER_CM_CMD_SEND_SIDR_REP] = ib_ucm_send_sidr_rep,
+ [IB_USER_CM_CMD_EVENT] = ib_ucm_event,
+ [IB_USER_CM_CMD_INIT_QP_ATTR] = ib_ucm_init_qp_attr,
+};
+
+static ssize_t ib_ucm_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct ib_ucm_file *file = filp->private_data;
+ struct ib_ucm_cmd_hdr hdr;
+ ssize_t result;
+
+ if (len < sizeof(hdr))
+ return -EINVAL;
+
+ if (copy_from_user(&hdr, buf, sizeof(hdr)))
+ return -EFAULT;
+
+ if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table))
+ return -EINVAL;
+
+ if (hdr.in + sizeof(hdr) > len)
+ return -EINVAL;
+
+ result = ucm_cmd_table[hdr.cmd](file, buf + sizeof(hdr),
+ hdr.in, hdr.out);
+ if (!result)
+ result = len;
+
+ return result;
+}
+
+static unsigned int ib_ucm_poll(struct file *filp,
+ struct poll_table_struct *wait)
+{
+ struct ib_ucm_file *file = filp->private_data;
+ unsigned int mask = 0;
+
+ poll_wait(filp, &file->poll_wait, wait);
+
+ if (!list_empty(&file->events))
+ mask = POLLIN | POLLRDNORM;
+
+ return mask;
+}
+
+/*
+ * ib_ucm_open() does not need the BKL:
+ *
+ * - no global state is referred to;
+ * - there is no ioctl method to race against;
+ * - no further module initialization is required for open to work
+ * after the device is registered.
+ */
+static int ib_ucm_open(struct inode *inode, struct file *filp)
+{
+ struct ib_ucm_file *file;
+
+ file = kmalloc(sizeof(*file), GFP_KERNEL);
+ if (!file)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&file->events);
+ INIT_LIST_HEAD(&file->ctxs);
+ init_waitqueue_head(&file->poll_wait);
+
+ mutex_init(&file->file_mutex);
+
+ filp->private_data = file;
+ file->filp = filp;
+ file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev);
+
+ return nonseekable_open(inode, filp);
+}
+
+static int ib_ucm_close(struct inode *inode, struct file *filp)
+{
+ struct ib_ucm_file *file = filp->private_data;
+ struct ib_ucm_context *ctx;
+
+ mutex_lock(&file->file_mutex);
+ while (!list_empty(&file->ctxs)) {
+ ctx = list_entry(file->ctxs.next,
+ struct ib_ucm_context, file_list);
+ mutex_unlock(&file->file_mutex);
+
+ mutex_lock(&ctx_id_mutex);
+ idr_remove(&ctx_id_table, ctx->id);
+ mutex_unlock(&ctx_id_mutex);
+
+ ib_destroy_cm_id(ctx->cm_id);
+ ib_ucm_cleanup_events(ctx);
+ kfree(ctx);
+
+ mutex_lock(&file->file_mutex);
+ }
+ mutex_unlock(&file->file_mutex);
+ kfree(file);
+ return 0;
+}
+
+static void ib_ucm_release_dev(struct device *dev)
+{
+ struct ib_ucm_device *ucm_dev;
+
+ ucm_dev = container_of(dev, struct ib_ucm_device, dev);
+ cdev_del(&ucm_dev->cdev);
+ if (ucm_dev->devnum < IB_UCM_MAX_DEVICES)
+ clear_bit(ucm_dev->devnum, dev_map);
+ else
+ clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, dev_map);
+ kfree(ucm_dev);
+}
+
+static const struct file_operations ucm_fops = {
+ .owner = THIS_MODULE,
+ .open = ib_ucm_open,
+ .release = ib_ucm_close,
+ .write = ib_ucm_write,
+ .poll = ib_ucm_poll,
+ .llseek = no_llseek,
+};
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_ucm_device *ucm_dev;
+
+ ucm_dev = container_of(dev, struct ib_ucm_device, dev);
+ return sprintf(buf, "%s\n", ucm_dev->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES);
+static int find_overflow_devnum(void)
+{
+ int ret;
+
+ if (!overflow_maj) {
+ ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES,
+ "infiniband_cm");
+ if (ret) {
+ printk(KERN_ERR "ucm: couldn't register dynamic device number\n");
+ return ret;
+ }
+ }
+
+ ret = find_first_zero_bit(overflow_map, IB_UCM_MAX_DEVICES);
+ if (ret >= IB_UCM_MAX_DEVICES)
+ return -1;
+
+ return ret;
+}
+
+static void ib_ucm_add_one(struct ib_device *device)
+{
+ int devnum;
+ dev_t base;
+ struct ib_ucm_device *ucm_dev;
+
+ if (!device->alloc_ucontext ||
+ rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
+ if (!ucm_dev)
+ return;
+
+ ucm_dev->ib_dev = device;
+
+ devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
+ if (devnum >= IB_UCM_MAX_DEVICES) {
+ devnum = find_overflow_devnum();
+ if (devnum < 0)
+ goto err;
+
+ ucm_dev->devnum = devnum + IB_UCM_MAX_DEVICES;
+ base = devnum + overflow_maj;
+ set_bit(devnum, overflow_map);
+ } else {
+ ucm_dev->devnum = devnum;
+ base = devnum + IB_UCM_BASE_DEV;
+ set_bit(devnum, dev_map);
+ }
+
+ cdev_init(&ucm_dev->cdev, &ucm_fops);
+ ucm_dev->cdev.owner = THIS_MODULE;
+ kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum);
+ if (cdev_add(&ucm_dev->cdev, base, 1))
+ goto err;
+
+ ucm_dev->dev.class = &cm_class;
+ ucm_dev->dev.parent = device->dma_device;
+ ucm_dev->dev.devt = ucm_dev->cdev.dev;
+ ucm_dev->dev.release = ib_ucm_release_dev;
+ dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum);
+ if (device_register(&ucm_dev->dev))
+ goto err_cdev;
+
+ if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev))
+ goto err_dev;
+
+ ib_set_client_data(device, &ucm_client, ucm_dev);
+ return;
+
+err_dev:
+ device_unregister(&ucm_dev->dev);
+err_cdev:
+ cdev_del(&ucm_dev->cdev);
+ if (ucm_dev->devnum < IB_UCM_MAX_DEVICES)
+ clear_bit(devnum, dev_map);
+ else
+ clear_bit(devnum, overflow_map);
+err:
+ kfree(ucm_dev);
+ return;
+}
+
+static void ib_ucm_remove_one(struct ib_device *device)
+{
+ struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client);
+
+ if (!ucm_dev)
+ return;
+
+ device_unregister(&ucm_dev->dev);
+}
+
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+ __stringify(IB_USER_CM_ABI_VERSION));
+
+static int __init ib_ucm_init(void)
+{
+ int ret;
+
+ ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES,
+ "infiniband_cm");
+ if (ret) {
+ printk(KERN_ERR "ucm: couldn't register device number\n");
+ goto error1;
+ }
+
+ ret = class_create_file(&cm_class, &class_attr_abi_version.attr);
+ if (ret) {
+ printk(KERN_ERR "ucm: couldn't create abi_version attribute\n");
+ goto error2;
+ }
+
+ ret = ib_register_client(&ucm_client);
+ if (ret) {
+ printk(KERN_ERR "ucm: couldn't register client\n");
+ goto error3;
+ }
+ return 0;
+
+error3:
+ class_remove_file(&cm_class, &class_attr_abi_version.attr);
+error2:
+ unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
+error1:
+ return ret;
+}
+
+static void __exit ib_ucm_cleanup(void)
+{
+ ib_unregister_client(&ucm_client);
+ class_remove_file(&cm_class, &class_attr_abi_version.attr);
+ unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
+ if (overflow_maj)
+ unregister_chrdev_region(overflow_maj, IB_UCM_MAX_DEVICES);
+ idr_destroy(&ctx_id_table);
+}
+
+module_init(ib_ucm_init);
+module_exit(ib_ucm_cleanup);
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
new file mode 100644
index 000000000..45d67e922
--- /dev/null
+++ b/drivers/infiniband/core/ucma.c
@@ -0,0 +1,1635 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/file.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/idr.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+
+#include <rdma/rdma_user_cm.h>
+#include <rdma/ib_marshall.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static unsigned int max_backlog = 1024;
+
+static struct ctl_table_header *ucma_ctl_table_hdr;
+static struct ctl_table ucma_ctl_table[] = {
+ {
+ .procname = "max_backlog",
+ .data = &max_backlog,
+ .maxlen = sizeof max_backlog,
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ { }
+};
+
+struct ucma_file {
+ struct mutex mut;
+ struct file *filp;
+ struct list_head ctx_list;
+ struct list_head event_list;
+ wait_queue_head_t poll_wait;
+};
+
+struct ucma_context {
+ int id;
+ struct completion comp;
+ atomic_t ref;
+ int events_reported;
+ int backlog;
+
+ struct ucma_file *file;
+ struct rdma_cm_id *cm_id;
+ u64 uid;
+
+ struct list_head list;
+ struct list_head mc_list;
+};
+
+struct ucma_multicast {
+ struct ucma_context *ctx;
+ int id;
+ int events_reported;
+
+ u64 uid;
+ struct list_head list;
+ struct sockaddr_storage addr;
+};
+
+struct ucma_event {
+ struct ucma_context *ctx;
+ struct ucma_multicast *mc;
+ struct list_head list;
+ struct rdma_cm_id *cm_id;
+ struct rdma_ucm_event_resp resp;
+};
+
+static DEFINE_MUTEX(mut);
+static DEFINE_IDR(ctx_idr);
+static DEFINE_IDR(multicast_idr);
+
+static inline struct ucma_context *_ucma_find_context(int id,
+ struct ucma_file *file)
+{
+ struct ucma_context *ctx;
+
+ ctx = idr_find(&ctx_idr, id);
+ if (!ctx)
+ ctx = ERR_PTR(-ENOENT);
+ else if (ctx->file != file)
+ ctx = ERR_PTR(-EINVAL);
+ return ctx;
+}
+
+static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
+{
+ struct ucma_context *ctx;
+
+ mutex_lock(&mut);
+ ctx = _ucma_find_context(id, file);
+ if (!IS_ERR(ctx))
+ atomic_inc(&ctx->ref);
+ mutex_unlock(&mut);
+ return ctx;
+}
+
+static void ucma_put_ctx(struct ucma_context *ctx)
+{
+ if (atomic_dec_and_test(&ctx->ref))
+ complete(&ctx->comp);
+}
+
+static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
+{
+ struct ucma_context *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+
+ atomic_set(&ctx->ref, 1);
+ init_completion(&ctx->comp);
+ INIT_LIST_HEAD(&ctx->mc_list);
+ ctx->file = file;
+
+ mutex_lock(&mut);
+ ctx->id = idr_alloc(&ctx_idr, ctx, 0, 0, GFP_KERNEL);
+ mutex_unlock(&mut);
+ if (ctx->id < 0)
+ goto error;
+
+ list_add_tail(&ctx->list, &file->ctx_list);
+ return ctx;
+
+error:
+ kfree(ctx);
+ return NULL;
+}
+
+static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx)
+{
+ struct ucma_multicast *mc;
+
+ mc = kzalloc(sizeof(*mc), GFP_KERNEL);
+ if (!mc)
+ return NULL;
+
+ mutex_lock(&mut);
+ mc->id = idr_alloc(&multicast_idr, mc, 0, 0, GFP_KERNEL);
+ mutex_unlock(&mut);
+ if (mc->id < 0)
+ goto error;
+
+ mc->ctx = ctx;
+ list_add_tail(&mc->list, &ctx->mc_list);
+ return mc;
+
+error:
+ kfree(mc);
+ return NULL;
+}
+
+static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst,
+ struct rdma_conn_param *src)
+{
+ if (src->private_data_len)
+ memcpy(dst->private_data, src->private_data,
+ src->private_data_len);
+ dst->private_data_len = src->private_data_len;
+ dst->responder_resources =src->responder_resources;
+ dst->initiator_depth = src->initiator_depth;
+ dst->flow_control = src->flow_control;
+ dst->retry_count = src->retry_count;
+ dst->rnr_retry_count = src->rnr_retry_count;
+ dst->srq = src->srq;
+ dst->qp_num = src->qp_num;
+}
+
+static void ucma_copy_ud_event(struct rdma_ucm_ud_param *dst,
+ struct rdma_ud_param *src)
+{
+ if (src->private_data_len)
+ memcpy(dst->private_data, src->private_data,
+ src->private_data_len);
+ dst->private_data_len = src->private_data_len;
+ ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr);
+ dst->qp_num = src->qp_num;
+ dst->qkey = src->qkey;
+}
+
+static void ucma_set_event_context(struct ucma_context *ctx,
+ struct rdma_cm_event *event,
+ struct ucma_event *uevent)
+{
+ uevent->ctx = ctx;
+ switch (event->event) {
+ case RDMA_CM_EVENT_MULTICAST_JOIN:
+ case RDMA_CM_EVENT_MULTICAST_ERROR:
+ uevent->mc = (struct ucma_multicast *)
+ event->param.ud.private_data;
+ uevent->resp.uid = uevent->mc->uid;
+ uevent->resp.id = uevent->mc->id;
+ break;
+ default:
+ uevent->resp.uid = ctx->uid;
+ uevent->resp.id = ctx->id;
+ break;
+ }
+}
+
+static int ucma_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ struct ucma_event *uevent;
+ struct ucma_context *ctx = cm_id->context;
+ int ret = 0;
+
+ uevent = kzalloc(sizeof(*uevent), GFP_KERNEL);
+ if (!uevent)
+ return event->event == RDMA_CM_EVENT_CONNECT_REQUEST;
+
+ mutex_lock(&ctx->file->mut);
+ uevent->cm_id = cm_id;
+ ucma_set_event_context(ctx, event, uevent);
+ uevent->resp.event = event->event;
+ uevent->resp.status = event->status;
+ if (cm_id->qp_type == IB_QPT_UD)
+ ucma_copy_ud_event(&uevent->resp.param.ud, &event->param.ud);
+ else
+ ucma_copy_conn_event(&uevent->resp.param.conn,
+ &event->param.conn);
+
+ if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+ if (!ctx->backlog) {
+ ret = -ENOMEM;
+ kfree(uevent);
+ goto out;
+ }
+ ctx->backlog--;
+ } else if (!ctx->uid || ctx->cm_id != cm_id) {
+ /*
+ * We ignore events for new connections until userspace has set
+ * their context. This can only happen if an error occurs on a
+ * new connection before the user accepts it. This is okay,
+ * since the accept will just fail later.
+ */
+ kfree(uevent);
+ goto out;
+ }
+
+ list_add_tail(&uevent->list, &ctx->file->event_list);
+ wake_up_interruptible(&ctx->file->poll_wait);
+out:
+ mutex_unlock(&ctx->file->mut);
+ return ret;
+}
+
+static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct ucma_context *ctx;
+ struct rdma_ucm_get_event cmd;
+ struct ucma_event *uevent;
+ int ret = 0;
+
+ if (out_len < sizeof uevent->resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&file->mut);
+ while (list_empty(&file->event_list)) {
+ mutex_unlock(&file->mut);
+
+ if (file->filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(file->poll_wait,
+ !list_empty(&file->event_list)))
+ return -ERESTARTSYS;
+
+ mutex_lock(&file->mut);
+ }
+
+ uevent = list_entry(file->event_list.next, struct ucma_event, list);
+
+ if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+ ctx = ucma_alloc_ctx(file);
+ if (!ctx) {
+ ret = -ENOMEM;
+ goto done;
+ }
+ uevent->ctx->backlog++;
+ ctx->cm_id = uevent->cm_id;
+ ctx->cm_id->context = ctx;
+ uevent->resp.id = ctx->id;
+ }
+
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &uevent->resp, sizeof uevent->resp)) {
+ ret = -EFAULT;
+ goto done;
+ }
+
+ list_del(&uevent->list);
+ uevent->ctx->events_reported++;
+ if (uevent->mc)
+ uevent->mc->events_reported++;
+ kfree(uevent);
+done:
+ mutex_unlock(&file->mut);
+ return ret;
+}
+
+static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type)
+{
+ switch (cmd->ps) {
+ case RDMA_PS_TCP:
+ *qp_type = IB_QPT_RC;
+ return 0;
+ case RDMA_PS_UDP:
+ case RDMA_PS_IPOIB:
+ *qp_type = IB_QPT_UD;
+ return 0;
+ case RDMA_PS_IB:
+ *qp_type = cmd->qp_type;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_create_id cmd;
+ struct rdma_ucm_create_id_resp resp;
+ struct ucma_context *ctx;
+ enum ib_qp_type qp_type;
+ int ret;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ret = ucma_get_qp_type(&cmd, &qp_type);
+ if (ret)
+ return ret;
+
+ mutex_lock(&file->mut);
+ ctx = ucma_alloc_ctx(file);
+ mutex_unlock(&file->mut);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->uid = cmd.uid;
+ ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps, qp_type);
+ if (IS_ERR(ctx->cm_id)) {
+ ret = PTR_ERR(ctx->cm_id);
+ goto err1;
+ }
+
+ resp.id = ctx->id;
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp))) {
+ ret = -EFAULT;
+ goto err2;
+ }
+ return 0;
+
+err2:
+ rdma_destroy_id(ctx->cm_id);
+err1:
+ mutex_lock(&mut);
+ idr_remove(&ctx_idr, ctx->id);
+ mutex_unlock(&mut);
+ kfree(ctx);
+ return ret;
+}
+
+static void ucma_cleanup_multicast(struct ucma_context *ctx)
+{
+ struct ucma_multicast *mc, *tmp;
+
+ mutex_lock(&mut);
+ list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) {
+ list_del(&mc->list);
+ idr_remove(&multicast_idr, mc->id);
+ kfree(mc);
+ }
+ mutex_unlock(&mut);
+}
+
+static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
+{
+ struct ucma_event *uevent, *tmp;
+
+ list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) {
+ if (uevent->mc != mc)
+ continue;
+
+ list_del(&uevent->list);
+ kfree(uevent);
+ }
+}
+
+/*
+ * We cannot hold file->mut when calling rdma_destroy_id() or we can
+ * deadlock. We also acquire file->mut in ucma_event_handler(), and
+ * rdma_destroy_id() will wait until all callbacks have completed.
+ */
+static int ucma_free_ctx(struct ucma_context *ctx)
+{
+ int events_reported;
+ struct ucma_event *uevent, *tmp;
+ LIST_HEAD(list);
+
+ /* No new events will be generated after destroying the id. */
+ rdma_destroy_id(ctx->cm_id);
+
+ ucma_cleanup_multicast(ctx);
+
+ /* Cleanup events not yet reported to the user. */
+ mutex_lock(&ctx->file->mut);
+ list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) {
+ if (uevent->ctx == ctx)
+ list_move_tail(&uevent->list, &list);
+ }
+ list_del(&ctx->list);
+ mutex_unlock(&ctx->file->mut);
+
+ list_for_each_entry_safe(uevent, tmp, &list, list) {
+ list_del(&uevent->list);
+ if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST)
+ rdma_destroy_id(uevent->cm_id);
+ kfree(uevent);
+ }
+
+ events_reported = ctx->events_reported;
+ kfree(ctx);
+ return events_reported;
+}
+
+static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_destroy_id cmd;
+ struct rdma_ucm_destroy_id_resp resp;
+ struct ucma_context *ctx;
+ int ret = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&mut);
+ ctx = _ucma_find_context(cmd.id, file);
+ if (!IS_ERR(ctx))
+ idr_remove(&ctx_idr, ctx->id);
+ mutex_unlock(&mut);
+
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ucma_put_ctx(ctx);
+ wait_for_completion(&ctx->comp);
+ resp.events_reported = ucma_free_ctx(ctx);
+
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_bind_ip cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_bind cmd;
+ struct sockaddr *addr;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ addr = (struct sockaddr *) &cmd.addr;
+ if (cmd.reserved || !cmd.addr_size || (cmd.addr_size != rdma_addr_size(addr)))
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = rdma_bind_addr(ctx->cm_id, addr);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_resolve_ip(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_resolve_ip cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
+ (struct sockaddr *) &cmd.dst_addr,
+ cmd.timeout_ms);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_resolve_addr(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_resolve_addr cmd;
+ struct sockaddr *src, *dst;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ src = (struct sockaddr *) &cmd.src_addr;
+ dst = (struct sockaddr *) &cmd.dst_addr;
+ if (cmd.reserved || (cmd.src_size && (cmd.src_size != rdma_addr_size(src))) ||
+ !cmd.dst_size || (cmd.dst_size != rdma_addr_size(dst)))
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = rdma_resolve_addr(ctx->cm_id, src, dst, cmd.timeout_ms);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_resolve_route(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_resolve_route cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,
+ struct rdma_route *route)
+{
+ struct rdma_dev_addr *dev_addr;
+
+ resp->num_paths = route->num_paths;
+ switch (route->num_paths) {
+ case 0:
+ dev_addr = &route->addr.dev_addr;
+ rdma_addr_get_dgid(dev_addr,
+ (union ib_gid *) &resp->ib_route[0].dgid);
+ rdma_addr_get_sgid(dev_addr,
+ (union ib_gid *) &resp->ib_route[0].sgid);
+ resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+ break;
+ case 2:
+ ib_copy_path_rec_to_user(&resp->ib_route[1],
+ &route->path_rec[1]);
+ /* fall through */
+ case 1:
+ ib_copy_path_rec_to_user(&resp->ib_route[0],
+ &route->path_rec[0]);
+ break;
+ default:
+ break;
+ }
+}
+
+static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp,
+ struct rdma_route *route)
+{
+
+ resp->num_paths = route->num_paths;
+ switch (route->num_paths) {
+ case 0:
+ rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr,
+ (union ib_gid *)&resp->ib_route[0].dgid);
+ rdma_ip2gid((struct sockaddr *)&route->addr.src_addr,
+ (union ib_gid *)&resp->ib_route[0].sgid);
+ resp->ib_route[0].pkey = cpu_to_be16(0xffff);
+ break;
+ case 2:
+ ib_copy_path_rec_to_user(&resp->ib_route[1],
+ &route->path_rec[1]);
+ /* fall through */
+ case 1:
+ ib_copy_path_rec_to_user(&resp->ib_route[0],
+ &route->path_rec[0]);
+ break;
+ default:
+ break;
+ }
+}
+
+static void ucma_copy_iw_route(struct rdma_ucm_query_route_resp *resp,
+ struct rdma_route *route)
+{
+ struct rdma_dev_addr *dev_addr;
+
+ dev_addr = &route->addr.dev_addr;
+ rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid);
+ rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid);
+}
+
+static ssize_t ucma_query_route(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_query cmd;
+ struct rdma_ucm_query_route_resp resp;
+ struct ucma_context *ctx;
+ struct sockaddr *addr;
+ int ret = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ memset(&resp, 0, sizeof resp);
+ addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
+ memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ?
+ sizeof(struct sockaddr_in) :
+ sizeof(struct sockaddr_in6));
+ addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
+ memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ?
+ sizeof(struct sockaddr_in) :
+ sizeof(struct sockaddr_in6));
+ if (!ctx->cm_id->device)
+ goto out;
+
+ resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;
+ resp.port_num = ctx->cm_id->port_num;
+ switch (rdma_node_get_transport(ctx->cm_id->device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ switch (rdma_port_get_link_layer(ctx->cm_id->device,
+ ctx->cm_id->port_num)) {
+ case IB_LINK_LAYER_INFINIBAND:
+ ucma_copy_ib_route(&resp, &ctx->cm_id->route);
+ break;
+ case IB_LINK_LAYER_ETHERNET:
+ ucma_copy_iboe_route(&resp, &ctx->cm_id->route);
+ break;
+ default:
+ break;
+ }
+ break;
+ case RDMA_TRANSPORT_IWARP:
+ ucma_copy_iw_route(&resp, &ctx->cm_id->route);
+ break;
+ default:
+ break;
+ }
+
+out:
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static void ucma_query_device_addr(struct rdma_cm_id *cm_id,
+ struct rdma_ucm_query_addr_resp *resp)
+{
+ if (!cm_id->device)
+ return;
+
+ resp->node_guid = (__force __u64) cm_id->device->node_guid;
+ resp->port_num = cm_id->port_num;
+ resp->pkey = (__force __u16) cpu_to_be16(
+ ib_addr_get_pkey(&cm_id->route.addr.dev_addr));
+}
+
+static ssize_t ucma_query_addr(struct ucma_context *ctx,
+ void __user *response, int out_len)
+{
+ struct rdma_ucm_query_addr_resp resp;
+ struct sockaddr *addr;
+ int ret = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ memset(&resp, 0, sizeof resp);
+
+ addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
+ resp.src_size = rdma_addr_size(addr);
+ memcpy(&resp.src_addr, addr, resp.src_size);
+
+ addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
+ resp.dst_size = rdma_addr_size(addr);
+ memcpy(&resp.dst_addr, addr, resp.dst_size);
+
+ ucma_query_device_addr(ctx->cm_id, &resp);
+
+ if (copy_to_user(response, &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static ssize_t ucma_query_path(struct ucma_context *ctx,
+ void __user *response, int out_len)
+{
+ struct rdma_ucm_query_path_resp *resp;
+ int i, ret = 0;
+
+ if (out_len < sizeof(*resp))
+ return -ENOSPC;
+
+ resp = kzalloc(out_len, GFP_KERNEL);
+ if (!resp)
+ return -ENOMEM;
+
+ resp->num_paths = ctx->cm_id->route.num_paths;
+ for (i = 0, out_len -= sizeof(*resp);
+ i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data);
+ i++, out_len -= sizeof(struct ib_path_rec_data)) {
+
+ resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY |
+ IB_PATH_BIDIRECTIONAL;
+ ib_sa_pack_path(&ctx->cm_id->route.path_rec[i],
+ &resp->path_data[i].path_rec);
+ }
+
+ if (copy_to_user(response, resp,
+ sizeof(*resp) + (i * sizeof(struct ib_path_rec_data))))
+ ret = -EFAULT;
+
+ kfree(resp);
+ return ret;
+}
+
+static ssize_t ucma_query_gid(struct ucma_context *ctx,
+ void __user *response, int out_len)
+{
+ struct rdma_ucm_query_addr_resp resp;
+ struct sockaddr_ib *addr;
+ int ret = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ memset(&resp, 0, sizeof resp);
+
+ ucma_query_device_addr(ctx->cm_id, &resp);
+
+ addr = (struct sockaddr_ib *) &resp.src_addr;
+ resp.src_size = sizeof(*addr);
+ if (ctx->cm_id->route.addr.src_addr.ss_family == AF_IB) {
+ memcpy(addr, &ctx->cm_id->route.addr.src_addr, resp.src_size);
+ } else {
+ addr->sib_family = AF_IB;
+ addr->sib_pkey = (__force __be16) resp.pkey;
+ rdma_addr_get_sgid(&ctx->cm_id->route.addr.dev_addr,
+ (union ib_gid *) &addr->sib_addr);
+ addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
+ &ctx->cm_id->route.addr.src_addr);
+ }
+
+ addr = (struct sockaddr_ib *) &resp.dst_addr;
+ resp.dst_size = sizeof(*addr);
+ if (ctx->cm_id->route.addr.dst_addr.ss_family == AF_IB) {
+ memcpy(addr, &ctx->cm_id->route.addr.dst_addr, resp.dst_size);
+ } else {
+ addr->sib_family = AF_IB;
+ addr->sib_pkey = (__force __be16) resp.pkey;
+ rdma_addr_get_dgid(&ctx->cm_id->route.addr.dev_addr,
+ (union ib_gid *) &addr->sib_addr);
+ addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
+ &ctx->cm_id->route.addr.dst_addr);
+ }
+
+ if (copy_to_user(response, &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static ssize_t ucma_query(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_query cmd;
+ struct ucma_context *ctx;
+ void __user *response;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ response = (void __user *)(unsigned long) cmd.response;
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ switch (cmd.option) {
+ case RDMA_USER_CM_QUERY_ADDR:
+ ret = ucma_query_addr(ctx, response, out_len);
+ break;
+ case RDMA_USER_CM_QUERY_PATH:
+ ret = ucma_query_path(ctx, response, out_len);
+ break;
+ case RDMA_USER_CM_QUERY_GID:
+ ret = ucma_query_gid(ctx, response, out_len);
+ break;
+ default:
+ ret = -ENOSYS;
+ break;
+ }
+
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static void ucma_copy_conn_param(struct rdma_cm_id *id,
+ struct rdma_conn_param *dst,
+ struct rdma_ucm_conn_param *src)
+{
+ dst->private_data = src->private_data;
+ dst->private_data_len = src->private_data_len;
+ dst->responder_resources =src->responder_resources;
+ dst->initiator_depth = src->initiator_depth;
+ dst->flow_control = src->flow_control;
+ dst->retry_count = src->retry_count;
+ dst->rnr_retry_count = src->rnr_retry_count;
+ dst->srq = src->srq;
+ dst->qp_num = src->qp_num;
+ dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0;
+}
+
+static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_connect cmd;
+ struct rdma_conn_param conn_param;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ if (!cmd.conn_param.valid)
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
+ ret = rdma_connect(ctx->cm_id, &conn_param);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_listen cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ctx->backlog = cmd.backlog > 0 && cmd.backlog < max_backlog ?
+ cmd.backlog : max_backlog;
+ ret = rdma_listen(ctx->cm_id, ctx->backlog);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_accept cmd;
+ struct rdma_conn_param conn_param;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ if (cmd.conn_param.valid) {
+ ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
+ mutex_lock(&file->mut);
+ ret = rdma_accept(ctx->cm_id, &conn_param);
+ if (!ret)
+ ctx->uid = cmd.uid;
+ mutex_unlock(&file->mut);
+ } else
+ ret = rdma_accept(ctx->cm_id, NULL);
+
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_reject cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_disconnect cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = rdma_disconnect(ctx->cm_id);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_init_qp_attr(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_init_qp_attr cmd;
+ struct ib_uverbs_qp_attr resp;
+ struct ucma_context *ctx;
+ struct ib_qp_attr qp_attr;
+ int ret;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ resp.qp_attr_mask = 0;
+ memset(&qp_attr, 0, sizeof qp_attr);
+ qp_attr.qp_state = cmd.qp_state;
+ ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+ if (ret)
+ goto out;
+
+ ib_copy_qp_attr_to_user(&resp, &qp_attr);
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+out:
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static int ucma_set_option_id(struct ucma_context *ctx, int optname,
+ void *optval, size_t optlen)
+{
+ int ret = 0;
+
+ switch (optname) {
+ case RDMA_OPTION_ID_TOS:
+ if (optlen != sizeof(u8)) {
+ ret = -EINVAL;
+ break;
+ }
+ rdma_set_service_type(ctx->cm_id, *((u8 *) optval));
+ break;
+ case RDMA_OPTION_ID_REUSEADDR:
+ if (optlen != sizeof(int)) {
+ ret = -EINVAL;
+ break;
+ }
+ ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0);
+ break;
+ case RDMA_OPTION_ID_AFONLY:
+ if (optlen != sizeof(int)) {
+ ret = -EINVAL;
+ break;
+ }
+ ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ return ret;
+}
+
+static int ucma_set_ib_path(struct ucma_context *ctx,
+ struct ib_path_rec_data *path_data, size_t optlen)
+{
+ struct ib_sa_path_rec sa_path;
+ struct rdma_cm_event event;
+ int ret;
+
+ if (optlen % sizeof(*path_data))
+ return -EINVAL;
+
+ for (; optlen; optlen -= sizeof(*path_data), path_data++) {
+ if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY |
+ IB_PATH_BIDIRECTIONAL))
+ break;
+ }
+
+ if (!optlen)
+ return -EINVAL;
+
+ memset(&sa_path, 0, sizeof(sa_path));
+ sa_path.vlan_id = 0xffff;
+
+ ib_sa_unpack_path(path_data->path_rec, &sa_path);
+ ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1);
+ if (ret)
+ return ret;
+
+ memset(&event, 0, sizeof event);
+ event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+ return ucma_event_handler(ctx->cm_id, &event);
+}
+
+static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
+ void *optval, size_t optlen)
+{
+ int ret;
+
+ switch (optname) {
+ case RDMA_OPTION_IB_PATH:
+ ret = ucma_set_ib_path(ctx, optval, optlen);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ return ret;
+}
+
+static int ucma_set_option_level(struct ucma_context *ctx, int level,
+ int optname, void *optval, size_t optlen)
+{
+ int ret;
+
+ switch (level) {
+ case RDMA_OPTION_ID:
+ ret = ucma_set_option_id(ctx, optname, optval, optlen);
+ break;
+ case RDMA_OPTION_IB:
+ ret = ucma_set_option_ib(ctx, optname, optval, optlen);
+ break;
+ default:
+ ret = -ENOSYS;
+ }
+
+ return ret;
+}
+
+static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_set_option cmd;
+ struct ucma_context *ctx;
+ void *optval;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ optval = memdup_user((void __user *) (unsigned long) cmd.optval,
+ cmd.optlen);
+ if (IS_ERR(optval)) {
+ ret = PTR_ERR(optval);
+ goto out;
+ }
+
+ ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval,
+ cmd.optlen);
+ kfree(optval);
+
+out:
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_notify cmd;
+ struct ucma_context *ctx;
+ int ret;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ ctx = ucma_get_ctx(file, cmd.id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ ret = rdma_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_process_join(struct ucma_file *file,
+ struct rdma_ucm_join_mcast *cmd, int out_len)
+{
+ struct rdma_ucm_create_id_resp resp;
+ struct ucma_context *ctx;
+ struct ucma_multicast *mc;
+ struct sockaddr *addr;
+ int ret;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ addr = (struct sockaddr *) &cmd->addr;
+ if (cmd->reserved || !cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr)))
+ return -EINVAL;
+
+ ctx = ucma_get_ctx(file, cmd->id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ mutex_lock(&file->mut);
+ mc = ucma_alloc_multicast(ctx);
+ if (!mc) {
+ ret = -ENOMEM;
+ goto err1;
+ }
+
+ mc->uid = cmd->uid;
+ memcpy(&mc->addr, addr, cmd->addr_size);
+ ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc);
+ if (ret)
+ goto err2;
+
+ resp.id = mc->id;
+ if (copy_to_user((void __user *)(unsigned long) cmd->response,
+ &resp, sizeof(resp))) {
+ ret = -EFAULT;
+ goto err3;
+ }
+
+ mutex_unlock(&file->mut);
+ ucma_put_ctx(ctx);
+ return 0;
+
+err3:
+ rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr);
+ ucma_cleanup_mc_events(mc);
+err2:
+ mutex_lock(&mut);
+ idr_remove(&multicast_idr, mc->id);
+ mutex_unlock(&mut);
+ list_del(&mc->list);
+ kfree(mc);
+err1:
+ mutex_unlock(&file->mut);
+ ucma_put_ctx(ctx);
+ return ret;
+}
+
+static ssize_t ucma_join_ip_multicast(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_join_ip_mcast cmd;
+ struct rdma_ucm_join_mcast join_cmd;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ join_cmd.response = cmd.response;
+ join_cmd.uid = cmd.uid;
+ join_cmd.id = cmd.id;
+ join_cmd.addr_size = rdma_addr_size((struct sockaddr *) &cmd.addr);
+ join_cmd.reserved = 0;
+ memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size);
+
+ return ucma_process_join(file, &join_cmd, out_len);
+}
+
+static ssize_t ucma_join_multicast(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_join_mcast cmd;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ return ucma_process_join(file, &cmd, out_len);
+}
+
+static ssize_t ucma_leave_multicast(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_destroy_id cmd;
+ struct rdma_ucm_destroy_id_resp resp;
+ struct ucma_multicast *mc;
+ int ret = 0;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ mutex_lock(&mut);
+ mc = idr_find(&multicast_idr, cmd.id);
+ if (!mc)
+ mc = ERR_PTR(-ENOENT);
+ else if (mc->ctx->file != file)
+ mc = ERR_PTR(-EINVAL);
+ else {
+ idr_remove(&multicast_idr, mc->id);
+ atomic_inc(&mc->ctx->ref);
+ }
+ mutex_unlock(&mut);
+
+ if (IS_ERR(mc)) {
+ ret = PTR_ERR(mc);
+ goto out;
+ }
+
+ rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr);
+ mutex_lock(&mc->ctx->file->mut);
+ ucma_cleanup_mc_events(mc);
+ list_del(&mc->list);
+ mutex_unlock(&mc->ctx->file->mut);
+
+ ucma_put_ctx(mc->ctx);
+ resp.events_reported = mc->events_reported;
+ kfree(mc);
+
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+out:
+ return ret;
+}
+
+static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+ /* Acquire mutex's based on pointer comparison to prevent deadlock. */
+ if (file1 < file2) {
+ mutex_lock(&file1->mut);
+ mutex_lock(&file2->mut);
+ } else {
+ mutex_lock(&file2->mut);
+ mutex_lock(&file1->mut);
+ }
+}
+
+static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+ if (file1 < file2) {
+ mutex_unlock(&file2->mut);
+ mutex_unlock(&file1->mut);
+ } else {
+ mutex_unlock(&file1->mut);
+ mutex_unlock(&file2->mut);
+ }
+}
+
+static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file)
+{
+ struct ucma_event *uevent, *tmp;
+
+ list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list)
+ if (uevent->ctx == ctx)
+ list_move_tail(&uevent->list, &file->event_list);
+}
+
+static ssize_t ucma_migrate_id(struct ucma_file *new_file,
+ const char __user *inbuf,
+ int in_len, int out_len)
+{
+ struct rdma_ucm_migrate_id cmd;
+ struct rdma_ucm_migrate_resp resp;
+ struct ucma_context *ctx;
+ struct fd f;
+ struct ucma_file *cur_file;
+ int ret = 0;
+
+ if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+ return -EFAULT;
+
+ /* Get current fd to protect against it being closed */
+ f = fdget(cmd.fd);
+ if (!f.file)
+ return -ENOENT;
+
+ /* Validate current fd and prevent destruction of id. */
+ ctx = ucma_get_ctx(f.file->private_data, cmd.id);
+ if (IS_ERR(ctx)) {
+ ret = PTR_ERR(ctx);
+ goto file_put;
+ }
+
+ cur_file = ctx->file;
+ if (cur_file == new_file) {
+ resp.events_reported = ctx->events_reported;
+ goto response;
+ }
+
+ /*
+ * Migrate events between fd's, maintaining order, and avoiding new
+ * events being added before existing events.
+ */
+ ucma_lock_files(cur_file, new_file);
+ mutex_lock(&mut);
+
+ list_move_tail(&ctx->list, &new_file->ctx_list);
+ ucma_move_events(ctx, new_file);
+ ctx->file = new_file;
+ resp.events_reported = ctx->events_reported;
+
+ mutex_unlock(&mut);
+ ucma_unlock_files(cur_file, new_file);
+
+response:
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+
+ ucma_put_ctx(ctx);
+file_put:
+ fdput(f);
+ return ret;
+}
+
+static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
+ const char __user *inbuf,
+ int in_len, int out_len) = {
+ [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id,
+ [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id,
+ [RDMA_USER_CM_CMD_BIND_IP] = ucma_bind_ip,
+ [RDMA_USER_CM_CMD_RESOLVE_IP] = ucma_resolve_ip,
+ [RDMA_USER_CM_CMD_RESOLVE_ROUTE] = ucma_resolve_route,
+ [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route,
+ [RDMA_USER_CM_CMD_CONNECT] = ucma_connect,
+ [RDMA_USER_CM_CMD_LISTEN] = ucma_listen,
+ [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept,
+ [RDMA_USER_CM_CMD_REJECT] = ucma_reject,
+ [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect,
+ [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr,
+ [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event,
+ [RDMA_USER_CM_CMD_GET_OPTION] = NULL,
+ [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option,
+ [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify,
+ [RDMA_USER_CM_CMD_JOIN_IP_MCAST] = ucma_join_ip_multicast,
+ [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast,
+ [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id,
+ [RDMA_USER_CM_CMD_QUERY] = ucma_query,
+ [RDMA_USER_CM_CMD_BIND] = ucma_bind,
+ [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr,
+ [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast
+};
+
+static ssize_t ucma_write(struct file *filp, const char __user *buf,
+ size_t len, loff_t *pos)
+{
+ struct ucma_file *file = filp->private_data;
+ struct rdma_ucm_cmd_hdr hdr;
+ ssize_t ret;
+
+ if (len < sizeof(hdr))
+ return -EINVAL;
+
+ if (copy_from_user(&hdr, buf, sizeof(hdr)))
+ return -EFAULT;
+
+ if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table))
+ return -EINVAL;
+
+ if (hdr.in + sizeof(hdr) > len)
+ return -EINVAL;
+
+ if (!ucma_cmd_table[hdr.cmd])
+ return -ENOSYS;
+
+ ret = ucma_cmd_table[hdr.cmd](file, buf + sizeof(hdr), hdr.in, hdr.out);
+ if (!ret)
+ ret = len;
+
+ return ret;
+}
+
+static unsigned int ucma_poll(struct file *filp, struct poll_table_struct *wait)
+{
+ struct ucma_file *file = filp->private_data;
+ unsigned int mask = 0;
+
+ poll_wait(filp, &file->poll_wait, wait);
+
+ if (!list_empty(&file->event_list))
+ mask = POLLIN | POLLRDNORM;
+
+ return mask;
+}
+
+/*
+ * ucma_open() does not need the BKL:
+ *
+ * - no global state is referred to;
+ * - there is no ioctl method to race against;
+ * - no further module initialization is required for open to work
+ * after the device is registered.
+ */
+static int ucma_open(struct inode *inode, struct file *filp)
+{
+ struct ucma_file *file;
+
+ file = kmalloc(sizeof *file, GFP_KERNEL);
+ if (!file)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&file->event_list);
+ INIT_LIST_HEAD(&file->ctx_list);
+ init_waitqueue_head(&file->poll_wait);
+ mutex_init(&file->mut);
+
+ filp->private_data = file;
+ file->filp = filp;
+
+ return nonseekable_open(inode, filp);
+}
+
+static int ucma_close(struct inode *inode, struct file *filp)
+{
+ struct ucma_file *file = filp->private_data;
+ struct ucma_context *ctx, *tmp;
+
+ mutex_lock(&file->mut);
+ list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
+ mutex_unlock(&file->mut);
+
+ mutex_lock(&mut);
+ idr_remove(&ctx_idr, ctx->id);
+ mutex_unlock(&mut);
+
+ ucma_free_ctx(ctx);
+ mutex_lock(&file->mut);
+ }
+ mutex_unlock(&file->mut);
+ kfree(file);
+ return 0;
+}
+
+static const struct file_operations ucma_fops = {
+ .owner = THIS_MODULE,
+ .open = ucma_open,
+ .release = ucma_close,
+ .write = ucma_write,
+ .poll = ucma_poll,
+ .llseek = no_llseek,
+};
+
+static struct miscdevice ucma_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "rdma_cm",
+ .nodename = "infiniband/rdma_cm",
+ .mode = 0666,
+ .fops = &ucma_fops,
+};
+
+static ssize_t show_abi_version(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", RDMA_USER_CM_ABI_VERSION);
+}
+static DEVICE_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static int __init ucma_init(void)
+{
+ int ret;
+
+ ret = misc_register(&ucma_misc);
+ if (ret)
+ return ret;
+
+ ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version);
+ if (ret) {
+ printk(KERN_ERR "rdma_ucm: couldn't create abi_version attr\n");
+ goto err1;
+ }
+
+ ucma_ctl_table_hdr = register_net_sysctl(&init_net, "net/rdma_ucm", ucma_ctl_table);
+ if (!ucma_ctl_table_hdr) {
+ printk(KERN_ERR "rdma_ucm: couldn't register sysctl paths\n");
+ ret = -ENOMEM;
+ goto err2;
+ }
+ return 0;
+err2:
+ device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
+err1:
+ misc_deregister(&ucma_misc);
+ return ret;
+}
+
+static void __exit ucma_cleanup(void)
+{
+ unregister_net_sysctl_table(ucma_ctl_table_hdr);
+ device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
+ misc_deregister(&ucma_misc);
+ idr_destroy(&ctx_idr);
+}
+
+module_init(ucma_init);
+module_exit(ucma_cleanup);
diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
new file mode 100644
index 000000000..72feee620
--- /dev/null
+++ b/drivers/infiniband/core/ud_header.c
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/export.h>
+#include <linux/if_ether.h>
+
+#include <rdma/ib_pack.h>
+
+#define STRUCT_FIELD(header, field) \
+ .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \
+ .struct_size_bytes = sizeof ((struct ib_unpacked_ ## header *) 0)->field, \
+ .field_name = #header ":" #field
+
+static const struct ib_field lrh_table[] = {
+ { STRUCT_FIELD(lrh, virtual_lane),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { STRUCT_FIELD(lrh, link_version),
+ .offset_words = 0,
+ .offset_bits = 4,
+ .size_bits = 4 },
+ { STRUCT_FIELD(lrh, service_level),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 4 },
+ { RESERVED,
+ .offset_words = 0,
+ .offset_bits = 12,
+ .size_bits = 2 },
+ { STRUCT_FIELD(lrh, link_next_header),
+ .offset_words = 0,
+ .offset_bits = 14,
+ .size_bits = 2 },
+ { STRUCT_FIELD(lrh, destination_lid),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { RESERVED,
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 5 },
+ { STRUCT_FIELD(lrh, packet_length),
+ .offset_words = 1,
+ .offset_bits = 5,
+ .size_bits = 11 },
+ { STRUCT_FIELD(lrh, source_lid),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 }
+};
+
+static const struct ib_field eth_table[] = {
+ { STRUCT_FIELD(eth, dmac_h),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { STRUCT_FIELD(eth, dmac_l),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(eth, smac_h),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { STRUCT_FIELD(eth, smac_l),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { STRUCT_FIELD(eth, type),
+ .offset_words = 3,
+ .offset_bits = 0,
+ .size_bits = 16 }
+};
+
+static const struct ib_field vlan_table[] = {
+ { STRUCT_FIELD(vlan, tag),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(vlan, type),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 }
+};
+
+static const struct ib_field grh_table[] = {
+ { STRUCT_FIELD(grh, ip_version),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 4 },
+ { STRUCT_FIELD(grh, traffic_class),
+ .offset_words = 0,
+ .offset_bits = 4,
+ .size_bits = 8 },
+ { STRUCT_FIELD(grh, flow_label),
+ .offset_words = 0,
+ .offset_bits = 12,
+ .size_bits = 20 },
+ { STRUCT_FIELD(grh, payload_length),
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 16 },
+ { STRUCT_FIELD(grh, next_header),
+ .offset_words = 1,
+ .offset_bits = 16,
+ .size_bits = 8 },
+ { STRUCT_FIELD(grh, hop_limit),
+ .offset_words = 1,
+ .offset_bits = 24,
+ .size_bits = 8 },
+ { STRUCT_FIELD(grh, source_gid),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 128 },
+ { STRUCT_FIELD(grh, destination_gid),
+ .offset_words = 6,
+ .offset_bits = 0,
+ .size_bits = 128 }
+};
+
+static const struct ib_field bth_table[] = {
+ { STRUCT_FIELD(bth, opcode),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(bth, solicited_event),
+ .offset_words = 0,
+ .offset_bits = 8,
+ .size_bits = 1 },
+ { STRUCT_FIELD(bth, mig_req),
+ .offset_words = 0,
+ .offset_bits = 9,
+ .size_bits = 1 },
+ { STRUCT_FIELD(bth, pad_count),
+ .offset_words = 0,
+ .offset_bits = 10,
+ .size_bits = 2 },
+ { STRUCT_FIELD(bth, transport_header_version),
+ .offset_words = 0,
+ .offset_bits = 12,
+ .size_bits = 4 },
+ { STRUCT_FIELD(bth, pkey),
+ .offset_words = 0,
+ .offset_bits = 16,
+ .size_bits = 16 },
+ { RESERVED,
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(bth, destination_qpn),
+ .offset_words = 1,
+ .offset_bits = 8,
+ .size_bits = 24 },
+ { STRUCT_FIELD(bth, ack_req),
+ .offset_words = 2,
+ .offset_bits = 0,
+ .size_bits = 1 },
+ { RESERVED,
+ .offset_words = 2,
+ .offset_bits = 1,
+ .size_bits = 7 },
+ { STRUCT_FIELD(bth, psn),
+ .offset_words = 2,
+ .offset_bits = 8,
+ .size_bits = 24 }
+};
+
+static const struct ib_field deth_table[] = {
+ { STRUCT_FIELD(deth, qkey),
+ .offset_words = 0,
+ .offset_bits = 0,
+ .size_bits = 32 },
+ { RESERVED,
+ .offset_words = 1,
+ .offset_bits = 0,
+ .size_bits = 8 },
+ { STRUCT_FIELD(deth, source_qpn),
+ .offset_words = 1,
+ .offset_bits = 8,
+ .size_bits = 24 }
+};
+
+/**
+ * ib_ud_header_init - Initialize UD header structure
+ * @payload_bytes:Length of packet payload
+ * @lrh_present: specify if LRH is present
+ * @eth_present: specify if Eth header is present
+ * @vlan_present: packet is tagged vlan
+ * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @immediate_present: specify if immediate data is present
+ * @header:Structure to initialize
+ */
+void ib_ud_header_init(int payload_bytes,
+ int lrh_present,
+ int eth_present,
+ int vlan_present,
+ int grh_present,
+ int immediate_present,
+ struct ib_ud_header *header)
+{
+ memset(header, 0, sizeof *header);
+
+ if (lrh_present) {
+ u16 packet_length;
+
+ header->lrh.link_version = 0;
+ header->lrh.link_next_header =
+ grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL;
+ packet_length = (IB_LRH_BYTES +
+ IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ (grh_present ? IB_GRH_BYTES : 0) +
+ payload_bytes +
+ 4 + /* ICRC */
+ 3) / 4; /* round up */
+ header->lrh.packet_length = cpu_to_be16(packet_length);
+ }
+
+ if (vlan_present)
+ header->eth.type = cpu_to_be16(ETH_P_8021Q);
+
+ if (grh_present) {
+ header->grh.ip_version = 6;
+ header->grh.payload_length =
+ cpu_to_be16((IB_BTH_BYTES +
+ IB_DETH_BYTES +
+ payload_bytes +
+ 4 + /* ICRC */
+ 3) & ~3); /* round up */
+ header->grh.next_header = 0x1b;
+ }
+
+ if (immediate_present)
+ header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+ else
+ header->bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+ header->bth.pad_count = (4 - payload_bytes) & 3;
+ header->bth.transport_header_version = 0;
+
+ header->lrh_present = lrh_present;
+ header->eth_present = eth_present;
+ header->vlan_present = vlan_present;
+ header->grh_present = grh_present;
+ header->immediate_present = immediate_present;
+}
+EXPORT_SYMBOL(ib_ud_header_init);
+
+/**
+ * ib_ud_header_pack - Pack UD header struct into wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() packs the UD header structure @header into wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_pack(struct ib_ud_header *header,
+ void *buf)
+{
+ int len = 0;
+
+ if (header->lrh_present) {
+ ib_pack(lrh_table, ARRAY_SIZE(lrh_table),
+ &header->lrh, buf + len);
+ len += IB_LRH_BYTES;
+ }
+ if (header->eth_present) {
+ ib_pack(eth_table, ARRAY_SIZE(eth_table),
+ &header->eth, buf + len);
+ len += IB_ETH_BYTES;
+ }
+ if (header->vlan_present) {
+ ib_pack(vlan_table, ARRAY_SIZE(vlan_table),
+ &header->vlan, buf + len);
+ len += IB_VLAN_BYTES;
+ }
+ if (header->grh_present) {
+ ib_pack(grh_table, ARRAY_SIZE(grh_table),
+ &header->grh, buf + len);
+ len += IB_GRH_BYTES;
+ }
+
+ ib_pack(bth_table, ARRAY_SIZE(bth_table),
+ &header->bth, buf + len);
+ len += IB_BTH_BYTES;
+
+ ib_pack(deth_table, ARRAY_SIZE(deth_table),
+ &header->deth, buf + len);
+ len += IB_DETH_BYTES;
+
+ if (header->immediate_present) {
+ memcpy(buf + len, &header->immediate_data, sizeof header->immediate_data);
+ len += sizeof header->immediate_data;
+ }
+
+ return len;
+}
+EXPORT_SYMBOL(ib_ud_header_pack);
+
+/**
+ * ib_ud_header_unpack - Unpack UD header struct from wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() unpacks the UD header structure @header from wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_unpack(void *buf,
+ struct ib_ud_header *header)
+{
+ ib_unpack(lrh_table, ARRAY_SIZE(lrh_table),
+ buf, &header->lrh);
+ buf += IB_LRH_BYTES;
+
+ if (header->lrh.link_version != 0) {
+ printk(KERN_WARNING "Invalid LRH.link_version %d\n",
+ header->lrh.link_version);
+ return -EINVAL;
+ }
+
+ switch (header->lrh.link_next_header) {
+ case IB_LNH_IBA_LOCAL:
+ header->grh_present = 0;
+ break;
+
+ case IB_LNH_IBA_GLOBAL:
+ header->grh_present = 1;
+ ib_unpack(grh_table, ARRAY_SIZE(grh_table),
+ buf, &header->grh);
+ buf += IB_GRH_BYTES;
+
+ if (header->grh.ip_version != 6) {
+ printk(KERN_WARNING "Invalid GRH.ip_version %d\n",
+ header->grh.ip_version);
+ return -EINVAL;
+ }
+ if (header->grh.next_header != 0x1b) {
+ printk(KERN_WARNING "Invalid GRH.next_header 0x%02x\n",
+ header->grh.next_header);
+ return -EINVAL;
+ }
+ break;
+
+ default:
+ printk(KERN_WARNING "Invalid LRH.link_next_header %d\n",
+ header->lrh.link_next_header);
+ return -EINVAL;
+ }
+
+ ib_unpack(bth_table, ARRAY_SIZE(bth_table),
+ buf, &header->bth);
+ buf += IB_BTH_BYTES;
+
+ switch (header->bth.opcode) {
+ case IB_OPCODE_UD_SEND_ONLY:
+ header->immediate_present = 0;
+ break;
+ case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE:
+ header->immediate_present = 1;
+ break;
+ default:
+ printk(KERN_WARNING "Invalid BTH.opcode 0x%02x\n",
+ header->bth.opcode);
+ return -EINVAL;
+ }
+
+ if (header->bth.transport_header_version != 0) {
+ printk(KERN_WARNING "Invalid BTH.transport_header_version %d\n",
+ header->bth.transport_header_version);
+ return -EINVAL;
+ }
+
+ ib_unpack(deth_table, ARRAY_SIZE(deth_table),
+ buf, &header->deth);
+ buf += IB_DETH_BYTES;
+
+ if (header->immediate_present)
+ memcpy(&header->immediate_data, buf, sizeof header->immediate_data);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_ud_header_unpack);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
new file mode 100644
index 000000000..38acb3cfc
--- /dev/null
+++ b/drivers/infiniband/core/umem.c
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/hugetlb.h>
+#include <linux/dma-attrs.h>
+#include <linux/slab.h>
+#include <rdma/ib_umem_odp.h>
+
+#include "uverbs.h"
+
+
+static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
+{
+ struct scatterlist *sg;
+ struct page *page;
+ int i;
+
+ if (umem->nmap > 0)
+ ib_dma_unmap_sg(dev, umem->sg_head.sgl,
+ umem->nmap,
+ DMA_BIDIRECTIONAL);
+
+ for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
+
+ page = sg_page(sg);
+ if (umem->writable && dirty)
+ set_page_dirty_lock(page);
+ put_page(page);
+ }
+
+ sg_free_table(&umem->sg_head);
+ return;
+
+}
+
+/**
+ * ib_umem_get - Pin and DMA map userspace memory.
+ *
+ * If access flags indicate ODP memory, avoid pinning. Instead, stores
+ * the mm for future page fault handling in conjunction with MMU notifiers.
+ *
+ * @context: userspace context to pin memory for
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ * @dmasync: flush in-flight DMA when the memory region is written
+ */
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+ size_t size, int access, int dmasync)
+{
+ struct ib_umem *umem;
+ struct page **page_list;
+ struct vm_area_struct **vma_list;
+ unsigned long locked;
+ unsigned long lock_limit;
+ unsigned long cur_base;
+ unsigned long npages;
+ int ret;
+ int i;
+ DEFINE_DMA_ATTRS(attrs);
+ struct scatterlist *sg, *sg_list_start;
+ int need_release = 0;
+
+ if (dmasync)
+ dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
+
+ if (!size)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * If the combination of the addr and size requested for this memory
+ * region causes an integer overflow, return error.
+ */
+ if (((addr + size) < addr) ||
+ PAGE_ALIGN(addr + size) < (addr + size))
+ return ERR_PTR(-EINVAL);
+
+ if (!can_do_mlock())
+ return ERR_PTR(-EPERM);
+
+ umem = kzalloc(sizeof *umem, GFP_KERNEL);
+ if (!umem)
+ return ERR_PTR(-ENOMEM);
+
+ umem->context = context;
+ umem->length = size;
+ umem->address = addr;
+ umem->page_size = PAGE_SIZE;
+ umem->pid = get_task_pid(current, PIDTYPE_PID);
+ /*
+ * We ask for writable memory if any of the following
+ * access flags are set. "Local write" and "remote write"
+ * obviously require write access. "Remote atomic" can do
+ * things like fetch and add, which will modify memory, and
+ * "MW bind" can change permissions by binding a window.
+ */
+ umem->writable = !!(access &
+ (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
+
+ if (access & IB_ACCESS_ON_DEMAND) {
+ ret = ib_umem_odp_get(context, umem);
+ if (ret) {
+ kfree(umem);
+ return ERR_PTR(ret);
+ }
+ return umem;
+ }
+
+ umem->odp_data = NULL;
+
+ /* We assume the memory is from hugetlb until proved otherwise */
+ umem->hugetlb = 1;
+
+ page_list = (struct page **) __get_free_page(GFP_KERNEL);
+ if (!page_list) {
+ kfree(umem);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /*
+ * if we can't alloc the vma_list, it's not so bad;
+ * just assume the memory is not hugetlb memory
+ */
+ vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
+ if (!vma_list)
+ umem->hugetlb = 0;
+
+ npages = ib_umem_num_pages(umem);
+
+ down_write(&current->mm->mmap_sem);
+
+ locked = npages + current->mm->pinned_vm;
+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ cur_base = addr & PAGE_MASK;
+
+ if (npages == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
+ if (ret)
+ goto out;
+
+ need_release = 1;
+ sg_list_start = umem->sg_head.sgl;
+
+ while (npages) {
+ ret = get_user_pages(current, current->mm, cur_base,
+ min_t(unsigned long, npages,
+ PAGE_SIZE / sizeof (struct page *)),
+ 1, !umem->writable, page_list, vma_list);
+
+ if (ret < 0)
+ goto out;
+
+ umem->npages += ret;
+ cur_base += ret * PAGE_SIZE;
+ npages -= ret;
+
+ for_each_sg(sg_list_start, sg, ret, i) {
+ if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
+ umem->hugetlb = 0;
+
+ sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
+ }
+
+ /* preparing for next loop */
+ sg_list_start = sg;
+ }
+
+ umem->nmap = ib_dma_map_sg_attrs(context->device,
+ umem->sg_head.sgl,
+ umem->npages,
+ DMA_BIDIRECTIONAL,
+ &attrs);
+
+ if (umem->nmap <= 0) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ if (ret < 0) {
+ if (need_release)
+ __ib_umem_release(context->device, umem, 0);
+ put_pid(umem->pid);
+ kfree(umem);
+ } else
+ current->mm->pinned_vm = locked;
+
+ up_write(&current->mm->mmap_sem);
+ if (vma_list)
+ free_page((unsigned long) vma_list);
+ free_page((unsigned long) page_list);
+
+ return ret < 0 ? ERR_PTR(ret) : umem;
+}
+EXPORT_SYMBOL(ib_umem_get);
+
+static void ib_umem_account(struct work_struct *work)
+{
+ struct ib_umem *umem = container_of(work, struct ib_umem, work);
+
+ down_write(&umem->mm->mmap_sem);
+ umem->mm->pinned_vm -= umem->diff;
+ up_write(&umem->mm->mmap_sem);
+ mmput(umem->mm);
+ kfree(umem);
+}
+
+/**
+ * ib_umem_release - release memory pinned with ib_umem_get
+ * @umem: umem struct to release
+ */
+void ib_umem_release(struct ib_umem *umem)
+{
+ struct ib_ucontext *context = umem->context;
+ struct mm_struct *mm;
+ struct task_struct *task;
+ unsigned long diff;
+
+ if (umem->odp_data) {
+ ib_umem_odp_release(umem);
+ return;
+ }
+
+ __ib_umem_release(umem->context->device, umem, 1);
+
+ task = get_pid_task(umem->pid, PIDTYPE_PID);
+ put_pid(umem->pid);
+ if (!task)
+ goto out;
+ mm = get_task_mm(task);
+ put_task_struct(task);
+ if (!mm)
+ goto out;
+
+ diff = ib_umem_num_pages(umem);
+
+ /*
+ * We may be called with the mm's mmap_sem already held. This
+ * can happen when a userspace munmap() is the call that drops
+ * the last reference to our file and calls our release
+ * method. If there are memory regions to destroy, we'll end
+ * up here and not be able to take the mmap_sem. In that case
+ * we defer the vm_locked accounting to the system workqueue.
+ */
+ if (context->closing) {
+ if (!down_write_trylock(&mm->mmap_sem)) {
+ INIT_WORK(&umem->work, ib_umem_account);
+ umem->mm = mm;
+ umem->diff = diff;
+
+ queue_work(ib_wq, &umem->work);
+ return;
+ }
+ } else
+ down_write(&mm->mmap_sem);
+
+ mm->pinned_vm -= diff;
+ up_write(&mm->mmap_sem);
+ mmput(mm);
+out:
+ kfree(umem);
+}
+EXPORT_SYMBOL(ib_umem_release);
+
+int ib_umem_page_count(struct ib_umem *umem)
+{
+ int shift;
+ int i;
+ int n;
+ struct scatterlist *sg;
+
+ if (umem->odp_data)
+ return ib_umem_num_pages(umem);
+
+ shift = ilog2(umem->page_size);
+
+ n = 0;
+ for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
+ n += sg_dma_len(sg) >> shift;
+
+ return n;
+}
+EXPORT_SYMBOL(ib_umem_page_count);
+
+/*
+ * Copy from the given ib_umem's pages to the given buffer.
+ *
+ * umem - the umem to copy from
+ * offset - offset to start copying from
+ * dst - destination buffer
+ * length - buffer length
+ *
+ * Returns 0 on success, or an error code.
+ */
+int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
+ size_t length)
+{
+ size_t end = offset + length;
+ int ret;
+
+ if (offset > umem->length || length > umem->length - offset) {
+ pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
+ offset, umem->length, end);
+ return -EINVAL;
+ }
+
+ ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length,
+ offset + ib_umem_offset(umem));
+
+ if (ret < 0)
+ return ret;
+ else if (ret != length)
+ return -EINVAL;
+ else
+ return 0;
+}
+EXPORT_SYMBOL(ib_umem_copy_from);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
new file mode 100644
index 000000000..40becdb31
--- /dev/null
+++ b/drivers/infiniband/core/umem_odp.c
@@ -0,0 +1,669 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/pid.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_umem_odp.h>
+
+static void ib_umem_notifier_start_account(struct ib_umem *item)
+{
+ mutex_lock(&item->odp_data->umem_mutex);
+
+ /* Only update private counters for this umem if it has them.
+ * Otherwise skip it. All page faults will be delayed for this umem. */
+ if (item->odp_data->mn_counters_active) {
+ int notifiers_count = item->odp_data->notifiers_count++;
+
+ if (notifiers_count == 0)
+ /* Initialize the completion object for waiting on
+ * notifiers. Since notifier_count is zero, no one
+ * should be waiting right now. */
+ reinit_completion(&item->odp_data->notifier_completion);
+ }
+ mutex_unlock(&item->odp_data->umem_mutex);
+}
+
+static void ib_umem_notifier_end_account(struct ib_umem *item)
+{
+ mutex_lock(&item->odp_data->umem_mutex);
+
+ /* Only update private counters for this umem if it has them.
+ * Otherwise skip it. All page faults will be delayed for this umem. */
+ if (item->odp_data->mn_counters_active) {
+ /*
+ * This sequence increase will notify the QP page fault that
+ * the page that is going to be mapped in the spte could have
+ * been freed.
+ */
+ ++item->odp_data->notifiers_seq;
+ if (--item->odp_data->notifiers_count == 0)
+ complete_all(&item->odp_data->notifier_completion);
+ }
+ mutex_unlock(&item->odp_data->umem_mutex);
+}
+
+/* Account for a new mmu notifier in an ib_ucontext. */
+static void ib_ucontext_notifier_start_account(struct ib_ucontext *context)
+{
+ atomic_inc(&context->notifier_count);
+}
+
+/* Account for a terminating mmu notifier in an ib_ucontext.
+ *
+ * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
+ * the function takes the semaphore itself. */
+static void ib_ucontext_notifier_end_account(struct ib_ucontext *context)
+{
+ int zero_notifiers = atomic_dec_and_test(&context->notifier_count);
+
+ if (zero_notifiers &&
+ !list_empty(&context->no_private_counters)) {
+ /* No currently running mmu notifiers. Now is the chance to
+ * add private accounting to all previously added umems. */
+ struct ib_umem_odp *odp_data, *next;
+
+ /* Prevent concurrent mmu notifiers from working on the
+ * no_private_counters list. */
+ down_write(&context->umem_rwsem);
+
+ /* Read the notifier_count again, with the umem_rwsem
+ * semaphore taken for write. */
+ if (!atomic_read(&context->notifier_count)) {
+ list_for_each_entry_safe(odp_data, next,
+ &context->no_private_counters,
+ no_private_counters) {
+ mutex_lock(&odp_data->umem_mutex);
+ odp_data->mn_counters_active = true;
+ list_del(&odp_data->no_private_counters);
+ complete_all(&odp_data->notifier_completion);
+ mutex_unlock(&odp_data->umem_mutex);
+ }
+ }
+
+ up_write(&context->umem_rwsem);
+ }
+}
+
+static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start,
+ u64 end, void *cookie) {
+ /*
+ * Increase the number of notifiers running, to
+ * prevent any further fault handling on this MR.
+ */
+ ib_umem_notifier_start_account(item);
+ item->odp_data->dying = 1;
+ /* Make sure that the fact the umem is dying is out before we release
+ * all pending page faults. */
+ smp_wmb();
+ complete_all(&item->odp_data->notifier_completion);
+ item->context->invalidate_range(item, ib_umem_start(item),
+ ib_umem_end(item));
+ return 0;
+}
+
+static void ib_umem_notifier_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+ if (!context->invalidate_range)
+ return;
+
+ ib_ucontext_notifier_start_account(context);
+ down_read(&context->umem_rwsem);
+ rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
+ ULLONG_MAX,
+ ib_umem_notifier_release_trampoline,
+ NULL);
+ up_read(&context->umem_rwsem);
+}
+
+static int invalidate_page_trampoline(struct ib_umem *item, u64 start,
+ u64 end, void *cookie)
+{
+ ib_umem_notifier_start_account(item);
+ item->context->invalidate_range(item, start, start + PAGE_SIZE);
+ ib_umem_notifier_end_account(item);
+ return 0;
+}
+
+static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+ if (!context->invalidate_range)
+ return;
+
+ ib_ucontext_notifier_start_account(context);
+ down_read(&context->umem_rwsem);
+ rbt_ib_umem_for_each_in_range(&context->umem_tree, address,
+ address + PAGE_SIZE,
+ invalidate_page_trampoline, NULL);
+ up_read(&context->umem_rwsem);
+ ib_ucontext_notifier_end_account(context);
+}
+
+static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
+ u64 end, void *cookie)
+{
+ ib_umem_notifier_start_account(item);
+ item->context->invalidate_range(item, start, end);
+ return 0;
+}
+
+static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+ if (!context->invalidate_range)
+ return;
+
+ ib_ucontext_notifier_start_account(context);
+ down_read(&context->umem_rwsem);
+ rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
+ end,
+ invalidate_range_start_trampoline, NULL);
+ up_read(&context->umem_rwsem);
+}
+
+static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
+ u64 end, void *cookie)
+{
+ ib_umem_notifier_end_account(item);
+ return 0;
+}
+
+static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long end)
+{
+ struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+ if (!context->invalidate_range)
+ return;
+
+ down_read(&context->umem_rwsem);
+ rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
+ end,
+ invalidate_range_end_trampoline, NULL);
+ up_read(&context->umem_rwsem);
+ ib_ucontext_notifier_end_account(context);
+}
+
+static struct mmu_notifier_ops ib_umem_notifiers = {
+ .release = ib_umem_notifier_release,
+ .invalidate_page = ib_umem_notifier_invalidate_page,
+ .invalidate_range_start = ib_umem_notifier_invalidate_range_start,
+ .invalidate_range_end = ib_umem_notifier_invalidate_range_end,
+};
+
+int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
+{
+ int ret_val;
+ struct pid *our_pid;
+ struct mm_struct *mm = get_task_mm(current);
+
+ if (!mm)
+ return -EINVAL;
+
+ /* Prevent creating ODP MRs in child processes */
+ rcu_read_lock();
+ our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
+ rcu_read_unlock();
+ put_pid(our_pid);
+ if (context->tgid != our_pid) {
+ ret_val = -EINVAL;
+ goto out_mm;
+ }
+
+ umem->hugetlb = 0;
+ umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL);
+ if (!umem->odp_data) {
+ ret_val = -ENOMEM;
+ goto out_mm;
+ }
+ umem->odp_data->umem = umem;
+
+ mutex_init(&umem->odp_data->umem_mutex);
+
+ init_completion(&umem->odp_data->notifier_completion);
+
+ umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) *
+ sizeof(*umem->odp_data->page_list));
+ if (!umem->odp_data->page_list) {
+ ret_val = -ENOMEM;
+ goto out_odp_data;
+ }
+
+ umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) *
+ sizeof(*umem->odp_data->dma_list));
+ if (!umem->odp_data->dma_list) {
+ ret_val = -ENOMEM;
+ goto out_page_list;
+ }
+
+ /*
+ * When using MMU notifiers, we will get a
+ * notification before the "current" task (and MM) is
+ * destroyed. We use the umem_rwsem semaphore to synchronize.
+ */
+ down_write(&context->umem_rwsem);
+ context->odp_mrs_count++;
+ if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+ rbt_ib_umem_insert(&umem->odp_data->interval_tree,
+ &context->umem_tree);
+ if (likely(!atomic_read(&context->notifier_count)) ||
+ context->odp_mrs_count == 1)
+ umem->odp_data->mn_counters_active = true;
+ else
+ list_add(&umem->odp_data->no_private_counters,
+ &context->no_private_counters);
+ downgrade_write(&context->umem_rwsem);
+
+ if (context->odp_mrs_count == 1) {
+ /*
+ * Note that at this point, no MMU notifier is running
+ * for this context!
+ */
+ atomic_set(&context->notifier_count, 0);
+ INIT_HLIST_NODE(&context->mn.hlist);
+ context->mn.ops = &ib_umem_notifiers;
+ /*
+ * Lock-dep detects a false positive for mmap_sem vs.
+ * umem_rwsem, due to not grasping downgrade_write correctly.
+ */
+ lockdep_off();
+ ret_val = mmu_notifier_register(&context->mn, mm);
+ lockdep_on();
+ if (ret_val) {
+ pr_err("Failed to register mmu_notifier %d\n", ret_val);
+ ret_val = -EBUSY;
+ goto out_mutex;
+ }
+ }
+
+ up_read(&context->umem_rwsem);
+
+ /*
+ * Note that doing an mmput can cause a notifier for the relevant mm.
+ * If the notifier is called while we hold the umem_rwsem, this will
+ * cause a deadlock. Therefore, we release the reference only after we
+ * released the semaphore.
+ */
+ mmput(mm);
+ return 0;
+
+out_mutex:
+ up_read(&context->umem_rwsem);
+ vfree(umem->odp_data->dma_list);
+out_page_list:
+ vfree(umem->odp_data->page_list);
+out_odp_data:
+ kfree(umem->odp_data);
+out_mm:
+ mmput(mm);
+ return ret_val;
+}
+
+void ib_umem_odp_release(struct ib_umem *umem)
+{
+ struct ib_ucontext *context = umem->context;
+
+ /*
+ * Ensure that no more pages are mapped in the umem.
+ *
+ * It is the driver's responsibility to ensure, before calling us,
+ * that the hardware will not attempt to access the MR any more.
+ */
+ ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem),
+ ib_umem_end(umem));
+
+ down_write(&context->umem_rwsem);
+ if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+ rbt_ib_umem_remove(&umem->odp_data->interval_tree,
+ &context->umem_tree);
+ context->odp_mrs_count--;
+ if (!umem->odp_data->mn_counters_active) {
+ list_del(&umem->odp_data->no_private_counters);
+ complete_all(&umem->odp_data->notifier_completion);
+ }
+
+ /*
+ * Downgrade the lock to a read lock. This ensures that the notifiers
+ * (who lock the mutex for reading) will be able to finish, and we
+ * will be able to enventually obtain the mmu notifiers SRCU. Note
+ * that since we are doing it atomically, no other user could register
+ * and unregister while we do the check.
+ */
+ downgrade_write(&context->umem_rwsem);
+ if (!context->odp_mrs_count) {
+ struct task_struct *owning_process = NULL;
+ struct mm_struct *owning_mm = NULL;
+
+ owning_process = get_pid_task(context->tgid,
+ PIDTYPE_PID);
+ if (owning_process == NULL)
+ /*
+ * The process is already dead, notifier were removed
+ * already.
+ */
+ goto out;
+
+ owning_mm = get_task_mm(owning_process);
+ if (owning_mm == NULL)
+ /*
+ * The process' mm is already dead, notifier were
+ * removed already.
+ */
+ goto out_put_task;
+ mmu_notifier_unregister(&context->mn, owning_mm);
+
+ mmput(owning_mm);
+
+out_put_task:
+ put_task_struct(owning_process);
+ }
+out:
+ up_read(&context->umem_rwsem);
+
+ vfree(umem->odp_data->dma_list);
+ vfree(umem->odp_data->page_list);
+ kfree(umem->odp_data);
+ kfree(umem);
+}
+
+/*
+ * Map for DMA and insert a single page into the on-demand paging page tables.
+ *
+ * @umem: the umem to insert the page to.
+ * @page_index: index in the umem to add the page to.
+ * @page: the page struct to map and add.
+ * @access_mask: access permissions needed for this page.
+ * @current_seq: sequence number for synchronization with invalidations.
+ * the sequence number is taken from
+ * umem->odp_data->notifiers_seq.
+ *
+ * The function returns -EFAULT if the DMA mapping operation fails. It returns
+ * -EAGAIN if a concurrent invalidation prevents us from updating the page.
+ *
+ * The page is released via put_page even if the operation failed. For
+ * on-demand pinning, the page is released whenever it isn't stored in the
+ * umem.
+ */
+static int ib_umem_odp_map_dma_single_page(
+ struct ib_umem *umem,
+ int page_index,
+ u64 base_virt_addr,
+ struct page *page,
+ u64 access_mask,
+ unsigned long current_seq)
+{
+ struct ib_device *dev = umem->context->device;
+ dma_addr_t dma_addr;
+ int stored_page = 0;
+ int remove_existing_mapping = 0;
+ int ret = 0;
+
+ /*
+ * Note: we avoid writing if seq is different from the initial seq, to
+ * handle case of a racing notifier. This check also allows us to bail
+ * early if we have a notifier running in parallel with us.
+ */
+ if (ib_umem_mmu_notifier_retry(umem, current_seq)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ if (!(umem->odp_data->dma_list[page_index])) {
+ dma_addr = ib_dma_map_page(dev,
+ page,
+ 0, PAGE_SIZE,
+ DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(dev, dma_addr)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ umem->odp_data->dma_list[page_index] = dma_addr | access_mask;
+ umem->odp_data->page_list[page_index] = page;
+ stored_page = 1;
+ } else if (umem->odp_data->page_list[page_index] == page) {
+ umem->odp_data->dma_list[page_index] |= access_mask;
+ } else {
+ pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
+ umem->odp_data->page_list[page_index], page);
+ /* Better remove the mapping now, to prevent any further
+ * damage. */
+ remove_existing_mapping = 1;
+ }
+
+out:
+ /* On Demand Paging - avoid pinning the page */
+ if (umem->context->invalidate_range || !stored_page)
+ put_page(page);
+
+ if (remove_existing_mapping && umem->context->invalidate_range) {
+ invalidate_page_trampoline(
+ umem,
+ base_virt_addr + (page_index * PAGE_SIZE),
+ base_virt_addr + ((page_index+1)*PAGE_SIZE),
+ NULL);
+ ret = -EAGAIN;
+ }
+
+ return ret;
+}
+
+/**
+ * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR.
+ *
+ * Pins the range of pages passed in the argument, and maps them to
+ * DMA addresses. The DMA addresses of the mapped pages is updated in
+ * umem->odp_data->dma_list.
+ *
+ * Returns the number of pages mapped in success, negative error code
+ * for failure.
+ * An -EAGAIN error code is returned when a concurrent mmu notifier prevents
+ * the function from completing its task.
+ *
+ * @umem: the umem to map and pin
+ * @user_virt: the address from which we need to map.
+ * @bcnt: the minimal number of bytes to pin and map. The mapping might be
+ * bigger due to alignment, and may also be smaller in case of an error
+ * pinning or mapping a page. The actual pages mapped is returned in
+ * the return value.
+ * @access_mask: bit mask of the requested access permissions for the given
+ * range.
+ * @current_seq: the MMU notifiers sequance value for synchronization with
+ * invalidations. the sequance number is read from
+ * umem->odp_data->notifiers_seq before calling this function
+ */
+int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
+ u64 access_mask, unsigned long current_seq)
+{
+ struct task_struct *owning_process = NULL;
+ struct mm_struct *owning_mm = NULL;
+ struct page **local_page_list = NULL;
+ u64 off;
+ int j, k, ret = 0, start_idx, npages = 0;
+ u64 base_virt_addr;
+
+ if (access_mask == 0)
+ return -EINVAL;
+
+ if (user_virt < ib_umem_start(umem) ||
+ user_virt + bcnt > ib_umem_end(umem))
+ return -EFAULT;
+
+ local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
+ if (!local_page_list)
+ return -ENOMEM;
+
+ off = user_virt & (~PAGE_MASK);
+ user_virt = user_virt & PAGE_MASK;
+ base_virt_addr = user_virt;
+ bcnt += off; /* Charge for the first page offset as well. */
+
+ owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID);
+ if (owning_process == NULL) {
+ ret = -EINVAL;
+ goto out_no_task;
+ }
+
+ owning_mm = get_task_mm(owning_process);
+ if (owning_mm == NULL) {
+ ret = -EINVAL;
+ goto out_put_task;
+ }
+
+ start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT;
+ k = start_idx;
+
+ while (bcnt > 0) {
+ const size_t gup_num_pages =
+ min_t(size_t, ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE,
+ PAGE_SIZE / sizeof(struct page *));
+
+ down_read(&owning_mm->mmap_sem);
+ /*
+ * Note: this might result in redundent page getting. We can
+ * avoid this by checking dma_list to be 0 before calling
+ * get_user_pages. However, this make the code much more
+ * complex (and doesn't gain us much performance in most use
+ * cases).
+ */
+ npages = get_user_pages(owning_process, owning_mm, user_virt,
+ gup_num_pages,
+ access_mask & ODP_WRITE_ALLOWED_BIT, 0,
+ local_page_list, NULL);
+ up_read(&owning_mm->mmap_sem);
+
+ if (npages < 0)
+ break;
+
+ bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt);
+ user_virt += npages << PAGE_SHIFT;
+ mutex_lock(&umem->odp_data->umem_mutex);
+ for (j = 0; j < npages; ++j) {
+ ret = ib_umem_odp_map_dma_single_page(
+ umem, k, base_virt_addr, local_page_list[j],
+ access_mask, current_seq);
+ if (ret < 0)
+ break;
+ k++;
+ }
+ mutex_unlock(&umem->odp_data->umem_mutex);
+
+ if (ret < 0) {
+ /* Release left over pages when handling errors. */
+ for (++j; j < npages; ++j)
+ put_page(local_page_list[j]);
+ break;
+ }
+ }
+
+ if (ret >= 0) {
+ if (npages < 0 && k == start_idx)
+ ret = npages;
+ else
+ ret = k - start_idx;
+ }
+
+ mmput(owning_mm);
+out_put_task:
+ put_task_struct(owning_process);
+out_no_task:
+ free_page((unsigned long)local_page_list);
+ return ret;
+}
+EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
+
+void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
+ u64 bound)
+{
+ int idx;
+ u64 addr;
+ struct ib_device *dev = umem->context->device;
+
+ virt = max_t(u64, virt, ib_umem_start(umem));
+ bound = min_t(u64, bound, ib_umem_end(umem));
+ /* Note that during the run of this function, the
+ * notifiers_count of the MR is > 0, preventing any racing
+ * faults from completion. We might be racing with other
+ * invalidations, so we must make sure we free each page only
+ * once. */
+ mutex_lock(&umem->odp_data->umem_mutex);
+ for (addr = virt; addr < bound; addr += (u64)umem->page_size) {
+ idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
+ if (umem->odp_data->page_list[idx]) {
+ struct page *page = umem->odp_data->page_list[idx];
+ dma_addr_t dma = umem->odp_data->dma_list[idx];
+ dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK;
+
+ WARN_ON(!dma_addr);
+
+ ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
+ DMA_BIDIRECTIONAL);
+ if (dma & ODP_WRITE_ALLOWED_BIT) {
+ struct page *head_page = compound_head(page);
+ /*
+ * set_page_dirty prefers being called with
+ * the page lock. However, MMU notifiers are
+ * called sometimes with and sometimes without
+ * the lock. We rely on the umem_mutex instead
+ * to prevent other mmu notifiers from
+ * continuing and allowing the page mapping to
+ * be removed.
+ */
+ set_page_dirty(head_page);
+ }
+ /* on demand pinning support */
+ if (!umem->context->invalidate_range)
+ put_page(page);
+ umem->odp_data->page_list[idx] = NULL;
+ umem->odp_data->dma_list[idx] = 0;
+ }
+ }
+ mutex_unlock(&umem->odp_data->umem_mutex);
+}
+EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
diff --git a/drivers/infiniband/core/umem_rbtree.c b/drivers/infiniband/core/umem_rbtree.c
new file mode 100644
index 000000000..727d78844
--- /dev/null
+++ b/drivers/infiniband/core/umem_rbtree.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interval_tree_generic.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <rdma/ib_umem_odp.h>
+
+/*
+ * The ib_umem list keeps track of memory regions for which the HW
+ * device request to receive notification when the related memory
+ * mapping is changed.
+ *
+ * ib_umem_lock protects the list.
+ */
+
+static inline u64 node_start(struct umem_odp_node *n)
+{
+ struct ib_umem_odp *umem_odp =
+ container_of(n, struct ib_umem_odp, interval_tree);
+
+ return ib_umem_start(umem_odp->umem);
+}
+
+/* Note that the representation of the intervals in the interval tree
+ * considers the ending point as contained in the interval, while the
+ * function ib_umem_end returns the first address which is not contained
+ * in the umem.
+ */
+static inline u64 node_last(struct umem_odp_node *n)
+{
+ struct ib_umem_odp *umem_odp =
+ container_of(n, struct ib_umem_odp, interval_tree);
+
+ return ib_umem_end(umem_odp->umem) - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
+ node_start, node_last, , rbt_ib_umem)
+
+/* @last is not a part of the interval. See comment for function
+ * node_last.
+ */
+int rbt_ib_umem_for_each_in_range(struct rb_root *root,
+ u64 start, u64 last,
+ umem_call_back cb,
+ void *cookie)
+{
+ int ret_val = 0;
+ struct umem_odp_node *node;
+ struct ib_umem_odp *umem;
+
+ if (unlikely(start == last))
+ return ret_val;
+
+ for (node = rbt_ib_umem_iter_first(root, start, last - 1); node;
+ node = rbt_ib_umem_iter_next(node, start, last - 1)) {
+ umem = container_of(node, struct ib_umem_odp, interval_tree);
+ ret_val = cb(umem->umem, start, last, cookie) || ret_val;
+ }
+
+ return ret_val;
+}
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
new file mode 100644
index 000000000..928cdd20e
--- /dev/null
+++ b/drivers/infiniband/core/user_mad.c
@@ -0,0 +1,1390 @@
+/*
+ * Copyright (c) 2004 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2008 Cisco. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "user_mad: " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/dma-mapping.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/kref.h>
+#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_mad.h>
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace MAD packet access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+ IB_UMAD_MAX_PORTS = 64,
+ IB_UMAD_MAX_AGENTS = 32,
+
+ IB_UMAD_MAJOR = 231,
+ IB_UMAD_MINOR_BASE = 0
+};
+
+/*
+ * Our lifetime rules for these structs are the following:
+ * device special file is opened, we take a reference on the
+ * ib_umad_port's struct ib_umad_device. We drop these
+ * references in the corresponding close().
+ *
+ * In addition to references coming from open character devices, there
+ * is one more reference to each ib_umad_device representing the
+ * module's reference taken when allocating the ib_umad_device in
+ * ib_umad_add_one().
+ *
+ * When destroying an ib_umad_device, we drop the module's reference.
+ */
+
+struct ib_umad_port {
+ struct cdev cdev;
+ struct device *dev;
+
+ struct cdev sm_cdev;
+ struct device *sm_dev;
+ struct semaphore sm_sem;
+
+ struct mutex file_mutex;
+ struct list_head file_list;
+
+ struct ib_device *ib_dev;
+ struct ib_umad_device *umad_dev;
+ int dev_num;
+ u8 port_num;
+};
+
+struct ib_umad_device {
+ int start_port, end_port;
+ struct kobject kobj;
+ struct ib_umad_port port[0];
+};
+
+struct ib_umad_file {
+ struct mutex mutex;
+ struct ib_umad_port *port;
+ struct list_head recv_list;
+ struct list_head send_list;
+ struct list_head port_list;
+ spinlock_t send_lock;
+ wait_queue_head_t recv_wait;
+ struct ib_mad_agent *agent[IB_UMAD_MAX_AGENTS];
+ int agents_dead;
+ u8 use_pkey_index;
+ u8 already_used;
+};
+
+struct ib_umad_packet {
+ struct ib_mad_send_buf *msg;
+ struct ib_mad_recv_wc *recv_wc;
+ struct list_head list;
+ int length;
+ struct ib_user_mad mad;
+};
+
+static struct class *umad_class;
+
+static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
+
+static DEFINE_SPINLOCK(port_lock);
+static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
+
+static void ib_umad_add_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device);
+
+static void ib_umad_release_dev(struct kobject *kobj)
+{
+ struct ib_umad_device *dev =
+ container_of(kobj, struct ib_umad_device, kobj);
+
+ kfree(dev);
+}
+
+static struct kobj_type ib_umad_dev_ktype = {
+ .release = ib_umad_release_dev,
+};
+
+static int hdr_size(struct ib_umad_file *file)
+{
+ return file->use_pkey_index ? sizeof (struct ib_user_mad_hdr) :
+ sizeof (struct ib_user_mad_hdr_old);
+}
+
+/* caller must hold file->mutex */
+static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id)
+{
+ return file->agents_dead ? NULL : file->agent[id];
+}
+
+static int queue_packet(struct ib_umad_file *file,
+ struct ib_mad_agent *agent,
+ struct ib_umad_packet *packet)
+{
+ int ret = 1;
+
+ mutex_lock(&file->mutex);
+
+ for (packet->mad.hdr.id = 0;
+ packet->mad.hdr.id < IB_UMAD_MAX_AGENTS;
+ packet->mad.hdr.id++)
+ if (agent == __get_agent(file, packet->mad.hdr.id)) {
+ list_add_tail(&packet->list, &file->recv_list);
+ wake_up_interruptible(&file->recv_wait);
+ ret = 0;
+ break;
+ }
+
+ mutex_unlock(&file->mutex);
+
+ return ret;
+}
+
+static void dequeue_send(struct ib_umad_file *file,
+ struct ib_umad_packet *packet)
+{
+ spin_lock_irq(&file->send_lock);
+ list_del(&packet->list);
+ spin_unlock_irq(&file->send_lock);
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+ struct ib_mad_send_wc *send_wc)
+{
+ struct ib_umad_file *file = agent->context;
+ struct ib_umad_packet *packet = send_wc->send_buf->context[0];
+
+ dequeue_send(file, packet);
+ ib_destroy_ah(packet->msg->ah);
+ ib_free_send_mad(packet->msg);
+
+ if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
+ packet->length = IB_MGMT_MAD_HDR;
+ packet->mad.hdr.status = ETIMEDOUT;
+ if (!queue_packet(file, agent, packet))
+ return;
+ }
+ kfree(packet);
+}
+
+static void recv_handler(struct ib_mad_agent *agent,
+ struct ib_mad_recv_wc *mad_recv_wc)
+{
+ struct ib_umad_file *file = agent->context;
+ struct ib_umad_packet *packet;
+
+ if (mad_recv_wc->wc->status != IB_WC_SUCCESS)
+ goto err1;
+
+ packet = kzalloc(sizeof *packet, GFP_KERNEL);
+ if (!packet)
+ goto err1;
+
+ packet->length = mad_recv_wc->mad_len;
+ packet->recv_wc = mad_recv_wc;
+
+ packet->mad.hdr.status = 0;
+ packet->mad.hdr.length = hdr_size(file) + mad_recv_wc->mad_len;
+ packet->mad.hdr.qpn = cpu_to_be32(mad_recv_wc->wc->src_qp);
+ packet->mad.hdr.lid = cpu_to_be16(mad_recv_wc->wc->slid);
+ packet->mad.hdr.sl = mad_recv_wc->wc->sl;
+ packet->mad.hdr.path_bits = mad_recv_wc->wc->dlid_path_bits;
+ packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index;
+ packet->mad.hdr.grh_present = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH);
+ if (packet->mad.hdr.grh_present) {
+ struct ib_ah_attr ah_attr;
+
+ ib_init_ah_from_wc(agent->device, agent->port_num,
+ mad_recv_wc->wc, mad_recv_wc->recv_buf.grh,
+ &ah_attr);
+
+ packet->mad.hdr.gid_index = ah_attr.grh.sgid_index;
+ packet->mad.hdr.hop_limit = ah_attr.grh.hop_limit;
+ packet->mad.hdr.traffic_class = ah_attr.grh.traffic_class;
+ memcpy(packet->mad.hdr.gid, &ah_attr.grh.dgid, 16);
+ packet->mad.hdr.flow_label = cpu_to_be32(ah_attr.grh.flow_label);
+ }
+
+ if (queue_packet(file, agent, packet))
+ goto err2;
+ return;
+
+err2:
+ kfree(packet);
+err1:
+ ib_free_recv_mad(mad_recv_wc);
+}
+
+static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,
+ struct ib_umad_packet *packet, size_t count)
+{
+ struct ib_mad_recv_buf *recv_buf;
+ int left, seg_payload, offset, max_seg_payload;
+
+ /* We need enough room to copy the first (or only) MAD segment. */
+ recv_buf = &packet->recv_wc->recv_buf;
+ if ((packet->length <= sizeof (*recv_buf->mad) &&
+ count < hdr_size(file) + packet->length) ||
+ (packet->length > sizeof (*recv_buf->mad) &&
+ count < hdr_size(file) + sizeof (*recv_buf->mad)))
+ return -EINVAL;
+
+ if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+ return -EFAULT;
+
+ buf += hdr_size(file);
+ seg_payload = min_t(int, packet->length, sizeof (*recv_buf->mad));
+ if (copy_to_user(buf, recv_buf->mad, seg_payload))
+ return -EFAULT;
+
+ if (seg_payload < packet->length) {
+ /*
+ * Multipacket RMPP MAD message. Copy remainder of message.
+ * Note that last segment may have a shorter payload.
+ */
+ if (count < hdr_size(file) + packet->length) {
+ /*
+ * The buffer is too small, return the first RMPP segment,
+ * which includes the RMPP message length.
+ */
+ return -ENOSPC;
+ }
+ offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class);
+ max_seg_payload = sizeof (struct ib_mad) - offset;
+
+ for (left = packet->length - seg_payload, buf += seg_payload;
+ left; left -= seg_payload, buf += seg_payload) {
+ recv_buf = container_of(recv_buf->list.next,
+ struct ib_mad_recv_buf, list);
+ seg_payload = min(left, max_seg_payload);
+ if (copy_to_user(buf, ((void *) recv_buf->mad) + offset,
+ seg_payload))
+ return -EFAULT;
+ }
+ }
+ return hdr_size(file) + packet->length;
+}
+
+static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf,
+ struct ib_umad_packet *packet, size_t count)
+{
+ ssize_t size = hdr_size(file) + packet->length;
+
+ if (count < size)
+ return -EINVAL;
+
+ if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+ return -EFAULT;
+
+ buf += hdr_size(file);
+
+ if (copy_to_user(buf, packet->mad.data, packet->length))
+ return -EFAULT;
+
+ return size;
+}
+
+static ssize_t ib_umad_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_umad_file *file = filp->private_data;
+ struct ib_umad_packet *packet;
+ ssize_t ret;
+
+ if (count < hdr_size(file))
+ return -EINVAL;
+
+ mutex_lock(&file->mutex);
+
+ while (list_empty(&file->recv_list)) {
+ mutex_unlock(&file->mutex);
+
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(file->recv_wait,
+ !list_empty(&file->recv_list)))
+ return -ERESTARTSYS;
+
+ mutex_lock(&file->mutex);
+ }
+
+ packet = list_entry(file->recv_list.next, struct ib_umad_packet, list);
+ list_del(&packet->list);
+
+ mutex_unlock(&file->mutex);
+
+ if (packet->recv_wc)
+ ret = copy_recv_mad(file, buf, packet, count);
+ else
+ ret = copy_send_mad(file, buf, packet, count);
+
+ if (ret < 0) {
+ /* Requeue packet */
+ mutex_lock(&file->mutex);
+ list_add(&packet->list, &file->recv_list);
+ mutex_unlock(&file->mutex);
+ } else {
+ if (packet->recv_wc)
+ ib_free_recv_mad(packet->recv_wc);
+ kfree(packet);
+ }
+ return ret;
+}
+
+static int copy_rmpp_mad(struct ib_mad_send_buf *msg, const char __user *buf)
+{
+ int left, seg;
+
+ /* Copy class specific header */
+ if ((msg->hdr_len > IB_MGMT_RMPP_HDR) &&
+ copy_from_user(msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR,
+ msg->hdr_len - IB_MGMT_RMPP_HDR))
+ return -EFAULT;
+
+ /* All headers are in place. Copy data segments. */
+ for (seg = 1, left = msg->data_len, buf += msg->hdr_len; left > 0;
+ seg++, left -= msg->seg_size, buf += msg->seg_size) {
+ if (copy_from_user(ib_get_rmpp_segment(msg, seg), buf,
+ min(left, msg->seg_size)))
+ return -EFAULT;
+ }
+ return 0;
+}
+
+static int same_destination(struct ib_user_mad_hdr *hdr1,
+ struct ib_user_mad_hdr *hdr2)
+{
+ if (!hdr1->grh_present && !hdr2->grh_present)
+ return (hdr1->lid == hdr2->lid);
+
+ if (hdr1->grh_present && hdr2->grh_present)
+ return !memcmp(hdr1->gid, hdr2->gid, 16);
+
+ return 0;
+}
+
+static int is_duplicate(struct ib_umad_file *file,
+ struct ib_umad_packet *packet)
+{
+ struct ib_umad_packet *sent_packet;
+ struct ib_mad_hdr *sent_hdr, *hdr;
+
+ hdr = (struct ib_mad_hdr *) packet->mad.data;
+ list_for_each_entry(sent_packet, &file->send_list, list) {
+ sent_hdr = (struct ib_mad_hdr *) sent_packet->mad.data;
+
+ if ((hdr->tid != sent_hdr->tid) ||
+ (hdr->mgmt_class != sent_hdr->mgmt_class))
+ continue;
+
+ /*
+ * No need to be overly clever here. If two new operations have
+ * the same TID, reject the second as a duplicate. This is more
+ * restrictive than required by the spec.
+ */
+ if (!ib_response_mad((struct ib_mad *) hdr)) {
+ if (!ib_response_mad((struct ib_mad *) sent_hdr))
+ return 1;
+ continue;
+ } else if (!ib_response_mad((struct ib_mad *) sent_hdr))
+ continue;
+
+ if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr))
+ return 1;
+ }
+
+ return 0;
+}
+
+static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_umad_file *file = filp->private_data;
+ struct ib_umad_packet *packet;
+ struct ib_mad_agent *agent;
+ struct ib_ah_attr ah_attr;
+ struct ib_ah *ah;
+ struct ib_rmpp_mad *rmpp_mad;
+ __be64 *tid;
+ int ret, data_len, hdr_len, copy_offset, rmpp_active;
+
+ if (count < hdr_size(file) + IB_MGMT_RMPP_HDR)
+ return -EINVAL;
+
+ packet = kzalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL);
+ if (!packet)
+ return -ENOMEM;
+
+ if (copy_from_user(&packet->mad, buf, hdr_size(file))) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ if (packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ buf += hdr_size(file);
+
+ if (copy_from_user(packet->mad.data, buf, IB_MGMT_RMPP_HDR)) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ mutex_lock(&file->mutex);
+
+ agent = __get_agent(file, packet->mad.hdr.id);
+ if (!agent) {
+ ret = -EINVAL;
+ goto err_up;
+ }
+
+ memset(&ah_attr, 0, sizeof ah_attr);
+ ah_attr.dlid = be16_to_cpu(packet->mad.hdr.lid);
+ ah_attr.sl = packet->mad.hdr.sl;
+ ah_attr.src_path_bits = packet->mad.hdr.path_bits;
+ ah_attr.port_num = file->port->port_num;
+ if (packet->mad.hdr.grh_present) {
+ ah_attr.ah_flags = IB_AH_GRH;
+ memcpy(ah_attr.grh.dgid.raw, packet->mad.hdr.gid, 16);
+ ah_attr.grh.sgid_index = packet->mad.hdr.gid_index;
+ ah_attr.grh.flow_label = be32_to_cpu(packet->mad.hdr.flow_label);
+ ah_attr.grh.hop_limit = packet->mad.hdr.hop_limit;
+ ah_attr.grh.traffic_class = packet->mad.hdr.traffic_class;
+ }
+
+ ah = ib_create_ah(agent->qp->pd, &ah_attr);
+ if (IS_ERR(ah)) {
+ ret = PTR_ERR(ah);
+ goto err_up;
+ }
+
+ rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data;
+ hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+
+ if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+ && ib_mad_kernel_rmpp_agent(agent)) {
+ copy_offset = IB_MGMT_RMPP_HDR;
+ rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+ IB_MGMT_RMPP_FLAG_ACTIVE;
+ } else {
+ copy_offset = IB_MGMT_MAD_HDR;
+ rmpp_active = 0;
+ }
+
+ data_len = count - hdr_size(file) - hdr_len;
+ packet->msg = ib_create_send_mad(agent,
+ be32_to_cpu(packet->mad.hdr.qpn),
+ packet->mad.hdr.pkey_index, rmpp_active,
+ hdr_len, data_len, GFP_KERNEL);
+ if (IS_ERR(packet->msg)) {
+ ret = PTR_ERR(packet->msg);
+ goto err_ah;
+ }
+
+ packet->msg->ah = ah;
+ packet->msg->timeout_ms = packet->mad.hdr.timeout_ms;
+ packet->msg->retries = packet->mad.hdr.retries;
+ packet->msg->context[0] = packet;
+
+ /* Copy MAD header. Any RMPP header is already in place. */
+ memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR);
+
+ if (!rmpp_active) {
+ if (copy_from_user(packet->msg->mad + copy_offset,
+ buf + copy_offset,
+ hdr_len + data_len - copy_offset)) {
+ ret = -EFAULT;
+ goto err_msg;
+ }
+ } else {
+ ret = copy_rmpp_mad(packet->msg, buf);
+ if (ret)
+ goto err_msg;
+ }
+
+ /*
+ * Set the high-order part of the transaction ID to make MADs from
+ * different agents unique, and allow routing responses back to the
+ * original requestor.
+ */
+ if (!ib_response_mad(packet->msg->mad)) {
+ tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid;
+ *tid = cpu_to_be64(((u64) agent->hi_tid) << 32 |
+ (be64_to_cpup(tid) & 0xffffffff));
+ rmpp_mad->mad_hdr.tid = *tid;
+ }
+
+ if (!ib_mad_kernel_rmpp_agent(agent)
+ && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+ && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+ spin_lock_irq(&file->send_lock);
+ list_add_tail(&packet->list, &file->send_list);
+ spin_unlock_irq(&file->send_lock);
+ } else {
+ spin_lock_irq(&file->send_lock);
+ ret = is_duplicate(file, packet);
+ if (!ret)
+ list_add_tail(&packet->list, &file->send_list);
+ spin_unlock_irq(&file->send_lock);
+ if (ret) {
+ ret = -EINVAL;
+ goto err_msg;
+ }
+ }
+
+ ret = ib_post_send_mad(packet->msg, NULL);
+ if (ret)
+ goto err_send;
+
+ mutex_unlock(&file->mutex);
+ return count;
+
+err_send:
+ dequeue_send(file, packet);
+err_msg:
+ ib_free_send_mad(packet->msg);
+err_ah:
+ ib_destroy_ah(ah);
+err_up:
+ mutex_unlock(&file->mutex);
+err:
+ kfree(packet);
+ return ret;
+}
+
+static unsigned int ib_umad_poll(struct file *filp, struct poll_table_struct *wait)
+{
+ struct ib_umad_file *file = filp->private_data;
+
+ /* we will always be able to post a MAD send */
+ unsigned int mask = POLLOUT | POLLWRNORM;
+
+ poll_wait(filp, &file->recv_wait, wait);
+
+ if (!list_empty(&file->recv_list))
+ mask |= POLLIN | POLLRDNORM;
+
+ return mask;
+}
+
+static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
+ int compat_method_mask)
+{
+ struct ib_user_mad_reg_req ureq;
+ struct ib_mad_reg_req req;
+ struct ib_mad_agent *agent = NULL;
+ int agent_id;
+ int ret;
+
+ mutex_lock(&file->port->file_mutex);
+ mutex_lock(&file->mutex);
+
+ if (!file->port->ib_dev) {
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent: invalid device\n");
+ ret = -EPIPE;
+ goto out;
+ }
+
+ if (copy_from_user(&ureq, arg, sizeof ureq)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (ureq.qpn != 0 && ureq.qpn != 1) {
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent: invalid QPN %d specified\n",
+ ureq.qpn);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+ if (!__get_agent(file, agent_id))
+ goto found;
+
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent: Max Agents (%u) reached\n",
+ IB_UMAD_MAX_AGENTS);
+ ret = -ENOMEM;
+ goto out;
+
+found:
+ if (ureq.mgmt_class) {
+ memset(&req, 0, sizeof(req));
+ req.mgmt_class = ureq.mgmt_class;
+ req.mgmt_class_version = ureq.mgmt_class_version;
+ memcpy(req.oui, ureq.oui, sizeof req.oui);
+
+ if (compat_method_mask) {
+ u32 *umm = (u32 *) ureq.method_mask;
+ int i;
+
+ for (i = 0; i < BITS_TO_LONGS(IB_MGMT_MAX_METHODS); ++i)
+ req.method_mask[i] =
+ umm[i * 2] | ((u64) umm[i * 2 + 1] << 32);
+ } else
+ memcpy(req.method_mask, ureq.method_mask,
+ sizeof req.method_mask);
+ }
+
+ agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+ ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+ ureq.mgmt_class ? &req : NULL,
+ ureq.rmpp_version,
+ send_handler, recv_handler, file, 0);
+ if (IS_ERR(agent)) {
+ ret = PTR_ERR(agent);
+ agent = NULL;
+ goto out;
+ }
+
+ if (put_user(agent_id,
+ (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (!file->already_used) {
+ file->already_used = 1;
+ if (!file->use_pkey_index) {
+ dev_warn(file->port->dev,
+ "process %s did not enable P_Key index support.\n",
+ current->comm);
+ dev_warn(file->port->dev,
+ " Documentation/infiniband/user_mad.txt has info on the new ABI.\n");
+ }
+ }
+
+ file->agent[agent_id] = agent;
+ ret = 0;
+
+out:
+ mutex_unlock(&file->mutex);
+
+ if (ret && agent)
+ ib_unregister_mad_agent(agent);
+
+ mutex_unlock(&file->port->file_mutex);
+
+ return ret;
+}
+
+static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg)
+{
+ struct ib_user_mad_reg_req2 ureq;
+ struct ib_mad_reg_req req;
+ struct ib_mad_agent *agent = NULL;
+ int agent_id;
+ int ret;
+
+ mutex_lock(&file->port->file_mutex);
+ mutex_lock(&file->mutex);
+
+ if (!file->port->ib_dev) {
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent2: invalid device\n");
+ ret = -EPIPE;
+ goto out;
+ }
+
+ if (copy_from_user(&ureq, arg, sizeof(ureq))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (ureq.qpn != 0 && ureq.qpn != 1) {
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent2: invalid QPN %d specified\n",
+ ureq.qpn);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) {
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent2 failed: invalid registration flags specified 0x%x; supported 0x%x\n",
+ ureq.flags, IB_USER_MAD_REG_FLAGS_CAP);
+ ret = -EINVAL;
+
+ if (put_user((u32)IB_USER_MAD_REG_FLAGS_CAP,
+ (u32 __user *) (arg + offsetof(struct
+ ib_user_mad_reg_req2, flags))))
+ ret = -EFAULT;
+
+ goto out;
+ }
+
+ for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+ if (!__get_agent(file, agent_id))
+ goto found;
+
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent2: Max Agents (%u) reached\n",
+ IB_UMAD_MAX_AGENTS);
+ ret = -ENOMEM;
+ goto out;
+
+found:
+ if (ureq.mgmt_class) {
+ memset(&req, 0, sizeof(req));
+ req.mgmt_class = ureq.mgmt_class;
+ req.mgmt_class_version = ureq.mgmt_class_version;
+ if (ureq.oui & 0xff000000) {
+ dev_notice(file->port->dev,
+ "ib_umad_reg_agent2 failed: oui invalid 0x%08x\n",
+ ureq.oui);
+ ret = -EINVAL;
+ goto out;
+ }
+ req.oui[2] = ureq.oui & 0x0000ff;
+ req.oui[1] = (ureq.oui & 0x00ff00) >> 8;
+ req.oui[0] = (ureq.oui & 0xff0000) >> 16;
+ memcpy(req.method_mask, ureq.method_mask,
+ sizeof(req.method_mask));
+ }
+
+ agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+ ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+ ureq.mgmt_class ? &req : NULL,
+ ureq.rmpp_version,
+ send_handler, recv_handler, file,
+ ureq.flags);
+ if (IS_ERR(agent)) {
+ ret = PTR_ERR(agent);
+ agent = NULL;
+ goto out;
+ }
+
+ if (put_user(agent_id,
+ (u32 __user *)(arg +
+ offsetof(struct ib_user_mad_reg_req2, id)))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (!file->already_used) {
+ file->already_used = 1;
+ file->use_pkey_index = 1;
+ }
+
+ file->agent[agent_id] = agent;
+ ret = 0;
+
+out:
+ mutex_unlock(&file->mutex);
+
+ if (ret && agent)
+ ib_unregister_mad_agent(agent);
+
+ mutex_unlock(&file->port->file_mutex);
+
+ return ret;
+}
+
+
+static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg)
+{
+ struct ib_mad_agent *agent = NULL;
+ u32 id;
+ int ret = 0;
+
+ if (get_user(id, arg))
+ return -EFAULT;
+
+ mutex_lock(&file->port->file_mutex);
+ mutex_lock(&file->mutex);
+
+ if (id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ agent = file->agent[id];
+ file->agent[id] = NULL;
+
+out:
+ mutex_unlock(&file->mutex);
+
+ if (agent)
+ ib_unregister_mad_agent(agent);
+
+ mutex_unlock(&file->port->file_mutex);
+
+ return ret;
+}
+
+static long ib_umad_enable_pkey(struct ib_umad_file *file)
+{
+ int ret = 0;
+
+ mutex_lock(&file->mutex);
+ if (file->already_used)
+ ret = -EINVAL;
+ else
+ file->use_pkey_index = 1;
+ mutex_unlock(&file->mutex);
+
+ return ret;
+}
+
+static long ib_umad_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case IB_USER_MAD_REGISTER_AGENT:
+ return ib_umad_reg_agent(filp->private_data, (void __user *) arg, 0);
+ case IB_USER_MAD_UNREGISTER_AGENT:
+ return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg);
+ case IB_USER_MAD_ENABLE_PKEY:
+ return ib_umad_enable_pkey(filp->private_data);
+ case IB_USER_MAD_REGISTER_AGENT2:
+ return ib_umad_reg_agent2(filp->private_data, (void __user *) arg);
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case IB_USER_MAD_REGISTER_AGENT:
+ return ib_umad_reg_agent(filp->private_data, compat_ptr(arg), 1);
+ case IB_USER_MAD_UNREGISTER_AGENT:
+ return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg));
+ case IB_USER_MAD_ENABLE_PKEY:
+ return ib_umad_enable_pkey(filp->private_data);
+ case IB_USER_MAD_REGISTER_AGENT2:
+ return ib_umad_reg_agent2(filp->private_data, compat_ptr(arg));
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+#endif
+
+/*
+ * ib_umad_open() does not need the BKL:
+ *
+ * - the ib_umad_port structures are properly reference counted, and
+ * everything else is purely local to the file being created, so
+ * races against other open calls are not a problem;
+ * - the ioctl method does not affect any global state outside of the
+ * file structure being operated on;
+ */
+static int ib_umad_open(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_port *port;
+ struct ib_umad_file *file;
+ int ret = -ENXIO;
+
+ port = container_of(inode->i_cdev, struct ib_umad_port, cdev);
+
+ mutex_lock(&port->file_mutex);
+
+ if (!port->ib_dev)
+ goto out;
+
+ ret = -ENOMEM;
+ file = kzalloc(sizeof *file, GFP_KERNEL);
+ if (!file)
+ goto out;
+
+ mutex_init(&file->mutex);
+ spin_lock_init(&file->send_lock);
+ INIT_LIST_HEAD(&file->recv_list);
+ INIT_LIST_HEAD(&file->send_list);
+ init_waitqueue_head(&file->recv_wait);
+
+ file->port = port;
+ filp->private_data = file;
+
+ list_add_tail(&file->port_list, &port->file_list);
+
+ ret = nonseekable_open(inode, filp);
+ if (ret) {
+ list_del(&file->port_list);
+ kfree(file);
+ goto out;
+ }
+
+ kobject_get(&port->umad_dev->kobj);
+
+out:
+ mutex_unlock(&port->file_mutex);
+ return ret;
+}
+
+static int ib_umad_close(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_file *file = filp->private_data;
+ struct ib_umad_device *dev = file->port->umad_dev;
+ struct ib_umad_packet *packet, *tmp;
+ int already_dead;
+ int i;
+
+ mutex_lock(&file->port->file_mutex);
+ mutex_lock(&file->mutex);
+
+ already_dead = file->agents_dead;
+ file->agents_dead = 1;
+
+ list_for_each_entry_safe(packet, tmp, &file->recv_list, list) {
+ if (packet->recv_wc)
+ ib_free_recv_mad(packet->recv_wc);
+ kfree(packet);
+ }
+
+ list_del(&file->port_list);
+
+ mutex_unlock(&file->mutex);
+
+ if (!already_dead)
+ for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i)
+ if (file->agent[i])
+ ib_unregister_mad_agent(file->agent[i]);
+
+ mutex_unlock(&file->port->file_mutex);
+
+ kfree(file);
+ kobject_put(&dev->kobj);
+
+ return 0;
+}
+
+static const struct file_operations umad_fops = {
+ .owner = THIS_MODULE,
+ .read = ib_umad_read,
+ .write = ib_umad_write,
+ .poll = ib_umad_poll,
+ .unlocked_ioctl = ib_umad_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ib_umad_compat_ioctl,
+#endif
+ .open = ib_umad_open,
+ .release = ib_umad_close,
+ .llseek = no_llseek,
+};
+
+static int ib_umad_sm_open(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_port *port;
+ struct ib_port_modify props = {
+ .set_port_cap_mask = IB_PORT_SM
+ };
+ int ret;
+
+ port = container_of(inode->i_cdev, struct ib_umad_port, sm_cdev);
+
+ if (filp->f_flags & O_NONBLOCK) {
+ if (down_trylock(&port->sm_sem)) {
+ ret = -EAGAIN;
+ goto fail;
+ }
+ } else {
+ if (down_interruptible(&port->sm_sem)) {
+ ret = -ERESTARTSYS;
+ goto fail;
+ }
+ }
+
+ ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+ if (ret)
+ goto err_up_sem;
+
+ filp->private_data = port;
+
+ ret = nonseekable_open(inode, filp);
+ if (ret)
+ goto err_clr_sm_cap;
+
+ kobject_get(&port->umad_dev->kobj);
+
+ return 0;
+
+err_clr_sm_cap:
+ swap(props.set_port_cap_mask, props.clr_port_cap_mask);
+ ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+
+err_up_sem:
+ up(&port->sm_sem);
+
+fail:
+ return ret;
+}
+
+static int ib_umad_sm_close(struct inode *inode, struct file *filp)
+{
+ struct ib_umad_port *port = filp->private_data;
+ struct ib_port_modify props = {
+ .clr_port_cap_mask = IB_PORT_SM
+ };
+ int ret = 0;
+
+ mutex_lock(&port->file_mutex);
+ if (port->ib_dev)
+ ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+ mutex_unlock(&port->file_mutex);
+
+ up(&port->sm_sem);
+
+ kobject_put(&port->umad_dev->kobj);
+
+ return ret;
+}
+
+static const struct file_operations umad_sm_fops = {
+ .owner = THIS_MODULE,
+ .open = ib_umad_sm_open,
+ .release = ib_umad_sm_close,
+ .llseek = no_llseek,
+};
+
+static struct ib_client umad_client = {
+ .name = "umad",
+ .add = ib_umad_add_one,
+ .remove = ib_umad_remove_one
+};
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_umad_port *port = dev_get_drvdata(dev);
+
+ if (!port)
+ return -ENODEV;
+
+ return sprintf(buf, "%s\n", port->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_port(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_umad_port *port = dev_get_drvdata(dev);
+
+ if (!port)
+ return -ENODEV;
+
+ return sprintf(buf, "%d\n", port->port_num);
+}
+static DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
+
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+ __stringify(IB_USER_MAD_ABI_VERSION));
+
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS);
+static int find_overflow_devnum(struct ib_device *device)
+{
+ int ret;
+
+ if (!overflow_maj) {
+ ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2,
+ "infiniband_mad");
+ if (ret) {
+ dev_err(&device->dev,
+ "couldn't register dynamic device number\n");
+ return ret;
+ }
+ }
+
+ ret = find_first_zero_bit(overflow_map, IB_UMAD_MAX_PORTS);
+ if (ret >= IB_UMAD_MAX_PORTS)
+ return -1;
+
+ return ret;
+}
+
+static int ib_umad_init_port(struct ib_device *device, int port_num,
+ struct ib_umad_device *umad_dev,
+ struct ib_umad_port *port)
+{
+ int devnum;
+ dev_t base;
+
+ spin_lock(&port_lock);
+ devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
+ if (devnum >= IB_UMAD_MAX_PORTS) {
+ spin_unlock(&port_lock);
+ devnum = find_overflow_devnum(device);
+ if (devnum < 0)
+ return -1;
+
+ spin_lock(&port_lock);
+ port->dev_num = devnum + IB_UMAD_MAX_PORTS;
+ base = devnum + overflow_maj;
+ set_bit(devnum, overflow_map);
+ } else {
+ port->dev_num = devnum;
+ base = devnum + base_dev;
+ set_bit(devnum, dev_map);
+ }
+ spin_unlock(&port_lock);
+
+ port->ib_dev = device;
+ port->port_num = port_num;
+ sema_init(&port->sm_sem, 1);
+ mutex_init(&port->file_mutex);
+ INIT_LIST_HEAD(&port->file_list);
+
+ cdev_init(&port->cdev, &umad_fops);
+ port->cdev.owner = THIS_MODULE;
+ port->cdev.kobj.parent = &umad_dev->kobj;
+ kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num);
+ if (cdev_add(&port->cdev, base, 1))
+ goto err_cdev;
+
+ port->dev = device_create(umad_class, device->dma_device,
+ port->cdev.dev, port,
+ "umad%d", port->dev_num);
+ if (IS_ERR(port->dev))
+ goto err_cdev;
+
+ if (device_create_file(port->dev, &dev_attr_ibdev))
+ goto err_dev;
+ if (device_create_file(port->dev, &dev_attr_port))
+ goto err_dev;
+
+ base += IB_UMAD_MAX_PORTS;
+ cdev_init(&port->sm_cdev, &umad_sm_fops);
+ port->sm_cdev.owner = THIS_MODULE;
+ port->sm_cdev.kobj.parent = &umad_dev->kobj;
+ kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num);
+ if (cdev_add(&port->sm_cdev, base, 1))
+ goto err_sm_cdev;
+
+ port->sm_dev = device_create(umad_class, device->dma_device,
+ port->sm_cdev.dev, port,
+ "issm%d", port->dev_num);
+ if (IS_ERR(port->sm_dev))
+ goto err_sm_cdev;
+
+ if (device_create_file(port->sm_dev, &dev_attr_ibdev))
+ goto err_sm_dev;
+ if (device_create_file(port->sm_dev, &dev_attr_port))
+ goto err_sm_dev;
+
+ return 0;
+
+err_sm_dev:
+ device_destroy(umad_class, port->sm_cdev.dev);
+
+err_sm_cdev:
+ cdev_del(&port->sm_cdev);
+
+err_dev:
+ device_destroy(umad_class, port->cdev.dev);
+
+err_cdev:
+ cdev_del(&port->cdev);
+ if (port->dev_num < IB_UMAD_MAX_PORTS)
+ clear_bit(devnum, dev_map);
+ else
+ clear_bit(devnum, overflow_map);
+
+ return -1;
+}
+
+static void ib_umad_kill_port(struct ib_umad_port *port)
+{
+ struct ib_umad_file *file;
+ int id;
+
+ dev_set_drvdata(port->dev, NULL);
+ dev_set_drvdata(port->sm_dev, NULL);
+
+ device_destroy(umad_class, port->cdev.dev);
+ device_destroy(umad_class, port->sm_cdev.dev);
+
+ cdev_del(&port->cdev);
+ cdev_del(&port->sm_cdev);
+
+ mutex_lock(&port->file_mutex);
+
+ port->ib_dev = NULL;
+
+ list_for_each_entry(file, &port->file_list, port_list) {
+ mutex_lock(&file->mutex);
+ file->agents_dead = 1;
+ mutex_unlock(&file->mutex);
+
+ for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id)
+ if (file->agent[id])
+ ib_unregister_mad_agent(file->agent[id]);
+ }
+
+ mutex_unlock(&port->file_mutex);
+
+ if (port->dev_num < IB_UMAD_MAX_PORTS)
+ clear_bit(port->dev_num, dev_map);
+ else
+ clear_bit(port->dev_num - IB_UMAD_MAX_PORTS, overflow_map);
+}
+
+static void ib_umad_add_one(struct ib_device *device)
+{
+ struct ib_umad_device *umad_dev;
+ int s, e, i;
+
+ if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ if (device->node_type == RDMA_NODE_IB_SWITCH)
+ s = e = 0;
+ else {
+ s = 1;
+ e = device->phys_port_cnt;
+ }
+
+ umad_dev = kzalloc(sizeof *umad_dev +
+ (e - s + 1) * sizeof (struct ib_umad_port),
+ GFP_KERNEL);
+ if (!umad_dev)
+ return;
+
+ kobject_init(&umad_dev->kobj, &ib_umad_dev_ktype);
+
+ umad_dev->start_port = s;
+ umad_dev->end_port = e;
+
+ for (i = s; i <= e; ++i) {
+ umad_dev->port[i - s].umad_dev = umad_dev;
+
+ if (ib_umad_init_port(device, i, umad_dev,
+ &umad_dev->port[i - s]))
+ goto err;
+ }
+
+ ib_set_client_data(device, &umad_client, umad_dev);
+
+ return;
+
+err:
+ while (--i >= s)
+ ib_umad_kill_port(&umad_dev->port[i - s]);
+
+ kobject_put(&umad_dev->kobj);
+}
+
+static void ib_umad_remove_one(struct ib_device *device)
+{
+ struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client);
+ int i;
+
+ if (!umad_dev)
+ return;
+
+ for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i)
+ ib_umad_kill_port(&umad_dev->port[i]);
+
+ kobject_put(&umad_dev->kobj);
+}
+
+static char *umad_devnode(struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static int __init ib_umad_init(void)
+{
+ int ret;
+
+ ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2,
+ "infiniband_mad");
+ if (ret) {
+ pr_err("couldn't register device number\n");
+ goto out;
+ }
+
+ umad_class = class_create(THIS_MODULE, "infiniband_mad");
+ if (IS_ERR(umad_class)) {
+ ret = PTR_ERR(umad_class);
+ pr_err("couldn't create class infiniband_mad\n");
+ goto out_chrdev;
+ }
+
+ umad_class->devnode = umad_devnode;
+
+ ret = class_create_file(umad_class, &class_attr_abi_version.attr);
+ if (ret) {
+ pr_err("couldn't create abi_version attribute\n");
+ goto out_class;
+ }
+
+ ret = ib_register_client(&umad_client);
+ if (ret) {
+ pr_err("couldn't register ib_umad client\n");
+ goto out_class;
+ }
+
+ return 0;
+
+out_class:
+ class_destroy(umad_class);
+
+out_chrdev:
+ unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+
+out:
+ return ret;
+}
+
+static void __exit ib_umad_cleanup(void)
+{
+ ib_unregister_client(&umad_client);
+ class_destroy(umad_class);
+ unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+ if (overflow_maj)
+ unregister_chrdev_region(overflow_maj, IB_UMAD_MAX_PORTS * 2);
+}
+
+module_init(ib_umad_init);
+module_exit(ib_umad_cleanup);
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
new file mode 100644
index 000000000..b716b0815
--- /dev/null
+++ b/drivers/infiniband/core/uverbs.h
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef UVERBS_H
+#define UVERBS_H
+
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+#include <linux/cdev.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \
+ do { \
+ (udata)->inbuf = (const void __user *) (ibuf); \
+ (udata)->outbuf = (void __user *) (obuf); \
+ (udata)->inlen = (ilen); \
+ (udata)->outlen = (olen); \
+ } while (0)
+
+#define INIT_UDATA_BUF_OR_NULL(udata, ibuf, obuf, ilen, olen) \
+ do { \
+ (udata)->inbuf = (ilen) ? (const void __user *) (ibuf) : NULL; \
+ (udata)->outbuf = (olen) ? (void __user *) (obuf) : NULL; \
+ (udata)->inlen = (ilen); \
+ (udata)->outlen = (olen); \
+ } while (0)
+
+/*
+ * Our lifetime rules for these structs are the following:
+ *
+ * struct ib_uverbs_device: One reference is held by the module and
+ * released in ib_uverbs_remove_one(). Another reference is taken by
+ * ib_uverbs_open() each time the character special file is opened,
+ * and released in ib_uverbs_release_file() when the file is released.
+ *
+ * struct ib_uverbs_file: One reference is held by the VFS and
+ * released when the file is closed. Another reference is taken when
+ * an asynchronous event queue file is created and released when the
+ * event file is closed.
+ *
+ * struct ib_uverbs_event_file: One reference is held by the VFS and
+ * released when the file is closed. For asynchronous event files,
+ * another reference is held by the corresponding main context file
+ * and released when that file is closed. For completion event files,
+ * a reference is taken when a CQ is created that uses the file, and
+ * released when the CQ is destroyed.
+ */
+
+struct ib_uverbs_device {
+ struct kref ref;
+ int num_comp_vectors;
+ struct completion comp;
+ struct device *dev;
+ struct ib_device *ib_dev;
+ int devnum;
+ struct cdev cdev;
+ struct rb_root xrcd_tree;
+ struct mutex xrcd_tree_mutex;
+};
+
+struct ib_uverbs_event_file {
+ struct kref ref;
+ int is_async;
+ struct ib_uverbs_file *uverbs_file;
+ spinlock_t lock;
+ int is_closed;
+ wait_queue_head_t poll_wait;
+ struct fasync_struct *async_queue;
+ struct list_head event_list;
+};
+
+struct ib_uverbs_file {
+ struct kref ref;
+ struct mutex mutex;
+ struct ib_uverbs_device *device;
+ struct ib_ucontext *ucontext;
+ struct ib_event_handler event_handler;
+ struct ib_uverbs_event_file *async_file;
+};
+
+struct ib_uverbs_event {
+ union {
+ struct ib_uverbs_async_event_desc async;
+ struct ib_uverbs_comp_event_desc comp;
+ } desc;
+ struct list_head list;
+ struct list_head obj_list;
+ u32 *counter;
+};
+
+struct ib_uverbs_mcast_entry {
+ struct list_head list;
+ union ib_gid gid;
+ u16 lid;
+};
+
+struct ib_uevent_object {
+ struct ib_uobject uobject;
+ struct list_head event_list;
+ u32 events_reported;
+};
+
+struct ib_uxrcd_object {
+ struct ib_uobject uobject;
+ atomic_t refcnt;
+};
+
+struct ib_usrq_object {
+ struct ib_uevent_object uevent;
+ struct ib_uxrcd_object *uxrcd;
+};
+
+struct ib_uqp_object {
+ struct ib_uevent_object uevent;
+ struct list_head mcast_list;
+ struct ib_uxrcd_object *uxrcd;
+};
+
+struct ib_ucq_object {
+ struct ib_uobject uobject;
+ struct ib_uverbs_file *uverbs_file;
+ struct list_head comp_list;
+ struct list_head async_list;
+ u32 comp_events_reported;
+ u32 async_events_reported;
+};
+
+extern spinlock_t ib_uverbs_idr_lock;
+extern struct idr ib_uverbs_pd_idr;
+extern struct idr ib_uverbs_mr_idr;
+extern struct idr ib_uverbs_mw_idr;
+extern struct idr ib_uverbs_ah_idr;
+extern struct idr ib_uverbs_cq_idr;
+extern struct idr ib_uverbs_qp_idr;
+extern struct idr ib_uverbs_srq_idr;
+extern struct idr ib_uverbs_xrcd_idr;
+extern struct idr ib_uverbs_rule_idr;
+
+void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
+
+struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+ int is_async);
+struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+ struct ib_uverbs_event_file *ev_file,
+ struct ib_ucq_object *uobj);
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+ struct ib_uevent_object *uobj);
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context);
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+ struct ib_event *event);
+void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd);
+
+struct ib_uverbs_flow_spec {
+ union {
+ union {
+ struct ib_uverbs_flow_spec_hdr hdr;
+ struct {
+ __u32 type;
+ __u16 size;
+ __u16 reserved;
+ };
+ };
+ struct ib_uverbs_flow_spec_eth eth;
+ struct ib_uverbs_flow_spec_ipv4 ipv4;
+ struct ib_uverbs_flow_spec_tcp_udp tcp_udp;
+ };
+};
+
+#define IB_UVERBS_DECLARE_CMD(name) \
+ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \
+ const char __user *buf, int in_len, \
+ int out_len)
+
+IB_UVERBS_DECLARE_CMD(get_context);
+IB_UVERBS_DECLARE_CMD(query_device);
+IB_UVERBS_DECLARE_CMD(query_port);
+IB_UVERBS_DECLARE_CMD(alloc_pd);
+IB_UVERBS_DECLARE_CMD(dealloc_pd);
+IB_UVERBS_DECLARE_CMD(reg_mr);
+IB_UVERBS_DECLARE_CMD(rereg_mr);
+IB_UVERBS_DECLARE_CMD(dereg_mr);
+IB_UVERBS_DECLARE_CMD(alloc_mw);
+IB_UVERBS_DECLARE_CMD(dealloc_mw);
+IB_UVERBS_DECLARE_CMD(create_comp_channel);
+IB_UVERBS_DECLARE_CMD(create_cq);
+IB_UVERBS_DECLARE_CMD(resize_cq);
+IB_UVERBS_DECLARE_CMD(poll_cq);
+IB_UVERBS_DECLARE_CMD(req_notify_cq);
+IB_UVERBS_DECLARE_CMD(destroy_cq);
+IB_UVERBS_DECLARE_CMD(create_qp);
+IB_UVERBS_DECLARE_CMD(open_qp);
+IB_UVERBS_DECLARE_CMD(query_qp);
+IB_UVERBS_DECLARE_CMD(modify_qp);
+IB_UVERBS_DECLARE_CMD(destroy_qp);
+IB_UVERBS_DECLARE_CMD(post_send);
+IB_UVERBS_DECLARE_CMD(post_recv);
+IB_UVERBS_DECLARE_CMD(post_srq_recv);
+IB_UVERBS_DECLARE_CMD(create_ah);
+IB_UVERBS_DECLARE_CMD(destroy_ah);
+IB_UVERBS_DECLARE_CMD(attach_mcast);
+IB_UVERBS_DECLARE_CMD(detach_mcast);
+IB_UVERBS_DECLARE_CMD(create_srq);
+IB_UVERBS_DECLARE_CMD(modify_srq);
+IB_UVERBS_DECLARE_CMD(query_srq);
+IB_UVERBS_DECLARE_CMD(destroy_srq);
+IB_UVERBS_DECLARE_CMD(create_xsrq);
+IB_UVERBS_DECLARE_CMD(open_xrcd);
+IB_UVERBS_DECLARE_CMD(close_xrcd);
+
+#define IB_UVERBS_DECLARE_EX_CMD(name) \
+ int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \
+ struct ib_udata *ucore, \
+ struct ib_udata *uhw)
+
+IB_UVERBS_DECLARE_EX_CMD(create_flow);
+IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
+IB_UVERBS_DECLARE_EX_CMD(query_device);
+
+#endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
new file mode 100644
index 000000000..a9f048990
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -0,0 +1,3357 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ * Copyright (c) 2006 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include <asm/uaccess.h>
+
+#include "uverbs.h"
+#include "core_priv.h"
+
+struct uverbs_lock_class {
+ struct lock_class_key key;
+ char name[16];
+};
+
+static struct uverbs_lock_class pd_lock_class = { .name = "PD-uobj" };
+static struct uverbs_lock_class mr_lock_class = { .name = "MR-uobj" };
+static struct uverbs_lock_class mw_lock_class = { .name = "MW-uobj" };
+static struct uverbs_lock_class cq_lock_class = { .name = "CQ-uobj" };
+static struct uverbs_lock_class qp_lock_class = { .name = "QP-uobj" };
+static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" };
+static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" };
+static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" };
+static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" };
+
+/*
+ * The ib_uobject locking scheme is as follows:
+ *
+ * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it
+ * needs to be held during all idr operations. When an object is
+ * looked up, a reference must be taken on the object's kref before
+ * dropping this lock.
+ *
+ * - Each object also has an rwsem. This rwsem must be held for
+ * reading while an operation that uses the object is performed.
+ * For example, while registering an MR, the associated PD's
+ * uobject.mutex must be held for reading. The rwsem must be held
+ * for writing while initializing or destroying an object.
+ *
+ * - In addition, each object has a "live" flag. If this flag is not
+ * set, then lookups of the object will fail even if it is found in
+ * the idr. This handles a reader that blocks and does not acquire
+ * the rwsem until after the object is destroyed. The destroy
+ * operation will set the live flag to 0 and then drop the rwsem;
+ * this will allow the reader to acquire the rwsem, see that the
+ * live flag is 0, and then drop the rwsem and its reference to
+ * object. The underlying storage will not be freed until the last
+ * reference to the object is dropped.
+ */
+
+static void init_uobj(struct ib_uobject *uobj, u64 user_handle,
+ struct ib_ucontext *context, struct uverbs_lock_class *c)
+{
+ uobj->user_handle = user_handle;
+ uobj->context = context;
+ kref_init(&uobj->ref);
+ init_rwsem(&uobj->mutex);
+ lockdep_set_class_and_name(&uobj->mutex, &c->key, c->name);
+ uobj->live = 0;
+}
+
+static void release_uobj(struct kref *kref)
+{
+ kfree(container_of(kref, struct ib_uobject, ref));
+}
+
+static void put_uobj(struct ib_uobject *uobj)
+{
+ kref_put(&uobj->ref, release_uobj);
+}
+
+static void put_uobj_read(struct ib_uobject *uobj)
+{
+ up_read(&uobj->mutex);
+ put_uobj(uobj);
+}
+
+static void put_uobj_write(struct ib_uobject *uobj)
+{
+ up_write(&uobj->mutex);
+ put_uobj(uobj);
+}
+
+static int idr_add_uobj(struct idr *idr, struct ib_uobject *uobj)
+{
+ int ret;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&ib_uverbs_idr_lock);
+
+ ret = idr_alloc(idr, uobj, 0, 0, GFP_NOWAIT);
+ if (ret >= 0)
+ uobj->id = ret;
+
+ spin_unlock(&ib_uverbs_idr_lock);
+ idr_preload_end();
+
+ return ret < 0 ? ret : 0;
+}
+
+void idr_remove_uobj(struct idr *idr, struct ib_uobject *uobj)
+{
+ spin_lock(&ib_uverbs_idr_lock);
+ idr_remove(idr, uobj->id);
+ spin_unlock(&ib_uverbs_idr_lock);
+}
+
+static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id,
+ struct ib_ucontext *context)
+{
+ struct ib_uobject *uobj;
+
+ spin_lock(&ib_uverbs_idr_lock);
+ uobj = idr_find(idr, id);
+ if (uobj) {
+ if (uobj->context == context)
+ kref_get(&uobj->ref);
+ else
+ uobj = NULL;
+ }
+ spin_unlock(&ib_uverbs_idr_lock);
+
+ return uobj;
+}
+
+static struct ib_uobject *idr_read_uobj(struct idr *idr, int id,
+ struct ib_ucontext *context, int nested)
+{
+ struct ib_uobject *uobj;
+
+ uobj = __idr_get_uobj(idr, id, context);
+ if (!uobj)
+ return NULL;
+
+ if (nested)
+ down_read_nested(&uobj->mutex, SINGLE_DEPTH_NESTING);
+ else
+ down_read(&uobj->mutex);
+ if (!uobj->live) {
+ put_uobj_read(uobj);
+ return NULL;
+ }
+
+ return uobj;
+}
+
+static struct ib_uobject *idr_write_uobj(struct idr *idr, int id,
+ struct ib_ucontext *context)
+{
+ struct ib_uobject *uobj;
+
+ uobj = __idr_get_uobj(idr, id, context);
+ if (!uobj)
+ return NULL;
+
+ down_write(&uobj->mutex);
+ if (!uobj->live) {
+ put_uobj_write(uobj);
+ return NULL;
+ }
+
+ return uobj;
+}
+
+static void *idr_read_obj(struct idr *idr, int id, struct ib_ucontext *context,
+ int nested)
+{
+ struct ib_uobject *uobj;
+
+ uobj = idr_read_uobj(idr, id, context, nested);
+ return uobj ? uobj->object : NULL;
+}
+
+static struct ib_pd *idr_read_pd(int pd_handle, struct ib_ucontext *context)
+{
+ return idr_read_obj(&ib_uverbs_pd_idr, pd_handle, context, 0);
+}
+
+static void put_pd_read(struct ib_pd *pd)
+{
+ put_uobj_read(pd->uobject);
+}
+
+static struct ib_cq *idr_read_cq(int cq_handle, struct ib_ucontext *context, int nested)
+{
+ return idr_read_obj(&ib_uverbs_cq_idr, cq_handle, context, nested);
+}
+
+static void put_cq_read(struct ib_cq *cq)
+{
+ put_uobj_read(cq->uobject);
+}
+
+static struct ib_ah *idr_read_ah(int ah_handle, struct ib_ucontext *context)
+{
+ return idr_read_obj(&ib_uverbs_ah_idr, ah_handle, context, 0);
+}
+
+static void put_ah_read(struct ib_ah *ah)
+{
+ put_uobj_read(ah->uobject);
+}
+
+static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context)
+{
+ return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0);
+}
+
+static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context)
+{
+ struct ib_uobject *uobj;
+
+ uobj = idr_write_uobj(&ib_uverbs_qp_idr, qp_handle, context);
+ return uobj ? uobj->object : NULL;
+}
+
+static void put_qp_read(struct ib_qp *qp)
+{
+ put_uobj_read(qp->uobject);
+}
+
+static void put_qp_write(struct ib_qp *qp)
+{
+ put_uobj_write(qp->uobject);
+}
+
+static struct ib_srq *idr_read_srq(int srq_handle, struct ib_ucontext *context)
+{
+ return idr_read_obj(&ib_uverbs_srq_idr, srq_handle, context, 0);
+}
+
+static void put_srq_read(struct ib_srq *srq)
+{
+ put_uobj_read(srq->uobject);
+}
+
+static struct ib_xrcd *idr_read_xrcd(int xrcd_handle, struct ib_ucontext *context,
+ struct ib_uobject **uobj)
+{
+ *uobj = idr_read_uobj(&ib_uverbs_xrcd_idr, xrcd_handle, context, 0);
+ return *uobj ? (*uobj)->object : NULL;
+}
+
+static void put_xrcd_read(struct ib_uobject *uobj)
+{
+ put_uobj_read(uobj);
+}
+
+ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_get_context cmd;
+ struct ib_uverbs_get_context_resp resp;
+ struct ib_udata udata;
+ struct ib_device *ibdev = file->device->ib_dev;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+ struct ib_device_attr dev_attr;
+#endif
+ struct ib_ucontext *ucontext;
+ struct file *filp;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ mutex_lock(&file->mutex);
+
+ if (file->ucontext) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ ucontext = ibdev->alloc_ucontext(ibdev, &udata);
+ if (IS_ERR(ucontext)) {
+ ret = PTR_ERR(ucontext);
+ goto err;
+ }
+
+ ucontext->device = ibdev;
+ INIT_LIST_HEAD(&ucontext->pd_list);
+ INIT_LIST_HEAD(&ucontext->mr_list);
+ INIT_LIST_HEAD(&ucontext->mw_list);
+ INIT_LIST_HEAD(&ucontext->cq_list);
+ INIT_LIST_HEAD(&ucontext->qp_list);
+ INIT_LIST_HEAD(&ucontext->srq_list);
+ INIT_LIST_HEAD(&ucontext->ah_list);
+ INIT_LIST_HEAD(&ucontext->xrcd_list);
+ INIT_LIST_HEAD(&ucontext->rule_list);
+ rcu_read_lock();
+ ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
+ rcu_read_unlock();
+ ucontext->closing = 0;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+ ucontext->umem_tree = RB_ROOT;
+ init_rwsem(&ucontext->umem_rwsem);
+ ucontext->odp_mrs_count = 0;
+ INIT_LIST_HEAD(&ucontext->no_private_counters);
+
+ ret = ib_query_device(ibdev, &dev_attr);
+ if (ret)
+ goto err_free;
+ if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
+ ucontext->invalidate_range = NULL;
+
+#endif
+
+ resp.num_comp_vectors = file->device->num_comp_vectors;
+
+ ret = get_unused_fd_flags(O_CLOEXEC);
+ if (ret < 0)
+ goto err_free;
+ resp.async_fd = ret;
+
+ filp = ib_uverbs_alloc_event_file(file, 1);
+ if (IS_ERR(filp)) {
+ ret = PTR_ERR(filp);
+ goto err_fd;
+ }
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_file;
+ }
+
+ file->async_file = filp->private_data;
+
+ INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev,
+ ib_uverbs_event_handler);
+ ret = ib_register_event_handler(&file->event_handler);
+ if (ret)
+ goto err_file;
+
+ kref_get(&file->async_file->ref);
+ kref_get(&file->ref);
+ file->ucontext = ucontext;
+
+ fd_install(resp.async_fd, filp);
+
+ mutex_unlock(&file->mutex);
+
+ return in_len;
+
+err_file:
+ fput(filp);
+
+err_fd:
+ put_unused_fd(resp.async_fd);
+
+err_free:
+ put_pid(ucontext->tgid);
+ ibdev->dealloc_ucontext(ucontext);
+
+err:
+ mutex_unlock(&file->mutex);
+ return ret;
+}
+
+static void copy_query_dev_fields(struct ib_uverbs_file *file,
+ struct ib_uverbs_query_device_resp *resp,
+ struct ib_device_attr *attr)
+{
+ resp->fw_ver = attr->fw_ver;
+ resp->node_guid = file->device->ib_dev->node_guid;
+ resp->sys_image_guid = attr->sys_image_guid;
+ resp->max_mr_size = attr->max_mr_size;
+ resp->page_size_cap = attr->page_size_cap;
+ resp->vendor_id = attr->vendor_id;
+ resp->vendor_part_id = attr->vendor_part_id;
+ resp->hw_ver = attr->hw_ver;
+ resp->max_qp = attr->max_qp;
+ resp->max_qp_wr = attr->max_qp_wr;
+ resp->device_cap_flags = attr->device_cap_flags;
+ resp->max_sge = attr->max_sge;
+ resp->max_sge_rd = attr->max_sge_rd;
+ resp->max_cq = attr->max_cq;
+ resp->max_cqe = attr->max_cqe;
+ resp->max_mr = attr->max_mr;
+ resp->max_pd = attr->max_pd;
+ resp->max_qp_rd_atom = attr->max_qp_rd_atom;
+ resp->max_ee_rd_atom = attr->max_ee_rd_atom;
+ resp->max_res_rd_atom = attr->max_res_rd_atom;
+ resp->max_qp_init_rd_atom = attr->max_qp_init_rd_atom;
+ resp->max_ee_init_rd_atom = attr->max_ee_init_rd_atom;
+ resp->atomic_cap = attr->atomic_cap;
+ resp->max_ee = attr->max_ee;
+ resp->max_rdd = attr->max_rdd;
+ resp->max_mw = attr->max_mw;
+ resp->max_raw_ipv6_qp = attr->max_raw_ipv6_qp;
+ resp->max_raw_ethy_qp = attr->max_raw_ethy_qp;
+ resp->max_mcast_grp = attr->max_mcast_grp;
+ resp->max_mcast_qp_attach = attr->max_mcast_qp_attach;
+ resp->max_total_mcast_qp_attach = attr->max_total_mcast_qp_attach;
+ resp->max_ah = attr->max_ah;
+ resp->max_fmr = attr->max_fmr;
+ resp->max_map_per_fmr = attr->max_map_per_fmr;
+ resp->max_srq = attr->max_srq;
+ resp->max_srq_wr = attr->max_srq_wr;
+ resp->max_srq_sge = attr->max_srq_sge;
+ resp->max_pkeys = attr->max_pkeys;
+ resp->local_ca_ack_delay = attr->local_ca_ack_delay;
+ resp->phys_port_cnt = file->device->ib_dev->phys_port_cnt;
+}
+
+ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_query_device cmd;
+ struct ib_uverbs_query_device_resp resp;
+ struct ib_device_attr attr;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ ret = ib_query_device(file->device->ib_dev, &attr);
+ if (ret)
+ return ret;
+
+ memset(&resp, 0, sizeof resp);
+ copy_query_dev_fields(file, &resp, &attr);
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ return -EFAULT;
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_query_port cmd;
+ struct ib_uverbs_query_port_resp resp;
+ struct ib_port_attr attr;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr);
+ if (ret)
+ return ret;
+
+ memset(&resp, 0, sizeof resp);
+
+ resp.state = attr.state;
+ resp.max_mtu = attr.max_mtu;
+ resp.active_mtu = attr.active_mtu;
+ resp.gid_tbl_len = attr.gid_tbl_len;
+ resp.port_cap_flags = attr.port_cap_flags;
+ resp.max_msg_sz = attr.max_msg_sz;
+ resp.bad_pkey_cntr = attr.bad_pkey_cntr;
+ resp.qkey_viol_cntr = attr.qkey_viol_cntr;
+ resp.pkey_tbl_len = attr.pkey_tbl_len;
+ resp.lid = attr.lid;
+ resp.sm_lid = attr.sm_lid;
+ resp.lmc = attr.lmc;
+ resp.max_vl_num = attr.max_vl_num;
+ resp.sm_sl = attr.sm_sl;
+ resp.subnet_timeout = attr.subnet_timeout;
+ resp.init_type_reply = attr.init_type_reply;
+ resp.active_width = attr.active_width;
+ resp.active_speed = attr.active_speed;
+ resp.phys_state = attr.phys_state;
+ resp.link_layer = rdma_port_get_link_layer(file->device->ib_dev,
+ cmd.port_num);
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ return -EFAULT;
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_alloc_pd cmd;
+ struct ib_uverbs_alloc_pd_resp resp;
+ struct ib_udata udata;
+ struct ib_uobject *uobj;
+ struct ib_pd *pd;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+ if (!uobj)
+ return -ENOMEM;
+
+ init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
+ down_write(&uobj->mutex);
+
+ pd = file->device->ib_dev->alloc_pd(file->device->ib_dev,
+ file->ucontext, &udata);
+ if (IS_ERR(pd)) {
+ ret = PTR_ERR(pd);
+ goto err;
+ }
+
+ pd->device = file->device->ib_dev;
+ pd->uobject = uobj;
+ atomic_set(&pd->usecnt, 0);
+
+ uobj->object = pd;
+ ret = idr_add_uobj(&ib_uverbs_pd_idr, uobj);
+ if (ret)
+ goto err_idr;
+
+ memset(&resp, 0, sizeof resp);
+ resp.pd_handle = uobj->id;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&uobj->list, &file->ucontext->pd_list);
+ mutex_unlock(&file->mutex);
+
+ uobj->live = 1;
+
+ up_write(&uobj->mutex);
+
+ return in_len;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+
+err_idr:
+ ib_dealloc_pd(pd);
+
+err:
+ put_uobj_write(uobj);
+ return ret;
+}
+
+ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_dealloc_pd cmd;
+ struct ib_uobject *uobj;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+
+ ret = ib_dealloc_pd(uobj->object);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+
+ if (ret)
+ return ret;
+
+ idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ put_uobj(uobj);
+
+ return in_len;
+}
+
+struct xrcd_table_entry {
+ struct rb_node node;
+ struct ib_xrcd *xrcd;
+ struct inode *inode;
+};
+
+static int xrcd_table_insert(struct ib_uverbs_device *dev,
+ struct inode *inode,
+ struct ib_xrcd *xrcd)
+{
+ struct xrcd_table_entry *entry, *scan;
+ struct rb_node **p = &dev->xrcd_tree.rb_node;
+ struct rb_node *parent = NULL;
+
+ entry = kmalloc(sizeof *entry, GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->xrcd = xrcd;
+ entry->inode = inode;
+
+ while (*p) {
+ parent = *p;
+ scan = rb_entry(parent, struct xrcd_table_entry, node);
+
+ if (inode < scan->inode) {
+ p = &(*p)->rb_left;
+ } else if (inode > scan->inode) {
+ p = &(*p)->rb_right;
+ } else {
+ kfree(entry);
+ return -EEXIST;
+ }
+ }
+
+ rb_link_node(&entry->node, parent, p);
+ rb_insert_color(&entry->node, &dev->xrcd_tree);
+ igrab(inode);
+ return 0;
+}
+
+static struct xrcd_table_entry *xrcd_table_search(struct ib_uverbs_device *dev,
+ struct inode *inode)
+{
+ struct xrcd_table_entry *entry;
+ struct rb_node *p = dev->xrcd_tree.rb_node;
+
+ while (p) {
+ entry = rb_entry(p, struct xrcd_table_entry, node);
+
+ if (inode < entry->inode)
+ p = p->rb_left;
+ else if (inode > entry->inode)
+ p = p->rb_right;
+ else
+ return entry;
+ }
+
+ return NULL;
+}
+
+static struct ib_xrcd *find_xrcd(struct ib_uverbs_device *dev, struct inode *inode)
+{
+ struct xrcd_table_entry *entry;
+
+ entry = xrcd_table_search(dev, inode);
+ if (!entry)
+ return NULL;
+
+ return entry->xrcd;
+}
+
+static void xrcd_table_delete(struct ib_uverbs_device *dev,
+ struct inode *inode)
+{
+ struct xrcd_table_entry *entry;
+
+ entry = xrcd_table_search(dev, inode);
+ if (entry) {
+ iput(inode);
+ rb_erase(&entry->node, &dev->xrcd_tree);
+ kfree(entry);
+ }
+}
+
+ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_open_xrcd cmd;
+ struct ib_uverbs_open_xrcd_resp resp;
+ struct ib_udata udata;
+ struct ib_uxrcd_object *obj;
+ struct ib_xrcd *xrcd = NULL;
+ struct fd f = {NULL, 0};
+ struct inode *inode = NULL;
+ int ret = 0;
+ int new_xrcd = 0;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ mutex_lock(&file->device->xrcd_tree_mutex);
+
+ if (cmd.fd != -1) {
+ /* search for file descriptor */
+ f = fdget(cmd.fd);
+ if (!f.file) {
+ ret = -EBADF;
+ goto err_tree_mutex_unlock;
+ }
+
+ inode = file_inode(f.file);
+ xrcd = find_xrcd(file->device, inode);
+ if (!xrcd && !(cmd.oflags & O_CREAT)) {
+ /* no file descriptor. Need CREATE flag */
+ ret = -EAGAIN;
+ goto err_tree_mutex_unlock;
+ }
+
+ if (xrcd && cmd.oflags & O_EXCL) {
+ ret = -EINVAL;
+ goto err_tree_mutex_unlock;
+ }
+ }
+
+ obj = kmalloc(sizeof *obj, GFP_KERNEL);
+ if (!obj) {
+ ret = -ENOMEM;
+ goto err_tree_mutex_unlock;
+ }
+
+ init_uobj(&obj->uobject, 0, file->ucontext, &xrcd_lock_class);
+
+ down_write(&obj->uobject.mutex);
+
+ if (!xrcd) {
+ xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev,
+ file->ucontext, &udata);
+ if (IS_ERR(xrcd)) {
+ ret = PTR_ERR(xrcd);
+ goto err;
+ }
+
+ xrcd->inode = inode;
+ xrcd->device = file->device->ib_dev;
+ atomic_set(&xrcd->usecnt, 0);
+ mutex_init(&xrcd->tgt_qp_mutex);
+ INIT_LIST_HEAD(&xrcd->tgt_qp_list);
+ new_xrcd = 1;
+ }
+
+ atomic_set(&obj->refcnt, 0);
+ obj->uobject.object = xrcd;
+ ret = idr_add_uobj(&ib_uverbs_xrcd_idr, &obj->uobject);
+ if (ret)
+ goto err_idr;
+
+ memset(&resp, 0, sizeof resp);
+ resp.xrcd_handle = obj->uobject.id;
+
+ if (inode) {
+ if (new_xrcd) {
+ /* create new inode/xrcd table entry */
+ ret = xrcd_table_insert(file->device, inode, xrcd);
+ if (ret)
+ goto err_insert_xrcd;
+ }
+ atomic_inc(&xrcd->usecnt);
+ }
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ if (f.file)
+ fdput(f);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&obj->uobject.list, &file->ucontext->xrcd_list);
+ mutex_unlock(&file->mutex);
+
+ obj->uobject.live = 1;
+ up_write(&obj->uobject.mutex);
+
+ mutex_unlock(&file->device->xrcd_tree_mutex);
+ return in_len;
+
+err_copy:
+ if (inode) {
+ if (new_xrcd)
+ xrcd_table_delete(file->device, inode);
+ atomic_dec(&xrcd->usecnt);
+ }
+
+err_insert_xrcd:
+ idr_remove_uobj(&ib_uverbs_xrcd_idr, &obj->uobject);
+
+err_idr:
+ ib_dealloc_xrcd(xrcd);
+
+err:
+ put_uobj_write(&obj->uobject);
+
+err_tree_mutex_unlock:
+ if (f.file)
+ fdput(f);
+
+ mutex_unlock(&file->device->xrcd_tree_mutex);
+
+ return ret;
+}
+
+ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_close_xrcd cmd;
+ struct ib_uobject *uobj;
+ struct ib_xrcd *xrcd = NULL;
+ struct inode *inode = NULL;
+ struct ib_uxrcd_object *obj;
+ int live;
+ int ret = 0;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ mutex_lock(&file->device->xrcd_tree_mutex);
+ uobj = idr_write_uobj(&ib_uverbs_xrcd_idr, cmd.xrcd_handle, file->ucontext);
+ if (!uobj) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ xrcd = uobj->object;
+ inode = xrcd->inode;
+ obj = container_of(uobj, struct ib_uxrcd_object, uobject);
+ if (atomic_read(&obj->refcnt)) {
+ put_uobj_write(uobj);
+ ret = -EBUSY;
+ goto out;
+ }
+
+ if (!inode || atomic_dec_and_test(&xrcd->usecnt)) {
+ ret = ib_dealloc_xrcd(uobj->object);
+ if (!ret)
+ uobj->live = 0;
+ }
+
+ live = uobj->live;
+ if (inode && ret)
+ atomic_inc(&xrcd->usecnt);
+
+ put_uobj_write(uobj);
+
+ if (ret)
+ goto out;
+
+ if (inode && !live)
+ xrcd_table_delete(file->device, inode);
+
+ idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj);
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ put_uobj(uobj);
+ ret = in_len;
+
+out:
+ mutex_unlock(&file->device->xrcd_tree_mutex);
+ return ret;
+}
+
+void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev,
+ struct ib_xrcd *xrcd)
+{
+ struct inode *inode;
+
+ inode = xrcd->inode;
+ if (inode && !atomic_dec_and_test(&xrcd->usecnt))
+ return;
+
+ ib_dealloc_xrcd(xrcd);
+
+ if (inode)
+ xrcd_table_delete(dev, inode);
+}
+
+ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_reg_mr cmd;
+ struct ib_uverbs_reg_mr_resp resp;
+ struct ib_udata udata;
+ struct ib_uobject *uobj;
+ struct ib_pd *pd;
+ struct ib_mr *mr;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
+ return -EINVAL;
+
+ ret = ib_check_mr_access(cmd.access_flags);
+ if (ret)
+ return ret;
+
+ uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+ if (!uobj)
+ return -ENOMEM;
+
+ init_uobj(uobj, 0, file->ucontext, &mr_lock_class);
+ down_write(&uobj->mutex);
+
+ pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ if (cmd.access_flags & IB_ACCESS_ON_DEMAND) {
+ struct ib_device_attr attr;
+
+ ret = ib_query_device(pd->device, &attr);
+ if (ret || !(attr.device_cap_flags &
+ IB_DEVICE_ON_DEMAND_PAGING)) {
+ pr_debug("ODP support not available\n");
+ ret = -EINVAL;
+ goto err_put;
+ }
+ }
+
+ mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
+ cmd.access_flags, &udata);
+ if (IS_ERR(mr)) {
+ ret = PTR_ERR(mr);
+ goto err_put;
+ }
+
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->uobject = uobj;
+ atomic_inc(&pd->usecnt);
+ atomic_set(&mr->usecnt, 0);
+
+ uobj->object = mr;
+ ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj);
+ if (ret)
+ goto err_unreg;
+
+ memset(&resp, 0, sizeof resp);
+ resp.lkey = mr->lkey;
+ resp.rkey = mr->rkey;
+ resp.mr_handle = uobj->id;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ put_pd_read(pd);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&uobj->list, &file->ucontext->mr_list);
+ mutex_unlock(&file->mutex);
+
+ uobj->live = 1;
+
+ up_write(&uobj->mutex);
+
+ return in_len;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+
+err_unreg:
+ ib_dereg_mr(mr);
+
+err_put:
+ put_pd_read(pd);
+
+err_free:
+ put_uobj_write(uobj);
+ return ret;
+}
+
+ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_rereg_mr cmd;
+ struct ib_uverbs_rereg_mr_resp resp;
+ struct ib_udata udata;
+ struct ib_pd *pd = NULL;
+ struct ib_mr *mr;
+ struct ib_pd *old_pd;
+ int ret;
+ struct ib_uobject *uobj;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof(cmd),
+ (unsigned long) cmd.response + sizeof(resp),
+ in_len - sizeof(cmd), out_len - sizeof(resp));
+
+ if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags)
+ return -EINVAL;
+
+ if ((cmd.flags & IB_MR_REREG_TRANS) &&
+ (!cmd.start || !cmd.hca_va || 0 >= cmd.length ||
+ (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)))
+ return -EINVAL;
+
+ uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle,
+ file->ucontext);
+
+ if (!uobj)
+ return -EINVAL;
+
+ mr = uobj->object;
+
+ if (cmd.flags & IB_MR_REREG_ACCESS) {
+ ret = ib_check_mr_access(cmd.access_flags);
+ if (ret)
+ goto put_uobjs;
+ }
+
+ if (cmd.flags & IB_MR_REREG_PD) {
+ pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ if (!pd) {
+ ret = -EINVAL;
+ goto put_uobjs;
+ }
+ }
+
+ if (atomic_read(&mr->usecnt)) {
+ ret = -EBUSY;
+ goto put_uobj_pd;
+ }
+
+ old_pd = mr->pd;
+ ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start,
+ cmd.length, cmd.hca_va,
+ cmd.access_flags, pd, &udata);
+ if (!ret) {
+ if (cmd.flags & IB_MR_REREG_PD) {
+ atomic_inc(&pd->usecnt);
+ mr->pd = pd;
+ atomic_dec(&old_pd->usecnt);
+ }
+ } else {
+ goto put_uobj_pd;
+ }
+
+ memset(&resp, 0, sizeof(resp));
+ resp.lkey = mr->lkey;
+ resp.rkey = mr->rkey;
+
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp)))
+ ret = -EFAULT;
+ else
+ ret = in_len;
+
+put_uobj_pd:
+ if (cmd.flags & IB_MR_REREG_PD)
+ put_pd_read(pd);
+
+put_uobjs:
+
+ put_uobj_write(mr->uobject);
+
+ return ret;
+}
+
+ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_dereg_mr cmd;
+ struct ib_mr *mr;
+ struct ib_uobject *uobj;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+
+ mr = uobj->object;
+
+ ret = ib_dereg_mr(mr);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+
+ if (ret)
+ return ret;
+
+ idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ put_uobj(uobj);
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_alloc_mw cmd;
+ struct ib_uverbs_alloc_mw_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_pd *pd;
+ struct ib_mw *mw;
+ int ret;
+
+ if (out_len < sizeof(resp))
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+
+ uobj = kmalloc(sizeof(*uobj), GFP_KERNEL);
+ if (!uobj)
+ return -ENOMEM;
+
+ init_uobj(uobj, 0, file->ucontext, &mw_lock_class);
+ down_write(&uobj->mutex);
+
+ pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err_free;
+ }
+
+ mw = pd->device->alloc_mw(pd, cmd.mw_type);
+ if (IS_ERR(mw)) {
+ ret = PTR_ERR(mw);
+ goto err_put;
+ }
+
+ mw->device = pd->device;
+ mw->pd = pd;
+ mw->uobject = uobj;
+ atomic_inc(&pd->usecnt);
+
+ uobj->object = mw;
+ ret = idr_add_uobj(&ib_uverbs_mw_idr, uobj);
+ if (ret)
+ goto err_unalloc;
+
+ memset(&resp, 0, sizeof(resp));
+ resp.rkey = mw->rkey;
+ resp.mw_handle = uobj->id;
+
+ if (copy_to_user((void __user *)(unsigned long)cmd.response,
+ &resp, sizeof(resp))) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ put_pd_read(pd);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&uobj->list, &file->ucontext->mw_list);
+ mutex_unlock(&file->mutex);
+
+ uobj->live = 1;
+
+ up_write(&uobj->mutex);
+
+ return in_len;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
+
+err_unalloc:
+ ib_dealloc_mw(mw);
+
+err_put:
+ put_pd_read(pd);
+
+err_free:
+ put_uobj_write(uobj);
+ return ret;
+}
+
+ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_dealloc_mw cmd;
+ struct ib_mw *mw;
+ struct ib_uobject *uobj;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof(cmd)))
+ return -EFAULT;
+
+ uobj = idr_write_uobj(&ib_uverbs_mw_idr, cmd.mw_handle, file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+
+ mw = uobj->object;
+
+ ret = ib_dealloc_mw(mw);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+
+ if (ret)
+ return ret;
+
+ idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ put_uobj(uobj);
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_comp_channel cmd;
+ struct ib_uverbs_create_comp_channel_resp resp;
+ struct file *filp;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ ret = get_unused_fd_flags(O_CLOEXEC);
+ if (ret < 0)
+ return ret;
+ resp.fd = ret;
+
+ filp = ib_uverbs_alloc_event_file(file, 0);
+ if (IS_ERR(filp)) {
+ put_unused_fd(resp.fd);
+ return PTR_ERR(filp);
+ }
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ put_unused_fd(resp.fd);
+ fput(filp);
+ return -EFAULT;
+ }
+
+ fd_install(resp.fd, filp);
+ return in_len;
+}
+
+ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_cq cmd;
+ struct ib_uverbs_create_cq_resp resp;
+ struct ib_udata udata;
+ struct ib_ucq_object *obj;
+ struct ib_uverbs_event_file *ev_file = NULL;
+ struct ib_cq *cq;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ if (cmd.comp_vector >= file->device->num_comp_vectors)
+ return -EINVAL;
+
+ obj = kmalloc(sizeof *obj, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+
+ init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &cq_lock_class);
+ down_write(&obj->uobject.mutex);
+
+ if (cmd.comp_channel >= 0) {
+ ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel);
+ if (!ev_file) {
+ ret = -EINVAL;
+ goto err;
+ }
+ }
+
+ obj->uverbs_file = file;
+ obj->comp_events_reported = 0;
+ obj->async_events_reported = 0;
+ INIT_LIST_HEAD(&obj->comp_list);
+ INIT_LIST_HEAD(&obj->async_list);
+
+ cq = file->device->ib_dev->create_cq(file->device->ib_dev, cmd.cqe,
+ cmd.comp_vector,
+ file->ucontext, &udata);
+ if (IS_ERR(cq)) {
+ ret = PTR_ERR(cq);
+ goto err_file;
+ }
+
+ cq->device = file->device->ib_dev;
+ cq->uobject = &obj->uobject;
+ cq->comp_handler = ib_uverbs_comp_handler;
+ cq->event_handler = ib_uverbs_cq_event_handler;
+ cq->cq_context = ev_file;
+ atomic_set(&cq->usecnt, 0);
+
+ obj->uobject.object = cq;
+ ret = idr_add_uobj(&ib_uverbs_cq_idr, &obj->uobject);
+ if (ret)
+ goto err_free;
+
+ memset(&resp, 0, sizeof resp);
+ resp.cq_handle = obj->uobject.id;
+ resp.cqe = cq->cqe;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&obj->uobject.list, &file->ucontext->cq_list);
+ mutex_unlock(&file->mutex);
+
+ obj->uobject.live = 1;
+
+ up_write(&obj->uobject.mutex);
+
+ return in_len;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_cq_idr, &obj->uobject);
+
+err_free:
+ ib_destroy_cq(cq);
+
+err_file:
+ if (ev_file)
+ ib_uverbs_release_ucq(file, ev_file, obj);
+
+err:
+ put_uobj_write(&obj->uobject);
+ return ret;
+}
+
+ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_resize_cq cmd;
+ struct ib_uverbs_resize_cq_resp resp;
+ struct ib_udata udata;
+ struct ib_cq *cq;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+ if (!cq)
+ return -EINVAL;
+
+ ret = cq->device->resize_cq(cq, cmd.cqe, &udata);
+ if (ret)
+ goto out;
+
+ resp.cqe = cq->cqe;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp.cqe))
+ ret = -EFAULT;
+
+out:
+ put_cq_read(cq);
+
+ return ret ? ret : in_len;
+}
+
+static int copy_wc_to_user(void __user *dest, struct ib_wc *wc)
+{
+ struct ib_uverbs_wc tmp;
+
+ tmp.wr_id = wc->wr_id;
+ tmp.status = wc->status;
+ tmp.opcode = wc->opcode;
+ tmp.vendor_err = wc->vendor_err;
+ tmp.byte_len = wc->byte_len;
+ tmp.ex.imm_data = (__u32 __force) wc->ex.imm_data;
+ tmp.qp_num = wc->qp->qp_num;
+ tmp.src_qp = wc->src_qp;
+ tmp.wc_flags = wc->wc_flags;
+ tmp.pkey_index = wc->pkey_index;
+ tmp.slid = wc->slid;
+ tmp.sl = wc->sl;
+ tmp.dlid_path_bits = wc->dlid_path_bits;
+ tmp.port_num = wc->port_num;
+ tmp.reserved = 0;
+
+ if (copy_to_user(dest, &tmp, sizeof tmp))
+ return -EFAULT;
+
+ return 0;
+}
+
+ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_poll_cq cmd;
+ struct ib_uverbs_poll_cq_resp resp;
+ u8 __user *header_ptr;
+ u8 __user *data_ptr;
+ struct ib_cq *cq;
+ struct ib_wc wc;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+ if (!cq)
+ return -EINVAL;
+
+ /* we copy a struct ib_uverbs_poll_cq_resp to user space */
+ header_ptr = (void __user *)(unsigned long) cmd.response;
+ data_ptr = header_ptr + sizeof resp;
+
+ memset(&resp, 0, sizeof resp);
+ while (resp.count < cmd.ne) {
+ ret = ib_poll_cq(cq, 1, &wc);
+ if (ret < 0)
+ goto out_put;
+ if (!ret)
+ break;
+
+ ret = copy_wc_to_user(data_ptr, &wc);
+ if (ret)
+ goto out_put;
+
+ data_ptr += sizeof(struct ib_uverbs_wc);
+ ++resp.count;
+ }
+
+ if (copy_to_user(header_ptr, &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto out_put;
+ }
+
+ ret = in_len;
+
+out_put:
+ put_cq_read(cq);
+ return ret;
+}
+
+ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_req_notify_cq cmd;
+ struct ib_cq *cq;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+ if (!cq)
+ return -EINVAL;
+
+ ib_req_notify_cq(cq, cmd.solicited_only ?
+ IB_CQ_SOLICITED : IB_CQ_NEXT_COMP);
+
+ put_cq_read(cq);
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_destroy_cq cmd;
+ struct ib_uverbs_destroy_cq_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_cq *cq;
+ struct ib_ucq_object *obj;
+ struct ib_uverbs_event_file *ev_file;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ uobj = idr_write_uobj(&ib_uverbs_cq_idr, cmd.cq_handle, file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+ cq = uobj->object;
+ ev_file = cq->cq_context;
+ obj = container_of(cq->uobject, struct ib_ucq_object, uobject);
+
+ ret = ib_destroy_cq(cq);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+
+ if (ret)
+ return ret;
+
+ idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ ib_uverbs_release_ucq(file, ev_file, obj);
+
+ memset(&resp, 0, sizeof resp);
+ resp.comp_events_reported = obj->comp_events_reported;
+ resp.async_events_reported = obj->async_events_reported;
+
+ put_uobj(uobj);
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ return -EFAULT;
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_qp cmd;
+ struct ib_uverbs_create_qp_resp resp;
+ struct ib_udata udata;
+ struct ib_uqp_object *obj;
+ struct ib_device *device;
+ struct ib_pd *pd = NULL;
+ struct ib_xrcd *xrcd = NULL;
+ struct ib_uobject *uninitialized_var(xrcd_uobj);
+ struct ib_cq *scq = NULL, *rcq = NULL;
+ struct ib_srq *srq = NULL;
+ struct ib_qp *qp;
+ struct ib_qp_init_attr attr;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ if (cmd.qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+ return -EPERM;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ obj = kzalloc(sizeof *obj, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+
+ init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class);
+ down_write(&obj->uevent.uobject.mutex);
+
+ if (cmd.qp_type == IB_QPT_XRC_TGT) {
+ xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj);
+ if (!xrcd) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ device = xrcd->device;
+ } else {
+ if (cmd.qp_type == IB_QPT_XRC_INI) {
+ cmd.max_recv_wr = cmd.max_recv_sge = 0;
+ } else {
+ if (cmd.is_srq) {
+ srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+ if (!srq || srq->srq_type != IB_SRQT_BASIC) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ }
+
+ if (cmd.recv_cq_handle != cmd.send_cq_handle) {
+ rcq = idr_read_cq(cmd.recv_cq_handle, file->ucontext, 0);
+ if (!rcq) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+ }
+ }
+
+ scq = idr_read_cq(cmd.send_cq_handle, file->ucontext, !!rcq);
+ rcq = rcq ?: scq;
+ pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ if (!pd || !scq) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ device = pd->device;
+ }
+
+ attr.event_handler = ib_uverbs_qp_event_handler;
+ attr.qp_context = file;
+ attr.send_cq = scq;
+ attr.recv_cq = rcq;
+ attr.srq = srq;
+ attr.xrcd = xrcd;
+ attr.sq_sig_type = cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+ attr.qp_type = cmd.qp_type;
+ attr.create_flags = 0;
+
+ attr.cap.max_send_wr = cmd.max_send_wr;
+ attr.cap.max_recv_wr = cmd.max_recv_wr;
+ attr.cap.max_send_sge = cmd.max_send_sge;
+ attr.cap.max_recv_sge = cmd.max_recv_sge;
+ attr.cap.max_inline_data = cmd.max_inline_data;
+
+ obj->uevent.events_reported = 0;
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+ INIT_LIST_HEAD(&obj->mcast_list);
+
+ if (cmd.qp_type == IB_QPT_XRC_TGT)
+ qp = ib_create_qp(pd, &attr);
+ else
+ qp = device->create_qp(pd, &attr, &udata);
+
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
+ goto err_put;
+ }
+
+ if (cmd.qp_type != IB_QPT_XRC_TGT) {
+ qp->real_qp = qp;
+ qp->device = device;
+ qp->pd = pd;
+ qp->send_cq = attr.send_cq;
+ qp->recv_cq = attr.recv_cq;
+ qp->srq = attr.srq;
+ qp->event_handler = attr.event_handler;
+ qp->qp_context = attr.qp_context;
+ qp->qp_type = attr.qp_type;
+ atomic_set(&qp->usecnt, 0);
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&attr.send_cq->usecnt);
+ if (attr.recv_cq)
+ atomic_inc(&attr.recv_cq->usecnt);
+ if (attr.srq)
+ atomic_inc(&attr.srq->usecnt);
+ }
+ qp->uobject = &obj->uevent.uobject;
+
+ obj->uevent.uobject.object = qp;
+ ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+ if (ret)
+ goto err_destroy;
+
+ memset(&resp, 0, sizeof resp);
+ resp.qpn = qp->qp_num;
+ resp.qp_handle = obj->uevent.uobject.id;
+ resp.max_recv_sge = attr.cap.max_recv_sge;
+ resp.max_send_sge = attr.cap.max_send_sge;
+ resp.max_recv_wr = attr.cap.max_recv_wr;
+ resp.max_send_wr = attr.cap.max_send_wr;
+ resp.max_inline_data = attr.cap.max_inline_data;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ if (xrcd) {
+ obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object,
+ uobject);
+ atomic_inc(&obj->uxrcd->refcnt);
+ put_xrcd_read(xrcd_uobj);
+ }
+
+ if (pd)
+ put_pd_read(pd);
+ if (scq)
+ put_cq_read(scq);
+ if (rcq && rcq != scq)
+ put_cq_read(rcq);
+ if (srq)
+ put_srq_read(srq);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
+ mutex_unlock(&file->mutex);
+
+ obj->uevent.uobject.live = 1;
+
+ up_write(&obj->uevent.uobject.mutex);
+
+ return in_len;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+
+err_destroy:
+ ib_destroy_qp(qp);
+
+err_put:
+ if (xrcd)
+ put_xrcd_read(xrcd_uobj);
+ if (pd)
+ put_pd_read(pd);
+ if (scq)
+ put_cq_read(scq);
+ if (rcq && rcq != scq)
+ put_cq_read(rcq);
+ if (srq)
+ put_srq_read(srq);
+
+ put_uobj_write(&obj->uevent.uobject);
+ return ret;
+}
+
+ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len, int out_len)
+{
+ struct ib_uverbs_open_qp cmd;
+ struct ib_uverbs_create_qp_resp resp;
+ struct ib_udata udata;
+ struct ib_uqp_object *obj;
+ struct ib_xrcd *xrcd;
+ struct ib_uobject *uninitialized_var(xrcd_uobj);
+ struct ib_qp *qp;
+ struct ib_qp_open_attr attr;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ obj = kmalloc(sizeof *obj, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+
+ init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class);
+ down_write(&obj->uevent.uobject.mutex);
+
+ xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj);
+ if (!xrcd) {
+ ret = -EINVAL;
+ goto err_put;
+ }
+
+ attr.event_handler = ib_uverbs_qp_event_handler;
+ attr.qp_context = file;
+ attr.qp_num = cmd.qpn;
+ attr.qp_type = cmd.qp_type;
+
+ obj->uevent.events_reported = 0;
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+ INIT_LIST_HEAD(&obj->mcast_list);
+
+ qp = ib_open_qp(xrcd, &attr);
+ if (IS_ERR(qp)) {
+ ret = PTR_ERR(qp);
+ goto err_put;
+ }
+
+ qp->uobject = &obj->uevent.uobject;
+
+ obj->uevent.uobject.object = qp;
+ ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+ if (ret)
+ goto err_destroy;
+
+ memset(&resp, 0, sizeof resp);
+ resp.qpn = qp->qp_num;
+ resp.qp_handle = obj->uevent.uobject.id;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_remove;
+ }
+
+ obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+ atomic_inc(&obj->uxrcd->refcnt);
+ put_xrcd_read(xrcd_uobj);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
+ mutex_unlock(&file->mutex);
+
+ obj->uevent.uobject.live = 1;
+
+ up_write(&obj->uevent.uobject.mutex);
+
+ return in_len;
+
+err_remove:
+ idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+
+err_destroy:
+ ib_destroy_qp(qp);
+
+err_put:
+ put_xrcd_read(xrcd_uobj);
+ put_uobj_write(&obj->uevent.uobject);
+ return ret;
+}
+
+ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_query_qp cmd;
+ struct ib_uverbs_query_qp_resp resp;
+ struct ib_qp *qp;
+ struct ib_qp_attr *attr;
+ struct ib_qp_init_attr *init_attr;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL);
+ if (!attr || !init_attr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+ if (!qp) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr);
+
+ put_qp_read(qp);
+
+ if (ret)
+ goto out;
+
+ memset(&resp, 0, sizeof resp);
+
+ resp.qp_state = attr->qp_state;
+ resp.cur_qp_state = attr->cur_qp_state;
+ resp.path_mtu = attr->path_mtu;
+ resp.path_mig_state = attr->path_mig_state;
+ resp.qkey = attr->qkey;
+ resp.rq_psn = attr->rq_psn;
+ resp.sq_psn = attr->sq_psn;
+ resp.dest_qp_num = attr->dest_qp_num;
+ resp.qp_access_flags = attr->qp_access_flags;
+ resp.pkey_index = attr->pkey_index;
+ resp.alt_pkey_index = attr->alt_pkey_index;
+ resp.sq_draining = attr->sq_draining;
+ resp.max_rd_atomic = attr->max_rd_atomic;
+ resp.max_dest_rd_atomic = attr->max_dest_rd_atomic;
+ resp.min_rnr_timer = attr->min_rnr_timer;
+ resp.port_num = attr->port_num;
+ resp.timeout = attr->timeout;
+ resp.retry_cnt = attr->retry_cnt;
+ resp.rnr_retry = attr->rnr_retry;
+ resp.alt_port_num = attr->alt_port_num;
+ resp.alt_timeout = attr->alt_timeout;
+
+ memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16);
+ resp.dest.flow_label = attr->ah_attr.grh.flow_label;
+ resp.dest.sgid_index = attr->ah_attr.grh.sgid_index;
+ resp.dest.hop_limit = attr->ah_attr.grh.hop_limit;
+ resp.dest.traffic_class = attr->ah_attr.grh.traffic_class;
+ resp.dest.dlid = attr->ah_attr.dlid;
+ resp.dest.sl = attr->ah_attr.sl;
+ resp.dest.src_path_bits = attr->ah_attr.src_path_bits;
+ resp.dest.static_rate = attr->ah_attr.static_rate;
+ resp.dest.is_global = !!(attr->ah_attr.ah_flags & IB_AH_GRH);
+ resp.dest.port_num = attr->ah_attr.port_num;
+
+ memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16);
+ resp.alt_dest.flow_label = attr->alt_ah_attr.grh.flow_label;
+ resp.alt_dest.sgid_index = attr->alt_ah_attr.grh.sgid_index;
+ resp.alt_dest.hop_limit = attr->alt_ah_attr.grh.hop_limit;
+ resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class;
+ resp.alt_dest.dlid = attr->alt_ah_attr.dlid;
+ resp.alt_dest.sl = attr->alt_ah_attr.sl;
+ resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits;
+ resp.alt_dest.static_rate = attr->alt_ah_attr.static_rate;
+ resp.alt_dest.is_global = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH);
+ resp.alt_dest.port_num = attr->alt_ah_attr.port_num;
+
+ resp.max_send_wr = init_attr->cap.max_send_wr;
+ resp.max_recv_wr = init_attr->cap.max_recv_wr;
+ resp.max_send_sge = init_attr->cap.max_send_sge;
+ resp.max_recv_sge = init_attr->cap.max_recv_sge;
+ resp.max_inline_data = init_attr->cap.max_inline_data;
+ resp.sq_sig_all = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ ret = -EFAULT;
+
+out:
+ kfree(attr);
+ kfree(init_attr);
+
+ return ret ? ret : in_len;
+}
+
+/* Remove ignored fields set in the attribute mask */
+static int modify_qp_mask(enum ib_qp_type qp_type, int mask)
+{
+ switch (qp_type) {
+ case IB_QPT_XRC_INI:
+ return mask & ~(IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER);
+ case IB_QPT_XRC_TGT:
+ return mask & ~(IB_QP_MAX_QP_RD_ATOMIC | IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY);
+ default:
+ return mask;
+ }
+}
+
+ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_modify_qp cmd;
+ struct ib_udata udata;
+ struct ib_qp *qp;
+ struct ib_qp_attr *attr;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
+ out_len);
+
+ attr = kmalloc(sizeof *attr, GFP_KERNEL);
+ if (!attr)
+ return -ENOMEM;
+
+ qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+ if (!qp) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ attr->qp_state = cmd.qp_state;
+ attr->cur_qp_state = cmd.cur_qp_state;
+ attr->path_mtu = cmd.path_mtu;
+ attr->path_mig_state = cmd.path_mig_state;
+ attr->qkey = cmd.qkey;
+ attr->rq_psn = cmd.rq_psn;
+ attr->sq_psn = cmd.sq_psn;
+ attr->dest_qp_num = cmd.dest_qp_num;
+ attr->qp_access_flags = cmd.qp_access_flags;
+ attr->pkey_index = cmd.pkey_index;
+ attr->alt_pkey_index = cmd.alt_pkey_index;
+ attr->en_sqd_async_notify = cmd.en_sqd_async_notify;
+ attr->max_rd_atomic = cmd.max_rd_atomic;
+ attr->max_dest_rd_atomic = cmd.max_dest_rd_atomic;
+ attr->min_rnr_timer = cmd.min_rnr_timer;
+ attr->port_num = cmd.port_num;
+ attr->timeout = cmd.timeout;
+ attr->retry_cnt = cmd.retry_cnt;
+ attr->rnr_retry = cmd.rnr_retry;
+ attr->alt_port_num = cmd.alt_port_num;
+ attr->alt_timeout = cmd.alt_timeout;
+
+ memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16);
+ attr->ah_attr.grh.flow_label = cmd.dest.flow_label;
+ attr->ah_attr.grh.sgid_index = cmd.dest.sgid_index;
+ attr->ah_attr.grh.hop_limit = cmd.dest.hop_limit;
+ attr->ah_attr.grh.traffic_class = cmd.dest.traffic_class;
+ attr->ah_attr.dlid = cmd.dest.dlid;
+ attr->ah_attr.sl = cmd.dest.sl;
+ attr->ah_attr.src_path_bits = cmd.dest.src_path_bits;
+ attr->ah_attr.static_rate = cmd.dest.static_rate;
+ attr->ah_attr.ah_flags = cmd.dest.is_global ? IB_AH_GRH : 0;
+ attr->ah_attr.port_num = cmd.dest.port_num;
+
+ memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16);
+ attr->alt_ah_attr.grh.flow_label = cmd.alt_dest.flow_label;
+ attr->alt_ah_attr.grh.sgid_index = cmd.alt_dest.sgid_index;
+ attr->alt_ah_attr.grh.hop_limit = cmd.alt_dest.hop_limit;
+ attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class;
+ attr->alt_ah_attr.dlid = cmd.alt_dest.dlid;
+ attr->alt_ah_attr.sl = cmd.alt_dest.sl;
+ attr->alt_ah_attr.src_path_bits = cmd.alt_dest.src_path_bits;
+ attr->alt_ah_attr.static_rate = cmd.alt_dest.static_rate;
+ attr->alt_ah_attr.ah_flags = cmd.alt_dest.is_global ? IB_AH_GRH : 0;
+ attr->alt_ah_attr.port_num = cmd.alt_dest.port_num;
+
+ if (qp->real_qp == qp) {
+ ret = ib_resolve_eth_l2_attrs(qp, attr, &cmd.attr_mask);
+ if (ret)
+ goto release_qp;
+ ret = qp->device->modify_qp(qp, attr,
+ modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata);
+ } else {
+ ret = ib_modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask));
+ }
+
+ if (ret)
+ goto release_qp;
+
+ ret = in_len;
+
+release_qp:
+ put_qp_read(qp);
+
+out:
+ kfree(attr);
+
+ return ret;
+}
+
+ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_destroy_qp cmd;
+ struct ib_uverbs_destroy_qp_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_qp *qp;
+ struct ib_uqp_object *obj;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ memset(&resp, 0, sizeof resp);
+
+ uobj = idr_write_uobj(&ib_uverbs_qp_idr, cmd.qp_handle, file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+ qp = uobj->object;
+ obj = container_of(uobj, struct ib_uqp_object, uevent.uobject);
+
+ if (!list_empty(&obj->mcast_list)) {
+ put_uobj_write(uobj);
+ return -EBUSY;
+ }
+
+ ret = ib_destroy_qp(qp);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+
+ if (ret)
+ return ret;
+
+ if (obj->uxrcd)
+ atomic_dec(&obj->uxrcd->refcnt);
+
+ idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ ib_uverbs_release_uevent(file, &obj->uevent);
+
+ resp.events_reported = obj->uevent.events_reported;
+
+ put_uobj(uobj);
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ return -EFAULT;
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_post_send cmd;
+ struct ib_uverbs_post_send_resp resp;
+ struct ib_uverbs_send_wr *user_wr;
+ struct ib_send_wr *wr = NULL, *last, *next, *bad_wr;
+ struct ib_qp *qp;
+ int i, sg_ind;
+ int is_ud;
+ ssize_t ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ if (in_len < sizeof cmd + cmd.wqe_size * cmd.wr_count +
+ cmd.sge_count * sizeof (struct ib_uverbs_sge))
+ return -EINVAL;
+
+ if (cmd.wqe_size < sizeof (struct ib_uverbs_send_wr))
+ return -EINVAL;
+
+ user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL);
+ if (!user_wr)
+ return -ENOMEM;
+
+ qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+ if (!qp)
+ goto out;
+
+ is_ud = qp->qp_type == IB_QPT_UD;
+ sg_ind = 0;
+ last = NULL;
+ for (i = 0; i < cmd.wr_count; ++i) {
+ if (copy_from_user(user_wr,
+ buf + sizeof cmd + i * cmd.wqe_size,
+ cmd.wqe_size)) {
+ ret = -EFAULT;
+ goto out_put;
+ }
+
+ if (user_wr->num_sge + sg_ind > cmd.sge_count) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+
+ next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
+ user_wr->num_sge * sizeof (struct ib_sge),
+ GFP_KERNEL);
+ if (!next) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ if (!last)
+ wr = next;
+ else
+ last->next = next;
+ last = next;
+
+ next->next = NULL;
+ next->wr_id = user_wr->wr_id;
+ next->num_sge = user_wr->num_sge;
+ next->opcode = user_wr->opcode;
+ next->send_flags = user_wr->send_flags;
+
+ if (is_ud) {
+ next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,
+ file->ucontext);
+ if (!next->wr.ud.ah) {
+ ret = -EINVAL;
+ goto out_put;
+ }
+ next->wr.ud.remote_qpn = user_wr->wr.ud.remote_qpn;
+ next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey;
+ if (next->opcode == IB_WR_SEND_WITH_IMM)
+ next->ex.imm_data =
+ (__be32 __force) user_wr->ex.imm_data;
+ } else {
+ switch (next->opcode) {
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ next->ex.imm_data =
+ (__be32 __force) user_wr->ex.imm_data;
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_READ:
+ next->wr.rdma.remote_addr =
+ user_wr->wr.rdma.remote_addr;
+ next->wr.rdma.rkey =
+ user_wr->wr.rdma.rkey;
+ break;
+ case IB_WR_SEND_WITH_IMM:
+ next->ex.imm_data =
+ (__be32 __force) user_wr->ex.imm_data;
+ break;
+ case IB_WR_SEND_WITH_INV:
+ next->ex.invalidate_rkey =
+ user_wr->ex.invalidate_rkey;
+ break;
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ next->wr.atomic.remote_addr =
+ user_wr->wr.atomic.remote_addr;
+ next->wr.atomic.compare_add =
+ user_wr->wr.atomic.compare_add;
+ next->wr.atomic.swap = user_wr->wr.atomic.swap;
+ next->wr.atomic.rkey = user_wr->wr.atomic.rkey;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (next->num_sge) {
+ next->sg_list = (void *) next +
+ ALIGN(sizeof *next, sizeof (struct ib_sge));
+ if (copy_from_user(next->sg_list,
+ buf + sizeof cmd +
+ cmd.wr_count * cmd.wqe_size +
+ sg_ind * sizeof (struct ib_sge),
+ next->num_sge * sizeof (struct ib_sge))) {
+ ret = -EFAULT;
+ goto out_put;
+ }
+ sg_ind += next->num_sge;
+ } else
+ next->sg_list = NULL;
+ }
+
+ resp.bad_wr = 0;
+ ret = qp->device->post_send(qp->real_qp, wr, &bad_wr);
+ if (ret)
+ for (next = wr; next; next = next->next) {
+ ++resp.bad_wr;
+ if (next == bad_wr)
+ break;
+ }
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ ret = -EFAULT;
+
+out_put:
+ put_qp_read(qp);
+
+ while (wr) {
+ if (is_ud && wr->wr.ud.ah)
+ put_ah_read(wr->wr.ud.ah);
+ next = wr->next;
+ kfree(wr);
+ wr = next;
+ }
+
+out:
+ kfree(user_wr);
+
+ return ret ? ret : in_len;
+}
+
+static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
+ int in_len,
+ u32 wr_count,
+ u32 sge_count,
+ u32 wqe_size)
+{
+ struct ib_uverbs_recv_wr *user_wr;
+ struct ib_recv_wr *wr = NULL, *last, *next;
+ int sg_ind;
+ int i;
+ int ret;
+
+ if (in_len < wqe_size * wr_count +
+ sge_count * sizeof (struct ib_uverbs_sge))
+ return ERR_PTR(-EINVAL);
+
+ if (wqe_size < sizeof (struct ib_uverbs_recv_wr))
+ return ERR_PTR(-EINVAL);
+
+ user_wr = kmalloc(wqe_size, GFP_KERNEL);
+ if (!user_wr)
+ return ERR_PTR(-ENOMEM);
+
+ sg_ind = 0;
+ last = NULL;
+ for (i = 0; i < wr_count; ++i) {
+ if (copy_from_user(user_wr, buf + i * wqe_size,
+ wqe_size)) {
+ ret = -EFAULT;
+ goto err;
+ }
+
+ if (user_wr->num_sge + sg_ind > sge_count) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
+ user_wr->num_sge * sizeof (struct ib_sge),
+ GFP_KERNEL);
+ if (!next) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ if (!last)
+ wr = next;
+ else
+ last->next = next;
+ last = next;
+
+ next->next = NULL;
+ next->wr_id = user_wr->wr_id;
+ next->num_sge = user_wr->num_sge;
+
+ if (next->num_sge) {
+ next->sg_list = (void *) next +
+ ALIGN(sizeof *next, sizeof (struct ib_sge));
+ if (copy_from_user(next->sg_list,
+ buf + wr_count * wqe_size +
+ sg_ind * sizeof (struct ib_sge),
+ next->num_sge * sizeof (struct ib_sge))) {
+ ret = -EFAULT;
+ goto err;
+ }
+ sg_ind += next->num_sge;
+ } else
+ next->sg_list = NULL;
+ }
+
+ kfree(user_wr);
+ return wr;
+
+err:
+ kfree(user_wr);
+
+ while (wr) {
+ next = wr->next;
+ kfree(wr);
+ wr = next;
+ }
+
+ return ERR_PTR(ret);
+}
+
+ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_post_recv cmd;
+ struct ib_uverbs_post_recv_resp resp;
+ struct ib_recv_wr *wr, *next, *bad_wr;
+ struct ib_qp *qp;
+ ssize_t ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+ in_len - sizeof cmd, cmd.wr_count,
+ cmd.sge_count, cmd.wqe_size);
+ if (IS_ERR(wr))
+ return PTR_ERR(wr);
+
+ qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+ if (!qp)
+ goto out;
+
+ resp.bad_wr = 0;
+ ret = qp->device->post_recv(qp->real_qp, wr, &bad_wr);
+
+ put_qp_read(qp);
+
+ if (ret)
+ for (next = wr; next; next = next->next) {
+ ++resp.bad_wr;
+ if (next == bad_wr)
+ break;
+ }
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ ret = -EFAULT;
+
+out:
+ while (wr) {
+ next = wr->next;
+ kfree(wr);
+ wr = next;
+ }
+
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_post_srq_recv cmd;
+ struct ib_uverbs_post_srq_recv_resp resp;
+ struct ib_recv_wr *wr, *next, *bad_wr;
+ struct ib_srq *srq;
+ ssize_t ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+ in_len - sizeof cmd, cmd.wr_count,
+ cmd.sge_count, cmd.wqe_size);
+ if (IS_ERR(wr))
+ return PTR_ERR(wr);
+
+ srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+ if (!srq)
+ goto out;
+
+ resp.bad_wr = 0;
+ ret = srq->device->post_srq_recv(srq, wr, &bad_wr);
+
+ put_srq_read(srq);
+
+ if (ret)
+ for (next = wr; next; next = next->next) {
+ ++resp.bad_wr;
+ if (next == bad_wr)
+ break;
+ }
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ ret = -EFAULT;
+
+out:
+ while (wr) {
+ next = wr->next;
+ kfree(wr);
+ wr = next;
+ }
+
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_ah cmd;
+ struct ib_uverbs_create_ah_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_pd *pd;
+ struct ib_ah *ah;
+ struct ib_ah_attr attr;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+ if (!uobj)
+ return -ENOMEM;
+
+ init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_class);
+ down_write(&uobj->mutex);
+
+ pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ attr.dlid = cmd.attr.dlid;
+ attr.sl = cmd.attr.sl;
+ attr.src_path_bits = cmd.attr.src_path_bits;
+ attr.static_rate = cmd.attr.static_rate;
+ attr.ah_flags = cmd.attr.is_global ? IB_AH_GRH : 0;
+ attr.port_num = cmd.attr.port_num;
+ attr.grh.flow_label = cmd.attr.grh.flow_label;
+ attr.grh.sgid_index = cmd.attr.grh.sgid_index;
+ attr.grh.hop_limit = cmd.attr.grh.hop_limit;
+ attr.grh.traffic_class = cmd.attr.grh.traffic_class;
+ attr.vlan_id = 0;
+ memset(&attr.dmac, 0, sizeof(attr.dmac));
+ memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
+
+ ah = ib_create_ah(pd, &attr);
+ if (IS_ERR(ah)) {
+ ret = PTR_ERR(ah);
+ goto err_put;
+ }
+
+ ah->uobject = uobj;
+ uobj->object = ah;
+
+ ret = idr_add_uobj(&ib_uverbs_ah_idr, uobj);
+ if (ret)
+ goto err_destroy;
+
+ resp.ah_handle = uobj->id;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ put_pd_read(pd);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&uobj->list, &file->ucontext->ah_list);
+ mutex_unlock(&file->mutex);
+
+ uobj->live = 1;
+
+ up_write(&uobj->mutex);
+
+ return in_len;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+
+err_destroy:
+ ib_destroy_ah(ah);
+
+err_put:
+ put_pd_read(pd);
+
+err:
+ put_uobj_write(uobj);
+ return ret;
+}
+
+ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len, int out_len)
+{
+ struct ib_uverbs_destroy_ah cmd;
+ struct ib_ah *ah;
+ struct ib_uobject *uobj;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ uobj = idr_write_uobj(&ib_uverbs_ah_idr, cmd.ah_handle, file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+ ah = uobj->object;
+
+ ret = ib_destroy_ah(ah);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+
+ if (ret)
+ return ret;
+
+ idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ put_uobj(uobj);
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_attach_mcast cmd;
+ struct ib_qp *qp;
+ struct ib_uqp_object *obj;
+ struct ib_uverbs_mcast_entry *mcast;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ qp = idr_write_qp(cmd.qp_handle, file->ucontext);
+ if (!qp)
+ return -EINVAL;
+
+ obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+
+ list_for_each_entry(mcast, &obj->mcast_list, list)
+ if (cmd.mlid == mcast->lid &&
+ !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+ ret = 0;
+ goto out_put;
+ }
+
+ mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
+ if (!mcast) {
+ ret = -ENOMEM;
+ goto out_put;
+ }
+
+ mcast->lid = cmd.mlid;
+ memcpy(mcast->gid.raw, cmd.gid, sizeof mcast->gid.raw);
+
+ ret = ib_attach_mcast(qp, &mcast->gid, cmd.mlid);
+ if (!ret)
+ list_add_tail(&mcast->list, &obj->mcast_list);
+ else
+ kfree(mcast);
+
+out_put:
+ put_qp_write(qp);
+
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_detach_mcast cmd;
+ struct ib_uqp_object *obj;
+ struct ib_qp *qp;
+ struct ib_uverbs_mcast_entry *mcast;
+ int ret = -EINVAL;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ qp = idr_write_qp(cmd.qp_handle, file->ucontext);
+ if (!qp)
+ return -EINVAL;
+
+ ret = ib_detach_mcast(qp, (union ib_gid *) cmd.gid, cmd.mlid);
+ if (ret)
+ goto out_put;
+
+ obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+
+ list_for_each_entry(mcast, &obj->mcast_list, list)
+ if (cmd.mlid == mcast->lid &&
+ !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+ list_del(&mcast->list);
+ kfree(mcast);
+ break;
+ }
+
+out_put:
+ put_qp_write(qp);
+
+ return ret ? ret : in_len;
+}
+
+static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
+ union ib_flow_spec *ib_spec)
+{
+ if (kern_spec->reserved)
+ return -EINVAL;
+
+ ib_spec->type = kern_spec->type;
+
+ switch (ib_spec->type) {
+ case IB_FLOW_SPEC_ETH:
+ ib_spec->eth.size = sizeof(struct ib_flow_spec_eth);
+ if (ib_spec->eth.size != kern_spec->eth.size)
+ return -EINVAL;
+ memcpy(&ib_spec->eth.val, &kern_spec->eth.val,
+ sizeof(struct ib_flow_eth_filter));
+ memcpy(&ib_spec->eth.mask, &kern_spec->eth.mask,
+ sizeof(struct ib_flow_eth_filter));
+ break;
+ case IB_FLOW_SPEC_IPV4:
+ ib_spec->ipv4.size = sizeof(struct ib_flow_spec_ipv4);
+ if (ib_spec->ipv4.size != kern_spec->ipv4.size)
+ return -EINVAL;
+ memcpy(&ib_spec->ipv4.val, &kern_spec->ipv4.val,
+ sizeof(struct ib_flow_ipv4_filter));
+ memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask,
+ sizeof(struct ib_flow_ipv4_filter));
+ break;
+ case IB_FLOW_SPEC_TCP:
+ case IB_FLOW_SPEC_UDP:
+ ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp);
+ if (ib_spec->tcp_udp.size != kern_spec->tcp_udp.size)
+ return -EINVAL;
+ memcpy(&ib_spec->tcp_udp.val, &kern_spec->tcp_udp.val,
+ sizeof(struct ib_flow_tcp_udp_filter));
+ memcpy(&ib_spec->tcp_udp.mask, &kern_spec->tcp_udp.mask,
+ sizeof(struct ib_flow_tcp_udp_filter));
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_create_flow cmd;
+ struct ib_uverbs_create_flow_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_flow *flow_id;
+ struct ib_uverbs_flow_attr *kern_flow_attr;
+ struct ib_flow_attr *flow_attr;
+ struct ib_qp *qp;
+ int err = 0;
+ void *kern_spec;
+ void *ib_spec;
+ int i;
+
+ if (ucore->inlen < sizeof(cmd))
+ return -EINVAL;
+
+ if (ucore->outlen < sizeof(resp))
+ return -ENOSPC;
+
+ err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+ if (err)
+ return err;
+
+ ucore->inbuf += sizeof(cmd);
+ ucore->inlen -= sizeof(cmd);
+
+ if (cmd.comp_mask)
+ return -EINVAL;
+
+ if ((cmd.flow_attr.type == IB_FLOW_ATTR_SNIFFER &&
+ !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW))
+ return -EPERM;
+
+ if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS)
+ return -EINVAL;
+
+ if (cmd.flow_attr.size > ucore->inlen ||
+ cmd.flow_attr.size >
+ (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec)))
+ return -EINVAL;
+
+ if (cmd.flow_attr.reserved[0] ||
+ cmd.flow_attr.reserved[1])
+ return -EINVAL;
+
+ if (cmd.flow_attr.num_of_specs) {
+ kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size,
+ GFP_KERNEL);
+ if (!kern_flow_attr)
+ return -ENOMEM;
+
+ memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr));
+ err = ib_copy_from_udata(kern_flow_attr + 1, ucore,
+ cmd.flow_attr.size);
+ if (err)
+ goto err_free_attr;
+ } else {
+ kern_flow_attr = &cmd.flow_attr;
+ }
+
+ uobj = kmalloc(sizeof(*uobj), GFP_KERNEL);
+ if (!uobj) {
+ err = -ENOMEM;
+ goto err_free_attr;
+ }
+ init_uobj(uobj, 0, file->ucontext, &rule_lock_class);
+ down_write(&uobj->mutex);
+
+ qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+ if (!qp) {
+ err = -EINVAL;
+ goto err_uobj;
+ }
+
+ flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, GFP_KERNEL);
+ if (!flow_attr) {
+ err = -ENOMEM;
+ goto err_put;
+ }
+
+ flow_attr->type = kern_flow_attr->type;
+ flow_attr->priority = kern_flow_attr->priority;
+ flow_attr->num_of_specs = kern_flow_attr->num_of_specs;
+ flow_attr->port = kern_flow_attr->port;
+ flow_attr->flags = kern_flow_attr->flags;
+ flow_attr->size = sizeof(*flow_attr);
+
+ kern_spec = kern_flow_attr + 1;
+ ib_spec = flow_attr + 1;
+ for (i = 0; i < flow_attr->num_of_specs &&
+ cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) &&
+ cmd.flow_attr.size >=
+ ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) {
+ err = kern_spec_to_ib_spec(kern_spec, ib_spec);
+ if (err)
+ goto err_free;
+ flow_attr->size +=
+ ((union ib_flow_spec *) ib_spec)->size;
+ cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size;
+ kern_spec += ((struct ib_uverbs_flow_spec *) kern_spec)->size;
+ ib_spec += ((union ib_flow_spec *) ib_spec)->size;
+ }
+ if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) {
+ pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n",
+ i, cmd.flow_attr.size);
+ err = -EINVAL;
+ goto err_free;
+ }
+ flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
+ if (IS_ERR(flow_id)) {
+ err = PTR_ERR(flow_id);
+ goto err_free;
+ }
+ flow_id->qp = qp;
+ flow_id->uobject = uobj;
+ uobj->object = flow_id;
+
+ err = idr_add_uobj(&ib_uverbs_rule_idr, uobj);
+ if (err)
+ goto destroy_flow;
+
+ memset(&resp, 0, sizeof(resp));
+ resp.flow_handle = uobj->id;
+
+ err = ib_copy_to_udata(ucore,
+ &resp, sizeof(resp));
+ if (err)
+ goto err_copy;
+
+ put_qp_read(qp);
+ mutex_lock(&file->mutex);
+ list_add_tail(&uobj->list, &file->ucontext->rule_list);
+ mutex_unlock(&file->mutex);
+
+ uobj->live = 1;
+
+ up_write(&uobj->mutex);
+ kfree(flow_attr);
+ if (cmd.flow_attr.num_of_specs)
+ kfree(kern_flow_attr);
+ return 0;
+err_copy:
+ idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
+destroy_flow:
+ ib_destroy_flow(flow_id);
+err_free:
+ kfree(flow_attr);
+err_put:
+ put_qp_read(qp);
+err_uobj:
+ put_uobj_write(uobj);
+err_free_attr:
+ if (cmd.flow_attr.num_of_specs)
+ kfree(kern_flow_attr);
+ return err;
+}
+
+int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_destroy_flow cmd;
+ struct ib_flow *flow_id;
+ struct ib_uobject *uobj;
+ int ret;
+
+ if (ucore->inlen < sizeof(cmd))
+ return -EINVAL;
+
+ ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+ if (ret)
+ return ret;
+
+ if (cmd.comp_mask)
+ return -EINVAL;
+
+ uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle,
+ file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+ flow_id = uobj->object;
+
+ ret = ib_destroy_flow(flow_id);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+
+ idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ put_uobj(uobj);
+
+ return ret;
+}
+
+static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
+ struct ib_uverbs_create_xsrq *cmd,
+ struct ib_udata *udata)
+{
+ struct ib_uverbs_create_srq_resp resp;
+ struct ib_usrq_object *obj;
+ struct ib_pd *pd;
+ struct ib_srq *srq;
+ struct ib_uobject *uninitialized_var(xrcd_uobj);
+ struct ib_srq_init_attr attr;
+ int ret;
+
+ obj = kmalloc(sizeof *obj, GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+
+ init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &srq_lock_class);
+ down_write(&obj->uevent.uobject.mutex);
+
+ if (cmd->srq_type == IB_SRQT_XRC) {
+ attr.ext.xrc.xrcd = idr_read_xrcd(cmd->xrcd_handle, file->ucontext, &xrcd_uobj);
+ if (!attr.ext.xrc.xrcd) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+ atomic_inc(&obj->uxrcd->refcnt);
+
+ attr.ext.xrc.cq = idr_read_cq(cmd->cq_handle, file->ucontext, 0);
+ if (!attr.ext.xrc.cq) {
+ ret = -EINVAL;
+ goto err_put_xrcd;
+ }
+ }
+
+ pd = idr_read_pd(cmd->pd_handle, file->ucontext);
+ if (!pd) {
+ ret = -EINVAL;
+ goto err_put_cq;
+ }
+
+ attr.event_handler = ib_uverbs_srq_event_handler;
+ attr.srq_context = file;
+ attr.srq_type = cmd->srq_type;
+ attr.attr.max_wr = cmd->max_wr;
+ attr.attr.max_sge = cmd->max_sge;
+ attr.attr.srq_limit = cmd->srq_limit;
+
+ obj->uevent.events_reported = 0;
+ INIT_LIST_HEAD(&obj->uevent.event_list);
+
+ srq = pd->device->create_srq(pd, &attr, udata);
+ if (IS_ERR(srq)) {
+ ret = PTR_ERR(srq);
+ goto err_put;
+ }
+
+ srq->device = pd->device;
+ srq->pd = pd;
+ srq->srq_type = cmd->srq_type;
+ srq->uobject = &obj->uevent.uobject;
+ srq->event_handler = attr.event_handler;
+ srq->srq_context = attr.srq_context;
+
+ if (cmd->srq_type == IB_SRQT_XRC) {
+ srq->ext.xrc.cq = attr.ext.xrc.cq;
+ srq->ext.xrc.xrcd = attr.ext.xrc.xrcd;
+ atomic_inc(&attr.ext.xrc.cq->usecnt);
+ atomic_inc(&attr.ext.xrc.xrcd->usecnt);
+ }
+
+ atomic_inc(&pd->usecnt);
+ atomic_set(&srq->usecnt, 0);
+
+ obj->uevent.uobject.object = srq;
+ ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject);
+ if (ret)
+ goto err_destroy;
+
+ memset(&resp, 0, sizeof resp);
+ resp.srq_handle = obj->uevent.uobject.id;
+ resp.max_wr = attr.attr.max_wr;
+ resp.max_sge = attr.attr.max_sge;
+ if (cmd->srq_type == IB_SRQT_XRC)
+ resp.srqn = srq->ext.xrc.srq_num;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd->response,
+ &resp, sizeof resp)) {
+ ret = -EFAULT;
+ goto err_copy;
+ }
+
+ if (cmd->srq_type == IB_SRQT_XRC) {
+ put_uobj_read(xrcd_uobj);
+ put_cq_read(attr.ext.xrc.cq);
+ }
+ put_pd_read(pd);
+
+ mutex_lock(&file->mutex);
+ list_add_tail(&obj->uevent.uobject.list, &file->ucontext->srq_list);
+ mutex_unlock(&file->mutex);
+
+ obj->uevent.uobject.live = 1;
+
+ up_write(&obj->uevent.uobject.mutex);
+
+ return 0;
+
+err_copy:
+ idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject);
+
+err_destroy:
+ ib_destroy_srq(srq);
+
+err_put:
+ put_pd_read(pd);
+
+err_put_cq:
+ if (cmd->srq_type == IB_SRQT_XRC)
+ put_cq_read(attr.ext.xrc.cq);
+
+err_put_xrcd:
+ if (cmd->srq_type == IB_SRQT_XRC) {
+ atomic_dec(&obj->uxrcd->refcnt);
+ put_uobj_read(xrcd_uobj);
+ }
+
+err:
+ put_uobj_write(&obj->uevent.uobject);
+ return ret;
+}
+
+ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_create_srq cmd;
+ struct ib_uverbs_create_xsrq xcmd;
+ struct ib_uverbs_create_srq_resp resp;
+ struct ib_udata udata;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ xcmd.response = cmd.response;
+ xcmd.user_handle = cmd.user_handle;
+ xcmd.srq_type = IB_SRQT_BASIC;
+ xcmd.pd_handle = cmd.pd_handle;
+ xcmd.max_wr = cmd.max_wr;
+ xcmd.max_sge = cmd.max_sge;
+ xcmd.srq_limit = cmd.srq_limit;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ ret = __uverbs_create_xsrq(file, &xcmd, &udata);
+ if (ret)
+ return ret;
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len, int out_len)
+{
+ struct ib_uverbs_create_xsrq cmd;
+ struct ib_uverbs_create_srq_resp resp;
+ struct ib_udata udata;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd,
+ (unsigned long) cmd.response + sizeof resp,
+ in_len - sizeof cmd, out_len - sizeof resp);
+
+ ret = __uverbs_create_xsrq(file, &cmd, &udata);
+ if (ret)
+ return ret;
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_modify_srq cmd;
+ struct ib_udata udata;
+ struct ib_srq *srq;
+ struct ib_srq_attr attr;
+ int ret;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
+ out_len);
+
+ srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+ if (!srq)
+ return -EINVAL;
+
+ attr.max_wr = cmd.max_wr;
+ attr.srq_limit = cmd.srq_limit;
+
+ ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata);
+
+ put_srq_read(srq);
+
+ return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
+ const char __user *buf,
+ int in_len, int out_len)
+{
+ struct ib_uverbs_query_srq cmd;
+ struct ib_uverbs_query_srq_resp resp;
+ struct ib_srq_attr attr;
+ struct ib_srq *srq;
+ int ret;
+
+ if (out_len < sizeof resp)
+ return -ENOSPC;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+ if (!srq)
+ return -EINVAL;
+
+ ret = ib_query_srq(srq, &attr);
+
+ put_srq_read(srq);
+
+ if (ret)
+ return ret;
+
+ memset(&resp, 0, sizeof resp);
+
+ resp.max_wr = attr.max_wr;
+ resp.max_sge = attr.max_sge;
+ resp.srq_limit = attr.srq_limit;
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ return -EFAULT;
+
+ return in_len;
+}
+
+ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len)
+{
+ struct ib_uverbs_destroy_srq cmd;
+ struct ib_uverbs_destroy_srq_resp resp;
+ struct ib_uobject *uobj;
+ struct ib_srq *srq;
+ struct ib_uevent_object *obj;
+ int ret = -EINVAL;
+ struct ib_usrq_object *us;
+ enum ib_srq_type srq_type;
+
+ if (copy_from_user(&cmd, buf, sizeof cmd))
+ return -EFAULT;
+
+ uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext);
+ if (!uobj)
+ return -EINVAL;
+ srq = uobj->object;
+ obj = container_of(uobj, struct ib_uevent_object, uobject);
+ srq_type = srq->srq_type;
+
+ ret = ib_destroy_srq(srq);
+ if (!ret)
+ uobj->live = 0;
+
+ put_uobj_write(uobj);
+
+ if (ret)
+ return ret;
+
+ if (srq_type == IB_SRQT_XRC) {
+ us = container_of(obj, struct ib_usrq_object, uevent);
+ atomic_dec(&us->uxrcd->refcnt);
+ }
+
+ idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
+
+ mutex_lock(&file->mutex);
+ list_del(&uobj->list);
+ mutex_unlock(&file->mutex);
+
+ ib_uverbs_release_uevent(file, obj);
+
+ memset(&resp, 0, sizeof resp);
+ resp.events_reported = obj->events_reported;
+
+ put_uobj(uobj);
+
+ if (copy_to_user((void __user *) (unsigned long) cmd.response,
+ &resp, sizeof resp))
+ ret = -EFAULT;
+
+ return ret ? ret : in_len;
+}
+
+int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw)
+{
+ struct ib_uverbs_ex_query_device_resp resp;
+ struct ib_uverbs_ex_query_device cmd;
+ struct ib_device_attr attr;
+ struct ib_device *device;
+ int err;
+
+ device = file->device->ib_dev;
+ if (ucore->inlen < sizeof(cmd))
+ return -EINVAL;
+
+ err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+ if (err)
+ return err;
+
+ if (cmd.comp_mask)
+ return -EINVAL;
+
+ if (cmd.reserved)
+ return -EINVAL;
+
+ resp.response_length = offsetof(typeof(resp), odp_caps);
+
+ if (ucore->outlen < resp.response_length)
+ return -ENOSPC;
+
+ err = device->query_device(device, &attr);
+ if (err)
+ return err;
+
+ copy_query_dev_fields(file, &resp.base, &attr);
+ resp.comp_mask = 0;
+
+ if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps))
+ goto end;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+ resp.odp_caps.general_caps = attr.odp_caps.general_caps;
+ resp.odp_caps.per_transport_caps.rc_odp_caps =
+ attr.odp_caps.per_transport_caps.rc_odp_caps;
+ resp.odp_caps.per_transport_caps.uc_odp_caps =
+ attr.odp_caps.per_transport_caps.uc_odp_caps;
+ resp.odp_caps.per_transport_caps.ud_odp_caps =
+ attr.odp_caps.per_transport_caps.ud_odp_caps;
+ resp.odp_caps.reserved = 0;
+#else
+ memset(&resp.odp_caps, 0, sizeof(resp.odp_caps));
+#endif
+ resp.response_length += sizeof(resp.odp_caps);
+
+end:
+ err = ib_copy_to_udata(ucore, &resp, resp.response_length);
+ if (err)
+ return err;
+
+ return 0;
+}
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
new file mode 100644
index 000000000..88cce9bb7
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -0,0 +1,1039 @@
+/*
+ * Copyright (c) 2005 Topspin Communications. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/cdev.h>
+#include <linux/anon_inodes.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+
+#include "uverbs.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace verbs access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+ IB_UVERBS_MAJOR = 231,
+ IB_UVERBS_BASE_MINOR = 192,
+ IB_UVERBS_MAX_DEVICES = 32
+};
+
+#define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR)
+
+static struct class *uverbs_class;
+
+DEFINE_SPINLOCK(ib_uverbs_idr_lock);
+DEFINE_IDR(ib_uverbs_pd_idr);
+DEFINE_IDR(ib_uverbs_mr_idr);
+DEFINE_IDR(ib_uverbs_mw_idr);
+DEFINE_IDR(ib_uverbs_ah_idr);
+DEFINE_IDR(ib_uverbs_cq_idr);
+DEFINE_IDR(ib_uverbs_qp_idr);
+DEFINE_IDR(ib_uverbs_srq_idr);
+DEFINE_IDR(ib_uverbs_xrcd_idr);
+DEFINE_IDR(ib_uverbs_rule_idr);
+
+static DEFINE_SPINLOCK(map_lock);
+static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
+
+static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
+ const char __user *buf, int in_len,
+ int out_len) = {
+ [IB_USER_VERBS_CMD_GET_CONTEXT] = ib_uverbs_get_context,
+ [IB_USER_VERBS_CMD_QUERY_DEVICE] = ib_uverbs_query_device,
+ [IB_USER_VERBS_CMD_QUERY_PORT] = ib_uverbs_query_port,
+ [IB_USER_VERBS_CMD_ALLOC_PD] = ib_uverbs_alloc_pd,
+ [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd,
+ [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr,
+ [IB_USER_VERBS_CMD_REREG_MR] = ib_uverbs_rereg_mr,
+ [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr,
+ [IB_USER_VERBS_CMD_ALLOC_MW] = ib_uverbs_alloc_mw,
+ [IB_USER_VERBS_CMD_DEALLOC_MW] = ib_uverbs_dealloc_mw,
+ [IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel,
+ [IB_USER_VERBS_CMD_CREATE_CQ] = ib_uverbs_create_cq,
+ [IB_USER_VERBS_CMD_RESIZE_CQ] = ib_uverbs_resize_cq,
+ [IB_USER_VERBS_CMD_POLL_CQ] = ib_uverbs_poll_cq,
+ [IB_USER_VERBS_CMD_REQ_NOTIFY_CQ] = ib_uverbs_req_notify_cq,
+ [IB_USER_VERBS_CMD_DESTROY_CQ] = ib_uverbs_destroy_cq,
+ [IB_USER_VERBS_CMD_CREATE_QP] = ib_uverbs_create_qp,
+ [IB_USER_VERBS_CMD_QUERY_QP] = ib_uverbs_query_qp,
+ [IB_USER_VERBS_CMD_MODIFY_QP] = ib_uverbs_modify_qp,
+ [IB_USER_VERBS_CMD_DESTROY_QP] = ib_uverbs_destroy_qp,
+ [IB_USER_VERBS_CMD_POST_SEND] = ib_uverbs_post_send,
+ [IB_USER_VERBS_CMD_POST_RECV] = ib_uverbs_post_recv,
+ [IB_USER_VERBS_CMD_POST_SRQ_RECV] = ib_uverbs_post_srq_recv,
+ [IB_USER_VERBS_CMD_CREATE_AH] = ib_uverbs_create_ah,
+ [IB_USER_VERBS_CMD_DESTROY_AH] = ib_uverbs_destroy_ah,
+ [IB_USER_VERBS_CMD_ATTACH_MCAST] = ib_uverbs_attach_mcast,
+ [IB_USER_VERBS_CMD_DETACH_MCAST] = ib_uverbs_detach_mcast,
+ [IB_USER_VERBS_CMD_CREATE_SRQ] = ib_uverbs_create_srq,
+ [IB_USER_VERBS_CMD_MODIFY_SRQ] = ib_uverbs_modify_srq,
+ [IB_USER_VERBS_CMD_QUERY_SRQ] = ib_uverbs_query_srq,
+ [IB_USER_VERBS_CMD_DESTROY_SRQ] = ib_uverbs_destroy_srq,
+ [IB_USER_VERBS_CMD_OPEN_XRCD] = ib_uverbs_open_xrcd,
+ [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd,
+ [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq,
+ [IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp,
+};
+
+static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
+ struct ib_udata *ucore,
+ struct ib_udata *uhw) = {
+ [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow,
+ [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow,
+ [IB_USER_VERBS_EX_CMD_QUERY_DEVICE] = ib_uverbs_ex_query_device,
+};
+
+static void ib_uverbs_add_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device);
+
+static void ib_uverbs_release_dev(struct kref *ref)
+{
+ struct ib_uverbs_device *dev =
+ container_of(ref, struct ib_uverbs_device, ref);
+
+ complete(&dev->comp);
+}
+
+static void ib_uverbs_release_event_file(struct kref *ref)
+{
+ struct ib_uverbs_event_file *file =
+ container_of(ref, struct ib_uverbs_event_file, ref);
+
+ kfree(file);
+}
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+ struct ib_uverbs_event_file *ev_file,
+ struct ib_ucq_object *uobj)
+{
+ struct ib_uverbs_event *evt, *tmp;
+
+ if (ev_file) {
+ spin_lock_irq(&ev_file->lock);
+ list_for_each_entry_safe(evt, tmp, &uobj->comp_list, obj_list) {
+ list_del(&evt->list);
+ kfree(evt);
+ }
+ spin_unlock_irq(&ev_file->lock);
+
+ kref_put(&ev_file->ref, ib_uverbs_release_event_file);
+ }
+
+ spin_lock_irq(&file->async_file->lock);
+ list_for_each_entry_safe(evt, tmp, &uobj->async_list, obj_list) {
+ list_del(&evt->list);
+ kfree(evt);
+ }
+ spin_unlock_irq(&file->async_file->lock);
+}
+
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+ struct ib_uevent_object *uobj)
+{
+ struct ib_uverbs_event *evt, *tmp;
+
+ spin_lock_irq(&file->async_file->lock);
+ list_for_each_entry_safe(evt, tmp, &uobj->event_list, obj_list) {
+ list_del(&evt->list);
+ kfree(evt);
+ }
+ spin_unlock_irq(&file->async_file->lock);
+}
+
+static void ib_uverbs_detach_umcast(struct ib_qp *qp,
+ struct ib_uqp_object *uobj)
+{
+ struct ib_uverbs_mcast_entry *mcast, *tmp;
+
+ list_for_each_entry_safe(mcast, tmp, &uobj->mcast_list, list) {
+ ib_detach_mcast(qp, &mcast->gid, mcast->lid);
+ list_del(&mcast->list);
+ kfree(mcast);
+ }
+}
+
+static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
+ struct ib_ucontext *context)
+{
+ struct ib_uobject *uobj, *tmp;
+
+ if (!context)
+ return 0;
+
+ context->closing = 1;
+
+ list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) {
+ struct ib_ah *ah = uobj->object;
+
+ idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+ ib_destroy_ah(ah);
+ kfree(uobj);
+ }
+
+ /* Remove MWs before QPs, in order to support type 2A MWs. */
+ list_for_each_entry_safe(uobj, tmp, &context->mw_list, list) {
+ struct ib_mw *mw = uobj->object;
+
+ idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
+ ib_dealloc_mw(mw);
+ kfree(uobj);
+ }
+
+ list_for_each_entry_safe(uobj, tmp, &context->rule_list, list) {
+ struct ib_flow *flow_id = uobj->object;
+
+ idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
+ ib_destroy_flow(flow_id);
+ kfree(uobj);
+ }
+
+ list_for_each_entry_safe(uobj, tmp, &context->qp_list, list) {
+ struct ib_qp *qp = uobj->object;
+ struct ib_uqp_object *uqp =
+ container_of(uobj, struct ib_uqp_object, uevent.uobject);
+
+ idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
+ if (qp != qp->real_qp) {
+ ib_close_qp(qp);
+ } else {
+ ib_uverbs_detach_umcast(qp, uqp);
+ ib_destroy_qp(qp);
+ }
+ ib_uverbs_release_uevent(file, &uqp->uevent);
+ kfree(uqp);
+ }
+
+ list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) {
+ struct ib_srq *srq = uobj->object;
+ struct ib_uevent_object *uevent =
+ container_of(uobj, struct ib_uevent_object, uobject);
+
+ idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
+ ib_destroy_srq(srq);
+ ib_uverbs_release_uevent(file, uevent);
+ kfree(uevent);
+ }
+
+ list_for_each_entry_safe(uobj, tmp, &context->cq_list, list) {
+ struct ib_cq *cq = uobj->object;
+ struct ib_uverbs_event_file *ev_file = cq->cq_context;
+ struct ib_ucq_object *ucq =
+ container_of(uobj, struct ib_ucq_object, uobject);
+
+ idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
+ ib_destroy_cq(cq);
+ ib_uverbs_release_ucq(file, ev_file, ucq);
+ kfree(ucq);
+ }
+
+ list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) {
+ struct ib_mr *mr = uobj->object;
+
+ idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+ ib_dereg_mr(mr);
+ kfree(uobj);
+ }
+
+ mutex_lock(&file->device->xrcd_tree_mutex);
+ list_for_each_entry_safe(uobj, tmp, &context->xrcd_list, list) {
+ struct ib_xrcd *xrcd = uobj->object;
+ struct ib_uxrcd_object *uxrcd =
+ container_of(uobj, struct ib_uxrcd_object, uobject);
+
+ idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj);
+ ib_uverbs_dealloc_xrcd(file->device, xrcd);
+ kfree(uxrcd);
+ }
+ mutex_unlock(&file->device->xrcd_tree_mutex);
+
+ list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) {
+ struct ib_pd *pd = uobj->object;
+
+ idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+ ib_dealloc_pd(pd);
+ kfree(uobj);
+ }
+
+ put_pid(context->tgid);
+
+ return context->device->dealloc_ucontext(context);
+}
+
+static void ib_uverbs_release_file(struct kref *ref)
+{
+ struct ib_uverbs_file *file =
+ container_of(ref, struct ib_uverbs_file, ref);
+
+ module_put(file->device->ib_dev->owner);
+ kref_put(&file->device->ref, ib_uverbs_release_dev);
+
+ kfree(file);
+}
+
+static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_uverbs_event_file *file = filp->private_data;
+ struct ib_uverbs_event *event;
+ int eventsz;
+ int ret = 0;
+
+ spin_lock_irq(&file->lock);
+
+ while (list_empty(&file->event_list)) {
+ spin_unlock_irq(&file->lock);
+
+ if (filp->f_flags & O_NONBLOCK)
+ return -EAGAIN;
+
+ if (wait_event_interruptible(file->poll_wait,
+ !list_empty(&file->event_list)))
+ return -ERESTARTSYS;
+
+ spin_lock_irq(&file->lock);
+ }
+
+ event = list_entry(file->event_list.next, struct ib_uverbs_event, list);
+
+ if (file->is_async)
+ eventsz = sizeof (struct ib_uverbs_async_event_desc);
+ else
+ eventsz = sizeof (struct ib_uverbs_comp_event_desc);
+
+ if (eventsz > count) {
+ ret = -EINVAL;
+ event = NULL;
+ } else {
+ list_del(file->event_list.next);
+ if (event->counter) {
+ ++(*event->counter);
+ list_del(&event->obj_list);
+ }
+ }
+
+ spin_unlock_irq(&file->lock);
+
+ if (event) {
+ if (copy_to_user(buf, event, eventsz))
+ ret = -EFAULT;
+ else
+ ret = eventsz;
+ }
+
+ kfree(event);
+
+ return ret;
+}
+
+static unsigned int ib_uverbs_event_poll(struct file *filp,
+ struct poll_table_struct *wait)
+{
+ unsigned int pollflags = 0;
+ struct ib_uverbs_event_file *file = filp->private_data;
+
+ poll_wait(filp, &file->poll_wait, wait);
+
+ spin_lock_irq(&file->lock);
+ if (!list_empty(&file->event_list))
+ pollflags = POLLIN | POLLRDNORM;
+ spin_unlock_irq(&file->lock);
+
+ return pollflags;
+}
+
+static int ib_uverbs_event_fasync(int fd, struct file *filp, int on)
+{
+ struct ib_uverbs_event_file *file = filp->private_data;
+
+ return fasync_helper(fd, filp, on, &file->async_queue);
+}
+
+static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
+{
+ struct ib_uverbs_event_file *file = filp->private_data;
+ struct ib_uverbs_event *entry, *tmp;
+
+ spin_lock_irq(&file->lock);
+ file->is_closed = 1;
+ list_for_each_entry_safe(entry, tmp, &file->event_list, list) {
+ if (entry->counter)
+ list_del(&entry->obj_list);
+ kfree(entry);
+ }
+ spin_unlock_irq(&file->lock);
+
+ if (file->is_async) {
+ ib_unregister_event_handler(&file->uverbs_file->event_handler);
+ kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
+ }
+ kref_put(&file->ref, ib_uverbs_release_event_file);
+
+ return 0;
+}
+
+static const struct file_operations uverbs_event_fops = {
+ .owner = THIS_MODULE,
+ .read = ib_uverbs_event_read,
+ .poll = ib_uverbs_event_poll,
+ .release = ib_uverbs_event_close,
+ .fasync = ib_uverbs_event_fasync,
+ .llseek = no_llseek,
+};
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+ struct ib_uverbs_event_file *file = cq_context;
+ struct ib_ucq_object *uobj;
+ struct ib_uverbs_event *entry;
+ unsigned long flags;
+
+ if (!file)
+ return;
+
+ spin_lock_irqsave(&file->lock, flags);
+ if (file->is_closed) {
+ spin_unlock_irqrestore(&file->lock, flags);
+ return;
+ }
+
+ entry = kmalloc(sizeof *entry, GFP_ATOMIC);
+ if (!entry) {
+ spin_unlock_irqrestore(&file->lock, flags);
+ return;
+ }
+
+ uobj = container_of(cq->uobject, struct ib_ucq_object, uobject);
+
+ entry->desc.comp.cq_handle = cq->uobject->user_handle;
+ entry->counter = &uobj->comp_events_reported;
+
+ list_add_tail(&entry->list, &file->event_list);
+ list_add_tail(&entry->obj_list, &uobj->comp_list);
+ spin_unlock_irqrestore(&file->lock, flags);
+
+ wake_up_interruptible(&file->poll_wait);
+ kill_fasync(&file->async_queue, SIGIO, POLL_IN);
+}
+
+static void ib_uverbs_async_handler(struct ib_uverbs_file *file,
+ __u64 element, __u64 event,
+ struct list_head *obj_list,
+ u32 *counter)
+{
+ struct ib_uverbs_event *entry;
+ unsigned long flags;
+
+ spin_lock_irqsave(&file->async_file->lock, flags);
+ if (file->async_file->is_closed) {
+ spin_unlock_irqrestore(&file->async_file->lock, flags);
+ return;
+ }
+
+ entry = kmalloc(sizeof *entry, GFP_ATOMIC);
+ if (!entry) {
+ spin_unlock_irqrestore(&file->async_file->lock, flags);
+ return;
+ }
+
+ entry->desc.async.element = element;
+ entry->desc.async.event_type = event;
+ entry->desc.async.reserved = 0;
+ entry->counter = counter;
+
+ list_add_tail(&entry->list, &file->async_file->event_list);
+ if (obj_list)
+ list_add_tail(&entry->obj_list, obj_list);
+ spin_unlock_irqrestore(&file->async_file->lock, flags);
+
+ wake_up_interruptible(&file->async_file->poll_wait);
+ kill_fasync(&file->async_file->async_queue, SIGIO, POLL_IN);
+}
+
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr)
+{
+ struct ib_ucq_object *uobj = container_of(event->element.cq->uobject,
+ struct ib_ucq_object, uobject);
+
+ ib_uverbs_async_handler(uobj->uverbs_file, uobj->uobject.user_handle,
+ event->event, &uobj->async_list,
+ &uobj->async_events_reported);
+}
+
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr)
+{
+ struct ib_uevent_object *uobj;
+
+ /* for XRC target qp's, check that qp is live */
+ if (!event->element.qp->uobject || !event->element.qp->uobject->live)
+ return;
+
+ uobj = container_of(event->element.qp->uobject,
+ struct ib_uevent_object, uobject);
+
+ ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+ event->event, &uobj->event_list,
+ &uobj->events_reported);
+}
+
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr)
+{
+ struct ib_uevent_object *uobj;
+
+ uobj = container_of(event->element.srq->uobject,
+ struct ib_uevent_object, uobject);
+
+ ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+ event->event, &uobj->event_list,
+ &uobj->events_reported);
+}
+
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+ struct ib_event *event)
+{
+ struct ib_uverbs_file *file =
+ container_of(handler, struct ib_uverbs_file, event_handler);
+
+ ib_uverbs_async_handler(file, event->element.port_num, event->event,
+ NULL, NULL);
+}
+
+struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+ int is_async)
+{
+ struct ib_uverbs_event_file *ev_file;
+ struct file *filp;
+
+ ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL);
+ if (!ev_file)
+ return ERR_PTR(-ENOMEM);
+
+ kref_init(&ev_file->ref);
+ spin_lock_init(&ev_file->lock);
+ INIT_LIST_HEAD(&ev_file->event_list);
+ init_waitqueue_head(&ev_file->poll_wait);
+ ev_file->uverbs_file = uverbs_file;
+ ev_file->async_queue = NULL;
+ ev_file->is_async = is_async;
+ ev_file->is_closed = 0;
+
+ filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops,
+ ev_file, O_RDONLY);
+ if (IS_ERR(filp))
+ kfree(ev_file);
+
+ return filp;
+}
+
+/*
+ * Look up a completion event file by FD. If lookup is successful,
+ * takes a ref to the event file struct that it returns; if
+ * unsuccessful, returns NULL.
+ */
+struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd)
+{
+ struct ib_uverbs_event_file *ev_file = NULL;
+ struct fd f = fdget(fd);
+
+ if (!f.file)
+ return NULL;
+
+ if (f.file->f_op != &uverbs_event_fops)
+ goto out;
+
+ ev_file = f.file->private_data;
+ if (ev_file->is_async) {
+ ev_file = NULL;
+ goto out;
+ }
+
+ kref_get(&ev_file->ref);
+
+out:
+ fdput(f);
+ return ev_file;
+}
+
+static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t *pos)
+{
+ struct ib_uverbs_file *file = filp->private_data;
+ struct ib_uverbs_cmd_hdr hdr;
+ __u32 flags;
+
+ if (count < sizeof hdr)
+ return -EINVAL;
+
+ if (copy_from_user(&hdr, buf, sizeof hdr))
+ return -EFAULT;
+
+ flags = (hdr.command &
+ IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT;
+
+ if (!flags) {
+ __u32 command;
+
+ if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
+ IB_USER_VERBS_CMD_COMMAND_MASK))
+ return -EINVAL;
+
+ command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
+
+ if (command >= ARRAY_SIZE(uverbs_cmd_table) ||
+ !uverbs_cmd_table[command])
+ return -EINVAL;
+
+ if (!file->ucontext &&
+ command != IB_USER_VERBS_CMD_GET_CONTEXT)
+ return -EINVAL;
+
+ if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command)))
+ return -ENOSYS;
+
+ if (hdr.in_words * 4 != count)
+ return -EINVAL;
+
+ return uverbs_cmd_table[command](file,
+ buf + sizeof(hdr),
+ hdr.in_words * 4,
+ hdr.out_words * 4);
+
+ } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) {
+ __u32 command;
+
+ struct ib_uverbs_ex_cmd_hdr ex_hdr;
+ struct ib_udata ucore;
+ struct ib_udata uhw;
+ int err;
+ size_t written_count = count;
+
+ if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
+ IB_USER_VERBS_CMD_COMMAND_MASK))
+ return -EINVAL;
+
+ command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
+
+ if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) ||
+ !uverbs_ex_cmd_table[command])
+ return -ENOSYS;
+
+ if (!file->ucontext)
+ return -EINVAL;
+
+ if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command)))
+ return -ENOSYS;
+
+ if (count < (sizeof(hdr) + sizeof(ex_hdr)))
+ return -EINVAL;
+
+ if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
+ return -EFAULT;
+
+ count -= sizeof(hdr) + sizeof(ex_hdr);
+ buf += sizeof(hdr) + sizeof(ex_hdr);
+
+ if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count)
+ return -EINVAL;
+
+ if (ex_hdr.cmd_hdr_reserved)
+ return -EINVAL;
+
+ if (ex_hdr.response) {
+ if (!hdr.out_words && !ex_hdr.provider_out_words)
+ return -EINVAL;
+
+ if (!access_ok(VERIFY_WRITE,
+ (void __user *) (unsigned long) ex_hdr.response,
+ (hdr.out_words + ex_hdr.provider_out_words) * 8))
+ return -EFAULT;
+ } else {
+ if (hdr.out_words || ex_hdr.provider_out_words)
+ return -EINVAL;
+ }
+
+ INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response,
+ hdr.in_words * 8, hdr.out_words * 8);
+
+ INIT_UDATA_BUF_OR_NULL(&uhw,
+ buf + ucore.inlen,
+ (unsigned long) ex_hdr.response + ucore.outlen,
+ ex_hdr.provider_in_words * 8,
+ ex_hdr.provider_out_words * 8);
+
+ err = uverbs_ex_cmd_table[command](file,
+ &ucore,
+ &uhw);
+
+ if (err)
+ return err;
+
+ return written_count;
+ }
+
+ return -ENOSYS;
+}
+
+static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct ib_uverbs_file *file = filp->private_data;
+
+ if (!file->ucontext)
+ return -ENODEV;
+ else
+ return file->device->ib_dev->mmap(file->ucontext, vma);
+}
+
+/*
+ * ib_uverbs_open() does not need the BKL:
+ *
+ * - the ib_uverbs_device structures are properly reference counted and
+ * everything else is purely local to the file being created, so
+ * races against other open calls are not a problem;
+ * - there is no ioctl method to race against;
+ * - the open method will either immediately run -ENXIO, or all
+ * required initialization will be done.
+ */
+static int ib_uverbs_open(struct inode *inode, struct file *filp)
+{
+ struct ib_uverbs_device *dev;
+ struct ib_uverbs_file *file;
+ int ret;
+
+ dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
+ if (dev)
+ kref_get(&dev->ref);
+ else
+ return -ENXIO;
+
+ if (!try_module_get(dev->ib_dev->owner)) {
+ ret = -ENODEV;
+ goto err;
+ }
+
+ file = kmalloc(sizeof *file, GFP_KERNEL);
+ if (!file) {
+ ret = -ENOMEM;
+ goto err_module;
+ }
+
+ file->device = dev;
+ file->ucontext = NULL;
+ file->async_file = NULL;
+ kref_init(&file->ref);
+ mutex_init(&file->mutex);
+
+ filp->private_data = file;
+
+ return nonseekable_open(inode, filp);
+
+err_module:
+ module_put(dev->ib_dev->owner);
+
+err:
+ kref_put(&dev->ref, ib_uverbs_release_dev);
+ return ret;
+}
+
+static int ib_uverbs_close(struct inode *inode, struct file *filp)
+{
+ struct ib_uverbs_file *file = filp->private_data;
+
+ ib_uverbs_cleanup_ucontext(file, file->ucontext);
+
+ if (file->async_file)
+ kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
+
+ kref_put(&file->ref, ib_uverbs_release_file);
+
+ return 0;
+}
+
+static const struct file_operations uverbs_fops = {
+ .owner = THIS_MODULE,
+ .write = ib_uverbs_write,
+ .open = ib_uverbs_open,
+ .release = ib_uverbs_close,
+ .llseek = no_llseek,
+};
+
+static const struct file_operations uverbs_mmap_fops = {
+ .owner = THIS_MODULE,
+ .write = ib_uverbs_write,
+ .mmap = ib_uverbs_mmap,
+ .open = ib_uverbs_open,
+ .release = ib_uverbs_close,
+ .llseek = no_llseek,
+};
+
+static struct ib_client uverbs_client = {
+ .name = "uverbs",
+ .add = ib_uverbs_add_one,
+ .remove = ib_uverbs_remove_one
+};
+
+static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
+ char *buf)
+{
+ struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+ if (!dev)
+ return -ENODEV;
+
+ return sprintf(buf, "%s\n", dev->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_dev_abi_version(struct device *device,
+ struct device_attribute *attr, char *buf)
+{
+ struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+ if (!dev)
+ return -ENODEV;
+
+ return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver);
+}
+static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
+
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+ __stringify(IB_USER_VERBS_ABI_VERSION));
+
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UVERBS_MAX_DEVICES);
+
+/*
+ * If we have more than IB_UVERBS_MAX_DEVICES, dynamically overflow by
+ * requesting a new major number and doubling the number of max devices we
+ * support. It's stupid, but simple.
+ */
+static int find_overflow_devnum(void)
+{
+ int ret;
+
+ if (!overflow_maj) {
+ ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES,
+ "infiniband_verbs");
+ if (ret) {
+ printk(KERN_ERR "user_verbs: couldn't register dynamic device number\n");
+ return ret;
+ }
+ }
+
+ ret = find_first_zero_bit(overflow_map, IB_UVERBS_MAX_DEVICES);
+ if (ret >= IB_UVERBS_MAX_DEVICES)
+ return -1;
+
+ return ret;
+}
+
+static void ib_uverbs_add_one(struct ib_device *device)
+{
+ int devnum;
+ dev_t base;
+ struct ib_uverbs_device *uverbs_dev;
+
+ if (!device->alloc_ucontext)
+ return;
+
+ uverbs_dev = kzalloc(sizeof *uverbs_dev, GFP_KERNEL);
+ if (!uverbs_dev)
+ return;
+
+ kref_init(&uverbs_dev->ref);
+ init_completion(&uverbs_dev->comp);
+ uverbs_dev->xrcd_tree = RB_ROOT;
+ mutex_init(&uverbs_dev->xrcd_tree_mutex);
+
+ spin_lock(&map_lock);
+ devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
+ if (devnum >= IB_UVERBS_MAX_DEVICES) {
+ spin_unlock(&map_lock);
+ devnum = find_overflow_devnum();
+ if (devnum < 0)
+ goto err;
+
+ spin_lock(&map_lock);
+ uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES;
+ base = devnum + overflow_maj;
+ set_bit(devnum, overflow_map);
+ } else {
+ uverbs_dev->devnum = devnum;
+ base = devnum + IB_UVERBS_BASE_DEV;
+ set_bit(devnum, dev_map);
+ }
+ spin_unlock(&map_lock);
+
+ uverbs_dev->ib_dev = device;
+ uverbs_dev->num_comp_vectors = device->num_comp_vectors;
+
+ cdev_init(&uverbs_dev->cdev, NULL);
+ uverbs_dev->cdev.owner = THIS_MODULE;
+ uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+ kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
+ if (cdev_add(&uverbs_dev->cdev, base, 1))
+ goto err_cdev;
+
+ uverbs_dev->dev = device_create(uverbs_class, device->dma_device,
+ uverbs_dev->cdev.dev, uverbs_dev,
+ "uverbs%d", uverbs_dev->devnum);
+ if (IS_ERR(uverbs_dev->dev))
+ goto err_cdev;
+
+ if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev))
+ goto err_class;
+ if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version))
+ goto err_class;
+
+ ib_set_client_data(device, &uverbs_client, uverbs_dev);
+
+ return;
+
+err_class:
+ device_destroy(uverbs_class, uverbs_dev->cdev.dev);
+
+err_cdev:
+ cdev_del(&uverbs_dev->cdev);
+ if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES)
+ clear_bit(devnum, dev_map);
+ else
+ clear_bit(devnum, overflow_map);
+
+err:
+ kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+ wait_for_completion(&uverbs_dev->comp);
+ kfree(uverbs_dev);
+ return;
+}
+
+static void ib_uverbs_remove_one(struct ib_device *device)
+{
+ struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client);
+
+ if (!uverbs_dev)
+ return;
+
+ dev_set_drvdata(uverbs_dev->dev, NULL);
+ device_destroy(uverbs_class, uverbs_dev->cdev.dev);
+ cdev_del(&uverbs_dev->cdev);
+
+ if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES)
+ clear_bit(uverbs_dev->devnum, dev_map);
+ else
+ clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
+
+ kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+ wait_for_completion(&uverbs_dev->comp);
+ kfree(uverbs_dev);
+}
+
+static char *uverbs_devnode(struct device *dev, umode_t *mode)
+{
+ if (mode)
+ *mode = 0666;
+ return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static int __init ib_uverbs_init(void)
+{
+ int ret;
+
+ ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES,
+ "infiniband_verbs");
+ if (ret) {
+ printk(KERN_ERR "user_verbs: couldn't register device number\n");
+ goto out;
+ }
+
+ uverbs_class = class_create(THIS_MODULE, "infiniband_verbs");
+ if (IS_ERR(uverbs_class)) {
+ ret = PTR_ERR(uverbs_class);
+ printk(KERN_ERR "user_verbs: couldn't create class infiniband_verbs\n");
+ goto out_chrdev;
+ }
+
+ uverbs_class->devnode = uverbs_devnode;
+
+ ret = class_create_file(uverbs_class, &class_attr_abi_version.attr);
+ if (ret) {
+ printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n");
+ goto out_class;
+ }
+
+ ret = ib_register_client(&uverbs_client);
+ if (ret) {
+ printk(KERN_ERR "user_verbs: couldn't register client\n");
+ goto out_class;
+ }
+
+ return 0;
+
+out_class:
+ class_destroy(uverbs_class);
+
+out_chrdev:
+ unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+
+out:
+ return ret;
+}
+
+static void __exit ib_uverbs_cleanup(void)
+{
+ ib_unregister_client(&uverbs_client);
+ class_destroy(uverbs_class);
+ unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+ if (overflow_maj)
+ unregister_chrdev_region(overflow_maj, IB_UVERBS_MAX_DEVICES);
+ idr_destroy(&ib_uverbs_pd_idr);
+ idr_destroy(&ib_uverbs_mr_idr);
+ idr_destroy(&ib_uverbs_mw_idr);
+ idr_destroy(&ib_uverbs_ah_idr);
+ idr_destroy(&ib_uverbs_cq_idr);
+ idr_destroy(&ib_uverbs_qp_idr);
+ idr_destroy(&ib_uverbs_srq_idr);
+}
+
+module_init(ib_uverbs_init);
+module_exit(ib_uverbs_cleanup);
diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c
new file mode 100644
index 000000000..abd972474
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_marshall.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <rdma/ib_marshall.h>
+
+void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst,
+ struct ib_ah_attr *src)
+{
+ memcpy(dst->grh.dgid, src->grh.dgid.raw, sizeof src->grh.dgid);
+ dst->grh.flow_label = src->grh.flow_label;
+ dst->grh.sgid_index = src->grh.sgid_index;
+ dst->grh.hop_limit = src->grh.hop_limit;
+ dst->grh.traffic_class = src->grh.traffic_class;
+ memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved));
+ dst->dlid = src->dlid;
+ dst->sl = src->sl;
+ dst->src_path_bits = src->src_path_bits;
+ dst->static_rate = src->static_rate;
+ dst->is_global = src->ah_flags & IB_AH_GRH ? 1 : 0;
+ dst->port_num = src->port_num;
+ dst->reserved = 0;
+}
+EXPORT_SYMBOL(ib_copy_ah_attr_to_user);
+
+void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst,
+ struct ib_qp_attr *src)
+{
+ dst->qp_state = src->qp_state;
+ dst->cur_qp_state = src->cur_qp_state;
+ dst->path_mtu = src->path_mtu;
+ dst->path_mig_state = src->path_mig_state;
+ dst->qkey = src->qkey;
+ dst->rq_psn = src->rq_psn;
+ dst->sq_psn = src->sq_psn;
+ dst->dest_qp_num = src->dest_qp_num;
+ dst->qp_access_flags = src->qp_access_flags;
+
+ dst->max_send_wr = src->cap.max_send_wr;
+ dst->max_recv_wr = src->cap.max_recv_wr;
+ dst->max_send_sge = src->cap.max_send_sge;
+ dst->max_recv_sge = src->cap.max_recv_sge;
+ dst->max_inline_data = src->cap.max_inline_data;
+
+ ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr);
+ ib_copy_ah_attr_to_user(&dst->alt_ah_attr, &src->alt_ah_attr);
+
+ dst->pkey_index = src->pkey_index;
+ dst->alt_pkey_index = src->alt_pkey_index;
+ dst->en_sqd_async_notify = src->en_sqd_async_notify;
+ dst->sq_draining = src->sq_draining;
+ dst->max_rd_atomic = src->max_rd_atomic;
+ dst->max_dest_rd_atomic = src->max_dest_rd_atomic;
+ dst->min_rnr_timer = src->min_rnr_timer;
+ dst->port_num = src->port_num;
+ dst->timeout = src->timeout;
+ dst->retry_cnt = src->retry_cnt;
+ dst->rnr_retry = src->rnr_retry;
+ dst->alt_port_num = src->alt_port_num;
+ dst->alt_timeout = src->alt_timeout;
+ memset(dst->reserved, 0, sizeof(dst->reserved));
+}
+EXPORT_SYMBOL(ib_copy_qp_attr_to_user);
+
+void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+ struct ib_sa_path_rec *src)
+{
+ memcpy(dst->dgid, src->dgid.raw, sizeof src->dgid);
+ memcpy(dst->sgid, src->sgid.raw, sizeof src->sgid);
+
+ dst->dlid = src->dlid;
+ dst->slid = src->slid;
+ dst->raw_traffic = src->raw_traffic;
+ dst->flow_label = src->flow_label;
+ dst->hop_limit = src->hop_limit;
+ dst->traffic_class = src->traffic_class;
+ dst->reversible = src->reversible;
+ dst->numb_path = src->numb_path;
+ dst->pkey = src->pkey;
+ dst->sl = src->sl;
+ dst->mtu_selector = src->mtu_selector;
+ dst->mtu = src->mtu;
+ dst->rate_selector = src->rate_selector;
+ dst->rate = src->rate;
+ dst->packet_life_time = src->packet_life_time;
+ dst->preference = src->preference;
+ dst->packet_life_time_selector = src->packet_life_time_selector;
+}
+EXPORT_SYMBOL(ib_copy_path_rec_to_user);
+
+void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
+ struct ib_user_path_rec *src)
+{
+ memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid);
+ memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid);
+
+ dst->dlid = src->dlid;
+ dst->slid = src->slid;
+ dst->raw_traffic = src->raw_traffic;
+ dst->flow_label = src->flow_label;
+ dst->hop_limit = src->hop_limit;
+ dst->traffic_class = src->traffic_class;
+ dst->reversible = src->reversible;
+ dst->numb_path = src->numb_path;
+ dst->pkey = src->pkey;
+ dst->sl = src->sl;
+ dst->mtu_selector = src->mtu_selector;
+ dst->mtu = src->mtu;
+ dst->rate_selector = src->rate_selector;
+ dst->rate = src->rate;
+ dst->packet_life_time = src->packet_life_time;
+ dst->preference = src->preference;
+ dst->packet_life_time_selector = src->packet_life_time_selector;
+
+ memset(dst->smac, 0, sizeof(dst->smac));
+ memset(dst->dmac, 0, sizeof(dst->dmac));
+ dst->vlan_id = 0xffff;
+}
+EXPORT_SYMBOL(ib_copy_path_rec_from_user);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
new file mode 100644
index 000000000..f93eb8da7
--- /dev/null
+++ b/drivers/infiniband/core/verbs.c
@@ -0,0 +1,1448 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation. All rights reserved.
+ * Copyright (c) 2004 Intel Corporation. All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation. All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+#include "core_priv.h"
+
+__attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
+{
+ switch (rate) {
+ case IB_RATE_2_5_GBPS: return 1;
+ case IB_RATE_5_GBPS: return 2;
+ case IB_RATE_10_GBPS: return 4;
+ case IB_RATE_20_GBPS: return 8;
+ case IB_RATE_30_GBPS: return 12;
+ case IB_RATE_40_GBPS: return 16;
+ case IB_RATE_60_GBPS: return 24;
+ case IB_RATE_80_GBPS: return 32;
+ case IB_RATE_120_GBPS: return 48;
+ default: return -1;
+ }
+}
+EXPORT_SYMBOL(ib_rate_to_mult);
+
+__attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
+{
+ switch (mult) {
+ case 1: return IB_RATE_2_5_GBPS;
+ case 2: return IB_RATE_5_GBPS;
+ case 4: return IB_RATE_10_GBPS;
+ case 8: return IB_RATE_20_GBPS;
+ case 12: return IB_RATE_30_GBPS;
+ case 16: return IB_RATE_40_GBPS;
+ case 24: return IB_RATE_60_GBPS;
+ case 32: return IB_RATE_80_GBPS;
+ case 48: return IB_RATE_120_GBPS;
+ default: return IB_RATE_PORT_CURRENT;
+ }
+}
+EXPORT_SYMBOL(mult_to_ib_rate);
+
+__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
+{
+ switch (rate) {
+ case IB_RATE_2_5_GBPS: return 2500;
+ case IB_RATE_5_GBPS: return 5000;
+ case IB_RATE_10_GBPS: return 10000;
+ case IB_RATE_20_GBPS: return 20000;
+ case IB_RATE_30_GBPS: return 30000;
+ case IB_RATE_40_GBPS: return 40000;
+ case IB_RATE_60_GBPS: return 60000;
+ case IB_RATE_80_GBPS: return 80000;
+ case IB_RATE_120_GBPS: return 120000;
+ case IB_RATE_14_GBPS: return 14062;
+ case IB_RATE_56_GBPS: return 56250;
+ case IB_RATE_112_GBPS: return 112500;
+ case IB_RATE_168_GBPS: return 168750;
+ case IB_RATE_25_GBPS: return 25781;
+ case IB_RATE_100_GBPS: return 103125;
+ case IB_RATE_200_GBPS: return 206250;
+ case IB_RATE_300_GBPS: return 309375;
+ default: return -1;
+ }
+}
+EXPORT_SYMBOL(ib_rate_to_mbps);
+
+__attribute_const__ enum rdma_transport_type
+rdma_node_get_transport(enum rdma_node_type node_type)
+{
+ switch (node_type) {
+ case RDMA_NODE_IB_CA:
+ case RDMA_NODE_IB_SWITCH:
+ case RDMA_NODE_IB_ROUTER:
+ return RDMA_TRANSPORT_IB;
+ case RDMA_NODE_RNIC:
+ return RDMA_TRANSPORT_IWARP;
+ case RDMA_NODE_USNIC:
+ return RDMA_TRANSPORT_USNIC;
+ case RDMA_NODE_USNIC_UDP:
+ return RDMA_TRANSPORT_USNIC_UDP;
+ default:
+ BUG();
+ return 0;
+ }
+}
+EXPORT_SYMBOL(rdma_node_get_transport);
+
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num)
+{
+ if (device->get_link_layer)
+ return device->get_link_layer(device, port_num);
+
+ switch (rdma_node_get_transport(device->node_type)) {
+ case RDMA_TRANSPORT_IB:
+ return IB_LINK_LAYER_INFINIBAND;
+ case RDMA_TRANSPORT_IWARP:
+ case RDMA_TRANSPORT_USNIC:
+ case RDMA_TRANSPORT_USNIC_UDP:
+ return IB_LINK_LAYER_ETHERNET;
+ default:
+ return IB_LINK_LAYER_UNSPECIFIED;
+ }
+}
+EXPORT_SYMBOL(rdma_port_get_link_layer);
+
+/* Protection domains */
+
+struct ib_pd *ib_alloc_pd(struct ib_device *device)
+{
+ struct ib_pd *pd;
+
+ pd = device->alloc_pd(device, NULL, NULL);
+
+ if (!IS_ERR(pd)) {
+ pd->device = device;
+ pd->uobject = NULL;
+ atomic_set(&pd->usecnt, 0);
+ }
+
+ return pd;
+}
+EXPORT_SYMBOL(ib_alloc_pd);
+
+int ib_dealloc_pd(struct ib_pd *pd)
+{
+ if (atomic_read(&pd->usecnt))
+ return -EBUSY;
+
+ return pd->device->dealloc_pd(pd);
+}
+EXPORT_SYMBOL(ib_dealloc_pd);
+
+/* Address handles */
+
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+ struct ib_ah *ah;
+
+ ah = pd->device->create_ah(pd, ah_attr);
+
+ if (!IS_ERR(ah)) {
+ ah->device = pd->device;
+ ah->pd = pd;
+ ah->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ }
+
+ return ah;
+}
+EXPORT_SYMBOL(ib_create_ah);
+
+int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
+ struct ib_grh *grh, struct ib_ah_attr *ah_attr)
+{
+ u32 flow_class;
+ u16 gid_index;
+ int ret;
+ int is_eth = (rdma_port_get_link_layer(device, port_num) ==
+ IB_LINK_LAYER_ETHERNET);
+
+ memset(ah_attr, 0, sizeof *ah_attr);
+ if (is_eth) {
+ if (!(wc->wc_flags & IB_WC_GRH))
+ return -EPROTOTYPE;
+
+ if (wc->wc_flags & IB_WC_WITH_SMAC &&
+ wc->wc_flags & IB_WC_WITH_VLAN) {
+ memcpy(ah_attr->dmac, wc->smac, ETH_ALEN);
+ ah_attr->vlan_id = wc->vlan_id;
+ } else {
+ ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid,
+ ah_attr->dmac, &ah_attr->vlan_id);
+ if (ret)
+ return ret;
+ }
+ } else {
+ ah_attr->vlan_id = 0xffff;
+ }
+
+ ah_attr->dlid = wc->slid;
+ ah_attr->sl = wc->sl;
+ ah_attr->src_path_bits = wc->dlid_path_bits;
+ ah_attr->port_num = port_num;
+
+ if (wc->wc_flags & IB_WC_GRH) {
+ ah_attr->ah_flags = IB_AH_GRH;
+ ah_attr->grh.dgid = grh->sgid;
+
+ ret = ib_find_cached_gid(device, &grh->dgid, &port_num,
+ &gid_index);
+ if (ret)
+ return ret;
+
+ ah_attr->grh.sgid_index = (u8) gid_index;
+ flow_class = be32_to_cpu(grh->version_tclass_flow);
+ ah_attr->grh.flow_label = flow_class & 0xFFFFF;
+ ah_attr->grh.hop_limit = 0xFF;
+ ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_wc);
+
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc,
+ struct ib_grh *grh, u8 port_num)
+{
+ struct ib_ah_attr ah_attr;
+ int ret;
+
+ ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return ib_create_ah(pd, &ah_attr);
+}
+EXPORT_SYMBOL(ib_create_ah_from_wc);
+
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+ return ah->device->modify_ah ?
+ ah->device->modify_ah(ah, ah_attr) :
+ -ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_ah);
+
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+ return ah->device->query_ah ?
+ ah->device->query_ah(ah, ah_attr) :
+ -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_ah);
+
+int ib_destroy_ah(struct ib_ah *ah)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ pd = ah->pd;
+ ret = ah->device->destroy_ah(ah);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_ah);
+
+/* Shared receive queues */
+
+struct ib_srq *ib_create_srq(struct ib_pd *pd,
+ struct ib_srq_init_attr *srq_init_attr)
+{
+ struct ib_srq *srq;
+
+ if (!pd->device->create_srq)
+ return ERR_PTR(-ENOSYS);
+
+ srq = pd->device->create_srq(pd, srq_init_attr, NULL);
+
+ if (!IS_ERR(srq)) {
+ srq->device = pd->device;
+ srq->pd = pd;
+ srq->uobject = NULL;
+ srq->event_handler = srq_init_attr->event_handler;
+ srq->srq_context = srq_init_attr->srq_context;
+ srq->srq_type = srq_init_attr->srq_type;
+ if (srq->srq_type == IB_SRQT_XRC) {
+ srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
+ srq->ext.xrc.cq = srq_init_attr->ext.xrc.cq;
+ atomic_inc(&srq->ext.xrc.xrcd->usecnt);
+ atomic_inc(&srq->ext.xrc.cq->usecnt);
+ }
+ atomic_inc(&pd->usecnt);
+ atomic_set(&srq->usecnt, 0);
+ }
+
+ return srq;
+}
+EXPORT_SYMBOL(ib_create_srq);
+
+int ib_modify_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr,
+ enum ib_srq_attr_mask srq_attr_mask)
+{
+ return srq->device->modify_srq ?
+ srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) :
+ -ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_srq);
+
+int ib_query_srq(struct ib_srq *srq,
+ struct ib_srq_attr *srq_attr)
+{
+ return srq->device->query_srq ?
+ srq->device->query_srq(srq, srq_attr) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_srq);
+
+int ib_destroy_srq(struct ib_srq *srq)
+{
+ struct ib_pd *pd;
+ enum ib_srq_type srq_type;
+ struct ib_xrcd *uninitialized_var(xrcd);
+ struct ib_cq *uninitialized_var(cq);
+ int ret;
+
+ if (atomic_read(&srq->usecnt))
+ return -EBUSY;
+
+ pd = srq->pd;
+ srq_type = srq->srq_type;
+ if (srq_type == IB_SRQT_XRC) {
+ xrcd = srq->ext.xrc.xrcd;
+ cq = srq->ext.xrc.cq;
+ }
+
+ ret = srq->device->destroy_srq(srq);
+ if (!ret) {
+ atomic_dec(&pd->usecnt);
+ if (srq_type == IB_SRQT_XRC) {
+ atomic_dec(&xrcd->usecnt);
+ atomic_dec(&cq->usecnt);
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_srq);
+
+/* Queue pairs */
+
+static void __ib_shared_qp_event_handler(struct ib_event *event, void *context)
+{
+ struct ib_qp *qp = context;
+ unsigned long flags;
+
+ spin_lock_irqsave(&qp->device->event_handler_lock, flags);
+ list_for_each_entry(event->element.qp, &qp->open_list, open_list)
+ if (event->element.qp->event_handler)
+ event->element.qp->event_handler(event, event->element.qp->qp_context);
+ spin_unlock_irqrestore(&qp->device->event_handler_lock, flags);
+}
+
+static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp)
+{
+ mutex_lock(&xrcd->tgt_qp_mutex);
+ list_add(&qp->xrcd_list, &xrcd->tgt_qp_list);
+ mutex_unlock(&xrcd->tgt_qp_mutex);
+}
+
+static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp,
+ void (*event_handler)(struct ib_event *, void *),
+ void *qp_context)
+{
+ struct ib_qp *qp;
+ unsigned long flags;
+
+ qp = kzalloc(sizeof *qp, GFP_KERNEL);
+ if (!qp)
+ return ERR_PTR(-ENOMEM);
+
+ qp->real_qp = real_qp;
+ atomic_inc(&real_qp->usecnt);
+ qp->device = real_qp->device;
+ qp->event_handler = event_handler;
+ qp->qp_context = qp_context;
+ qp->qp_num = real_qp->qp_num;
+ qp->qp_type = real_qp->qp_type;
+
+ spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
+ list_add(&qp->open_list, &real_qp->open_list);
+ spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
+
+ return qp;
+}
+
+struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
+ struct ib_qp_open_attr *qp_open_attr)
+{
+ struct ib_qp *qp, *real_qp;
+
+ if (qp_open_attr->qp_type != IB_QPT_XRC_TGT)
+ return ERR_PTR(-EINVAL);
+
+ qp = ERR_PTR(-EINVAL);
+ mutex_lock(&xrcd->tgt_qp_mutex);
+ list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) {
+ if (real_qp->qp_num == qp_open_attr->qp_num) {
+ qp = __ib_open_qp(real_qp, qp_open_attr->event_handler,
+ qp_open_attr->qp_context);
+ break;
+ }
+ }
+ mutex_unlock(&xrcd->tgt_qp_mutex);
+ return qp;
+}
+EXPORT_SYMBOL(ib_open_qp);
+
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct ib_qp *qp, *real_qp;
+ struct ib_device *device;
+
+ device = pd ? pd->device : qp_init_attr->xrcd->device;
+ qp = device->create_qp(pd, qp_init_attr, NULL);
+
+ if (!IS_ERR(qp)) {
+ qp->device = device;
+ qp->real_qp = qp;
+ qp->uobject = NULL;
+ qp->qp_type = qp_init_attr->qp_type;
+
+ atomic_set(&qp->usecnt, 0);
+ if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
+ qp->event_handler = __ib_shared_qp_event_handler;
+ qp->qp_context = qp;
+ qp->pd = NULL;
+ qp->send_cq = qp->recv_cq = NULL;
+ qp->srq = NULL;
+ qp->xrcd = qp_init_attr->xrcd;
+ atomic_inc(&qp_init_attr->xrcd->usecnt);
+ INIT_LIST_HEAD(&qp->open_list);
+
+ real_qp = qp;
+ qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
+ qp_init_attr->qp_context);
+ if (!IS_ERR(qp))
+ __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
+ else
+ real_qp->device->destroy_qp(real_qp);
+ } else {
+ qp->event_handler = qp_init_attr->event_handler;
+ qp->qp_context = qp_init_attr->qp_context;
+ if (qp_init_attr->qp_type == IB_QPT_XRC_INI) {
+ qp->recv_cq = NULL;
+ qp->srq = NULL;
+ } else {
+ qp->recv_cq = qp_init_attr->recv_cq;
+ atomic_inc(&qp_init_attr->recv_cq->usecnt);
+ qp->srq = qp_init_attr->srq;
+ if (qp->srq)
+ atomic_inc(&qp_init_attr->srq->usecnt);
+ }
+
+ qp->pd = pd;
+ qp->send_cq = qp_init_attr->send_cq;
+ qp->xrcd = NULL;
+
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&qp_init_attr->send_cq->usecnt);
+ }
+ }
+
+ return qp;
+}
+EXPORT_SYMBOL(ib_create_qp);
+
+static const struct {
+ int valid;
+ enum ib_qp_attr_mask req_param[IB_QPT_MAX];
+ enum ib_qp_attr_mask req_param_add_eth[IB_QPT_MAX];
+ enum ib_qp_attr_mask opt_param[IB_QPT_MAX];
+ enum ib_qp_attr_mask opt_param_add_eth[IB_QPT_MAX];
+} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+ [IB_QPS_RESET] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_INIT] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_RAW_PACKET] = IB_QP_PORT,
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ },
+ },
+ [IB_QPS_INIT] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_INIT] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_RC] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX |
+ IB_QP_PORT |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_RTR] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UC] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN),
+ [IB_QPT_RC] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_XRC_INI] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN),
+ [IB_QPT_XRC_TGT] = (IB_QP_AV |
+ IB_QP_PATH_MTU |
+ IB_QP_DEST_QPN |
+ IB_QP_RQ_PSN |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_MIN_RNR_TIMER),
+ },
+ .req_param_add_eth = {
+ [IB_QPT_RC] = (IB_QP_SMAC),
+ [IB_QPT_UC] = (IB_QP_SMAC),
+ [IB_QPT_XRC_INI] = (IB_QP_SMAC),
+ [IB_QPT_XRC_TGT] = (IB_QP_SMAC)
+ },
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_RC] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ },
+ .opt_param_add_eth = {
+ [IB_QPT_RC] = (IB_QP_ALT_SMAC |
+ IB_QP_VID |
+ IB_QP_ALT_VID),
+ [IB_QPT_UC] = (IB_QP_ALT_SMAC |
+ IB_QP_VID |
+ IB_QP_ALT_VID),
+ [IB_QPT_XRC_INI] = (IB_QP_ALT_SMAC |
+ IB_QP_VID |
+ IB_QP_ALT_VID),
+ [IB_QPT_XRC_TGT] = (IB_QP_ALT_SMAC |
+ IB_QP_VID |
+ IB_QP_ALT_VID)
+ }
+ }
+ },
+ [IB_QPS_RTR] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .req_param = {
+ [IB_QPT_UD] = IB_QP_SQ_PSN,
+ [IB_QPT_UC] = IB_QP_SQ_PSN,
+ [IB_QPT_RC] = (IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_SQ_PSN |
+ IB_QP_MAX_QP_RD_ATOMIC),
+ [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_SQ_PSN |
+ IB_QP_MAX_QP_RD_ATOMIC),
+ [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT |
+ IB_QP_SQ_PSN),
+ [IB_QPT_SMI] = IB_QP_SQ_PSN,
+ [IB_QPT_GSI] = IB_QP_SQ_PSN,
+ },
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_RTS] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_ALT_PATH |
+ IB_QP_PATH_MIG_STATE |
+ IB_QP_MIN_RNR_TIMER),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_SQD] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */
+ [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+ [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY
+ }
+ },
+ },
+ [IB_QPS_SQD] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ },
+ [IB_QPS_SQD] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_AV |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_RC] = (IB_QP_PORT |
+ IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_INI] = (IB_QP_PORT |
+ IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_RETRY_CNT |
+ IB_QP_RNR_RETRY |
+ IB_QP_MAX_QP_RD_ATOMIC |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_XRC_TGT] = (IB_QP_PORT |
+ IB_QP_AV |
+ IB_QP_TIMEOUT |
+ IB_QP_MAX_DEST_RD_ATOMIC |
+ IB_QP_ALT_PATH |
+ IB_QP_ACCESS_FLAGS |
+ IB_QP_PKEY_INDEX |
+ IB_QP_MIN_RNR_TIMER |
+ IB_QP_PATH_MIG_STATE),
+ [IB_QPT_SMI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_PKEY_INDEX |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_SQE] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 },
+ [IB_QPS_RTS] = {
+ .valid = 1,
+ .opt_param = {
+ [IB_QPT_UD] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_UC] = (IB_QP_CUR_STATE |
+ IB_QP_ACCESS_FLAGS),
+ [IB_QPT_SMI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ [IB_QPT_GSI] = (IB_QP_CUR_STATE |
+ IB_QP_QKEY),
+ }
+ }
+ },
+ [IB_QPS_ERR] = {
+ [IB_QPS_RESET] = { .valid = 1 },
+ [IB_QPS_ERR] = { .valid = 1 }
+ }
+};
+
+int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+ enum ib_qp_type type, enum ib_qp_attr_mask mask,
+ enum rdma_link_layer ll)
+{
+ enum ib_qp_attr_mask req_param, opt_param;
+
+ if (cur_state < 0 || cur_state > IB_QPS_ERR ||
+ next_state < 0 || next_state > IB_QPS_ERR)
+ return 0;
+
+ if (mask & IB_QP_CUR_STATE &&
+ cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
+ cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
+ return 0;
+
+ if (!qp_state_table[cur_state][next_state].valid)
+ return 0;
+
+ req_param = qp_state_table[cur_state][next_state].req_param[type];
+ opt_param = qp_state_table[cur_state][next_state].opt_param[type];
+
+ if (ll == IB_LINK_LAYER_ETHERNET) {
+ req_param |= qp_state_table[cur_state][next_state].
+ req_param_add_eth[type];
+ opt_param |= qp_state_table[cur_state][next_state].
+ opt_param_add_eth[type];
+ }
+
+ if ((mask & req_param) != req_param)
+ return 0;
+
+ if (mask & ~(req_param | opt_param | IB_QP_STATE))
+ return 0;
+
+ return 1;
+}
+EXPORT_SYMBOL(ib_modify_qp_is_ok);
+
+int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr, int *qp_attr_mask)
+{
+ int ret = 0;
+ union ib_gid sgid;
+
+ if ((*qp_attr_mask & IB_QP_AV) &&
+ (rdma_port_get_link_layer(qp->device, qp_attr->ah_attr.port_num) == IB_LINK_LAYER_ETHERNET)) {
+ ret = ib_query_gid(qp->device, qp_attr->ah_attr.port_num,
+ qp_attr->ah_attr.grh.sgid_index, &sgid);
+ if (ret)
+ goto out;
+ if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) {
+ rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, qp_attr->ah_attr.dmac);
+ rdma_get_ll_mac((struct in6_addr *)sgid.raw, qp_attr->smac);
+ if (!(*qp_attr_mask & IB_QP_VID))
+ qp_attr->vlan_id = rdma_get_vlan_id(&sgid);
+ } else {
+ ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid,
+ qp_attr->ah_attr.dmac, &qp_attr->vlan_id);
+ if (ret)
+ goto out;
+ ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr->smac, NULL);
+ if (ret)
+ goto out;
+ }
+ *qp_attr_mask |= IB_QP_SMAC;
+ if (qp_attr->vlan_id < 0xFFFF)
+ *qp_attr_mask |= IB_QP_VID;
+ }
+out:
+ return ret;
+}
+EXPORT_SYMBOL(ib_resolve_eth_l2_attrs);
+
+
+int ib_modify_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask)
+{
+ int ret;
+
+ ret = ib_resolve_eth_l2_attrs(qp, qp_attr, &qp_attr_mask);
+ if (ret)
+ return ret;
+
+ return qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL);
+}
+EXPORT_SYMBOL(ib_modify_qp);
+
+int ib_query_qp(struct ib_qp *qp,
+ struct ib_qp_attr *qp_attr,
+ int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ return qp->device->query_qp ?
+ qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) :
+ -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_qp);
+
+int ib_close_qp(struct ib_qp *qp)
+{
+ struct ib_qp *real_qp;
+ unsigned long flags;
+
+ real_qp = qp->real_qp;
+ if (real_qp == qp)
+ return -EINVAL;
+
+ spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
+ list_del(&qp->open_list);
+ spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
+
+ atomic_dec(&real_qp->usecnt);
+ kfree(qp);
+
+ return 0;
+}
+EXPORT_SYMBOL(ib_close_qp);
+
+static int __ib_destroy_shared_qp(struct ib_qp *qp)
+{
+ struct ib_xrcd *xrcd;
+ struct ib_qp *real_qp;
+ int ret;
+
+ real_qp = qp->real_qp;
+ xrcd = real_qp->xrcd;
+
+ mutex_lock(&xrcd->tgt_qp_mutex);
+ ib_close_qp(qp);
+ if (atomic_read(&real_qp->usecnt) == 0)
+ list_del(&real_qp->xrcd_list);
+ else
+ real_qp = NULL;
+ mutex_unlock(&xrcd->tgt_qp_mutex);
+
+ if (real_qp) {
+ ret = ib_destroy_qp(real_qp);
+ if (!ret)
+ atomic_dec(&xrcd->usecnt);
+ else
+ __ib_insert_xrcd_qp(xrcd, real_qp);
+ }
+
+ return 0;
+}
+
+int ib_destroy_qp(struct ib_qp *qp)
+{
+ struct ib_pd *pd;
+ struct ib_cq *scq, *rcq;
+ struct ib_srq *srq;
+ int ret;
+
+ if (atomic_read(&qp->usecnt))
+ return -EBUSY;
+
+ if (qp->real_qp != qp)
+ return __ib_destroy_shared_qp(qp);
+
+ pd = qp->pd;
+ scq = qp->send_cq;
+ rcq = qp->recv_cq;
+ srq = qp->srq;
+
+ ret = qp->device->destroy_qp(qp);
+ if (!ret) {
+ if (pd)
+ atomic_dec(&pd->usecnt);
+ if (scq)
+ atomic_dec(&scq->usecnt);
+ if (rcq)
+ atomic_dec(&rcq->usecnt);
+ if (srq)
+ atomic_dec(&srq->usecnt);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_qp);
+
+/* Completion queues */
+
+struct ib_cq *ib_create_cq(struct ib_device *device,
+ ib_comp_handler comp_handler,
+ void (*event_handler)(struct ib_event *, void *),
+ void *cq_context, int cqe, int comp_vector)
+{
+ struct ib_cq *cq;
+
+ cq = device->create_cq(device, cqe, comp_vector, NULL, NULL);
+
+ if (!IS_ERR(cq)) {
+ cq->device = device;
+ cq->uobject = NULL;
+ cq->comp_handler = comp_handler;
+ cq->event_handler = event_handler;
+ cq->cq_context = cq_context;
+ atomic_set(&cq->usecnt, 0);
+ }
+
+ return cq;
+}
+EXPORT_SYMBOL(ib_create_cq);
+
+int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+ return cq->device->modify_cq ?
+ cq->device->modify_cq(cq, cq_count, cq_period) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_cq);
+
+int ib_destroy_cq(struct ib_cq *cq)
+{
+ if (atomic_read(&cq->usecnt))
+ return -EBUSY;
+
+ return cq->device->destroy_cq(cq);
+}
+EXPORT_SYMBOL(ib_destroy_cq);
+
+int ib_resize_cq(struct ib_cq *cq, int cqe)
+{
+ return cq->device->resize_cq ?
+ cq->device->resize_cq(cq, cqe, NULL) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_resize_cq);
+
+/* Memory regions */
+
+struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
+{
+ struct ib_mr *mr;
+ int err;
+
+ err = ib_check_mr_access(mr_access_flags);
+ if (err)
+ return ERR_PTR(err);
+
+ mr = pd->device->get_dma_mr(pd, mr_access_flags);
+
+ if (!IS_ERR(mr)) {
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_set(&mr->usecnt, 0);
+ }
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_get_dma_mr);
+
+struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start)
+{
+ struct ib_mr *mr;
+ int err;
+
+ err = ib_check_mr_access(mr_access_flags);
+ if (err)
+ return ERR_PTR(err);
+
+ if (!pd->device->reg_phys_mr)
+ return ERR_PTR(-ENOSYS);
+
+ mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
+ mr_access_flags, iova_start);
+
+ if (!IS_ERR(mr)) {
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_set(&mr->usecnt, 0);
+ }
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_reg_phys_mr);
+
+int ib_rereg_phys_mr(struct ib_mr *mr,
+ int mr_rereg_mask,
+ struct ib_pd *pd,
+ struct ib_phys_buf *phys_buf_array,
+ int num_phys_buf,
+ int mr_access_flags,
+ u64 *iova_start)
+{
+ struct ib_pd *old_pd;
+ int ret;
+
+ ret = ib_check_mr_access(mr_access_flags);
+ if (ret)
+ return ret;
+
+ if (!mr->device->rereg_phys_mr)
+ return -ENOSYS;
+
+ if (atomic_read(&mr->usecnt))
+ return -EBUSY;
+
+ old_pd = mr->pd;
+
+ ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
+ phys_buf_array, num_phys_buf,
+ mr_access_flags, iova_start);
+
+ if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
+ atomic_dec(&old_pd->usecnt);
+ atomic_inc(&pd->usecnt);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_rereg_phys_mr);
+
+int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
+{
+ return mr->device->query_mr ?
+ mr->device->query_mr(mr, mr_attr) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_mr);
+
+int ib_dereg_mr(struct ib_mr *mr)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ if (atomic_read(&mr->usecnt))
+ return -EBUSY;
+
+ pd = mr->pd;
+ ret = mr->device->dereg_mr(mr);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_dereg_mr);
+
+struct ib_mr *ib_create_mr(struct ib_pd *pd,
+ struct ib_mr_init_attr *mr_init_attr)
+{
+ struct ib_mr *mr;
+
+ if (!pd->device->create_mr)
+ return ERR_PTR(-ENOSYS);
+
+ mr = pd->device->create_mr(pd, mr_init_attr);
+
+ if (!IS_ERR(mr)) {
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_set(&mr->usecnt, 0);
+ }
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_create_mr);
+
+int ib_destroy_mr(struct ib_mr *mr)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ if (atomic_read(&mr->usecnt))
+ return -EBUSY;
+
+ pd = mr->pd;
+ ret = mr->device->destroy_mr(mr);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_destroy_mr);
+
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+{
+ struct ib_mr *mr;
+
+ if (!pd->device->alloc_fast_reg_mr)
+ return ERR_PTR(-ENOSYS);
+
+ mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
+
+ if (!IS_ERR(mr)) {
+ mr->device = pd->device;
+ mr->pd = pd;
+ mr->uobject = NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_set(&mr->usecnt, 0);
+ }
+
+ return mr;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
+ int max_page_list_len)
+{
+ struct ib_fast_reg_page_list *page_list;
+
+ if (!device->alloc_fast_reg_page_list)
+ return ERR_PTR(-ENOSYS);
+
+ page_list = device->alloc_fast_reg_page_list(device, max_page_list_len);
+
+ if (!IS_ERR(page_list)) {
+ page_list->device = device;
+ page_list->max_page_list_len = max_page_list_len;
+ }
+
+ return page_list;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_page_list);
+
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+ page_list->device->free_fast_reg_page_list(page_list);
+}
+EXPORT_SYMBOL(ib_free_fast_reg_page_list);
+
+/* Memory windows */
+
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
+{
+ struct ib_mw *mw;
+
+ if (!pd->device->alloc_mw)
+ return ERR_PTR(-ENOSYS);
+
+ mw = pd->device->alloc_mw(pd, type);
+ if (!IS_ERR(mw)) {
+ mw->device = pd->device;
+ mw->pd = pd;
+ mw->uobject = NULL;
+ mw->type = type;
+ atomic_inc(&pd->usecnt);
+ }
+
+ return mw;
+}
+EXPORT_SYMBOL(ib_alloc_mw);
+
+int ib_dealloc_mw(struct ib_mw *mw)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ pd = mw->pd;
+ ret = mw->device->dealloc_mw(mw);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_mw);
+
+/* "Fast" memory regions */
+
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+ int mr_access_flags,
+ struct ib_fmr_attr *fmr_attr)
+{
+ struct ib_fmr *fmr;
+
+ if (!pd->device->alloc_fmr)
+ return ERR_PTR(-ENOSYS);
+
+ fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr);
+ if (!IS_ERR(fmr)) {
+ fmr->device = pd->device;
+ fmr->pd = pd;
+ atomic_inc(&pd->usecnt);
+ }
+
+ return fmr;
+}
+EXPORT_SYMBOL(ib_alloc_fmr);
+
+int ib_unmap_fmr(struct list_head *fmr_list)
+{
+ struct ib_fmr *fmr;
+
+ if (list_empty(fmr_list))
+ return 0;
+
+ fmr = list_entry(fmr_list->next, struct ib_fmr, list);
+ return fmr->device->unmap_fmr(fmr_list);
+}
+EXPORT_SYMBOL(ib_unmap_fmr);
+
+int ib_dealloc_fmr(struct ib_fmr *fmr)
+{
+ struct ib_pd *pd;
+ int ret;
+
+ pd = fmr->pd;
+ ret = fmr->device->dealloc_fmr(fmr);
+ if (!ret)
+ atomic_dec(&pd->usecnt);
+
+ return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_fmr);
+
+/* Multicast groups */
+
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+ int ret;
+
+ if (!qp->device->attach_mcast)
+ return -ENOSYS;
+ if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+ return -EINVAL;
+
+ ret = qp->device->attach_mcast(qp, gid, lid);
+ if (!ret)
+ atomic_inc(&qp->usecnt);
+ return ret;
+}
+EXPORT_SYMBOL(ib_attach_mcast);
+
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+ int ret;
+
+ if (!qp->device->detach_mcast)
+ return -ENOSYS;
+ if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+ return -EINVAL;
+
+ ret = qp->device->detach_mcast(qp, gid, lid);
+ if (!ret)
+ atomic_dec(&qp->usecnt);
+ return ret;
+}
+EXPORT_SYMBOL(ib_detach_mcast);
+
+struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device)
+{
+ struct ib_xrcd *xrcd;
+
+ if (!device->alloc_xrcd)
+ return ERR_PTR(-ENOSYS);
+
+ xrcd = device->alloc_xrcd(device, NULL, NULL);
+ if (!IS_ERR(xrcd)) {
+ xrcd->device = device;
+ xrcd->inode = NULL;
+ atomic_set(&xrcd->usecnt, 0);
+ mutex_init(&xrcd->tgt_qp_mutex);
+ INIT_LIST_HEAD(&xrcd->tgt_qp_list);
+ }
+
+ return xrcd;
+}
+EXPORT_SYMBOL(ib_alloc_xrcd);
+
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+ struct ib_qp *qp;
+ int ret;
+
+ if (atomic_read(&xrcd->usecnt))
+ return -EBUSY;
+
+ while (!list_empty(&xrcd->tgt_qp_list)) {
+ qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list);
+ ret = ib_destroy_qp(qp);
+ if (ret)
+ return ret;
+ }
+
+ return xrcd->device->dealloc_xrcd(xrcd);
+}
+EXPORT_SYMBOL(ib_dealloc_xrcd);
+
+struct ib_flow *ib_create_flow(struct ib_qp *qp,
+ struct ib_flow_attr *flow_attr,
+ int domain)
+{
+ struct ib_flow *flow_id;
+ if (!qp->device->create_flow)
+ return ERR_PTR(-ENOSYS);
+
+ flow_id = qp->device->create_flow(qp, flow_attr, domain);
+ if (!IS_ERR(flow_id))
+ atomic_inc(&qp->usecnt);
+ return flow_id;
+}
+EXPORT_SYMBOL(ib_create_flow);
+
+int ib_destroy_flow(struct ib_flow *flow_id)
+{
+ int err;
+ struct ib_qp *qp = flow_id->qp;
+
+ err = qp->device->destroy_flow(flow_id);
+ if (!err)
+ atomic_dec(&qp->usecnt);
+ return err;
+}
+EXPORT_SYMBOL(ib_destroy_flow);
+
+int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
+ struct ib_mr_status *mr_status)
+{
+ return mr->device->check_mr_status ?
+ mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_check_mr_status);