diff options
Diffstat (limited to 'drivers/infiniband/core')
-rw-r--r-- | drivers/infiniband/core/Makefile | 14 | ||||
-rw-r--r-- | drivers/infiniband/core/addr.c | 226 | ||||
-rw-r--r-- | drivers/infiniband/core/cache.c | 14 | ||||
-rw-r--r-- | drivers/infiniband/core/cma.c | 66 | ||||
-rw-r--r-- | drivers/infiniband/core/core_priv.h | 16 | ||||
-rw-r--r-- | drivers/infiniband/core/device.c | 62 | ||||
-rw-r--r-- | drivers/infiniband/core/iwcm.c | 4 | ||||
-rw-r--r-- | drivers/infiniband/core/iwpm_msg.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/core/iwpm_util.c | 1 | ||||
-rw-r--r-- | drivers/infiniband/core/mad.c | 19 | ||||
-rw-r--r-- | drivers/infiniband/core/mr_pool.c | 86 | ||||
-rw-r--r-- | drivers/infiniband/core/multicast.c | 23 | ||||
-rw-r--r-- | drivers/infiniband/core/netlink.c | 5 | ||||
-rw-r--r-- | drivers/infiniband/core/rw.c | 727 | ||||
-rw-r--r-- | drivers/infiniband/core/sa_query.c | 213 | ||||
-rw-r--r-- | drivers/infiniband/core/sysfs.c | 378 | ||||
-rw-r--r-- | drivers/infiniband/core/uverbs_cmd.c | 13 | ||||
-rw-r--r-- | drivers/infiniband/core/verbs.c | 188 |
18 files changed, 1730 insertions, 327 deletions
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index f818538a7..edaae9f98 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -1,23 +1,19 @@ infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_cm.o user_access-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_ucm.o -obj-$(CONFIG_INFINIBAND) += ib_core.o ib_mad.o ib_sa.o \ - ib_cm.o iw_cm.o ib_addr.o \ +obj-$(CONFIG_INFINIBAND) += ib_core.o ib_cm.o iw_cm.o \ $(infiniband-y) obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ $(user_access-y) -ib_core-y := packer.o ud_header.o verbs.o cq.o sysfs.o \ +ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ - roce_gid_mgmt.o + roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ + multicast.o mad.o smi.o agent.o mad_rmpp.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o -ib_mad-y := mad.o smi.o agent.o mad_rmpp.o - -ib_sa-y := sa_query.o multicast.o - ib_cm-y := cm.o iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o @@ -28,8 +24,6 @@ rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o rdma_ucm-y := ucma.o -ib_addr-y := addr.o - ib_umad-y := user_mad.o ib_ucm-y := ucm.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 337353d86..1374541a4 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -46,10 +46,10 @@ #include <net/ip6_route.h> #include <rdma/ib_addr.h> #include <rdma/ib.h> +#include <rdma/rdma_netlink.h> +#include <net/netlink.h> -MODULE_AUTHOR("Sean Hefty"); -MODULE_DESCRIPTION("IB Address Translation"); -MODULE_LICENSE("Dual BSD/GPL"); +#include "core_priv.h" struct addr_req { struct list_head list; @@ -62,8 +62,11 @@ struct addr_req { struct rdma_dev_addr *addr, void *context); unsigned long timeout; int status; + u32 seq; }; +static atomic_t ib_nl_addr_request_seq = ATOMIC_INIT(0); + static void process_req(struct work_struct *work); static DEFINE_MUTEX(lock); @@ -71,6 +74,126 @@ static LIST_HEAD(req_list); static DECLARE_DELAYED_WORK(work, process_req); static struct workqueue_struct *addr_wq; +static const struct nla_policy ib_nl_addr_policy[LS_NLA_TYPE_MAX] = { + [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY, + .len = sizeof(struct rdma_nla_ls_gid)}, +}; + +static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh) +{ + struct nlattr *tb[LS_NLA_TYPE_MAX] = {}; + int ret; + + if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR) + return false; + + ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), + nlmsg_len(nlh), ib_nl_addr_policy); + if (ret) + return false; + + return true; +} + +static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh) +{ + const struct nlattr *head, *curr; + union ib_gid gid; + struct addr_req *req; + int len, rem; + int found = 0; + + head = (const struct nlattr *)nlmsg_data(nlh); + len = nlmsg_len(nlh); + + nla_for_each_attr(curr, head, len, rem) { + if (curr->nla_type == LS_NLA_TYPE_DGID) + memcpy(&gid, nla_data(curr), nla_len(curr)); + } + + mutex_lock(&lock); + list_for_each_entry(req, &req_list, list) { + if (nlh->nlmsg_seq != req->seq) + continue; + /* We set the DGID part, the rest was set earlier */ + rdma_addr_set_dgid(req->addr, &gid); + req->status = 0; + found = 1; + break; + } + mutex_unlock(&lock); + + if (!found) + pr_info("Couldn't find request waiting for DGID: %pI6\n", + &gid); +} + +int ib_nl_handle_ip_res_resp(struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; + + if ((nlh->nlmsg_flags & NLM_F_REQUEST) || + !(NETLINK_CB(skb).sk) || + !netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + + if (ib_nl_is_good_ip_resp(nlh)) + ib_nl_process_good_ip_rsep(nlh); + + return skb->len; +} + +static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr, + const void *daddr, + u32 seq, u16 family) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + struct rdma_ls_ip_resolve_header *header; + void *data; + size_t size; + int attrtype; + int len; + + if (family == AF_INET) { + size = sizeof(struct in_addr); + attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV4; + } else { + size = sizeof(struct in6_addr); + attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV6; + } + + len = nla_total_size(sizeof(size)); + len += NLMSG_ALIGN(sizeof(*header)); + + skb = nlmsg_new(len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + data = ibnl_put_msg(skb, &nlh, seq, 0, RDMA_NL_LS, + RDMA_NL_LS_OP_IP_RESOLVE, NLM_F_REQUEST); + if (!data) { + nlmsg_free(skb); + return -ENODATA; + } + + /* Construct the family header first */ + header = (struct rdma_ls_ip_resolve_header *) + skb_put(skb, NLMSG_ALIGN(sizeof(*header))); + header->ifindex = dev_addr->bound_dev_if; + nla_put(skb, attrtype, size, daddr); + + /* Repair the nlmsg header length */ + nlmsg_end(skb, nlh); + ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL); + + /* Make the request retry, so when we get the response from userspace + * we will have something. + */ + return -ENODATA; +} + int rdma_addr_size(struct sockaddr *addr) { switch (addr->sa_family) { @@ -199,6 +322,17 @@ static void queue_req(struct addr_req *req) mutex_unlock(&lock); } +static int ib_nl_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, + const void *daddr, u32 seq, u16 family) +{ + if (ibnl_chk_listeners(RDMA_NL_GROUP_LS)) + return -EADDRNOTAVAIL; + + /* We fill in what we can, the response will fill the rest */ + rdma_copy_addr(dev_addr, dst->dev, NULL); + return ib_nl_ip_send_msg(dev_addr, daddr, seq, family); +} + static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, const void *daddr) { @@ -223,6 +357,39 @@ static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, return ret; } +static bool has_gateway(struct dst_entry *dst, sa_family_t family) +{ + struct rtable *rt; + struct rt6_info *rt6; + + if (family == AF_INET) { + rt = container_of(dst, struct rtable, dst); + return rt->rt_uses_gateway; + } + + rt6 = container_of(dst, struct rt6_info, dst); + return rt6->rt6i_flags & RTF_GATEWAY; +} + +static int fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, + const struct sockaddr *dst_in, u32 seq) +{ + const struct sockaddr_in *dst_in4 = + (const struct sockaddr_in *)dst_in; + const struct sockaddr_in6 *dst_in6 = + (const struct sockaddr_in6 *)dst_in; + const void *daddr = (dst_in->sa_family == AF_INET) ? + (const void *)&dst_in4->sin_addr.s_addr : + (const void *)&dst_in6->sin6_addr; + sa_family_t family = dst_in->sa_family; + + /* Gateway + ARPHRD_INFINIBAND -> IB router */ + if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND) + return ib_nl_fetch_ha(dst, dev_addr, daddr, seq, family); + else + return dst_fetch_ha(dst, dev_addr, daddr); +} + static int addr4_resolve(struct sockaddr_in *src_in, const struct sockaddr_in *dst_in, struct rdma_dev_addr *addr, @@ -246,10 +413,11 @@ static int addr4_resolve(struct sockaddr_in *src_in, src_in->sin_family = AF_INET; src_in->sin_addr.s_addr = fl4.saddr; - /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't - * routable) and we could set the network type accordingly. + /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're + * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network + * type accordingly. */ - if (rt->rt_uses_gateway) + if (rt->rt_uses_gateway && rt->dst.dev->type != ARPHRD_INFINIBAND) addr->network = RDMA_NETWORK_IPV4; addr->hoplimit = ip4_dst_hoplimit(&rt->dst); @@ -291,10 +459,12 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, src_in->sin6_addr = fl6.saddr; } - /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't - * routable) and we could set the network type accordingly. + /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're + * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network + * type accordingly. */ - if (rt->rt6i_flags & RTF_GATEWAY) + if (rt->rt6i_flags & RTF_GATEWAY && + ip6_dst_idev(dst)->dev->type != ARPHRD_INFINIBAND) addr->network = RDMA_NETWORK_IPV6; addr->hoplimit = ip6_dst_hoplimit(dst); @@ -317,7 +487,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, static int addr_resolve_neigh(struct dst_entry *dst, const struct sockaddr *dst_in, - struct rdma_dev_addr *addr) + struct rdma_dev_addr *addr, + u32 seq) { if (dst->dev->flags & IFF_LOOPBACK) { int ret; @@ -331,17 +502,8 @@ static int addr_resolve_neigh(struct dst_entry *dst, } /* If the device doesn't do ARP internally */ - if (!(dst->dev->flags & IFF_NOARP)) { - const struct sockaddr_in *dst_in4 = - (const struct sockaddr_in *)dst_in; - const struct sockaddr_in6 *dst_in6 = - (const struct sockaddr_in6 *)dst_in; - - return dst_fetch_ha(dst, addr, - dst_in->sa_family == AF_INET ? - (const void *)&dst_in4->sin_addr.s_addr : - (const void *)&dst_in6->sin6_addr); - } + if (!(dst->dev->flags & IFF_NOARP)) + return fetch_ha(dst, addr, dst_in, seq); return rdma_copy_addr(addr, dst->dev, NULL); } @@ -349,7 +511,8 @@ static int addr_resolve_neigh(struct dst_entry *dst, static int addr_resolve(struct sockaddr *src_in, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, - bool resolve_neigh) + bool resolve_neigh, + u32 seq) { struct net_device *ndev; struct dst_entry *dst; @@ -366,7 +529,7 @@ static int addr_resolve(struct sockaddr *src_in, return ret; if (resolve_neigh) - ret = addr_resolve_neigh(&rt->dst, dst_in, addr); + ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq); ndev = rt->dst.dev; dev_hold(ndev); @@ -383,7 +546,7 @@ static int addr_resolve(struct sockaddr *src_in, return ret; if (resolve_neigh) - ret = addr_resolve_neigh(dst, dst_in, addr); + ret = addr_resolve_neigh(dst, dst_in, addr, seq); ndev = dst->dev; dev_hold(ndev); @@ -412,7 +575,7 @@ static void process_req(struct work_struct *work) src_in = (struct sockaddr *) &req->src_addr; dst_in = (struct sockaddr *) &req->dst_addr; req->status = addr_resolve(src_in, dst_in, req->addr, - true); + true, req->seq); if (req->status && time_after_eq(jiffies, req->timeout)) req->status = -ETIMEDOUT; else if (req->status == -ENODATA) @@ -471,8 +634,9 @@ int rdma_resolve_ip(struct rdma_addr_client *client, req->context = context; req->client = client; atomic_inc(&client->refcount); + req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq); - req->status = addr_resolve(src_in, dst_in, addr, true); + req->status = addr_resolve(src_in, dst_in, addr, true, req->seq); switch (req->status) { case 0: req->timeout = jiffies; @@ -510,7 +674,7 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr, src_in->sa_family = dst_addr->sa_family; } - return addr_resolve(src_in, dst_addr, addr, false); + return addr_resolve(src_in, dst_addr, addr, false, 0); } EXPORT_SYMBOL(rdma_resolve_ip_route); @@ -634,7 +798,7 @@ static struct notifier_block nb = { .notifier_call = netevent_callback }; -static int __init addr_init(void) +int addr_init(void) { addr_wq = create_singlethread_workqueue("ib_addr"); if (!addr_wq) @@ -642,15 +806,13 @@ static int __init addr_init(void) register_netevent_notifier(&nb); rdma_addr_register_client(&self); + return 0; } -static void __exit addr_cleanup(void) +void addr_cleanup(void) { rdma_addr_unregister_client(&self); unregister_netevent_notifier(&nb); destroy_workqueue(addr_wq); } - -module_init(addr_init); -module_exit(addr_cleanup); diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index c2e257d97..1a2984c28 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -178,6 +178,7 @@ static int write_gid(struct ib_device *ib_dev, u8 port, { int ret = 0; struct net_device *old_net_dev; + enum ib_gid_type old_gid_type; /* in rdma_cap_roce_gid_table, this funciton should be protected by a * sleep-able lock. @@ -199,6 +200,7 @@ static int write_gid(struct ib_device *ib_dev, u8 port, } old_net_dev = table->data_vec[ix].attr.ndev; + old_gid_type = table->data_vec[ix].attr.gid_type; if (old_net_dev && old_net_dev != attr->ndev) dev_put(old_net_dev); /* if modify_gid failed, just delete the old gid */ @@ -207,10 +209,14 @@ static int write_gid(struct ib_device *ib_dev, u8 port, attr = &zattr; table->data_vec[ix].context = NULL; } - if (default_gid) - table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT; + memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid)); memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr)); + if (default_gid) { + table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT; + if (action == GID_TABLE_WRITE_ACTION_DEL) + table->data_vec[ix].attr.gid_type = old_gid_type; + } if (table->data_vec[ix].attr.ndev && table->data_vec[ix].attr.ndev != old_net_dev) dev_hold(table->data_vec[ix].attr.ndev); @@ -405,7 +411,9 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, for (ix = 0; ix < table->sz; ix++) if (table->data_vec[ix].attr.ndev == ndev) - if (!del_gid(ib_dev, port, table, ix, false)) + if (!del_gid(ib_dev, port, table, ix, + !!(table->data_vec[ix].props & + GID_TABLE_ENTRY_DEFAULT))) deleted = true; write_unlock_irq(&table->rwlock); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 93ab0ae97..ad1b1adcf 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -708,17 +708,6 @@ static void cma_deref_id(struct rdma_id_private *id_priv) complete(&id_priv->comp); } -static int cma_disable_callback(struct rdma_id_private *id_priv, - enum rdma_cm_state state) -{ - mutex_lock(&id_priv->handler_mutex); - if (id_priv->state != state) { - mutex_unlock(&id_priv->handler_mutex); - return -EINVAL; - } - return 0; -} - struct rdma_cm_id *rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, void *context, enum rdma_port_space ps, @@ -800,6 +789,7 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, if (id->device != pd->device) return -EINVAL; + qp_init_attr->port_num = id->port_num; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) return PTR_ERR(qp); @@ -1670,11 +1660,12 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) struct rdma_cm_event event; int ret = 0; + mutex_lock(&id_priv->handler_mutex); if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && - cma_disable_callback(id_priv, RDMA_CM_CONNECT)) || + id_priv->state != RDMA_CM_CONNECT) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && - cma_disable_callback(id_priv, RDMA_CM_DISCONNECT))) - return 0; + id_priv->state != RDMA_CM_DISCONNECT)) + goto out; memset(&event, 0, sizeof event); switch (ib_event->event) { @@ -1869,7 +1860,7 @@ static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_e static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { - struct rdma_id_private *listen_id, *conn_id; + struct rdma_id_private *listen_id, *conn_id = NULL; struct rdma_cm_event event; struct net_device *net_dev; int offset, ret; @@ -1883,9 +1874,10 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) goto net_dev_put; } - if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) { + mutex_lock(&listen_id->handler_mutex); + if (listen_id->state != RDMA_CM_LISTEN) { ret = -ECONNABORTED; - goto net_dev_put; + goto err1; } memset(&event, 0, sizeof event); @@ -1975,8 +1967,9 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; - if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) - return 0; + mutex_lock(&id_priv->handler_mutex); + if (id_priv->state != RDMA_CM_CONNECT) + goto out; memset(&event, 0, sizeof event); switch (iw_event->event) { @@ -2028,6 +2021,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) return ret; } +out: mutex_unlock(&id_priv->handler_mutex); return ret; } @@ -2038,13 +2032,15 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event; - int ret; + int ret = -ECONNABORTED; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; listen_id = cm_id->context; - if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) - return -ECONNABORTED; + + mutex_lock(&listen_id->handler_mutex); + if (listen_id->state != RDMA_CM_LISTEN) + goto out; /* Create a new RDMA id for the new IW CM ID */ new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net, @@ -3215,8 +3211,9 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret = 0; - if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) - return 0; + mutex_lock(&id_priv->handler_mutex); + if (id_priv->state != RDMA_CM_CONNECT) + goto out; memset(&event, 0, sizeof event); switch (ib_event->event) { @@ -3672,12 +3669,13 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) struct rdma_id_private *id_priv; struct cma_multicast *mc = multicast->context; struct rdma_cm_event event; - int ret; + int ret = 0; id_priv = mc->id_priv; - if (cma_disable_callback(id_priv, RDMA_CM_ADDR_BOUND) && - cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED)) - return 0; + mutex_lock(&id_priv->handler_mutex); + if (id_priv->state != RDMA_CM_ADDR_BOUND && + id_priv->state != RDMA_CM_ADDR_RESOLVED) + goto out; if (!status) status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); @@ -3719,6 +3717,7 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) return 0; } +out: mutex_unlock(&id_priv->handler_mutex); return 0; } @@ -3877,12 +3876,12 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; if (addr->sa_family == AF_INET) { - if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { + mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, true); - if (!err) { - mc->igmp_joined = true; - mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; + if (!err) + mc->igmp_joined = true; } } else { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) @@ -4294,7 +4293,8 @@ static int __init cma_init(void) if (ret) goto err; - if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table)) + if (ibnl_add_client(RDMA_NL_RDMA_CM, ARRAY_SIZE(cma_cb_table), + cma_cb_table)) pr_warn("RDMA CMA: failed to add netlink callback\n"); cma_configfs_init(); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index eab322157..19d499dca 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -137,4 +137,20 @@ static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, return _upper == upper; } +int addr_init(void); +void addr_cleanup(void); + +int ib_mad_init(void); +void ib_mad_cleanup(void); + +int ib_sa_init(void); +void ib_sa_cleanup(void); + +int ib_nl_handle_resolve_resp(struct sk_buff *skb, + struct netlink_callback *cb); +int ib_nl_handle_set_timeout(struct sk_buff *skb, + struct netlink_callback *cb); +int ib_nl_handle_ip_res_resp(struct sk_buff *skb, + struct netlink_callback *cb); + #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 109798440..5c155fa91 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -661,6 +661,9 @@ int ib_query_port(struct ib_device *device, if (err || port_attr->subnet_prefix) return err; + if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) + return 0; + err = ib_query_gid(device, port_num, 0, &gid, NULL); if (err) return err; @@ -955,6 +958,29 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, } EXPORT_SYMBOL(ib_get_net_dev_by_params); +static struct ibnl_client_cbs ibnl_ls_cb_table[] = { + [RDMA_NL_LS_OP_RESOLVE] = { + .dump = ib_nl_handle_resolve_resp, + .module = THIS_MODULE }, + [RDMA_NL_LS_OP_SET_TIMEOUT] = { + .dump = ib_nl_handle_set_timeout, + .module = THIS_MODULE }, + [RDMA_NL_LS_OP_IP_RESOLVE] = { + .dump = ib_nl_handle_ip_res_resp, + .module = THIS_MODULE }, +}; + +static int ib_add_ibnl_clients(void) +{ + return ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ibnl_ls_cb_table), + ibnl_ls_cb_table); +} + +static void ib_remove_ibnl_clients(void) +{ + ibnl_remove_client(RDMA_NL_LS); +} + static int __init ib_core_init(void) { int ret; @@ -983,10 +1009,42 @@ static int __init ib_core_init(void) goto err_sysfs; } + ret = addr_init(); + if (ret) { + pr_warn("Could't init IB address resolution\n"); + goto err_ibnl; + } + + ret = ib_mad_init(); + if (ret) { + pr_warn("Couldn't init IB MAD\n"); + goto err_addr; + } + + ret = ib_sa_init(); + if (ret) { + pr_warn("Couldn't init SA\n"); + goto err_mad; + } + + ret = ib_add_ibnl_clients(); + if (ret) { + pr_warn("Couldn't register ibnl clients\n"); + goto err_sa; + } + ib_cache_setup(); return 0; +err_sa: + ib_sa_cleanup(); +err_mad: + ib_mad_cleanup(); +err_addr: + addr_cleanup(); +err_ibnl: + ibnl_cleanup(); err_sysfs: class_unregister(&ib_class); err_comp: @@ -999,6 +1057,10 @@ err: static void __exit ib_core_cleanup(void) { ib_cache_cleanup(); + ib_remove_ibnl_clients(); + ib_sa_cleanup(); + ib_mad_cleanup(); + addr_cleanup(); ibnl_cleanup(); class_unregister(&ib_class); destroy_workqueue(ib_comp_wq); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index e28a160cd..f0572049d 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -459,7 +459,7 @@ static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr, if (pm_addr->ss_family == AF_INET) { struct sockaddr_in *pm4_addr = (struct sockaddr_in *)pm_addr; - if (pm4_addr->sin_addr.s_addr == INADDR_ANY) { + if (pm4_addr->sin_addr.s_addr == htonl(INADDR_ANY)) { struct sockaddr_in *cm4_addr = (struct sockaddr_in *)cm_addr; struct sockaddr_in *cm4_outaddr = @@ -1175,7 +1175,7 @@ static int __init iw_cm_init(void) if (ret) pr_err("iw_cm: couldn't init iwpm\n"); - ret = ibnl_add_client(RDMA_NL_IWCM, RDMA_NL_IWPM_NUM_OPS, + ret = ibnl_add_client(RDMA_NL_IWCM, ARRAY_SIZE(iwcm_nl_cb_table), iwcm_nl_cb_table); if (ret) pr_err("iw_cm: couldn't register netlink callbacks\n"); diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index 43e3fa271..1c41b95ce 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -506,7 +506,7 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, if (!nlmsg_request) { pr_info("%s: Could not find a matching request (seq = %u)\n", __func__, msg_seq); - return -EINVAL; + return -EINVAL; } pm_msg = nlmsg_request->req_buffer; local_sockaddr = (struct sockaddr_storage *) diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index 9b2bf2fb2..b65e06c56 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -634,6 +634,7 @@ static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { pr_warn("%s Unable to put NLMSG_DONE\n", __func__); + dev_kfree_skb(skb); return -ENOMEM; } nlh->nlmsg_type = NLMSG_DONE; diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 9fa5bf33f..2d49228f2 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -47,11 +47,7 @@ #include "smi.h" #include "opa_smi.h" #include "agent.h" - -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_DESCRIPTION("kernel IB MAD API"); -MODULE_AUTHOR("Hal Rosenstock"); -MODULE_AUTHOR("Sean Hefty"); +#include "core_priv.h" static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; @@ -1642,9 +1638,9 @@ static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv) /* Now, check to see if there are any methods still in use */ if (!check_method_table(method)) { /* If not, release management method table */ - kfree(method); - class->method_table[mgmt_class] = NULL; - /* Any management classes left ? */ + kfree(method); + class->method_table[mgmt_class] = NULL; + /* Any management classes left ? */ if (!check_class_table(class)) { /* If not, release management class table */ kfree(class); @@ -3316,7 +3312,7 @@ static struct ib_client mad_client = { .remove = ib_mad_remove_device }; -static int __init ib_mad_init_module(void) +int ib_mad_init(void) { mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE); mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE); @@ -3334,10 +3330,7 @@ static int __init ib_mad_init_module(void) return 0; } -static void __exit ib_mad_cleanup_module(void) +void ib_mad_cleanup(void) { ib_unregister_client(&mad_client); } - -module_init(ib_mad_init_module); -module_exit(ib_mad_cleanup_module); diff --git a/drivers/infiniband/core/mr_pool.c b/drivers/infiniband/core/mr_pool.c new file mode 100644 index 000000000..49d478b2e --- /dev/null +++ b/drivers/infiniband/core/mr_pool.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <rdma/ib_verbs.h> +#include <rdma/mr_pool.h> + +struct ib_mr *ib_mr_pool_get(struct ib_qp *qp, struct list_head *list) +{ + struct ib_mr *mr; + unsigned long flags; + + spin_lock_irqsave(&qp->mr_lock, flags); + mr = list_first_entry_or_null(list, struct ib_mr, qp_entry); + if (mr) { + list_del(&mr->qp_entry); + qp->mrs_used++; + } + spin_unlock_irqrestore(&qp->mr_lock, flags); + + return mr; +} +EXPORT_SYMBOL(ib_mr_pool_get); + +void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->mr_lock, flags); + list_add(&mr->qp_entry, list); + qp->mrs_used--; + spin_unlock_irqrestore(&qp->mr_lock, flags); +} +EXPORT_SYMBOL(ib_mr_pool_put); + +int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr, + enum ib_mr_type type, u32 max_num_sg) +{ + struct ib_mr *mr; + unsigned long flags; + int ret, i; + + for (i = 0; i < nr; i++) { + mr = ib_alloc_mr(qp->pd, type, max_num_sg); + if (IS_ERR(mr)) { + ret = PTR_ERR(mr); + goto out; + } + + spin_lock_irqsave(&qp->mr_lock, flags); + list_add_tail(&mr->qp_entry, list); + spin_unlock_irqrestore(&qp->mr_lock, flags); + } + + return 0; +out: + ib_mr_pool_destroy(qp, list); + return ret; +} +EXPORT_SYMBOL(ib_mr_pool_init); + +void ib_mr_pool_destroy(struct ib_qp *qp, struct list_head *list) +{ + struct ib_mr *mr; + unsigned long flags; + + spin_lock_irqsave(&qp->mr_lock, flags); + while (!list_empty(list)) { + mr = list_first_entry(list, struct ib_mr, qp_entry); + list_del(&mr->qp_entry); + + spin_unlock_irqrestore(&qp->mr_lock, flags); + ib_dereg_mr(mr); + spin_lock_irqsave(&qp->mr_lock, flags); + } + spin_unlock_irqrestore(&qp->mr_lock, flags); +} +EXPORT_SYMBOL(ib_mr_pool_destroy); diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index 250937cb9..a83ec28a1 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -93,6 +93,18 @@ enum { struct mcast_member; +/* +* There are 4 types of join states: +* FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember. +*/ +enum { + FULLMEMBER_JOIN, + NONMEMBER_JOIN, + SENDONLY_NONMEBER_JOIN, + SENDONLY_FULLMEMBER_JOIN, + NUM_JOIN_MEMBERSHIP_TYPES, +}; + struct mcast_group { struct ib_sa_mcmember_rec rec; struct rb_node node; @@ -102,7 +114,7 @@ struct mcast_group { struct list_head pending_list; struct list_head active_list; struct mcast_member *last_join; - int members[3]; + int members[NUM_JOIN_MEMBERSHIP_TYPES]; atomic_t refcount; enum mcast_group_state state; struct ib_sa_query *query; @@ -220,8 +232,9 @@ static void queue_join(struct mcast_member *member) } /* - * A multicast group has three types of members: full member, non member, and - * send only member. We need to keep track of the number of members of each + * A multicast group has four types of members: full member, non member, + * sendonly non member and sendonly full member. + * We need to keep track of the number of members of each * type based on their join state. Adjust the number of members the belong to * the specified join states. */ @@ -229,7 +242,7 @@ static void adjust_membership(struct mcast_group *group, u8 join_state, int inc) { int i; - for (i = 0; i < 3; i++, join_state >>= 1) + for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++, join_state >>= 1) if (join_state & 0x1) group->members[i] += inc; } @@ -245,7 +258,7 @@ static u8 get_leave_state(struct mcast_group *group) u8 leave_state = 0; int i; - for (i = 0; i < 3; i++) + for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++) if (!group->members[i]) leave_state |= (0x1 << i); diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index d47df9356..9b8c20c82 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -151,12 +151,11 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct ibnl_client *client; int type = nlh->nlmsg_type; int index = RDMA_NL_GET_CLIENT(type); - int op = RDMA_NL_GET_OP(type); + unsigned int op = RDMA_NL_GET_OP(type); list_for_each_entry(client, &client_list, list) { if (client->index == index) { - if (op < 0 || op >= client->nops || - !client->cb_table[op].dump) + if (op >= client->nops || !client->cb_table[op].dump) return -EINVAL; /* diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c new file mode 100644 index 000000000..1eb9b1294 --- /dev/null +++ b/drivers/infiniband/core/rw.c @@ -0,0 +1,727 @@ +/* + * Copyright (c) 2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/moduleparam.h> +#include <linux/slab.h> +#include <rdma/mr_pool.h> +#include <rdma/rw.h> + +enum { + RDMA_RW_SINGLE_WR, + RDMA_RW_MULTI_WR, + RDMA_RW_MR, + RDMA_RW_SIG_MR, +}; + +static bool rdma_rw_force_mr; +module_param_named(force_mr, rdma_rw_force_mr, bool, 0); +MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations"); + +/* + * Check if the device might use memory registration. This is currently only + * true for iWarp devices. In the future we can hopefully fine tune this based + * on HCA driver input. + */ +static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num) +{ + if (rdma_protocol_iwarp(dev, port_num)) + return true; + if (unlikely(rdma_rw_force_mr)) + return true; + return false; +} + +/* + * Check if the device will use memory registration for this RW operation. + * We currently always use memory registrations for iWarp RDMA READs, and + * have a debug option to force usage of MRs. + * + * XXX: In the future we can hopefully fine tune this based on HCA driver + * input. + */ +static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num, + enum dma_data_direction dir, int dma_nents) +{ + if (rdma_protocol_iwarp(dev, port_num) && dir == DMA_FROM_DEVICE) + return true; + if (unlikely(rdma_rw_force_mr)) + return true; + return false; +} + +static inline u32 rdma_rw_max_sge(struct ib_device *dev, + enum dma_data_direction dir) +{ + return dir == DMA_TO_DEVICE ? + dev->attrs.max_sge : dev->attrs.max_sge_rd; +} + +static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev) +{ + /* arbitrary limit to avoid allocating gigantic resources */ + return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256); +} + +static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, + struct rdma_rw_reg_ctx *reg, struct scatterlist *sg, + u32 sg_cnt, u32 offset) +{ + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + u32 nents = min(sg_cnt, pages_per_mr); + int count = 0, ret; + + reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs); + if (!reg->mr) + return -EAGAIN; + + if (reg->mr->need_inval) { + reg->inv_wr.opcode = IB_WR_LOCAL_INV; + reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey; + reg->inv_wr.next = ®->reg_wr.wr; + count++; + } else { + reg->inv_wr.next = NULL; + } + + ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE); + if (ret < nents) { + ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr); + return -EINVAL; + } + + reg->reg_wr.wr.opcode = IB_WR_REG_MR; + reg->reg_wr.mr = reg->mr; + reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE; + if (rdma_protocol_iwarp(qp->device, port_num)) + reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE; + count++; + + reg->sge.addr = reg->mr->iova; + reg->sge.length = reg->mr->length; + return count; +} + +static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + int i, j, ret = 0, count = 0; + + ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr; + ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL); + if (!ctx->reg) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < ctx->nr_ops; i++) { + struct rdma_rw_reg_ctx *prev = i ? &ctx->reg[i - 1] : NULL; + struct rdma_rw_reg_ctx *reg = &ctx->reg[i]; + u32 nents = min(sg_cnt, pages_per_mr); + + ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sg_cnt, + offset); + if (ret < 0) + goto out_free; + count += ret; + + if (prev) { + if (reg->mr->need_inval) + prev->wr.wr.next = ®->inv_wr; + else + prev->wr.wr.next = ®->reg_wr.wr; + } + + reg->reg_wr.wr.next = ®->wr.wr; + + reg->wr.wr.sg_list = ®->sge; + reg->wr.wr.num_sge = 1; + reg->wr.remote_addr = remote_addr; + reg->wr.rkey = rkey; + if (dir == DMA_TO_DEVICE) { + reg->wr.wr.opcode = IB_WR_RDMA_WRITE; + } else if (!rdma_cap_read_inv(qp->device, port_num)) { + reg->wr.wr.opcode = IB_WR_RDMA_READ; + } else { + reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; + reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey; + } + count++; + + remote_addr += reg->sge.length; + sg_cnt -= nents; + for (j = 0; j < nents; j++) + sg = sg_next(sg); + offset = 0; + } + + ctx->type = RDMA_RW_MR; + return count; + +out_free: + while (--i >= 0) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); + kfree(ctx->reg); +out: + return ret; +} + +static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + struct scatterlist *sg, u32 sg_cnt, u32 offset, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + u32 max_sge = rdma_rw_max_sge(dev, dir); + struct ib_sge *sge; + u32 total_len = 0, i, j; + + ctx->nr_ops = DIV_ROUND_UP(sg_cnt, max_sge); + + ctx->map.sges = sge = kcalloc(sg_cnt, sizeof(*sge), GFP_KERNEL); + if (!ctx->map.sges) + goto out; + + ctx->map.wrs = kcalloc(ctx->nr_ops, sizeof(*ctx->map.wrs), GFP_KERNEL); + if (!ctx->map.wrs) + goto out_free_sges; + + for (i = 0; i < ctx->nr_ops; i++) { + struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i]; + u32 nr_sge = min(sg_cnt, max_sge); + + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + rdma_wr->remote_addr = remote_addr + total_len; + rdma_wr->rkey = rkey; + rdma_wr->wr.sg_list = sge; + + for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) { + rdma_wr->wr.num_sge++; + + sge->addr = ib_sg_dma_address(dev, sg) + offset; + sge->length = ib_sg_dma_len(dev, sg) - offset; + sge->lkey = qp->pd->local_dma_lkey; + + total_len += sge->length; + sge++; + sg_cnt--; + offset = 0; + } + + if (i + 1 < ctx->nr_ops) + rdma_wr->wr.next = &ctx->map.wrs[i + 1].wr; + } + + ctx->type = RDMA_RW_MULTI_WR; + return ctx->nr_ops; + +out_free_sges: + kfree(ctx->map.sges); +out: + return -ENOMEM; +} + +static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey, + enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + struct ib_rdma_wr *rdma_wr = &ctx->single.wr; + + ctx->nr_ops = 1; + + ctx->single.sge.lkey = qp->pd->local_dma_lkey; + ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset; + ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset; + + memset(rdma_wr, 0, sizeof(*rdma_wr)); + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + rdma_wr->wr.sg_list = &ctx->single.sge; + rdma_wr->wr.num_sge = 1; + rdma_wr->remote_addr = remote_addr; + rdma_wr->rkey = rkey; + + ctx->type = RDMA_RW_SINGLE_WR; + return 1; +} + +/** + * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context + * @ctx: context to initialize + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist to READ/WRITE from/to + * @sg_cnt: number of entries in @sg + * @sg_offset: current byte offset into @sg + * @remote_addr:remote address to read/write (relative to @rkey) + * @rkey: remote key to operate on + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + * + * Returns the number of WQEs that will be needed on the workqueue if + * successful, or a negative error code. + */ +int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, + struct scatterlist *sg, u32 sg_cnt, u32 sg_offset, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + int ret; + + ret = ib_dma_map_sg(dev, sg, sg_cnt, dir); + if (!ret) + return -ENOMEM; + sg_cnt = ret; + + /* + * Skip to the S/G entry that sg_offset falls into: + */ + for (;;) { + u32 len = ib_sg_dma_len(dev, sg); + + if (sg_offset < len) + break; + + sg = sg_next(sg); + sg_offset -= len; + sg_cnt--; + } + + ret = -EIO; + if (WARN_ON_ONCE(sg_cnt == 0)) + goto out_unmap_sg; + + if (rdma_rw_io_needs_mr(qp->device, port_num, dir, sg_cnt)) { + ret = rdma_rw_init_mr_wrs(ctx, qp, port_num, sg, sg_cnt, + sg_offset, remote_addr, rkey, dir); + } else if (sg_cnt > 1) { + ret = rdma_rw_init_map_wrs(ctx, qp, sg, sg_cnt, sg_offset, + remote_addr, rkey, dir); + } else { + ret = rdma_rw_init_single_wr(ctx, qp, sg, sg_offset, + remote_addr, rkey, dir); + } + + if (ret < 0) + goto out_unmap_sg; + return ret; + +out_unmap_sg: + ib_dma_unmap_sg(dev, sg, sg_cnt, dir); + return ret; +} +EXPORT_SYMBOL(rdma_rw_ctx_init); + +/** + * rdma_rw_ctx_signature init - initialize a RW context with signature offload + * @ctx: context to initialize + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist to READ/WRITE from/to + * @sg_cnt: number of entries in @sg + * @prot_sg: scatterlist to READ/WRITE protection information from/to + * @prot_sg_cnt: number of entries in @prot_sg + * @sig_attrs: signature offloading algorithms + * @remote_addr:remote address to read/write (relative to @rkey) + * @rkey: remote key to operate on + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + * + * Returns the number of WQEs that will be needed on the workqueue if + * successful, or a negative error code. + */ +int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct scatterlist *sg, u32 sg_cnt, + struct scatterlist *prot_sg, u32 prot_sg_cnt, + struct ib_sig_attrs *sig_attrs, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + struct ib_rdma_wr *rdma_wr; + struct ib_send_wr *prev_wr = NULL; + int count = 0, ret; + + if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) { + pr_err("SG count too large\n"); + return -EINVAL; + } + + ret = ib_dma_map_sg(dev, sg, sg_cnt, dir); + if (!ret) + return -ENOMEM; + sg_cnt = ret; + + ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir); + if (!ret) { + ret = -ENOMEM; + goto out_unmap_sg; + } + prot_sg_cnt = ret; + + ctx->type = RDMA_RW_SIG_MR; + ctx->nr_ops = 1; + ctx->sig = kcalloc(1, sizeof(*ctx->sig), GFP_KERNEL); + if (!ctx->sig) { + ret = -ENOMEM; + goto out_unmap_prot_sg; + } + + ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->data, sg, sg_cnt, 0); + if (ret < 0) + goto out_free_ctx; + count += ret; + prev_wr = &ctx->sig->data.reg_wr.wr; + + if (prot_sg_cnt) { + ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->prot, + prot_sg, prot_sg_cnt, 0); + if (ret < 0) + goto out_destroy_data_mr; + count += ret; + + if (ctx->sig->prot.inv_wr.next) + prev_wr->next = &ctx->sig->prot.inv_wr; + else + prev_wr->next = &ctx->sig->prot.reg_wr.wr; + prev_wr = &ctx->sig->prot.reg_wr.wr; + } else { + ctx->sig->prot.mr = NULL; + } + + ctx->sig->sig_mr = ib_mr_pool_get(qp, &qp->sig_mrs); + if (!ctx->sig->sig_mr) { + ret = -EAGAIN; + goto out_destroy_prot_mr; + } + + if (ctx->sig->sig_mr->need_inval) { + memset(&ctx->sig->sig_inv_wr, 0, sizeof(ctx->sig->sig_inv_wr)); + + ctx->sig->sig_inv_wr.opcode = IB_WR_LOCAL_INV; + ctx->sig->sig_inv_wr.ex.invalidate_rkey = ctx->sig->sig_mr->rkey; + + prev_wr->next = &ctx->sig->sig_inv_wr; + prev_wr = &ctx->sig->sig_inv_wr; + } + + ctx->sig->sig_wr.wr.opcode = IB_WR_REG_SIG_MR; + ctx->sig->sig_wr.wr.wr_cqe = NULL; + ctx->sig->sig_wr.wr.sg_list = &ctx->sig->data.sge; + ctx->sig->sig_wr.wr.num_sge = 1; + ctx->sig->sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE; + ctx->sig->sig_wr.sig_attrs = sig_attrs; + ctx->sig->sig_wr.sig_mr = ctx->sig->sig_mr; + if (prot_sg_cnt) + ctx->sig->sig_wr.prot = &ctx->sig->prot.sge; + prev_wr->next = &ctx->sig->sig_wr.wr; + prev_wr = &ctx->sig->sig_wr.wr; + count++; + + ctx->sig->sig_sge.addr = 0; + ctx->sig->sig_sge.length = ctx->sig->data.sge.length; + if (sig_attrs->wire.sig_type != IB_SIG_TYPE_NONE) + ctx->sig->sig_sge.length += ctx->sig->prot.sge.length; + + rdma_wr = &ctx->sig->data.wr; + rdma_wr->wr.sg_list = &ctx->sig->sig_sge; + rdma_wr->wr.num_sge = 1; + rdma_wr->remote_addr = remote_addr; + rdma_wr->rkey = rkey; + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + prev_wr->next = &rdma_wr->wr; + prev_wr = &rdma_wr->wr; + count++; + + return count; + +out_destroy_prot_mr: + if (prot_sg_cnt) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr); +out_destroy_data_mr: + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr); +out_free_ctx: + kfree(ctx->sig); +out_unmap_prot_sg: + ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir); +out_unmap_sg: + ib_dma_unmap_sg(dev, sg, sg_cnt, dir); + return ret; +} +EXPORT_SYMBOL(rdma_rw_ctx_signature_init); + +/* + * Now that we are going to post the WRs we can update the lkey and need_inval + * state on the MRs. If we were doing this at init time, we would get double + * or missing invalidations if a context was initialized but not actually + * posted. + */ +static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval) +{ + reg->mr->need_inval = need_inval; + ib_update_fast_reg_key(reg->mr, ib_inc_rkey(reg->mr->lkey)); + reg->reg_wr.key = reg->mr->lkey; + reg->sge.lkey = reg->mr->lkey; +} + +/** + * rdma_rw_ctx_wrs - return chain of WRs for a RDMA READ or WRITE operation + * @ctx: context to operate on + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @cqe: completion queue entry for the last WR + * @chain_wr: WR to append to the posted chain + * + * Return the WR chain for the set of RDMA READ/WRITE operations described by + * @ctx, as well as any memory registration operations needed. If @chain_wr + * is non-NULL the WR it points to will be appended to the chain of WRs posted. + * If @chain_wr is not set @cqe must be set so that the caller gets a + * completion notification. + */ +struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr) +{ + struct ib_send_wr *first_wr, *last_wr; + int i; + + switch (ctx->type) { + case RDMA_RW_SIG_MR: + rdma_rw_update_lkey(&ctx->sig->data, true); + if (ctx->sig->prot.mr) + rdma_rw_update_lkey(&ctx->sig->prot, true); + + ctx->sig->sig_mr->need_inval = true; + ib_update_fast_reg_key(ctx->sig->sig_mr, + ib_inc_rkey(ctx->sig->sig_mr->lkey)); + ctx->sig->sig_sge.lkey = ctx->sig->sig_mr->lkey; + + if (ctx->sig->data.inv_wr.next) + first_wr = &ctx->sig->data.inv_wr; + else + first_wr = &ctx->sig->data.reg_wr.wr; + last_wr = &ctx->sig->data.wr.wr; + break; + case RDMA_RW_MR: + for (i = 0; i < ctx->nr_ops; i++) { + rdma_rw_update_lkey(&ctx->reg[i], + ctx->reg[i].wr.wr.opcode != + IB_WR_RDMA_READ_WITH_INV); + } + + if (ctx->reg[0].inv_wr.next) + first_wr = &ctx->reg[0].inv_wr; + else + first_wr = &ctx->reg[0].reg_wr.wr; + last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr; + break; + case RDMA_RW_MULTI_WR: + first_wr = &ctx->map.wrs[0].wr; + last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr; + break; + case RDMA_RW_SINGLE_WR: + first_wr = &ctx->single.wr.wr; + last_wr = &ctx->single.wr.wr; + break; + default: + BUG(); + } + + if (chain_wr) { + last_wr->next = chain_wr; + } else { + last_wr->wr_cqe = cqe; + last_wr->send_flags |= IB_SEND_SIGNALED; + } + + return first_wr; +} +EXPORT_SYMBOL(rdma_rw_ctx_wrs); + +/** + * rdma_rw_ctx_post - post a RDMA READ or RDMA WRITE operation + * @ctx: context to operate on + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @cqe: completion queue entry for the last WR + * @chain_wr: WR to append to the posted chain + * + * Post the set of RDMA READ/WRITE operations described by @ctx, as well as + * any memory registration operations needed. If @chain_wr is non-NULL the + * WR it points to will be appended to the chain of WRs posted. If @chain_wr + * is not set @cqe must be set so that the caller gets a completion + * notification. + */ +int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, + struct ib_cqe *cqe, struct ib_send_wr *chain_wr) +{ + struct ib_send_wr *first_wr, *bad_wr; + + first_wr = rdma_rw_ctx_wrs(ctx, qp, port_num, cqe, chain_wr); + return ib_post_send(qp, first_wr, &bad_wr); +} +EXPORT_SYMBOL(rdma_rw_ctx_post); + +/** + * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init + * @ctx: context to release + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist that was used for the READ/WRITE + * @sg_cnt: number of entries in @sg + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + */ +void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, + struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir) +{ + int i; + + switch (ctx->type) { + case RDMA_RW_MR: + for (i = 0; i < ctx->nr_ops; i++) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); + kfree(ctx->reg); + break; + case RDMA_RW_MULTI_WR: + kfree(ctx->map.wrs); + kfree(ctx->map.sges); + break; + case RDMA_RW_SINGLE_WR: + break; + default: + BUG(); + break; + } + + ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); +} +EXPORT_SYMBOL(rdma_rw_ctx_destroy); + +/** + * rdma_rw_ctx_destroy_signature - release all resources allocated by + * rdma_rw_ctx_init_signature + * @ctx: context to release + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist that was used for the READ/WRITE + * @sg_cnt: number of entries in @sg + * @prot_sg: scatterlist that was used for the READ/WRITE of the PI + * @prot_sg_cnt: number of entries in @prot_sg + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + */ +void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct scatterlist *sg, u32 sg_cnt, + struct scatterlist *prot_sg, u32 prot_sg_cnt, + enum dma_data_direction dir) +{ + if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR)) + return; + + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr); + ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); + + if (ctx->sig->prot.mr) { + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr); + ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir); + } + + ib_mr_pool_put(qp, &qp->sig_mrs, ctx->sig->sig_mr); + kfree(ctx->sig); +} +EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature); + +void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr) +{ + u32 factor; + + WARN_ON_ONCE(attr->port_num == 0); + + /* + * Each context needs at least one RDMA READ or WRITE WR. + * + * For some hardware we might need more, eventually we should ask the + * HCA driver for a multiplier here. + */ + factor = 1; + + /* + * If the devices needs MRs to perform RDMA READ or WRITE operations, + * we'll need two additional MRs for the registrations and the + * invalidation. + */ + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) + factor += 6; /* (inv + reg) * (data + prot + sig) */ + else if (rdma_rw_can_use_mr(dev, attr->port_num)) + factor += 2; /* inv + reg */ + + attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs; + + /* + * But maybe we were just too high in the sky and the device doesn't + * even support all we need, and we'll have to live with what we get.. + */ + attr->cap.max_send_wr = + min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr); +} + +int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) +{ + struct ib_device *dev = qp->pd->device; + u32 nr_mrs = 0, nr_sig_mrs = 0; + int ret = 0; + + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) { + nr_sig_mrs = attr->cap.max_rdma_ctxs; + nr_mrs = attr->cap.max_rdma_ctxs * 2; + } else if (rdma_rw_can_use_mr(dev, attr->port_num)) { + nr_mrs = attr->cap.max_rdma_ctxs; + } + + if (nr_mrs) { + ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs, + IB_MR_TYPE_MEM_REG, + rdma_rw_fr_page_list_len(dev)); + if (ret) { + pr_err("%s: failed to allocated %d MRs\n", + __func__, nr_mrs); + return ret; + } + } + + if (nr_sig_mrs) { + ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs, + IB_MR_TYPE_SIGNATURE, 2); + if (ret) { + pr_err("%s: failed to allocated %d SIG MRs\n", + __func__, nr_mrs); + goto out_free_rdma_mrs; + } + } + + return 0; + +out_free_rdma_mrs: + ib_mr_pool_destroy(qp, &qp->rdma_mrs); + return ret; +} + +void rdma_rw_cleanup_mrs(struct ib_qp *qp) +{ + ib_mr_pool_destroy(qp, &qp->sig_mrs); + ib_mr_pool_destroy(qp, &qp->rdma_mrs); +} diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 8a09c0fb2..e95538650 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -53,10 +53,6 @@ #include "sa.h" #include "core_priv.h" -MODULE_AUTHOR("Roland Dreier"); -MODULE_DESCRIPTION("InfiniBand subnet administration query support"); -MODULE_LICENSE("Dual BSD/GPL"); - #define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100 #define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000 #define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000 @@ -119,6 +115,12 @@ struct ib_sa_guidinfo_query { struct ib_sa_query sa_query; }; +struct ib_sa_classport_info_query { + void (*callback)(int, struct ib_class_port_info *, void *); + void *context; + struct ib_sa_query sa_query; +}; + struct ib_sa_mcmember_query { void (*callback)(int, struct ib_sa_mcmember_rec *, void *); void *context; @@ -392,6 +394,82 @@ static const struct ib_field service_rec_table[] = { .size_bits = 2*64 }, }; +#define CLASSPORTINFO_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct ib_class_port_info, field), \ + .struct_size_bytes = sizeof((struct ib_class_port_info *)0)->field, \ + .field_name = "ib_class_port_info:" #field + +static const struct ib_field classport_info_rec_table[] = { + { CLASSPORTINFO_REC_FIELD(base_version), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 8 }, + { CLASSPORTINFO_REC_FIELD(class_version), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 8 }, + { CLASSPORTINFO_REC_FIELD(capability_mask), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_gid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { CLASSPORTINFO_REC_FIELD(redirect_tcslfl), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_lid), + .offset_words = 7, + .offset_bits = 0, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(redirect_pkey), + .offset_words = 7, + .offset_bits = 16, + .size_bits = 16 }, + + { CLASSPORTINFO_REC_FIELD(redirect_qp), + .offset_words = 8, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_qkey), + .offset_words = 9, + .offset_bits = 0, + .size_bits = 32 }, + + { CLASSPORTINFO_REC_FIELD(trap_gid), + .offset_words = 10, + .offset_bits = 0, + .size_bits = 128 }, + { CLASSPORTINFO_REC_FIELD(trap_tcslfl), + .offset_words = 14, + .offset_bits = 0, + .size_bits = 32 }, + + { CLASSPORTINFO_REC_FIELD(trap_lid), + .offset_words = 15, + .offset_bits = 0, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(trap_pkey), + .offset_words = 15, + .offset_bits = 16, + .size_bits = 16 }, + + { CLASSPORTINFO_REC_FIELD(trap_hlqp), + .offset_words = 16, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(trap_qkey), + .offset_words = 17, + .offset_bits = 0, + .size_bits = 32 }, +}; + #define GUIDINFO_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \ .struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \ @@ -536,7 +614,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS, RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST); if (!data) { - kfree_skb(skb); + nlmsg_free(skb); return -EMSGSIZE; } @@ -705,8 +783,8 @@ static void ib_nl_request_timeout(struct work_struct *work) spin_unlock_irqrestore(&ib_nl_request_lock, flags); } -static int ib_nl_handle_set_timeout(struct sk_buff *skb, - struct netlink_callback *cb) +int ib_nl_handle_set_timeout(struct sk_buff *skb, + struct netlink_callback *cb) { const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; int timeout, delta, abs_delta; @@ -782,8 +860,8 @@ static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh) return 1; } -static int ib_nl_handle_resolve_resp(struct sk_buff *skb, - struct netlink_callback *cb) +int ib_nl_handle_resolve_resp(struct sk_buff *skb, + struct netlink_callback *cb) { const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; unsigned long flags; @@ -838,15 +916,6 @@ resp_out: return skb->len; } -static struct ibnl_client_cbs ib_sa_cb_table[] = { - [RDMA_NL_LS_OP_RESOLVE] = { - .dump = ib_nl_handle_resolve_resp, - .module = THIS_MODULE }, - [RDMA_NL_LS_OP_SET_TIMEOUT] = { - .dump = ib_nl_handle_set_timeout, - .module = THIS_MODULE }, -}; - static void free_sm_ah(struct kref *kref) { struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); @@ -1645,6 +1714,97 @@ err1: } EXPORT_SYMBOL(ib_sa_guid_info_rec_query); +/* Support get SA ClassPortInfo */ +static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query, + int status, + struct ib_sa_mad *mad) +{ + struct ib_sa_classport_info_query *query = + container_of(sa_query, struct ib_sa_classport_info_query, sa_query); + + if (mad) { + struct ib_class_port_info rec; + + ib_unpack(classport_info_rec_table, + ARRAY_SIZE(classport_info_rec_table), + mad->data, &rec); + query->callback(status, &rec, query->context); + } else { + query->callback(status, NULL, query->context); + } +} + +static void ib_sa_portclass_info_rec_release(struct ib_sa_query *sa_query) +{ + kfree(container_of(sa_query, struct ib_sa_classport_info_query, + sa_query)); +} + +int ib_sa_classport_info_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_class_port_info *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) +{ + struct ib_sa_classport_info_query *query; + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_port *port; + struct ib_mad_agent *agent; + struct ib_sa_mad *mad; + int ret; + + if (!sa_dev) + return -ENODEV; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + agent = port->agent; + + query = kzalloc(sizeof(*query), gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err1; + + ib_sa_client_get(client); + query->sa_query.client = client; + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(mad, agent); + + query->sa_query.callback = callback ? ib_sa_classport_info_rec_callback : NULL; + + query->sa_query.release = ib_sa_portclass_info_rec_release; + /* support GET only */ + mad->mad_hdr.method = IB_MGMT_METHOD_GET; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_CLASS_PORTINFO); + mad->sa_hdr.comp_mask = 0; + *sa_query = &query->sa_query; + + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + if (ret < 0) + goto err2; + + return ret; + +err2: + *sa_query = NULL; + ib_sa_client_put(query->sa_query.client); + free_mad(&query->sa_query); + +err1: + kfree(query); + return ret; +} +EXPORT_SYMBOL(ib_sa_classport_info_rec_query); + static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *mad_send_wc) { @@ -1794,7 +1954,7 @@ static void ib_sa_remove_one(struct ib_device *device, void *client_data) kfree(sa_dev); } -static int __init ib_sa_init(void) +int ib_sa_init(void) { int ret; @@ -1820,17 +1980,10 @@ static int __init ib_sa_init(void) goto err3; } - if (ibnl_add_client(RDMA_NL_LS, RDMA_NL_LS_NUM_OPS, - ib_sa_cb_table)) { - pr_err("Failed to add netlink callback\n"); - ret = -EINVAL; - goto err4; - } INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout); return 0; -err4: - destroy_workqueue(ib_nl_wq); + err3: mcast_cleanup(); err2: @@ -1839,9 +1992,8 @@ err1: return ret; } -static void __exit ib_sa_cleanup(void) +void ib_sa_cleanup(void) { - ibnl_remove_client(RDMA_NL_LS); cancel_delayed_work(&ib_nl_timed_work); flush_workqueue(ib_nl_wq); destroy_workqueue(ib_nl_wq); @@ -1849,6 +2001,3 @@ static void __exit ib_sa_cleanup(void) ib_unregister_client(&sa_client); idr_destroy(&query_idr); } - -module_init(ib_sa_init); -module_exit(ib_sa_cleanup); diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 14606afbf..60df4f8e8 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -56,8 +56,10 @@ struct ib_port { struct gid_attr_group *gid_attr_group; struct attribute_group gid_group; struct attribute_group pkey_group; - u8 port_num; struct attribute_group *pma_table; + struct attribute_group *hw_stats_ag; + struct rdma_hw_stats *hw_stats; + u8 port_num; }; struct port_attribute { @@ -80,6 +82,18 @@ struct port_table_attribute { __be16 attr_id; }; +struct hw_stats_attribute { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, + struct attribute *attr, char *buf); + ssize_t (*store)(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t count); + int index; + u8 port_num; +}; + static ssize_t port_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -516,6 +530,7 @@ static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192); static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); +static PORT_PMA_ATTR(port_xmit_wait , 0, 32, 320); /* * Counters added by extended set @@ -546,6 +561,7 @@ static struct attribute *pma_attrs[] = { &port_pma_attr_port_rcv_data.attr.attr, &port_pma_attr_port_xmit_packets.attr.attr, &port_pma_attr_port_rcv_packets.attr.attr, + &port_pma_attr_port_xmit_wait.attr.attr, NULL }; @@ -565,6 +581,7 @@ static struct attribute *pma_attrs_ext[] = { &port_pma_attr_ext_port_xmit_data.attr.attr, &port_pma_attr_ext_port_rcv_data.attr.attr, &port_pma_attr_ext_port_xmit_packets.attr.attr, + &port_pma_attr_port_xmit_wait.attr.attr, &port_pma_attr_ext_port_rcv_packets.attr.attr, &port_pma_attr_ext_unicast_rcv_packets.attr.attr, &port_pma_attr_ext_unicast_xmit_packets.attr.attr, @@ -590,6 +607,7 @@ static struct attribute *pma_attrs_noietf[] = { &port_pma_attr_ext_port_rcv_data.attr.attr, &port_pma_attr_ext_port_xmit_packets.attr.attr, &port_pma_attr_ext_port_rcv_packets.attr.attr, + &port_pma_attr_port_xmit_wait.attr.attr, NULL }; @@ -733,6 +751,220 @@ static struct attribute_group *get_counter_table(struct ib_device *dev, return &pma_group; } +static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats, + u8 port_num, int index) +{ + int ret; + + if (time_is_after_eq_jiffies(stats->timestamp + stats->lifespan)) + return 0; + ret = dev->get_hw_stats(dev, stats, port_num, index); + if (ret < 0) + return ret; + if (ret == stats->num_counters) + stats->timestamp = jiffies; + + return 0; +} + +static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf) +{ + return sprintf(buf, "%llu\n", stats->value[index]); +} + +static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ib_device *dev; + struct ib_port *port; + struct hw_stats_attribute *hsa; + struct rdma_hw_stats *stats; + int ret; + + hsa = container_of(attr, struct hw_stats_attribute, attr); + if (!hsa->port_num) { + dev = container_of((struct device *)kobj, + struct ib_device, dev); + stats = dev->hw_stats; + } else { + port = container_of(kobj, struct ib_port, kobj); + dev = port->ibdev; + stats = port->hw_stats; + } + ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index); + if (ret) + return ret; + return print_hw_stat(stats, hsa->index, buf); +} + +static ssize_t show_stats_lifespan(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct hw_stats_attribute *hsa; + int msecs; + + hsa = container_of(attr, struct hw_stats_attribute, attr); + if (!hsa->port_num) { + struct ib_device *dev = container_of((struct device *)kobj, + struct ib_device, dev); + msecs = jiffies_to_msecs(dev->hw_stats->lifespan); + } else { + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + msecs = jiffies_to_msecs(p->hw_stats->lifespan); + } + return sprintf(buf, "%d\n", msecs); +} + +static ssize_t set_stats_lifespan(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t count) +{ + struct hw_stats_attribute *hsa; + int msecs; + int jiffies; + int ret; + + ret = kstrtoint(buf, 10, &msecs); + if (ret) + return ret; + if (msecs < 0 || msecs > 10000) + return -EINVAL; + jiffies = msecs_to_jiffies(msecs); + hsa = container_of(attr, struct hw_stats_attribute, attr); + if (!hsa->port_num) { + struct ib_device *dev = container_of((struct device *)kobj, + struct ib_device, dev); + dev->hw_stats->lifespan = jiffies; + } else { + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + p->hw_stats->lifespan = jiffies; + } + return count; +} + +static void free_hsag(struct kobject *kobj, struct attribute_group *attr_group) +{ + struct attribute **attr; + + sysfs_remove_group(kobj, attr_group); + + for (attr = attr_group->attrs; *attr; attr++) + kfree(*attr); + kfree(attr_group); +} + +static struct attribute *alloc_hsa(int index, u8 port_num, const char *name) +{ + struct hw_stats_attribute *hsa; + + hsa = kmalloc(sizeof(*hsa), GFP_KERNEL); + if (!hsa) + return NULL; + + hsa->attr.name = (char *)name; + hsa->attr.mode = S_IRUGO; + hsa->show = show_hw_stats; + hsa->store = NULL; + hsa->index = index; + hsa->port_num = port_num; + + return &hsa->attr; +} + +static struct attribute *alloc_hsa_lifespan(char *name, u8 port_num) +{ + struct hw_stats_attribute *hsa; + + hsa = kmalloc(sizeof(*hsa), GFP_KERNEL); + if (!hsa) + return NULL; + + hsa->attr.name = name; + hsa->attr.mode = S_IWUSR | S_IRUGO; + hsa->show = show_stats_lifespan; + hsa->store = set_stats_lifespan; + hsa->index = 0; + hsa->port_num = port_num; + + return &hsa->attr; +} + +static void setup_hw_stats(struct ib_device *device, struct ib_port *port, + u8 port_num) +{ + struct attribute_group *hsag; + struct rdma_hw_stats *stats; + int i, ret; + + stats = device->alloc_hw_stats(device, port_num); + + if (!stats) + return; + + if (!stats->names || stats->num_counters <= 0) + goto err_free_stats; + + /* + * Two extra attribue elements here, one for the lifespan entry and + * one to NULL terminate the list for the sysfs core code + */ + hsag = kzalloc(sizeof(*hsag) + + sizeof(void *) * (stats->num_counters + 2), + GFP_KERNEL); + if (!hsag) + goto err_free_stats; + + ret = device->get_hw_stats(device, stats, port_num, + stats->num_counters); + if (ret != stats->num_counters) + goto err_free_hsag; + + stats->timestamp = jiffies; + + hsag->name = "hw_counters"; + hsag->attrs = (void *)hsag + sizeof(*hsag); + + for (i = 0; i < stats->num_counters; i++) { + hsag->attrs[i] = alloc_hsa(i, port_num, stats->names[i]); + if (!hsag->attrs[i]) + goto err; + sysfs_attr_init(hsag->attrs[i]); + } + + /* treat an error here as non-fatal */ + hsag->attrs[i] = alloc_hsa_lifespan("lifespan", port_num); + if (hsag->attrs[i]) + sysfs_attr_init(hsag->attrs[i]); + + if (port) { + struct kobject *kobj = &port->kobj; + ret = sysfs_create_group(kobj, hsag); + if (ret) + goto err; + port->hw_stats_ag = hsag; + port->hw_stats = stats; + } else { + struct kobject *kobj = &device->dev.kobj; + ret = sysfs_create_group(kobj, hsag); + if (ret) + goto err; + device->hw_stats_ag = hsag; + device->hw_stats = stats; + } + + return; + +err: + for (; i >= 0; i--) + kfree(hsag->attrs[i]); +err_free_hsag: + kfree(hsag); +err_free_stats: + kfree(stats); + return; +} + static int add_port(struct ib_device *device, int port_num, int (*port_callback)(struct ib_device *, u8, struct kobject *)) @@ -835,6 +1067,14 @@ static int add_port(struct ib_device *device, int port_num, goto err_remove_pkey; } + /* + * If port == 0, it means we have only one port and the parent + * device, not this port device, should be the holder of the + * hw_counters + */ + if (device->alloc_hw_stats && port_num) + setup_hw_stats(device, p, port_num); + list_add_tail(&p->kobj.entry, &device->port_list); kobject_uevent(&p->kobj, KOBJ_ADD); @@ -972,120 +1212,6 @@ static struct device_attribute *ib_class_attributes[] = { &dev_attr_node_desc }; -/* Show a given an attribute in the statistics group */ -static ssize_t show_protocol_stat(const struct device *device, - struct device_attribute *attr, char *buf, - unsigned offset) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - union rdma_protocol_stats stats; - ssize_t ret; - - ret = dev->get_protocol_stats(dev, &stats); - if (ret) - return ret; - - return sprintf(buf, "%llu\n", - (unsigned long long) ((u64 *) &stats)[offset]); -} - -/* generate a read-only iwarp statistics attribute */ -#define IW_STATS_ENTRY(name) \ -static ssize_t show_##name(struct device *device, \ - struct device_attribute *attr, char *buf) \ -{ \ - return show_protocol_stat(device, attr, buf, \ - offsetof(struct iw_protocol_stats, name) / \ - sizeof (u64)); \ -} \ -static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) - -IW_STATS_ENTRY(ipInReceives); -IW_STATS_ENTRY(ipInHdrErrors); -IW_STATS_ENTRY(ipInTooBigErrors); -IW_STATS_ENTRY(ipInNoRoutes); -IW_STATS_ENTRY(ipInAddrErrors); -IW_STATS_ENTRY(ipInUnknownProtos); -IW_STATS_ENTRY(ipInTruncatedPkts); -IW_STATS_ENTRY(ipInDiscards); -IW_STATS_ENTRY(ipInDelivers); -IW_STATS_ENTRY(ipOutForwDatagrams); -IW_STATS_ENTRY(ipOutRequests); -IW_STATS_ENTRY(ipOutDiscards); -IW_STATS_ENTRY(ipOutNoRoutes); -IW_STATS_ENTRY(ipReasmTimeout); -IW_STATS_ENTRY(ipReasmReqds); -IW_STATS_ENTRY(ipReasmOKs); -IW_STATS_ENTRY(ipReasmFails); -IW_STATS_ENTRY(ipFragOKs); -IW_STATS_ENTRY(ipFragFails); -IW_STATS_ENTRY(ipFragCreates); -IW_STATS_ENTRY(ipInMcastPkts); -IW_STATS_ENTRY(ipOutMcastPkts); -IW_STATS_ENTRY(ipInBcastPkts); -IW_STATS_ENTRY(ipOutBcastPkts); -IW_STATS_ENTRY(tcpRtoAlgorithm); -IW_STATS_ENTRY(tcpRtoMin); -IW_STATS_ENTRY(tcpRtoMax); -IW_STATS_ENTRY(tcpMaxConn); -IW_STATS_ENTRY(tcpActiveOpens); -IW_STATS_ENTRY(tcpPassiveOpens); -IW_STATS_ENTRY(tcpAttemptFails); -IW_STATS_ENTRY(tcpEstabResets); -IW_STATS_ENTRY(tcpCurrEstab); -IW_STATS_ENTRY(tcpInSegs); -IW_STATS_ENTRY(tcpOutSegs); -IW_STATS_ENTRY(tcpRetransSegs); -IW_STATS_ENTRY(tcpInErrs); -IW_STATS_ENTRY(tcpOutRsts); - -static struct attribute *iw_proto_stats_attrs[] = { - &dev_attr_ipInReceives.attr, - &dev_attr_ipInHdrErrors.attr, - &dev_attr_ipInTooBigErrors.attr, - &dev_attr_ipInNoRoutes.attr, - &dev_attr_ipInAddrErrors.attr, - &dev_attr_ipInUnknownProtos.attr, - &dev_attr_ipInTruncatedPkts.attr, - &dev_attr_ipInDiscards.attr, - &dev_attr_ipInDelivers.attr, - &dev_attr_ipOutForwDatagrams.attr, - &dev_attr_ipOutRequests.attr, - &dev_attr_ipOutDiscards.attr, - &dev_attr_ipOutNoRoutes.attr, - &dev_attr_ipReasmTimeout.attr, - &dev_attr_ipReasmReqds.attr, - &dev_attr_ipReasmOKs.attr, - &dev_attr_ipReasmFails.attr, - &dev_attr_ipFragOKs.attr, - &dev_attr_ipFragFails.attr, - &dev_attr_ipFragCreates.attr, - &dev_attr_ipInMcastPkts.attr, - &dev_attr_ipOutMcastPkts.attr, - &dev_attr_ipInBcastPkts.attr, - &dev_attr_ipOutBcastPkts.attr, - &dev_attr_tcpRtoAlgorithm.attr, - &dev_attr_tcpRtoMin.attr, - &dev_attr_tcpRtoMax.attr, - &dev_attr_tcpMaxConn.attr, - &dev_attr_tcpActiveOpens.attr, - &dev_attr_tcpPassiveOpens.attr, - &dev_attr_tcpAttemptFails.attr, - &dev_attr_tcpEstabResets.attr, - &dev_attr_tcpCurrEstab.attr, - &dev_attr_tcpInSegs.attr, - &dev_attr_tcpOutSegs.attr, - &dev_attr_tcpRetransSegs.attr, - &dev_attr_tcpInErrs.attr, - &dev_attr_tcpOutRsts.attr, - NULL -}; - -static struct attribute_group iw_stats_group = { - .name = "proto_stats", - .attrs = iw_proto_stats_attrs, -}; - static void free_port_list_attributes(struct ib_device *device) { struct kobject *p, *t; @@ -1093,6 +1219,10 @@ static void free_port_list_attributes(struct ib_device *device) list_for_each_entry_safe(p, t, &device->port_list, entry) { struct ib_port *port = container_of(p, struct ib_port, kobj); list_del(&p->entry); + if (port->hw_stats) { + kfree(port->hw_stats); + free_hsag(&port->kobj, port->hw_stats_ag); + } sysfs_remove_group(p, port->pma_table); sysfs_remove_group(p, &port->pkey_group); sysfs_remove_group(p, &port->gid_group); @@ -1149,11 +1279,8 @@ int ib_device_register_sysfs(struct ib_device *device, } } - if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) { - ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group); - if (ret) - goto err_put; - } + if (device->alloc_hw_stats) + setup_hw_stats(device, NULL, 0); return 0; @@ -1169,15 +1296,18 @@ err: void ib_device_unregister_sysfs(struct ib_device *device) { - /* Hold kobject until ib_dealloc_device() */ - struct kobject *kobj_dev = kobject_get(&device->dev.kobj); int i; - if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) - sysfs_remove_group(kobj_dev, &iw_stats_group); + /* Hold kobject until ib_dealloc_device() */ + kobject_get(&device->dev.kobj); free_port_list_attributes(device); + if (device->hw_stats) { + kfree(device->hw_stats); + free_hsag(&device->dev.kobj, device->hw_stats_ag); + } + for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) device_remove_file(&device->dev, ib_class_attributes[i]); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 6fdc7ecda..825021d10 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1747,7 +1747,7 @@ static int create_qp(struct ib_uverbs_file *file, struct ib_srq *srq = NULL; struct ib_qp *qp; char *buf; - struct ib_qp_init_attr attr; + struct ib_qp_init_attr attr = {}; struct ib_uverbs_ex_create_qp_resp resp; int ret; @@ -1833,7 +1833,8 @@ static int create_qp(struct ib_uverbs_file *file, if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | IB_QP_CREATE_CROSS_CHANNEL | IB_QP_CREATE_MANAGED_SEND | - IB_QP_CREATE_MANAGED_RECV)) { + IB_QP_CREATE_MANAGED_RECV | + IB_QP_CREATE_SCATTER_FCS)) { ret = -EINVAL; goto err_put; } @@ -3088,8 +3089,7 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, if (cmd.comp_mask) return -EINVAL; - if ((cmd.flow_attr.type == IB_FLOW_ATTR_SNIFFER && - !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW)) + if (!capable(CAP_NET_RAW)) return -EPERM; if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED) @@ -3655,6 +3655,11 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, resp.hca_core_clock = attr.hca_core_clock; resp.response_length += sizeof(resp.hca_core_clock); + if (ucore->outlen < resp.response_length + sizeof(resp.device_cap_flags_ex)) + goto end; + + resp.device_cap_flags_ex = attr.device_cap_flags; + resp.response_length += sizeof(resp.device_cap_flags_ex); end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); return err; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index b65b3541e..6298f54b4 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -48,6 +48,7 @@ #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> +#include <rdma/rw.h> #include "core_priv.h" @@ -510,12 +511,16 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, ah_attr->grh.dgid = sgid; if (!rdma_cap_eth_ah(device, port_num)) { - ret = ib_find_cached_gid_by_port(device, &dgid, - IB_GID_TYPE_IB, - port_num, NULL, - &gid_index); - if (ret) - return ret; + if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) { + ret = ib_find_cached_gid_by_port(device, &dgid, + IB_GID_TYPE_IB, + port_num, NULL, + &gid_index); + if (ret) + return ret; + } else { + gid_index = 0; + } } ah_attr->grh.sgid_index = (u8) gid_index; @@ -723,59 +728,89 @@ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, } EXPORT_SYMBOL(ib_open_qp); +static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp, + struct ib_qp_init_attr *qp_init_attr) +{ + struct ib_qp *real_qp = qp; + + qp->event_handler = __ib_shared_qp_event_handler; + qp->qp_context = qp; + qp->pd = NULL; + qp->send_cq = qp->recv_cq = NULL; + qp->srq = NULL; + qp->xrcd = qp_init_attr->xrcd; + atomic_inc(&qp_init_attr->xrcd->usecnt); + INIT_LIST_HEAD(&qp->open_list); + + qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, + qp_init_attr->qp_context); + if (!IS_ERR(qp)) + __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); + else + real_qp->device->destroy_qp(real_qp); + return qp; +} + struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { - struct ib_qp *qp, *real_qp; - struct ib_device *device; + struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device; + struct ib_qp *qp; + int ret; + + /* + * If the callers is using the RDMA API calculate the resources + * needed for the RDMA READ/WRITE operations. + * + * Note that these callers need to pass in a port number. + */ + if (qp_init_attr->cap.max_rdma_ctxs) + rdma_rw_init_qp(device, qp_init_attr); - device = pd ? pd->device : qp_init_attr->xrcd->device; qp = device->create_qp(pd, qp_init_attr, NULL); + if (IS_ERR(qp)) + return qp; + + qp->device = device; + qp->real_qp = qp; + qp->uobject = NULL; + qp->qp_type = qp_init_attr->qp_type; + + atomic_set(&qp->usecnt, 0); + qp->mrs_used = 0; + spin_lock_init(&qp->mr_lock); + INIT_LIST_HEAD(&qp->rdma_mrs); + INIT_LIST_HEAD(&qp->sig_mrs); + + if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) + return ib_create_xrc_qp(qp, qp_init_attr); + + qp->event_handler = qp_init_attr->event_handler; + qp->qp_context = qp_init_attr->qp_context; + if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { + qp->recv_cq = NULL; + qp->srq = NULL; + } else { + qp->recv_cq = qp_init_attr->recv_cq; + atomic_inc(&qp_init_attr->recv_cq->usecnt); + qp->srq = qp_init_attr->srq; + if (qp->srq) + atomic_inc(&qp_init_attr->srq->usecnt); + } - if (!IS_ERR(qp)) { - qp->device = device; - qp->real_qp = qp; - qp->uobject = NULL; - qp->qp_type = qp_init_attr->qp_type; - - atomic_set(&qp->usecnt, 0); - if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) { - qp->event_handler = __ib_shared_qp_event_handler; - qp->qp_context = qp; - qp->pd = NULL; - qp->send_cq = qp->recv_cq = NULL; - qp->srq = NULL; - qp->xrcd = qp_init_attr->xrcd; - atomic_inc(&qp_init_attr->xrcd->usecnt); - INIT_LIST_HEAD(&qp->open_list); - - real_qp = qp; - qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, - qp_init_attr->qp_context); - if (!IS_ERR(qp)) - __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); - else - real_qp->device->destroy_qp(real_qp); - } else { - qp->event_handler = qp_init_attr->event_handler; - qp->qp_context = qp_init_attr->qp_context; - if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { - qp->recv_cq = NULL; - qp->srq = NULL; - } else { - qp->recv_cq = qp_init_attr->recv_cq; - atomic_inc(&qp_init_attr->recv_cq->usecnt); - qp->srq = qp_init_attr->srq; - if (qp->srq) - atomic_inc(&qp_init_attr->srq->usecnt); - } + qp->pd = pd; + qp->send_cq = qp_init_attr->send_cq; + qp->xrcd = NULL; - qp->pd = pd; - qp->send_cq = qp_init_attr->send_cq; - qp->xrcd = NULL; + atomic_inc(&pd->usecnt); + atomic_inc(&qp_init_attr->send_cq->usecnt); - atomic_inc(&pd->usecnt); - atomic_inc(&qp_init_attr->send_cq->usecnt); + if (qp_init_attr->cap.max_rdma_ctxs) { + ret = rdma_rw_init_mrs(qp, qp_init_attr); + if (ret) { + pr_err("failed to init MR pool ret= %d\n", ret); + ib_destroy_qp(qp); + qp = ERR_PTR(ret); } } @@ -1250,6 +1285,8 @@ int ib_destroy_qp(struct ib_qp *qp) struct ib_srq *srq; int ret; + WARN_ON_ONCE(qp->mrs_used > 0); + if (atomic_read(&qp->usecnt)) return -EBUSY; @@ -1261,6 +1298,9 @@ int ib_destroy_qp(struct ib_qp *qp) rcq = qp->recv_cq; srq = qp->srq; + if (!qp->uobject) + rdma_rw_cleanup_mrs(qp); + ret = qp->device->destroy_qp(qp); if (!ret) { if (pd) @@ -1343,6 +1383,7 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) mr->pd = pd; mr->uobject = NULL; atomic_inc(&pd->usecnt); + mr->need_inval = false; } return mr; @@ -1389,6 +1430,7 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd, mr->pd = pd; mr->uobject = NULL; atomic_inc(&pd->usecnt); + mr->need_inval = false; } return mr; @@ -1597,6 +1639,7 @@ EXPORT_SYMBOL(ib_set_vf_guid); * @mr: memory region * @sg: dma mapped scatterlist * @sg_nents: number of entries in sg + * @sg_offset: offset in bytes into sg * @page_size: page vector desired page size * * Constraints: @@ -1615,17 +1658,15 @@ EXPORT_SYMBOL(ib_set_vf_guid); * After this completes successfully, the memory region * is ready for registration. */ -int ib_map_mr_sg(struct ib_mr *mr, - struct scatterlist *sg, - int sg_nents, - unsigned int page_size) +int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset, unsigned int page_size) { if (unlikely(!mr->device->map_mr_sg)) return -ENOSYS; mr->page_size = page_size; - return mr->device->map_mr_sg(mr, sg, sg_nents); + return mr->device->map_mr_sg(mr, sg, sg_nents, sg_offset); } EXPORT_SYMBOL(ib_map_mr_sg); @@ -1635,6 +1676,10 @@ EXPORT_SYMBOL(ib_map_mr_sg); * @mr: memory region * @sgl: dma mapped scatterlist * @sg_nents: number of entries in sg + * @sg_offset_p: IN: start offset in bytes into sg + * OUT: offset in bytes for element n of the sg of the first + * byte that has not been processed where n is the return + * value of this function. * @set_page: driver page assignment function pointer * * Core service helper for drivers to convert the largest @@ -1645,23 +1690,26 @@ EXPORT_SYMBOL(ib_map_mr_sg); * Returns the number of sg elements that were assigned to * a page vector. */ -int ib_sg_to_pages(struct ib_mr *mr, - struct scatterlist *sgl, - int sg_nents, - int (*set_page)(struct ib_mr *, u64)) +int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, + unsigned int *sg_offset_p, int (*set_page)(struct ib_mr *, u64)) { struct scatterlist *sg; u64 last_end_dma_addr = 0; + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; unsigned int last_page_off = 0; u64 page_mask = ~((u64)mr->page_size - 1); int i, ret; - mr->iova = sg_dma_address(&sgl[0]); + if (unlikely(sg_nents <= 0 || sg_offset > sg_dma_len(&sgl[0]))) + return -EINVAL; + + mr->iova = sg_dma_address(&sgl[0]) + sg_offset; mr->length = 0; for_each_sg(sgl, sg, sg_nents, i) { - u64 dma_addr = sg_dma_address(sg); - unsigned int dma_len = sg_dma_len(sg); + u64 dma_addr = sg_dma_address(sg) + sg_offset; + u64 prev_addr = dma_addr; + unsigned int dma_len = sg_dma_len(sg) - sg_offset; u64 end_dma_addr = dma_addr + dma_len; u64 page_addr = dma_addr & page_mask; @@ -1685,8 +1733,14 @@ int ib_sg_to_pages(struct ib_mr *mr, do { ret = set_page(mr, page_addr); - if (unlikely(ret < 0)) - return i ? : ret; + if (unlikely(ret < 0)) { + sg_offset = prev_addr - sg_dma_address(sg); + mr->length += prev_addr - dma_addr; + if (sg_offset_p) + *sg_offset_p = sg_offset; + return i || sg_offset ? i : ret; + } + prev_addr = page_addr; next_page: page_addr += mr->page_size; } while (page_addr < end_dma_addr); @@ -1694,8 +1748,12 @@ next_page: mr->length += dma_len; last_end_dma_addr = end_dma_addr; last_page_off = end_dma_addr & ~page_mask; + + sg_offset = 0; } + if (sg_offset_p) + *sg_offset_p = 0; return i; } EXPORT_SYMBOL(ib_sg_to_pages); |