diff options
Diffstat (limited to 'drivers/infiniband')
156 files changed, 65464 insertions, 2623 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 6425c0e5d..2137adfbd 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -85,4 +85,6 @@ source "drivers/infiniband/ulp/isert/Kconfig" source "drivers/infiniband/sw/rdmavt/Kconfig" +source "drivers/infiniband/hw/hfi1/Kconfig" + endif # INFINIBAND diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index f818538a7..edaae9f98 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -1,23 +1,19 @@ infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_cm.o user_access-$(CONFIG_INFINIBAND_ADDR_TRANS) := rdma_ucm.o -obj-$(CONFIG_INFINIBAND) += ib_core.o ib_mad.o ib_sa.o \ - ib_cm.o iw_cm.o ib_addr.o \ +obj-$(CONFIG_INFINIBAND) += ib_core.o ib_cm.o iw_cm.o \ $(infiniband-y) obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ $(user_access-y) -ib_core-y := packer.o ud_header.o verbs.o cq.o sysfs.o \ +ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ - roce_gid_mgmt.o + roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ + multicast.o mad.o smi.o agent.o mad_rmpp.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o -ib_mad-y := mad.o smi.o agent.o mad_rmpp.o - -ib_sa-y := sa_query.o multicast.o - ib_cm-y := cm.o iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o @@ -28,8 +24,6 @@ rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o rdma_ucm-y := ucma.o -ib_addr-y := addr.o - ib_umad-y := user_mad.o ib_ucm-y := ucm.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 337353d86..1374541a4 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -46,10 +46,10 @@ #include <net/ip6_route.h> #include <rdma/ib_addr.h> #include <rdma/ib.h> +#include <rdma/rdma_netlink.h> +#include <net/netlink.h> -MODULE_AUTHOR("Sean Hefty"); -MODULE_DESCRIPTION("IB Address Translation"); -MODULE_LICENSE("Dual BSD/GPL"); +#include "core_priv.h" struct addr_req { struct list_head list; @@ -62,8 +62,11 @@ struct addr_req { struct rdma_dev_addr *addr, void *context); unsigned long timeout; int status; + u32 seq; }; +static atomic_t ib_nl_addr_request_seq = ATOMIC_INIT(0); + static void process_req(struct work_struct *work); static DEFINE_MUTEX(lock); @@ -71,6 +74,126 @@ static LIST_HEAD(req_list); static DECLARE_DELAYED_WORK(work, process_req); static struct workqueue_struct *addr_wq; +static const struct nla_policy ib_nl_addr_policy[LS_NLA_TYPE_MAX] = { + [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY, + .len = sizeof(struct rdma_nla_ls_gid)}, +}; + +static inline bool ib_nl_is_good_ip_resp(const struct nlmsghdr *nlh) +{ + struct nlattr *tb[LS_NLA_TYPE_MAX] = {}; + int ret; + + if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR) + return false; + + ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), + nlmsg_len(nlh), ib_nl_addr_policy); + if (ret) + return false; + + return true; +} + +static void ib_nl_process_good_ip_rsep(const struct nlmsghdr *nlh) +{ + const struct nlattr *head, *curr; + union ib_gid gid; + struct addr_req *req; + int len, rem; + int found = 0; + + head = (const struct nlattr *)nlmsg_data(nlh); + len = nlmsg_len(nlh); + + nla_for_each_attr(curr, head, len, rem) { + if (curr->nla_type == LS_NLA_TYPE_DGID) + memcpy(&gid, nla_data(curr), nla_len(curr)); + } + + mutex_lock(&lock); + list_for_each_entry(req, &req_list, list) { + if (nlh->nlmsg_seq != req->seq) + continue; + /* We set the DGID part, the rest was set earlier */ + rdma_addr_set_dgid(req->addr, &gid); + req->status = 0; + found = 1; + break; + } + mutex_unlock(&lock); + + if (!found) + pr_info("Couldn't find request waiting for DGID: %pI6\n", + &gid); +} + +int ib_nl_handle_ip_res_resp(struct sk_buff *skb, + struct netlink_callback *cb) +{ + const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; + + if ((nlh->nlmsg_flags & NLM_F_REQUEST) || + !(NETLINK_CB(skb).sk) || + !netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + + if (ib_nl_is_good_ip_resp(nlh)) + ib_nl_process_good_ip_rsep(nlh); + + return skb->len; +} + +static int ib_nl_ip_send_msg(struct rdma_dev_addr *dev_addr, + const void *daddr, + u32 seq, u16 family) +{ + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + struct rdma_ls_ip_resolve_header *header; + void *data; + size_t size; + int attrtype; + int len; + + if (family == AF_INET) { + size = sizeof(struct in_addr); + attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV4; + } else { + size = sizeof(struct in6_addr); + attrtype = RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_IPV6; + } + + len = nla_total_size(sizeof(size)); + len += NLMSG_ALIGN(sizeof(*header)); + + skb = nlmsg_new(len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + data = ibnl_put_msg(skb, &nlh, seq, 0, RDMA_NL_LS, + RDMA_NL_LS_OP_IP_RESOLVE, NLM_F_REQUEST); + if (!data) { + nlmsg_free(skb); + return -ENODATA; + } + + /* Construct the family header first */ + header = (struct rdma_ls_ip_resolve_header *) + skb_put(skb, NLMSG_ALIGN(sizeof(*header))); + header->ifindex = dev_addr->bound_dev_if; + nla_put(skb, attrtype, size, daddr); + + /* Repair the nlmsg header length */ + nlmsg_end(skb, nlh); + ibnl_multicast(skb, nlh, RDMA_NL_GROUP_LS, GFP_KERNEL); + + /* Make the request retry, so when we get the response from userspace + * we will have something. + */ + return -ENODATA; +} + int rdma_addr_size(struct sockaddr *addr) { switch (addr->sa_family) { @@ -199,6 +322,17 @@ static void queue_req(struct addr_req *req) mutex_unlock(&lock); } +static int ib_nl_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, + const void *daddr, u32 seq, u16 family) +{ + if (ibnl_chk_listeners(RDMA_NL_GROUP_LS)) + return -EADDRNOTAVAIL; + + /* We fill in what we can, the response will fill the rest */ + rdma_copy_addr(dev_addr, dst->dev, NULL); + return ib_nl_ip_send_msg(dev_addr, daddr, seq, family); +} + static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, const void *daddr) { @@ -223,6 +357,39 @@ static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, return ret; } +static bool has_gateway(struct dst_entry *dst, sa_family_t family) +{ + struct rtable *rt; + struct rt6_info *rt6; + + if (family == AF_INET) { + rt = container_of(dst, struct rtable, dst); + return rt->rt_uses_gateway; + } + + rt6 = container_of(dst, struct rt6_info, dst); + return rt6->rt6i_flags & RTF_GATEWAY; +} + +static int fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, + const struct sockaddr *dst_in, u32 seq) +{ + const struct sockaddr_in *dst_in4 = + (const struct sockaddr_in *)dst_in; + const struct sockaddr_in6 *dst_in6 = + (const struct sockaddr_in6 *)dst_in; + const void *daddr = (dst_in->sa_family == AF_INET) ? + (const void *)&dst_in4->sin_addr.s_addr : + (const void *)&dst_in6->sin6_addr; + sa_family_t family = dst_in->sa_family; + + /* Gateway + ARPHRD_INFINIBAND -> IB router */ + if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND) + return ib_nl_fetch_ha(dst, dev_addr, daddr, seq, family); + else + return dst_fetch_ha(dst, dev_addr, daddr); +} + static int addr4_resolve(struct sockaddr_in *src_in, const struct sockaddr_in *dst_in, struct rdma_dev_addr *addr, @@ -246,10 +413,11 @@ static int addr4_resolve(struct sockaddr_in *src_in, src_in->sin_family = AF_INET; src_in->sin_addr.s_addr = fl4.saddr; - /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't - * routable) and we could set the network type accordingly. + /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're + * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network + * type accordingly. */ - if (rt->rt_uses_gateway) + if (rt->rt_uses_gateway && rt->dst.dev->type != ARPHRD_INFINIBAND) addr->network = RDMA_NETWORK_IPV4; addr->hoplimit = ip4_dst_hoplimit(&rt->dst); @@ -291,10 +459,12 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, src_in->sin6_addr = fl6.saddr; } - /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't - * routable) and we could set the network type accordingly. + /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're + * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network + * type accordingly. */ - if (rt->rt6i_flags & RTF_GATEWAY) + if (rt->rt6i_flags & RTF_GATEWAY && + ip6_dst_idev(dst)->dev->type != ARPHRD_INFINIBAND) addr->network = RDMA_NETWORK_IPV6; addr->hoplimit = ip6_dst_hoplimit(dst); @@ -317,7 +487,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, static int addr_resolve_neigh(struct dst_entry *dst, const struct sockaddr *dst_in, - struct rdma_dev_addr *addr) + struct rdma_dev_addr *addr, + u32 seq) { if (dst->dev->flags & IFF_LOOPBACK) { int ret; @@ -331,17 +502,8 @@ static int addr_resolve_neigh(struct dst_entry *dst, } /* If the device doesn't do ARP internally */ - if (!(dst->dev->flags & IFF_NOARP)) { - const struct sockaddr_in *dst_in4 = - (const struct sockaddr_in *)dst_in; - const struct sockaddr_in6 *dst_in6 = - (const struct sockaddr_in6 *)dst_in; - - return dst_fetch_ha(dst, addr, - dst_in->sa_family == AF_INET ? - (const void *)&dst_in4->sin_addr.s_addr : - (const void *)&dst_in6->sin6_addr); - } + if (!(dst->dev->flags & IFF_NOARP)) + return fetch_ha(dst, addr, dst_in, seq); return rdma_copy_addr(addr, dst->dev, NULL); } @@ -349,7 +511,8 @@ static int addr_resolve_neigh(struct dst_entry *dst, static int addr_resolve(struct sockaddr *src_in, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, - bool resolve_neigh) + bool resolve_neigh, + u32 seq) { struct net_device *ndev; struct dst_entry *dst; @@ -366,7 +529,7 @@ static int addr_resolve(struct sockaddr *src_in, return ret; if (resolve_neigh) - ret = addr_resolve_neigh(&rt->dst, dst_in, addr); + ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq); ndev = rt->dst.dev; dev_hold(ndev); @@ -383,7 +546,7 @@ static int addr_resolve(struct sockaddr *src_in, return ret; if (resolve_neigh) - ret = addr_resolve_neigh(dst, dst_in, addr); + ret = addr_resolve_neigh(dst, dst_in, addr, seq); ndev = dst->dev; dev_hold(ndev); @@ -412,7 +575,7 @@ static void process_req(struct work_struct *work) src_in = (struct sockaddr *) &req->src_addr; dst_in = (struct sockaddr *) &req->dst_addr; req->status = addr_resolve(src_in, dst_in, req->addr, - true); + true, req->seq); if (req->status && time_after_eq(jiffies, req->timeout)) req->status = -ETIMEDOUT; else if (req->status == -ENODATA) @@ -471,8 +634,9 @@ int rdma_resolve_ip(struct rdma_addr_client *client, req->context = context; req->client = client; atomic_inc(&client->refcount); + req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq); - req->status = addr_resolve(src_in, dst_in, addr, true); + req->status = addr_resolve(src_in, dst_in, addr, true, req->seq); switch (req->status) { case 0: req->timeout = jiffies; @@ -510,7 +674,7 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr, src_in->sa_family = dst_addr->sa_family; } - return addr_resolve(src_in, dst_addr, addr, false); + return addr_resolve(src_in, dst_addr, addr, false, 0); } EXPORT_SYMBOL(rdma_resolve_ip_route); @@ -634,7 +798,7 @@ static struct notifier_block nb = { .notifier_call = netevent_callback }; -static int __init addr_init(void) +int addr_init(void) { addr_wq = create_singlethread_workqueue("ib_addr"); if (!addr_wq) @@ -642,15 +806,13 @@ static int __init addr_init(void) register_netevent_notifier(&nb); rdma_addr_register_client(&self); + return 0; } -static void __exit addr_cleanup(void) +void addr_cleanup(void) { rdma_addr_unregister_client(&self); unregister_netevent_notifier(&nb); destroy_workqueue(addr_wq); } - -module_init(addr_init); -module_exit(addr_cleanup); diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index c2e257d97..1a2984c28 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -178,6 +178,7 @@ static int write_gid(struct ib_device *ib_dev, u8 port, { int ret = 0; struct net_device *old_net_dev; + enum ib_gid_type old_gid_type; /* in rdma_cap_roce_gid_table, this funciton should be protected by a * sleep-able lock. @@ -199,6 +200,7 @@ static int write_gid(struct ib_device *ib_dev, u8 port, } old_net_dev = table->data_vec[ix].attr.ndev; + old_gid_type = table->data_vec[ix].attr.gid_type; if (old_net_dev && old_net_dev != attr->ndev) dev_put(old_net_dev); /* if modify_gid failed, just delete the old gid */ @@ -207,10 +209,14 @@ static int write_gid(struct ib_device *ib_dev, u8 port, attr = &zattr; table->data_vec[ix].context = NULL; } - if (default_gid) - table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT; + memcpy(&table->data_vec[ix].gid, gid, sizeof(*gid)); memcpy(&table->data_vec[ix].attr, attr, sizeof(*attr)); + if (default_gid) { + table->data_vec[ix].props |= GID_TABLE_ENTRY_DEFAULT; + if (action == GID_TABLE_WRITE_ACTION_DEL) + table->data_vec[ix].attr.gid_type = old_gid_type; + } if (table->data_vec[ix].attr.ndev && table->data_vec[ix].attr.ndev != old_net_dev) dev_hold(table->data_vec[ix].attr.ndev); @@ -405,7 +411,9 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, for (ix = 0; ix < table->sz; ix++) if (table->data_vec[ix].attr.ndev == ndev) - if (!del_gid(ib_dev, port, table, ix, false)) + if (!del_gid(ib_dev, port, table, ix, + !!(table->data_vec[ix].props & + GID_TABLE_ENTRY_DEFAULT))) deleted = true; write_unlock_irq(&table->rwlock); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 93ab0ae97..ad1b1adcf 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -708,17 +708,6 @@ static void cma_deref_id(struct rdma_id_private *id_priv) complete(&id_priv->comp); } -static int cma_disable_callback(struct rdma_id_private *id_priv, - enum rdma_cm_state state) -{ - mutex_lock(&id_priv->handler_mutex); - if (id_priv->state != state) { - mutex_unlock(&id_priv->handler_mutex); - return -EINVAL; - } - return 0; -} - struct rdma_cm_id *rdma_create_id(struct net *net, rdma_cm_event_handler event_handler, void *context, enum rdma_port_space ps, @@ -800,6 +789,7 @@ int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd, if (id->device != pd->device) return -EINVAL; + qp_init_attr->port_num = id->port_num; qp = ib_create_qp(pd, qp_init_attr); if (IS_ERR(qp)) return PTR_ERR(qp); @@ -1670,11 +1660,12 @@ static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) struct rdma_cm_event event; int ret = 0; + mutex_lock(&id_priv->handler_mutex); if ((ib_event->event != IB_CM_TIMEWAIT_EXIT && - cma_disable_callback(id_priv, RDMA_CM_CONNECT)) || + id_priv->state != RDMA_CM_CONNECT) || (ib_event->event == IB_CM_TIMEWAIT_EXIT && - cma_disable_callback(id_priv, RDMA_CM_DISCONNECT))) - return 0; + id_priv->state != RDMA_CM_DISCONNECT)) + goto out; memset(&event, 0, sizeof event); switch (ib_event->event) { @@ -1869,7 +1860,7 @@ static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_e static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) { - struct rdma_id_private *listen_id, *conn_id; + struct rdma_id_private *listen_id, *conn_id = NULL; struct rdma_cm_event event; struct net_device *net_dev; int offset, ret; @@ -1883,9 +1874,10 @@ static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event) goto net_dev_put; } - if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) { + mutex_lock(&listen_id->handler_mutex); + if (listen_id->state != RDMA_CM_LISTEN) { ret = -ECONNABORTED; - goto net_dev_put; + goto err1; } memset(&event, 0, sizeof event); @@ -1975,8 +1967,9 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; - if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) - return 0; + mutex_lock(&id_priv->handler_mutex); + if (id_priv->state != RDMA_CM_CONNECT) + goto out; memset(&event, 0, sizeof event); switch (iw_event->event) { @@ -2028,6 +2021,7 @@ static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) return ret; } +out: mutex_unlock(&id_priv->handler_mutex); return ret; } @@ -2038,13 +2032,15 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct rdma_cm_id *new_cm_id; struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event; - int ret; + int ret = -ECONNABORTED; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; listen_id = cm_id->context; - if (cma_disable_callback(listen_id, RDMA_CM_LISTEN)) - return -ECONNABORTED; + + mutex_lock(&listen_id->handler_mutex); + if (listen_id->state != RDMA_CM_LISTEN) + goto out; /* Create a new RDMA id for the new IW CM ID */ new_cm_id = rdma_create_id(listen_id->id.route.addr.dev_addr.net, @@ -3215,8 +3211,9 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd; int ret = 0; - if (cma_disable_callback(id_priv, RDMA_CM_CONNECT)) - return 0; + mutex_lock(&id_priv->handler_mutex); + if (id_priv->state != RDMA_CM_CONNECT) + goto out; memset(&event, 0, sizeof event); switch (ib_event->event) { @@ -3672,12 +3669,13 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) struct rdma_id_private *id_priv; struct cma_multicast *mc = multicast->context; struct rdma_cm_event event; - int ret; + int ret = 0; id_priv = mc->id_priv; - if (cma_disable_callback(id_priv, RDMA_CM_ADDR_BOUND) && - cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED)) - return 0; + mutex_lock(&id_priv->handler_mutex); + if (id_priv->state != RDMA_CM_ADDR_BOUND && + id_priv->state != RDMA_CM_ADDR_RESOLVED) + goto out; if (!status) status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey)); @@ -3719,6 +3717,7 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) return 0; } +out: mutex_unlock(&id_priv->handler_mutex); return 0; } @@ -3877,12 +3876,12 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; if (addr->sa_family == AF_INET) { - if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { + mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, true); - if (!err) { - mc->igmp_joined = true; - mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; + if (!err) + mc->igmp_joined = true; } } else { if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) @@ -4294,7 +4293,8 @@ static int __init cma_init(void) if (ret) goto err; - if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table)) + if (ibnl_add_client(RDMA_NL_RDMA_CM, ARRAY_SIZE(cma_cb_table), + cma_cb_table)) pr_warn("RDMA CMA: failed to add netlink callback\n"); cma_configfs_init(); diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index eab322157..19d499dca 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -137,4 +137,20 @@ static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, return _upper == upper; } +int addr_init(void); +void addr_cleanup(void); + +int ib_mad_init(void); +void ib_mad_cleanup(void); + +int ib_sa_init(void); +void ib_sa_cleanup(void); + +int ib_nl_handle_resolve_resp(struct sk_buff *skb, + struct netlink_callback *cb); +int ib_nl_handle_set_timeout(struct sk_buff *skb, + struct netlink_callback *cb); +int ib_nl_handle_ip_res_resp(struct sk_buff *skb, + struct netlink_callback *cb); + #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 109798440..5c155fa91 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -661,6 +661,9 @@ int ib_query_port(struct ib_device *device, if (err || port_attr->subnet_prefix) return err; + if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) + return 0; + err = ib_query_gid(device, port_num, 0, &gid, NULL); if (err) return err; @@ -955,6 +958,29 @@ struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, } EXPORT_SYMBOL(ib_get_net_dev_by_params); +static struct ibnl_client_cbs ibnl_ls_cb_table[] = { + [RDMA_NL_LS_OP_RESOLVE] = { + .dump = ib_nl_handle_resolve_resp, + .module = THIS_MODULE }, + [RDMA_NL_LS_OP_SET_TIMEOUT] = { + .dump = ib_nl_handle_set_timeout, + .module = THIS_MODULE }, + [RDMA_NL_LS_OP_IP_RESOLVE] = { + .dump = ib_nl_handle_ip_res_resp, + .module = THIS_MODULE }, +}; + +static int ib_add_ibnl_clients(void) +{ + return ibnl_add_client(RDMA_NL_LS, ARRAY_SIZE(ibnl_ls_cb_table), + ibnl_ls_cb_table); +} + +static void ib_remove_ibnl_clients(void) +{ + ibnl_remove_client(RDMA_NL_LS); +} + static int __init ib_core_init(void) { int ret; @@ -983,10 +1009,42 @@ static int __init ib_core_init(void) goto err_sysfs; } + ret = addr_init(); + if (ret) { + pr_warn("Could't init IB address resolution\n"); + goto err_ibnl; + } + + ret = ib_mad_init(); + if (ret) { + pr_warn("Couldn't init IB MAD\n"); + goto err_addr; + } + + ret = ib_sa_init(); + if (ret) { + pr_warn("Couldn't init SA\n"); + goto err_mad; + } + + ret = ib_add_ibnl_clients(); + if (ret) { + pr_warn("Couldn't register ibnl clients\n"); + goto err_sa; + } + ib_cache_setup(); return 0; +err_sa: + ib_sa_cleanup(); +err_mad: + ib_mad_cleanup(); +err_addr: + addr_cleanup(); +err_ibnl: + ibnl_cleanup(); err_sysfs: class_unregister(&ib_class); err_comp: @@ -999,6 +1057,10 @@ err: static void __exit ib_core_cleanup(void) { ib_cache_cleanup(); + ib_remove_ibnl_clients(); + ib_sa_cleanup(); + ib_mad_cleanup(); + addr_cleanup(); ibnl_cleanup(); class_unregister(&ib_class); destroy_workqueue(ib_comp_wq); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index e28a160cd..f0572049d 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -459,7 +459,7 @@ static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr, if (pm_addr->ss_family == AF_INET) { struct sockaddr_in *pm4_addr = (struct sockaddr_in *)pm_addr; - if (pm4_addr->sin_addr.s_addr == INADDR_ANY) { + if (pm4_addr->sin_addr.s_addr == htonl(INADDR_ANY)) { struct sockaddr_in *cm4_addr = (struct sockaddr_in *)cm_addr; struct sockaddr_in *cm4_outaddr = @@ -1175,7 +1175,7 @@ static int __init iw_cm_init(void) if (ret) pr_err("iw_cm: couldn't init iwpm\n"); - ret = ibnl_add_client(RDMA_NL_IWCM, RDMA_NL_IWPM_NUM_OPS, + ret = ibnl_add_client(RDMA_NL_IWCM, ARRAY_SIZE(iwcm_nl_cb_table), iwcm_nl_cb_table); if (ret) pr_err("iw_cm: couldn't register netlink callbacks\n"); diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index 43e3fa271..1c41b95ce 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -506,7 +506,7 @@ int iwpm_add_and_query_mapping_cb(struct sk_buff *skb, if (!nlmsg_request) { pr_info("%s: Could not find a matching request (seq = %u)\n", __func__, msg_seq); - return -EINVAL; + return -EINVAL; } pm_msg = nlmsg_request->req_buffer; local_sockaddr = (struct sockaddr_storage *) diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index 9b2bf2fb2..b65e06c56 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -634,6 +634,7 @@ static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid) if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client, RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) { pr_warn("%s Unable to put NLMSG_DONE\n", __func__); + dev_kfree_skb(skb); return -ENOMEM; } nlh->nlmsg_type = NLMSG_DONE; diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 9fa5bf33f..2d49228f2 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -47,11 +47,7 @@ #include "smi.h" #include "opa_smi.h" #include "agent.h" - -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_DESCRIPTION("kernel IB MAD API"); -MODULE_AUTHOR("Hal Rosenstock"); -MODULE_AUTHOR("Sean Hefty"); +#include "core_priv.h" static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; @@ -1642,9 +1638,9 @@ static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv) /* Now, check to see if there are any methods still in use */ if (!check_method_table(method)) { /* If not, release management method table */ - kfree(method); - class->method_table[mgmt_class] = NULL; - /* Any management classes left ? */ + kfree(method); + class->method_table[mgmt_class] = NULL; + /* Any management classes left ? */ if (!check_class_table(class)) { /* If not, release management class table */ kfree(class); @@ -3316,7 +3312,7 @@ static struct ib_client mad_client = { .remove = ib_mad_remove_device }; -static int __init ib_mad_init_module(void) +int ib_mad_init(void) { mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE); mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE); @@ -3334,10 +3330,7 @@ static int __init ib_mad_init_module(void) return 0; } -static void __exit ib_mad_cleanup_module(void) +void ib_mad_cleanup(void) { ib_unregister_client(&mad_client); } - -module_init(ib_mad_init_module); -module_exit(ib_mad_cleanup_module); diff --git a/drivers/infiniband/core/mr_pool.c b/drivers/infiniband/core/mr_pool.c new file mode 100644 index 000000000..49d478b2e --- /dev/null +++ b/drivers/infiniband/core/mr_pool.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <rdma/ib_verbs.h> +#include <rdma/mr_pool.h> + +struct ib_mr *ib_mr_pool_get(struct ib_qp *qp, struct list_head *list) +{ + struct ib_mr *mr; + unsigned long flags; + + spin_lock_irqsave(&qp->mr_lock, flags); + mr = list_first_entry_or_null(list, struct ib_mr, qp_entry); + if (mr) { + list_del(&mr->qp_entry); + qp->mrs_used++; + } + spin_unlock_irqrestore(&qp->mr_lock, flags); + + return mr; +} +EXPORT_SYMBOL(ib_mr_pool_get); + +void ib_mr_pool_put(struct ib_qp *qp, struct list_head *list, struct ib_mr *mr) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->mr_lock, flags); + list_add(&mr->qp_entry, list); + qp->mrs_used--; + spin_unlock_irqrestore(&qp->mr_lock, flags); +} +EXPORT_SYMBOL(ib_mr_pool_put); + +int ib_mr_pool_init(struct ib_qp *qp, struct list_head *list, int nr, + enum ib_mr_type type, u32 max_num_sg) +{ + struct ib_mr *mr; + unsigned long flags; + int ret, i; + + for (i = 0; i < nr; i++) { + mr = ib_alloc_mr(qp->pd, type, max_num_sg); + if (IS_ERR(mr)) { + ret = PTR_ERR(mr); + goto out; + } + + spin_lock_irqsave(&qp->mr_lock, flags); + list_add_tail(&mr->qp_entry, list); + spin_unlock_irqrestore(&qp->mr_lock, flags); + } + + return 0; +out: + ib_mr_pool_destroy(qp, list); + return ret; +} +EXPORT_SYMBOL(ib_mr_pool_init); + +void ib_mr_pool_destroy(struct ib_qp *qp, struct list_head *list) +{ + struct ib_mr *mr; + unsigned long flags; + + spin_lock_irqsave(&qp->mr_lock, flags); + while (!list_empty(list)) { + mr = list_first_entry(list, struct ib_mr, qp_entry); + list_del(&mr->qp_entry); + + spin_unlock_irqrestore(&qp->mr_lock, flags); + ib_dereg_mr(mr); + spin_lock_irqsave(&qp->mr_lock, flags); + } + spin_unlock_irqrestore(&qp->mr_lock, flags); +} +EXPORT_SYMBOL(ib_mr_pool_destroy); diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index 250937cb9..a83ec28a1 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -93,6 +93,18 @@ enum { struct mcast_member; +/* +* There are 4 types of join states: +* FullMember, NonMember, SendOnlyNonMember, SendOnlyFullMember. +*/ +enum { + FULLMEMBER_JOIN, + NONMEMBER_JOIN, + SENDONLY_NONMEBER_JOIN, + SENDONLY_FULLMEMBER_JOIN, + NUM_JOIN_MEMBERSHIP_TYPES, +}; + struct mcast_group { struct ib_sa_mcmember_rec rec; struct rb_node node; @@ -102,7 +114,7 @@ struct mcast_group { struct list_head pending_list; struct list_head active_list; struct mcast_member *last_join; - int members[3]; + int members[NUM_JOIN_MEMBERSHIP_TYPES]; atomic_t refcount; enum mcast_group_state state; struct ib_sa_query *query; @@ -220,8 +232,9 @@ static void queue_join(struct mcast_member *member) } /* - * A multicast group has three types of members: full member, non member, and - * send only member. We need to keep track of the number of members of each + * A multicast group has four types of members: full member, non member, + * sendonly non member and sendonly full member. + * We need to keep track of the number of members of each * type based on their join state. Adjust the number of members the belong to * the specified join states. */ @@ -229,7 +242,7 @@ static void adjust_membership(struct mcast_group *group, u8 join_state, int inc) { int i; - for (i = 0; i < 3; i++, join_state >>= 1) + for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++, join_state >>= 1) if (join_state & 0x1) group->members[i] += inc; } @@ -245,7 +258,7 @@ static u8 get_leave_state(struct mcast_group *group) u8 leave_state = 0; int i; - for (i = 0; i < 3; i++) + for (i = 0; i < NUM_JOIN_MEMBERSHIP_TYPES; i++) if (!group->members[i]) leave_state |= (0x1 << i); diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index d47df9356..9b8c20c82 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -151,12 +151,11 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct ibnl_client *client; int type = nlh->nlmsg_type; int index = RDMA_NL_GET_CLIENT(type); - int op = RDMA_NL_GET_OP(type); + unsigned int op = RDMA_NL_GET_OP(type); list_for_each_entry(client, &client_list, list) { if (client->index == index) { - if (op < 0 || op >= client->nops || - !client->cb_table[op].dump) + if (op >= client->nops || !client->cb_table[op].dump) return -EINVAL; /* diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c new file mode 100644 index 000000000..1eb9b1294 --- /dev/null +++ b/drivers/infiniband/core/rw.c @@ -0,0 +1,727 @@ +/* + * Copyright (c) 2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/moduleparam.h> +#include <linux/slab.h> +#include <rdma/mr_pool.h> +#include <rdma/rw.h> + +enum { + RDMA_RW_SINGLE_WR, + RDMA_RW_MULTI_WR, + RDMA_RW_MR, + RDMA_RW_SIG_MR, +}; + +static bool rdma_rw_force_mr; +module_param_named(force_mr, rdma_rw_force_mr, bool, 0); +MODULE_PARM_DESC(force_mr, "Force usage of MRs for RDMA READ/WRITE operations"); + +/* + * Check if the device might use memory registration. This is currently only + * true for iWarp devices. In the future we can hopefully fine tune this based + * on HCA driver input. + */ +static inline bool rdma_rw_can_use_mr(struct ib_device *dev, u8 port_num) +{ + if (rdma_protocol_iwarp(dev, port_num)) + return true; + if (unlikely(rdma_rw_force_mr)) + return true; + return false; +} + +/* + * Check if the device will use memory registration for this RW operation. + * We currently always use memory registrations for iWarp RDMA READs, and + * have a debug option to force usage of MRs. + * + * XXX: In the future we can hopefully fine tune this based on HCA driver + * input. + */ +static inline bool rdma_rw_io_needs_mr(struct ib_device *dev, u8 port_num, + enum dma_data_direction dir, int dma_nents) +{ + if (rdma_protocol_iwarp(dev, port_num) && dir == DMA_FROM_DEVICE) + return true; + if (unlikely(rdma_rw_force_mr)) + return true; + return false; +} + +static inline u32 rdma_rw_max_sge(struct ib_device *dev, + enum dma_data_direction dir) +{ + return dir == DMA_TO_DEVICE ? + dev->attrs.max_sge : dev->attrs.max_sge_rd; +} + +static inline u32 rdma_rw_fr_page_list_len(struct ib_device *dev) +{ + /* arbitrary limit to avoid allocating gigantic resources */ + return min_t(u32, dev->attrs.max_fast_reg_page_list_len, 256); +} + +static int rdma_rw_init_one_mr(struct ib_qp *qp, u8 port_num, + struct rdma_rw_reg_ctx *reg, struct scatterlist *sg, + u32 sg_cnt, u32 offset) +{ + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + u32 nents = min(sg_cnt, pages_per_mr); + int count = 0, ret; + + reg->mr = ib_mr_pool_get(qp, &qp->rdma_mrs); + if (!reg->mr) + return -EAGAIN; + + if (reg->mr->need_inval) { + reg->inv_wr.opcode = IB_WR_LOCAL_INV; + reg->inv_wr.ex.invalidate_rkey = reg->mr->lkey; + reg->inv_wr.next = ®->reg_wr.wr; + count++; + } else { + reg->inv_wr.next = NULL; + } + + ret = ib_map_mr_sg(reg->mr, sg, nents, &offset, PAGE_SIZE); + if (ret < nents) { + ib_mr_pool_put(qp, &qp->rdma_mrs, reg->mr); + return -EINVAL; + } + + reg->reg_wr.wr.opcode = IB_WR_REG_MR; + reg->reg_wr.mr = reg->mr; + reg->reg_wr.access = IB_ACCESS_LOCAL_WRITE; + if (rdma_protocol_iwarp(qp->device, port_num)) + reg->reg_wr.access |= IB_ACCESS_REMOTE_WRITE; + count++; + + reg->sge.addr = reg->mr->iova; + reg->sge.length = reg->mr->length; + return count; +} + +static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + int i, j, ret = 0, count = 0; + + ctx->nr_ops = (sg_cnt + pages_per_mr - 1) / pages_per_mr; + ctx->reg = kcalloc(ctx->nr_ops, sizeof(*ctx->reg), GFP_KERNEL); + if (!ctx->reg) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < ctx->nr_ops; i++) { + struct rdma_rw_reg_ctx *prev = i ? &ctx->reg[i - 1] : NULL; + struct rdma_rw_reg_ctx *reg = &ctx->reg[i]; + u32 nents = min(sg_cnt, pages_per_mr); + + ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sg_cnt, + offset); + if (ret < 0) + goto out_free; + count += ret; + + if (prev) { + if (reg->mr->need_inval) + prev->wr.wr.next = ®->inv_wr; + else + prev->wr.wr.next = ®->reg_wr.wr; + } + + reg->reg_wr.wr.next = ®->wr.wr; + + reg->wr.wr.sg_list = ®->sge; + reg->wr.wr.num_sge = 1; + reg->wr.remote_addr = remote_addr; + reg->wr.rkey = rkey; + if (dir == DMA_TO_DEVICE) { + reg->wr.wr.opcode = IB_WR_RDMA_WRITE; + } else if (!rdma_cap_read_inv(qp->device, port_num)) { + reg->wr.wr.opcode = IB_WR_RDMA_READ; + } else { + reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; + reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey; + } + count++; + + remote_addr += reg->sge.length; + sg_cnt -= nents; + for (j = 0; j < nents; j++) + sg = sg_next(sg); + offset = 0; + } + + ctx->type = RDMA_RW_MR; + return count; + +out_free: + while (--i >= 0) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); + kfree(ctx->reg); +out: + return ret; +} + +static int rdma_rw_init_map_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + struct scatterlist *sg, u32 sg_cnt, u32 offset, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + u32 max_sge = rdma_rw_max_sge(dev, dir); + struct ib_sge *sge; + u32 total_len = 0, i, j; + + ctx->nr_ops = DIV_ROUND_UP(sg_cnt, max_sge); + + ctx->map.sges = sge = kcalloc(sg_cnt, sizeof(*sge), GFP_KERNEL); + if (!ctx->map.sges) + goto out; + + ctx->map.wrs = kcalloc(ctx->nr_ops, sizeof(*ctx->map.wrs), GFP_KERNEL); + if (!ctx->map.wrs) + goto out_free_sges; + + for (i = 0; i < ctx->nr_ops; i++) { + struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i]; + u32 nr_sge = min(sg_cnt, max_sge); + + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + rdma_wr->remote_addr = remote_addr + total_len; + rdma_wr->rkey = rkey; + rdma_wr->wr.sg_list = sge; + + for (j = 0; j < nr_sge; j++, sg = sg_next(sg)) { + rdma_wr->wr.num_sge++; + + sge->addr = ib_sg_dma_address(dev, sg) + offset; + sge->length = ib_sg_dma_len(dev, sg) - offset; + sge->lkey = qp->pd->local_dma_lkey; + + total_len += sge->length; + sge++; + sg_cnt--; + offset = 0; + } + + if (i + 1 < ctx->nr_ops) + rdma_wr->wr.next = &ctx->map.wrs[i + 1].wr; + } + + ctx->type = RDMA_RW_MULTI_WR; + return ctx->nr_ops; + +out_free_sges: + kfree(ctx->map.sges); +out: + return -ENOMEM; +} + +static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + struct scatterlist *sg, u32 offset, u64 remote_addr, u32 rkey, + enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + struct ib_rdma_wr *rdma_wr = &ctx->single.wr; + + ctx->nr_ops = 1; + + ctx->single.sge.lkey = qp->pd->local_dma_lkey; + ctx->single.sge.addr = ib_sg_dma_address(dev, sg) + offset; + ctx->single.sge.length = ib_sg_dma_len(dev, sg) - offset; + + memset(rdma_wr, 0, sizeof(*rdma_wr)); + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + rdma_wr->wr.sg_list = &ctx->single.sge; + rdma_wr->wr.num_sge = 1; + rdma_wr->remote_addr = remote_addr; + rdma_wr->rkey = rkey; + + ctx->type = RDMA_RW_SINGLE_WR; + return 1; +} + +/** + * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context + * @ctx: context to initialize + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist to READ/WRITE from/to + * @sg_cnt: number of entries in @sg + * @sg_offset: current byte offset into @sg + * @remote_addr:remote address to read/write (relative to @rkey) + * @rkey: remote key to operate on + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + * + * Returns the number of WQEs that will be needed on the workqueue if + * successful, or a negative error code. + */ +int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, + struct scatterlist *sg, u32 sg_cnt, u32 sg_offset, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + int ret; + + ret = ib_dma_map_sg(dev, sg, sg_cnt, dir); + if (!ret) + return -ENOMEM; + sg_cnt = ret; + + /* + * Skip to the S/G entry that sg_offset falls into: + */ + for (;;) { + u32 len = ib_sg_dma_len(dev, sg); + + if (sg_offset < len) + break; + + sg = sg_next(sg); + sg_offset -= len; + sg_cnt--; + } + + ret = -EIO; + if (WARN_ON_ONCE(sg_cnt == 0)) + goto out_unmap_sg; + + if (rdma_rw_io_needs_mr(qp->device, port_num, dir, sg_cnt)) { + ret = rdma_rw_init_mr_wrs(ctx, qp, port_num, sg, sg_cnt, + sg_offset, remote_addr, rkey, dir); + } else if (sg_cnt > 1) { + ret = rdma_rw_init_map_wrs(ctx, qp, sg, sg_cnt, sg_offset, + remote_addr, rkey, dir); + } else { + ret = rdma_rw_init_single_wr(ctx, qp, sg, sg_offset, + remote_addr, rkey, dir); + } + + if (ret < 0) + goto out_unmap_sg; + return ret; + +out_unmap_sg: + ib_dma_unmap_sg(dev, sg, sg_cnt, dir); + return ret; +} +EXPORT_SYMBOL(rdma_rw_ctx_init); + +/** + * rdma_rw_ctx_signature init - initialize a RW context with signature offload + * @ctx: context to initialize + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist to READ/WRITE from/to + * @sg_cnt: number of entries in @sg + * @prot_sg: scatterlist to READ/WRITE protection information from/to + * @prot_sg_cnt: number of entries in @prot_sg + * @sig_attrs: signature offloading algorithms + * @remote_addr:remote address to read/write (relative to @rkey) + * @rkey: remote key to operate on + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + * + * Returns the number of WQEs that will be needed on the workqueue if + * successful, or a negative error code. + */ +int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct scatterlist *sg, u32 sg_cnt, + struct scatterlist *prot_sg, u32 prot_sg_cnt, + struct ib_sig_attrs *sig_attrs, + u64 remote_addr, u32 rkey, enum dma_data_direction dir) +{ + struct ib_device *dev = qp->pd->device; + u32 pages_per_mr = rdma_rw_fr_page_list_len(qp->pd->device); + struct ib_rdma_wr *rdma_wr; + struct ib_send_wr *prev_wr = NULL; + int count = 0, ret; + + if (sg_cnt > pages_per_mr || prot_sg_cnt > pages_per_mr) { + pr_err("SG count too large\n"); + return -EINVAL; + } + + ret = ib_dma_map_sg(dev, sg, sg_cnt, dir); + if (!ret) + return -ENOMEM; + sg_cnt = ret; + + ret = ib_dma_map_sg(dev, prot_sg, prot_sg_cnt, dir); + if (!ret) { + ret = -ENOMEM; + goto out_unmap_sg; + } + prot_sg_cnt = ret; + + ctx->type = RDMA_RW_SIG_MR; + ctx->nr_ops = 1; + ctx->sig = kcalloc(1, sizeof(*ctx->sig), GFP_KERNEL); + if (!ctx->sig) { + ret = -ENOMEM; + goto out_unmap_prot_sg; + } + + ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->data, sg, sg_cnt, 0); + if (ret < 0) + goto out_free_ctx; + count += ret; + prev_wr = &ctx->sig->data.reg_wr.wr; + + if (prot_sg_cnt) { + ret = rdma_rw_init_one_mr(qp, port_num, &ctx->sig->prot, + prot_sg, prot_sg_cnt, 0); + if (ret < 0) + goto out_destroy_data_mr; + count += ret; + + if (ctx->sig->prot.inv_wr.next) + prev_wr->next = &ctx->sig->prot.inv_wr; + else + prev_wr->next = &ctx->sig->prot.reg_wr.wr; + prev_wr = &ctx->sig->prot.reg_wr.wr; + } else { + ctx->sig->prot.mr = NULL; + } + + ctx->sig->sig_mr = ib_mr_pool_get(qp, &qp->sig_mrs); + if (!ctx->sig->sig_mr) { + ret = -EAGAIN; + goto out_destroy_prot_mr; + } + + if (ctx->sig->sig_mr->need_inval) { + memset(&ctx->sig->sig_inv_wr, 0, sizeof(ctx->sig->sig_inv_wr)); + + ctx->sig->sig_inv_wr.opcode = IB_WR_LOCAL_INV; + ctx->sig->sig_inv_wr.ex.invalidate_rkey = ctx->sig->sig_mr->rkey; + + prev_wr->next = &ctx->sig->sig_inv_wr; + prev_wr = &ctx->sig->sig_inv_wr; + } + + ctx->sig->sig_wr.wr.opcode = IB_WR_REG_SIG_MR; + ctx->sig->sig_wr.wr.wr_cqe = NULL; + ctx->sig->sig_wr.wr.sg_list = &ctx->sig->data.sge; + ctx->sig->sig_wr.wr.num_sge = 1; + ctx->sig->sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE; + ctx->sig->sig_wr.sig_attrs = sig_attrs; + ctx->sig->sig_wr.sig_mr = ctx->sig->sig_mr; + if (prot_sg_cnt) + ctx->sig->sig_wr.prot = &ctx->sig->prot.sge; + prev_wr->next = &ctx->sig->sig_wr.wr; + prev_wr = &ctx->sig->sig_wr.wr; + count++; + + ctx->sig->sig_sge.addr = 0; + ctx->sig->sig_sge.length = ctx->sig->data.sge.length; + if (sig_attrs->wire.sig_type != IB_SIG_TYPE_NONE) + ctx->sig->sig_sge.length += ctx->sig->prot.sge.length; + + rdma_wr = &ctx->sig->data.wr; + rdma_wr->wr.sg_list = &ctx->sig->sig_sge; + rdma_wr->wr.num_sge = 1; + rdma_wr->remote_addr = remote_addr; + rdma_wr->rkey = rkey; + if (dir == DMA_TO_DEVICE) + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; + else + rdma_wr->wr.opcode = IB_WR_RDMA_READ; + prev_wr->next = &rdma_wr->wr; + prev_wr = &rdma_wr->wr; + count++; + + return count; + +out_destroy_prot_mr: + if (prot_sg_cnt) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr); +out_destroy_data_mr: + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr); +out_free_ctx: + kfree(ctx->sig); +out_unmap_prot_sg: + ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir); +out_unmap_sg: + ib_dma_unmap_sg(dev, sg, sg_cnt, dir); + return ret; +} +EXPORT_SYMBOL(rdma_rw_ctx_signature_init); + +/* + * Now that we are going to post the WRs we can update the lkey and need_inval + * state on the MRs. If we were doing this at init time, we would get double + * or missing invalidations if a context was initialized but not actually + * posted. + */ +static void rdma_rw_update_lkey(struct rdma_rw_reg_ctx *reg, bool need_inval) +{ + reg->mr->need_inval = need_inval; + ib_update_fast_reg_key(reg->mr, ib_inc_rkey(reg->mr->lkey)); + reg->reg_wr.key = reg->mr->lkey; + reg->sge.lkey = reg->mr->lkey; +} + +/** + * rdma_rw_ctx_wrs - return chain of WRs for a RDMA READ or WRITE operation + * @ctx: context to operate on + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @cqe: completion queue entry for the last WR + * @chain_wr: WR to append to the posted chain + * + * Return the WR chain for the set of RDMA READ/WRITE operations described by + * @ctx, as well as any memory registration operations needed. If @chain_wr + * is non-NULL the WR it points to will be appended to the chain of WRs posted. + * If @chain_wr is not set @cqe must be set so that the caller gets a + * completion notification. + */ +struct ib_send_wr *rdma_rw_ctx_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct ib_cqe *cqe, struct ib_send_wr *chain_wr) +{ + struct ib_send_wr *first_wr, *last_wr; + int i; + + switch (ctx->type) { + case RDMA_RW_SIG_MR: + rdma_rw_update_lkey(&ctx->sig->data, true); + if (ctx->sig->prot.mr) + rdma_rw_update_lkey(&ctx->sig->prot, true); + + ctx->sig->sig_mr->need_inval = true; + ib_update_fast_reg_key(ctx->sig->sig_mr, + ib_inc_rkey(ctx->sig->sig_mr->lkey)); + ctx->sig->sig_sge.lkey = ctx->sig->sig_mr->lkey; + + if (ctx->sig->data.inv_wr.next) + first_wr = &ctx->sig->data.inv_wr; + else + first_wr = &ctx->sig->data.reg_wr.wr; + last_wr = &ctx->sig->data.wr.wr; + break; + case RDMA_RW_MR: + for (i = 0; i < ctx->nr_ops; i++) { + rdma_rw_update_lkey(&ctx->reg[i], + ctx->reg[i].wr.wr.opcode != + IB_WR_RDMA_READ_WITH_INV); + } + + if (ctx->reg[0].inv_wr.next) + first_wr = &ctx->reg[0].inv_wr; + else + first_wr = &ctx->reg[0].reg_wr.wr; + last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr; + break; + case RDMA_RW_MULTI_WR: + first_wr = &ctx->map.wrs[0].wr; + last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr; + break; + case RDMA_RW_SINGLE_WR: + first_wr = &ctx->single.wr.wr; + last_wr = &ctx->single.wr.wr; + break; + default: + BUG(); + } + + if (chain_wr) { + last_wr->next = chain_wr; + } else { + last_wr->wr_cqe = cqe; + last_wr->send_flags |= IB_SEND_SIGNALED; + } + + return first_wr; +} +EXPORT_SYMBOL(rdma_rw_ctx_wrs); + +/** + * rdma_rw_ctx_post - post a RDMA READ or RDMA WRITE operation + * @ctx: context to operate on + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @cqe: completion queue entry for the last WR + * @chain_wr: WR to append to the posted chain + * + * Post the set of RDMA READ/WRITE operations described by @ctx, as well as + * any memory registration operations needed. If @chain_wr is non-NULL the + * WR it points to will be appended to the chain of WRs posted. If @chain_wr + * is not set @cqe must be set so that the caller gets a completion + * notification. + */ +int rdma_rw_ctx_post(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, + struct ib_cqe *cqe, struct ib_send_wr *chain_wr) +{ + struct ib_send_wr *first_wr, *bad_wr; + + first_wr = rdma_rw_ctx_wrs(ctx, qp, port_num, cqe, chain_wr); + return ib_post_send(qp, first_wr, &bad_wr); +} +EXPORT_SYMBOL(rdma_rw_ctx_post); + +/** + * rdma_rw_ctx_destroy - release all resources allocated by rdma_rw_ctx_init + * @ctx: context to release + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist that was used for the READ/WRITE + * @sg_cnt: number of entries in @sg + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + */ +void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num, + struct scatterlist *sg, u32 sg_cnt, enum dma_data_direction dir) +{ + int i; + + switch (ctx->type) { + case RDMA_RW_MR: + for (i = 0; i < ctx->nr_ops; i++) + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); + kfree(ctx->reg); + break; + case RDMA_RW_MULTI_WR: + kfree(ctx->map.wrs); + kfree(ctx->map.sges); + break; + case RDMA_RW_SINGLE_WR: + break; + default: + BUG(); + break; + } + + ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); +} +EXPORT_SYMBOL(rdma_rw_ctx_destroy); + +/** + * rdma_rw_ctx_destroy_signature - release all resources allocated by + * rdma_rw_ctx_init_signature + * @ctx: context to release + * @qp: queue pair to operate on + * @port_num: port num to which the connection is bound + * @sg: scatterlist that was used for the READ/WRITE + * @sg_cnt: number of entries in @sg + * @prot_sg: scatterlist that was used for the READ/WRITE of the PI + * @prot_sg_cnt: number of entries in @prot_sg + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ + */ +void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, struct ib_qp *qp, + u8 port_num, struct scatterlist *sg, u32 sg_cnt, + struct scatterlist *prot_sg, u32 prot_sg_cnt, + enum dma_data_direction dir) +{ + if (WARN_ON_ONCE(ctx->type != RDMA_RW_SIG_MR)) + return; + + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->data.mr); + ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); + + if (ctx->sig->prot.mr) { + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->sig->prot.mr); + ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir); + } + + ib_mr_pool_put(qp, &qp->sig_mrs, ctx->sig->sig_mr); + kfree(ctx->sig); +} +EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature); + +void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr) +{ + u32 factor; + + WARN_ON_ONCE(attr->port_num == 0); + + /* + * Each context needs at least one RDMA READ or WRITE WR. + * + * For some hardware we might need more, eventually we should ask the + * HCA driver for a multiplier here. + */ + factor = 1; + + /* + * If the devices needs MRs to perform RDMA READ or WRITE operations, + * we'll need two additional MRs for the registrations and the + * invalidation. + */ + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) + factor += 6; /* (inv + reg) * (data + prot + sig) */ + else if (rdma_rw_can_use_mr(dev, attr->port_num)) + factor += 2; /* inv + reg */ + + attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs; + + /* + * But maybe we were just too high in the sky and the device doesn't + * even support all we need, and we'll have to live with what we get.. + */ + attr->cap.max_send_wr = + min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr); +} + +int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr) +{ + struct ib_device *dev = qp->pd->device; + u32 nr_mrs = 0, nr_sig_mrs = 0; + int ret = 0; + + if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN) { + nr_sig_mrs = attr->cap.max_rdma_ctxs; + nr_mrs = attr->cap.max_rdma_ctxs * 2; + } else if (rdma_rw_can_use_mr(dev, attr->port_num)) { + nr_mrs = attr->cap.max_rdma_ctxs; + } + + if (nr_mrs) { + ret = ib_mr_pool_init(qp, &qp->rdma_mrs, nr_mrs, + IB_MR_TYPE_MEM_REG, + rdma_rw_fr_page_list_len(dev)); + if (ret) { + pr_err("%s: failed to allocated %d MRs\n", + __func__, nr_mrs); + return ret; + } + } + + if (nr_sig_mrs) { + ret = ib_mr_pool_init(qp, &qp->sig_mrs, nr_sig_mrs, + IB_MR_TYPE_SIGNATURE, 2); + if (ret) { + pr_err("%s: failed to allocated %d SIG MRs\n", + __func__, nr_mrs); + goto out_free_rdma_mrs; + } + } + + return 0; + +out_free_rdma_mrs: + ib_mr_pool_destroy(qp, &qp->rdma_mrs); + return ret; +} + +void rdma_rw_cleanup_mrs(struct ib_qp *qp) +{ + ib_mr_pool_destroy(qp, &qp->sig_mrs); + ib_mr_pool_destroy(qp, &qp->rdma_mrs); +} diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 8a09c0fb2..e95538650 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -53,10 +53,6 @@ #include "sa.h" #include "core_priv.h" -MODULE_AUTHOR("Roland Dreier"); -MODULE_DESCRIPTION("InfiniBand subnet administration query support"); -MODULE_LICENSE("Dual BSD/GPL"); - #define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100 #define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000 #define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000 @@ -119,6 +115,12 @@ struct ib_sa_guidinfo_query { struct ib_sa_query sa_query; }; +struct ib_sa_classport_info_query { + void (*callback)(int, struct ib_class_port_info *, void *); + void *context; + struct ib_sa_query sa_query; +}; + struct ib_sa_mcmember_query { void (*callback)(int, struct ib_sa_mcmember_rec *, void *); void *context; @@ -392,6 +394,82 @@ static const struct ib_field service_rec_table[] = { .size_bits = 2*64 }, }; +#define CLASSPORTINFO_REC_FIELD(field) \ + .struct_offset_bytes = offsetof(struct ib_class_port_info, field), \ + .struct_size_bytes = sizeof((struct ib_class_port_info *)0)->field, \ + .field_name = "ib_class_port_info:" #field + +static const struct ib_field classport_info_rec_table[] = { + { CLASSPORTINFO_REC_FIELD(base_version), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 8 }, + { CLASSPORTINFO_REC_FIELD(class_version), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 8 }, + { CLASSPORTINFO_REC_FIELD(capability_mask), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_gid), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 128 }, + { CLASSPORTINFO_REC_FIELD(redirect_tcslfl), + .offset_words = 6, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_lid), + .offset_words = 7, + .offset_bits = 0, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(redirect_pkey), + .offset_words = 7, + .offset_bits = 16, + .size_bits = 16 }, + + { CLASSPORTINFO_REC_FIELD(redirect_qp), + .offset_words = 8, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(redirect_qkey), + .offset_words = 9, + .offset_bits = 0, + .size_bits = 32 }, + + { CLASSPORTINFO_REC_FIELD(trap_gid), + .offset_words = 10, + .offset_bits = 0, + .size_bits = 128 }, + { CLASSPORTINFO_REC_FIELD(trap_tcslfl), + .offset_words = 14, + .offset_bits = 0, + .size_bits = 32 }, + + { CLASSPORTINFO_REC_FIELD(trap_lid), + .offset_words = 15, + .offset_bits = 0, + .size_bits = 16 }, + { CLASSPORTINFO_REC_FIELD(trap_pkey), + .offset_words = 15, + .offset_bits = 16, + .size_bits = 16 }, + + { CLASSPORTINFO_REC_FIELD(trap_hlqp), + .offset_words = 16, + .offset_bits = 0, + .size_bits = 32 }, + { CLASSPORTINFO_REC_FIELD(trap_qkey), + .offset_words = 17, + .offset_bits = 0, + .size_bits = 32 }, +}; + #define GUIDINFO_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \ .struct_size_bytes = sizeof((struct ib_sa_guidinfo_rec *) 0)->field, \ @@ -536,7 +614,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS, RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST); if (!data) { - kfree_skb(skb); + nlmsg_free(skb); return -EMSGSIZE; } @@ -705,8 +783,8 @@ static void ib_nl_request_timeout(struct work_struct *work) spin_unlock_irqrestore(&ib_nl_request_lock, flags); } -static int ib_nl_handle_set_timeout(struct sk_buff *skb, - struct netlink_callback *cb) +int ib_nl_handle_set_timeout(struct sk_buff *skb, + struct netlink_callback *cb) { const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; int timeout, delta, abs_delta; @@ -782,8 +860,8 @@ static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh) return 1; } -static int ib_nl_handle_resolve_resp(struct sk_buff *skb, - struct netlink_callback *cb) +int ib_nl_handle_resolve_resp(struct sk_buff *skb, + struct netlink_callback *cb) { const struct nlmsghdr *nlh = (struct nlmsghdr *)cb->nlh; unsigned long flags; @@ -838,15 +916,6 @@ resp_out: return skb->len; } -static struct ibnl_client_cbs ib_sa_cb_table[] = { - [RDMA_NL_LS_OP_RESOLVE] = { - .dump = ib_nl_handle_resolve_resp, - .module = THIS_MODULE }, - [RDMA_NL_LS_OP_SET_TIMEOUT] = { - .dump = ib_nl_handle_set_timeout, - .module = THIS_MODULE }, -}; - static void free_sm_ah(struct kref *kref) { struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); @@ -1645,6 +1714,97 @@ err1: } EXPORT_SYMBOL(ib_sa_guid_info_rec_query); +/* Support get SA ClassPortInfo */ +static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query, + int status, + struct ib_sa_mad *mad) +{ + struct ib_sa_classport_info_query *query = + container_of(sa_query, struct ib_sa_classport_info_query, sa_query); + + if (mad) { + struct ib_class_port_info rec; + + ib_unpack(classport_info_rec_table, + ARRAY_SIZE(classport_info_rec_table), + mad->data, &rec); + query->callback(status, &rec, query->context); + } else { + query->callback(status, NULL, query->context); + } +} + +static void ib_sa_portclass_info_rec_release(struct ib_sa_query *sa_query) +{ + kfree(container_of(sa_query, struct ib_sa_classport_info_query, + sa_query)); +} + +int ib_sa_classport_info_rec_query(struct ib_sa_client *client, + struct ib_device *device, u8 port_num, + int timeout_ms, gfp_t gfp_mask, + void (*callback)(int status, + struct ib_class_port_info *resp, + void *context), + void *context, + struct ib_sa_query **sa_query) +{ + struct ib_sa_classport_info_query *query; + struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); + struct ib_sa_port *port; + struct ib_mad_agent *agent; + struct ib_sa_mad *mad; + int ret; + + if (!sa_dev) + return -ENODEV; + + port = &sa_dev->port[port_num - sa_dev->start_port]; + agent = port->agent; + + query = kzalloc(sizeof(*query), gfp_mask); + if (!query) + return -ENOMEM; + + query->sa_query.port = port; + ret = alloc_mad(&query->sa_query, gfp_mask); + if (ret) + goto err1; + + ib_sa_client_get(client); + query->sa_query.client = client; + query->callback = callback; + query->context = context; + + mad = query->sa_query.mad_buf->mad; + init_mad(mad, agent); + + query->sa_query.callback = callback ? ib_sa_classport_info_rec_callback : NULL; + + query->sa_query.release = ib_sa_portclass_info_rec_release; + /* support GET only */ + mad->mad_hdr.method = IB_MGMT_METHOD_GET; + mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_CLASS_PORTINFO); + mad->sa_hdr.comp_mask = 0; + *sa_query = &query->sa_query; + + ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); + if (ret < 0) + goto err2; + + return ret; + +err2: + *sa_query = NULL; + ib_sa_client_put(query->sa_query.client); + free_mad(&query->sa_query); + +err1: + kfree(query); + return ret; +} +EXPORT_SYMBOL(ib_sa_classport_info_rec_query); + static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *mad_send_wc) { @@ -1794,7 +1954,7 @@ static void ib_sa_remove_one(struct ib_device *device, void *client_data) kfree(sa_dev); } -static int __init ib_sa_init(void) +int ib_sa_init(void) { int ret; @@ -1820,17 +1980,10 @@ static int __init ib_sa_init(void) goto err3; } - if (ibnl_add_client(RDMA_NL_LS, RDMA_NL_LS_NUM_OPS, - ib_sa_cb_table)) { - pr_err("Failed to add netlink callback\n"); - ret = -EINVAL; - goto err4; - } INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout); return 0; -err4: - destroy_workqueue(ib_nl_wq); + err3: mcast_cleanup(); err2: @@ -1839,9 +1992,8 @@ err1: return ret; } -static void __exit ib_sa_cleanup(void) +void ib_sa_cleanup(void) { - ibnl_remove_client(RDMA_NL_LS); cancel_delayed_work(&ib_nl_timed_work); flush_workqueue(ib_nl_wq); destroy_workqueue(ib_nl_wq); @@ -1849,6 +2001,3 @@ static void __exit ib_sa_cleanup(void) ib_unregister_client(&sa_client); idr_destroy(&query_idr); } - -module_init(ib_sa_init); -module_exit(ib_sa_cleanup); diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 14606afbf..60df4f8e8 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -56,8 +56,10 @@ struct ib_port { struct gid_attr_group *gid_attr_group; struct attribute_group gid_group; struct attribute_group pkey_group; - u8 port_num; struct attribute_group *pma_table; + struct attribute_group *hw_stats_ag; + struct rdma_hw_stats *hw_stats; + u8 port_num; }; struct port_attribute { @@ -80,6 +82,18 @@ struct port_table_attribute { __be16 attr_id; }; +struct hw_stats_attribute { + struct attribute attr; + ssize_t (*show)(struct kobject *kobj, + struct attribute *attr, char *buf); + ssize_t (*store)(struct kobject *kobj, + struct attribute *attr, + const char *buf, + size_t count); + int index; + u8 port_num; +}; + static ssize_t port_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -516,6 +530,7 @@ static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192); static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); +static PORT_PMA_ATTR(port_xmit_wait , 0, 32, 320); /* * Counters added by extended set @@ -546,6 +561,7 @@ static struct attribute *pma_attrs[] = { &port_pma_attr_port_rcv_data.attr.attr, &port_pma_attr_port_xmit_packets.attr.attr, &port_pma_attr_port_rcv_packets.attr.attr, + &port_pma_attr_port_xmit_wait.attr.attr, NULL }; @@ -565,6 +581,7 @@ static struct attribute *pma_attrs_ext[] = { &port_pma_attr_ext_port_xmit_data.attr.attr, &port_pma_attr_ext_port_rcv_data.attr.attr, &port_pma_attr_ext_port_xmit_packets.attr.attr, + &port_pma_attr_port_xmit_wait.attr.attr, &port_pma_attr_ext_port_rcv_packets.attr.attr, &port_pma_attr_ext_unicast_rcv_packets.attr.attr, &port_pma_attr_ext_unicast_xmit_packets.attr.attr, @@ -590,6 +607,7 @@ static struct attribute *pma_attrs_noietf[] = { &port_pma_attr_ext_port_rcv_data.attr.attr, &port_pma_attr_ext_port_xmit_packets.attr.attr, &port_pma_attr_ext_port_rcv_packets.attr.attr, + &port_pma_attr_port_xmit_wait.attr.attr, NULL }; @@ -733,6 +751,220 @@ static struct attribute_group *get_counter_table(struct ib_device *dev, return &pma_group; } +static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats, + u8 port_num, int index) +{ + int ret; + + if (time_is_after_eq_jiffies(stats->timestamp + stats->lifespan)) + return 0; + ret = dev->get_hw_stats(dev, stats, port_num, index); + if (ret < 0) + return ret; + if (ret == stats->num_counters) + stats->timestamp = jiffies; + + return 0; +} + +static ssize_t print_hw_stat(struct rdma_hw_stats *stats, int index, char *buf) +{ + return sprintf(buf, "%llu\n", stats->value[index]); +} + +static ssize_t show_hw_stats(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct ib_device *dev; + struct ib_port *port; + struct hw_stats_attribute *hsa; + struct rdma_hw_stats *stats; + int ret; + + hsa = container_of(attr, struct hw_stats_attribute, attr); + if (!hsa->port_num) { + dev = container_of((struct device *)kobj, + struct ib_device, dev); + stats = dev->hw_stats; + } else { + port = container_of(kobj, struct ib_port, kobj); + dev = port->ibdev; + stats = port->hw_stats; + } + ret = update_hw_stats(dev, stats, hsa->port_num, hsa->index); + if (ret) + return ret; + return print_hw_stat(stats, hsa->index, buf); +} + +static ssize_t show_stats_lifespan(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct hw_stats_attribute *hsa; + int msecs; + + hsa = container_of(attr, struct hw_stats_attribute, attr); + if (!hsa->port_num) { + struct ib_device *dev = container_of((struct device *)kobj, + struct ib_device, dev); + msecs = jiffies_to_msecs(dev->hw_stats->lifespan); + } else { + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + msecs = jiffies_to_msecs(p->hw_stats->lifespan); + } + return sprintf(buf, "%d\n", msecs); +} + +static ssize_t set_stats_lifespan(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t count) +{ + struct hw_stats_attribute *hsa; + int msecs; + int jiffies; + int ret; + + ret = kstrtoint(buf, 10, &msecs); + if (ret) + return ret; + if (msecs < 0 || msecs > 10000) + return -EINVAL; + jiffies = msecs_to_jiffies(msecs); + hsa = container_of(attr, struct hw_stats_attribute, attr); + if (!hsa->port_num) { + struct ib_device *dev = container_of((struct device *)kobj, + struct ib_device, dev); + dev->hw_stats->lifespan = jiffies; + } else { + struct ib_port *p = container_of(kobj, struct ib_port, kobj); + p->hw_stats->lifespan = jiffies; + } + return count; +} + +static void free_hsag(struct kobject *kobj, struct attribute_group *attr_group) +{ + struct attribute **attr; + + sysfs_remove_group(kobj, attr_group); + + for (attr = attr_group->attrs; *attr; attr++) + kfree(*attr); + kfree(attr_group); +} + +static struct attribute *alloc_hsa(int index, u8 port_num, const char *name) +{ + struct hw_stats_attribute *hsa; + + hsa = kmalloc(sizeof(*hsa), GFP_KERNEL); + if (!hsa) + return NULL; + + hsa->attr.name = (char *)name; + hsa->attr.mode = S_IRUGO; + hsa->show = show_hw_stats; + hsa->store = NULL; + hsa->index = index; + hsa->port_num = port_num; + + return &hsa->attr; +} + +static struct attribute *alloc_hsa_lifespan(char *name, u8 port_num) +{ + struct hw_stats_attribute *hsa; + + hsa = kmalloc(sizeof(*hsa), GFP_KERNEL); + if (!hsa) + return NULL; + + hsa->attr.name = name; + hsa->attr.mode = S_IWUSR | S_IRUGO; + hsa->show = show_stats_lifespan; + hsa->store = set_stats_lifespan; + hsa->index = 0; + hsa->port_num = port_num; + + return &hsa->attr; +} + +static void setup_hw_stats(struct ib_device *device, struct ib_port *port, + u8 port_num) +{ + struct attribute_group *hsag; + struct rdma_hw_stats *stats; + int i, ret; + + stats = device->alloc_hw_stats(device, port_num); + + if (!stats) + return; + + if (!stats->names || stats->num_counters <= 0) + goto err_free_stats; + + /* + * Two extra attribue elements here, one for the lifespan entry and + * one to NULL terminate the list for the sysfs core code + */ + hsag = kzalloc(sizeof(*hsag) + + sizeof(void *) * (stats->num_counters + 2), + GFP_KERNEL); + if (!hsag) + goto err_free_stats; + + ret = device->get_hw_stats(device, stats, port_num, + stats->num_counters); + if (ret != stats->num_counters) + goto err_free_hsag; + + stats->timestamp = jiffies; + + hsag->name = "hw_counters"; + hsag->attrs = (void *)hsag + sizeof(*hsag); + + for (i = 0; i < stats->num_counters; i++) { + hsag->attrs[i] = alloc_hsa(i, port_num, stats->names[i]); + if (!hsag->attrs[i]) + goto err; + sysfs_attr_init(hsag->attrs[i]); + } + + /* treat an error here as non-fatal */ + hsag->attrs[i] = alloc_hsa_lifespan("lifespan", port_num); + if (hsag->attrs[i]) + sysfs_attr_init(hsag->attrs[i]); + + if (port) { + struct kobject *kobj = &port->kobj; + ret = sysfs_create_group(kobj, hsag); + if (ret) + goto err; + port->hw_stats_ag = hsag; + port->hw_stats = stats; + } else { + struct kobject *kobj = &device->dev.kobj; + ret = sysfs_create_group(kobj, hsag); + if (ret) + goto err; + device->hw_stats_ag = hsag; + device->hw_stats = stats; + } + + return; + +err: + for (; i >= 0; i--) + kfree(hsag->attrs[i]); +err_free_hsag: + kfree(hsag); +err_free_stats: + kfree(stats); + return; +} + static int add_port(struct ib_device *device, int port_num, int (*port_callback)(struct ib_device *, u8, struct kobject *)) @@ -835,6 +1067,14 @@ static int add_port(struct ib_device *device, int port_num, goto err_remove_pkey; } + /* + * If port == 0, it means we have only one port and the parent + * device, not this port device, should be the holder of the + * hw_counters + */ + if (device->alloc_hw_stats && port_num) + setup_hw_stats(device, p, port_num); + list_add_tail(&p->kobj.entry, &device->port_list); kobject_uevent(&p->kobj, KOBJ_ADD); @@ -972,120 +1212,6 @@ static struct device_attribute *ib_class_attributes[] = { &dev_attr_node_desc }; -/* Show a given an attribute in the statistics group */ -static ssize_t show_protocol_stat(const struct device *device, - struct device_attribute *attr, char *buf, - unsigned offset) -{ - struct ib_device *dev = container_of(device, struct ib_device, dev); - union rdma_protocol_stats stats; - ssize_t ret; - - ret = dev->get_protocol_stats(dev, &stats); - if (ret) - return ret; - - return sprintf(buf, "%llu\n", - (unsigned long long) ((u64 *) &stats)[offset]); -} - -/* generate a read-only iwarp statistics attribute */ -#define IW_STATS_ENTRY(name) \ -static ssize_t show_##name(struct device *device, \ - struct device_attribute *attr, char *buf) \ -{ \ - return show_protocol_stat(device, attr, buf, \ - offsetof(struct iw_protocol_stats, name) / \ - sizeof (u64)); \ -} \ -static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) - -IW_STATS_ENTRY(ipInReceives); -IW_STATS_ENTRY(ipInHdrErrors); -IW_STATS_ENTRY(ipInTooBigErrors); -IW_STATS_ENTRY(ipInNoRoutes); -IW_STATS_ENTRY(ipInAddrErrors); -IW_STATS_ENTRY(ipInUnknownProtos); -IW_STATS_ENTRY(ipInTruncatedPkts); -IW_STATS_ENTRY(ipInDiscards); -IW_STATS_ENTRY(ipInDelivers); -IW_STATS_ENTRY(ipOutForwDatagrams); -IW_STATS_ENTRY(ipOutRequests); -IW_STATS_ENTRY(ipOutDiscards); -IW_STATS_ENTRY(ipOutNoRoutes); -IW_STATS_ENTRY(ipReasmTimeout); -IW_STATS_ENTRY(ipReasmReqds); -IW_STATS_ENTRY(ipReasmOKs); -IW_STATS_ENTRY(ipReasmFails); -IW_STATS_ENTRY(ipFragOKs); -IW_STATS_ENTRY(ipFragFails); -IW_STATS_ENTRY(ipFragCreates); -IW_STATS_ENTRY(ipInMcastPkts); -IW_STATS_ENTRY(ipOutMcastPkts); -IW_STATS_ENTRY(ipInBcastPkts); -IW_STATS_ENTRY(ipOutBcastPkts); -IW_STATS_ENTRY(tcpRtoAlgorithm); -IW_STATS_ENTRY(tcpRtoMin); -IW_STATS_ENTRY(tcpRtoMax); -IW_STATS_ENTRY(tcpMaxConn); -IW_STATS_ENTRY(tcpActiveOpens); -IW_STATS_ENTRY(tcpPassiveOpens); -IW_STATS_ENTRY(tcpAttemptFails); -IW_STATS_ENTRY(tcpEstabResets); -IW_STATS_ENTRY(tcpCurrEstab); -IW_STATS_ENTRY(tcpInSegs); -IW_STATS_ENTRY(tcpOutSegs); -IW_STATS_ENTRY(tcpRetransSegs); -IW_STATS_ENTRY(tcpInErrs); -IW_STATS_ENTRY(tcpOutRsts); - -static struct attribute *iw_proto_stats_attrs[] = { - &dev_attr_ipInReceives.attr, - &dev_attr_ipInHdrErrors.attr, - &dev_attr_ipInTooBigErrors.attr, - &dev_attr_ipInNoRoutes.attr, - &dev_attr_ipInAddrErrors.attr, - &dev_attr_ipInUnknownProtos.attr, - &dev_attr_ipInTruncatedPkts.attr, - &dev_attr_ipInDiscards.attr, - &dev_attr_ipInDelivers.attr, - &dev_attr_ipOutForwDatagrams.attr, - &dev_attr_ipOutRequests.attr, - &dev_attr_ipOutDiscards.attr, - &dev_attr_ipOutNoRoutes.attr, - &dev_attr_ipReasmTimeout.attr, - &dev_attr_ipReasmReqds.attr, - &dev_attr_ipReasmOKs.attr, - &dev_attr_ipReasmFails.attr, - &dev_attr_ipFragOKs.attr, - &dev_attr_ipFragFails.attr, - &dev_attr_ipFragCreates.attr, - &dev_attr_ipInMcastPkts.attr, - &dev_attr_ipOutMcastPkts.attr, - &dev_attr_ipInBcastPkts.attr, - &dev_attr_ipOutBcastPkts.attr, - &dev_attr_tcpRtoAlgorithm.attr, - &dev_attr_tcpRtoMin.attr, - &dev_attr_tcpRtoMax.attr, - &dev_attr_tcpMaxConn.attr, - &dev_attr_tcpActiveOpens.attr, - &dev_attr_tcpPassiveOpens.attr, - &dev_attr_tcpAttemptFails.attr, - &dev_attr_tcpEstabResets.attr, - &dev_attr_tcpCurrEstab.attr, - &dev_attr_tcpInSegs.attr, - &dev_attr_tcpOutSegs.attr, - &dev_attr_tcpRetransSegs.attr, - &dev_attr_tcpInErrs.attr, - &dev_attr_tcpOutRsts.attr, - NULL -}; - -static struct attribute_group iw_stats_group = { - .name = "proto_stats", - .attrs = iw_proto_stats_attrs, -}; - static void free_port_list_attributes(struct ib_device *device) { struct kobject *p, *t; @@ -1093,6 +1219,10 @@ static void free_port_list_attributes(struct ib_device *device) list_for_each_entry_safe(p, t, &device->port_list, entry) { struct ib_port *port = container_of(p, struct ib_port, kobj); list_del(&p->entry); + if (port->hw_stats) { + kfree(port->hw_stats); + free_hsag(&port->kobj, port->hw_stats_ag); + } sysfs_remove_group(p, port->pma_table); sysfs_remove_group(p, &port->pkey_group); sysfs_remove_group(p, &port->gid_group); @@ -1149,11 +1279,8 @@ int ib_device_register_sysfs(struct ib_device *device, } } - if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) { - ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group); - if (ret) - goto err_put; - } + if (device->alloc_hw_stats) + setup_hw_stats(device, NULL, 0); return 0; @@ -1169,15 +1296,18 @@ err: void ib_device_unregister_sysfs(struct ib_device *device) { - /* Hold kobject until ib_dealloc_device() */ - struct kobject *kobj_dev = kobject_get(&device->dev.kobj); int i; - if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) - sysfs_remove_group(kobj_dev, &iw_stats_group); + /* Hold kobject until ib_dealloc_device() */ + kobject_get(&device->dev.kobj); free_port_list_attributes(device); + if (device->hw_stats) { + kfree(device->hw_stats); + free_hsag(&device->dev.kobj, device->hw_stats_ag); + } + for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) device_remove_file(&device->dev, ib_class_attributes[i]); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 6fdc7ecda..825021d10 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1747,7 +1747,7 @@ static int create_qp(struct ib_uverbs_file *file, struct ib_srq *srq = NULL; struct ib_qp *qp; char *buf; - struct ib_qp_init_attr attr; + struct ib_qp_init_attr attr = {}; struct ib_uverbs_ex_create_qp_resp resp; int ret; @@ -1833,7 +1833,8 @@ static int create_qp(struct ib_uverbs_file *file, if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | IB_QP_CREATE_CROSS_CHANNEL | IB_QP_CREATE_MANAGED_SEND | - IB_QP_CREATE_MANAGED_RECV)) { + IB_QP_CREATE_MANAGED_RECV | + IB_QP_CREATE_SCATTER_FCS)) { ret = -EINVAL; goto err_put; } @@ -3088,8 +3089,7 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, if (cmd.comp_mask) return -EINVAL; - if ((cmd.flow_attr.type == IB_FLOW_ATTR_SNIFFER && - !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW)) + if (!capable(CAP_NET_RAW)) return -EPERM; if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED) @@ -3655,6 +3655,11 @@ int ib_uverbs_ex_query_device(struct ib_uverbs_file *file, resp.hca_core_clock = attr.hca_core_clock; resp.response_length += sizeof(resp.hca_core_clock); + if (ucore->outlen < resp.response_length + sizeof(resp.device_cap_flags_ex)) + goto end; + + resp.device_cap_flags_ex = attr.device_cap_flags; + resp.response_length += sizeof(resp.device_cap_flags_ex); end: err = ib_copy_to_udata(ucore, &resp, resp.response_length); return err; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index b65b3541e..6298f54b4 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -48,6 +48,7 @@ #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> +#include <rdma/rw.h> #include "core_priv.h" @@ -510,12 +511,16 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, ah_attr->grh.dgid = sgid; if (!rdma_cap_eth_ah(device, port_num)) { - ret = ib_find_cached_gid_by_port(device, &dgid, - IB_GID_TYPE_IB, - port_num, NULL, - &gid_index); - if (ret) - return ret; + if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) { + ret = ib_find_cached_gid_by_port(device, &dgid, + IB_GID_TYPE_IB, + port_num, NULL, + &gid_index); + if (ret) + return ret; + } else { + gid_index = 0; + } } ah_attr->grh.sgid_index = (u8) gid_index; @@ -723,59 +728,89 @@ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, } EXPORT_SYMBOL(ib_open_qp); +static struct ib_qp *ib_create_xrc_qp(struct ib_qp *qp, + struct ib_qp_init_attr *qp_init_attr) +{ + struct ib_qp *real_qp = qp; + + qp->event_handler = __ib_shared_qp_event_handler; + qp->qp_context = qp; + qp->pd = NULL; + qp->send_cq = qp->recv_cq = NULL; + qp->srq = NULL; + qp->xrcd = qp_init_attr->xrcd; + atomic_inc(&qp_init_attr->xrcd->usecnt); + INIT_LIST_HEAD(&qp->open_list); + + qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, + qp_init_attr->qp_context); + if (!IS_ERR(qp)) + __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); + else + real_qp->device->destroy_qp(real_qp); + return qp; +} + struct ib_qp *ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr) { - struct ib_qp *qp, *real_qp; - struct ib_device *device; + struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device; + struct ib_qp *qp; + int ret; + + /* + * If the callers is using the RDMA API calculate the resources + * needed for the RDMA READ/WRITE operations. + * + * Note that these callers need to pass in a port number. + */ + if (qp_init_attr->cap.max_rdma_ctxs) + rdma_rw_init_qp(device, qp_init_attr); - device = pd ? pd->device : qp_init_attr->xrcd->device; qp = device->create_qp(pd, qp_init_attr, NULL); + if (IS_ERR(qp)) + return qp; + + qp->device = device; + qp->real_qp = qp; + qp->uobject = NULL; + qp->qp_type = qp_init_attr->qp_type; + + atomic_set(&qp->usecnt, 0); + qp->mrs_used = 0; + spin_lock_init(&qp->mr_lock); + INIT_LIST_HEAD(&qp->rdma_mrs); + INIT_LIST_HEAD(&qp->sig_mrs); + + if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) + return ib_create_xrc_qp(qp, qp_init_attr); + + qp->event_handler = qp_init_attr->event_handler; + qp->qp_context = qp_init_attr->qp_context; + if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { + qp->recv_cq = NULL; + qp->srq = NULL; + } else { + qp->recv_cq = qp_init_attr->recv_cq; + atomic_inc(&qp_init_attr->recv_cq->usecnt); + qp->srq = qp_init_attr->srq; + if (qp->srq) + atomic_inc(&qp_init_attr->srq->usecnt); + } - if (!IS_ERR(qp)) { - qp->device = device; - qp->real_qp = qp; - qp->uobject = NULL; - qp->qp_type = qp_init_attr->qp_type; - - atomic_set(&qp->usecnt, 0); - if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) { - qp->event_handler = __ib_shared_qp_event_handler; - qp->qp_context = qp; - qp->pd = NULL; - qp->send_cq = qp->recv_cq = NULL; - qp->srq = NULL; - qp->xrcd = qp_init_attr->xrcd; - atomic_inc(&qp_init_attr->xrcd->usecnt); - INIT_LIST_HEAD(&qp->open_list); - - real_qp = qp; - qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, - qp_init_attr->qp_context); - if (!IS_ERR(qp)) - __ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp); - else - real_qp->device->destroy_qp(real_qp); - } else { - qp->event_handler = qp_init_attr->event_handler; - qp->qp_context = qp_init_attr->qp_context; - if (qp_init_attr->qp_type == IB_QPT_XRC_INI) { - qp->recv_cq = NULL; - qp->srq = NULL; - } else { - qp->recv_cq = qp_init_attr->recv_cq; - atomic_inc(&qp_init_attr->recv_cq->usecnt); - qp->srq = qp_init_attr->srq; - if (qp->srq) - atomic_inc(&qp_init_attr->srq->usecnt); - } + qp->pd = pd; + qp->send_cq = qp_init_attr->send_cq; + qp->xrcd = NULL; - qp->pd = pd; - qp->send_cq = qp_init_attr->send_cq; - qp->xrcd = NULL; + atomic_inc(&pd->usecnt); + atomic_inc(&qp_init_attr->send_cq->usecnt); - atomic_inc(&pd->usecnt); - atomic_inc(&qp_init_attr->send_cq->usecnt); + if (qp_init_attr->cap.max_rdma_ctxs) { + ret = rdma_rw_init_mrs(qp, qp_init_attr); + if (ret) { + pr_err("failed to init MR pool ret= %d\n", ret); + ib_destroy_qp(qp); + qp = ERR_PTR(ret); } } @@ -1250,6 +1285,8 @@ int ib_destroy_qp(struct ib_qp *qp) struct ib_srq *srq; int ret; + WARN_ON_ONCE(qp->mrs_used > 0); + if (atomic_read(&qp->usecnt)) return -EBUSY; @@ -1261,6 +1298,9 @@ int ib_destroy_qp(struct ib_qp *qp) rcq = qp->recv_cq; srq = qp->srq; + if (!qp->uobject) + rdma_rw_cleanup_mrs(qp); + ret = qp->device->destroy_qp(qp); if (!ret) { if (pd) @@ -1343,6 +1383,7 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) mr->pd = pd; mr->uobject = NULL; atomic_inc(&pd->usecnt); + mr->need_inval = false; } return mr; @@ -1389,6 +1430,7 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd, mr->pd = pd; mr->uobject = NULL; atomic_inc(&pd->usecnt); + mr->need_inval = false; } return mr; @@ -1597,6 +1639,7 @@ EXPORT_SYMBOL(ib_set_vf_guid); * @mr: memory region * @sg: dma mapped scatterlist * @sg_nents: number of entries in sg + * @sg_offset: offset in bytes into sg * @page_size: page vector desired page size * * Constraints: @@ -1615,17 +1658,15 @@ EXPORT_SYMBOL(ib_set_vf_guid); * After this completes successfully, the memory region * is ready for registration. */ -int ib_map_mr_sg(struct ib_mr *mr, - struct scatterlist *sg, - int sg_nents, - unsigned int page_size) +int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset, unsigned int page_size) { if (unlikely(!mr->device->map_mr_sg)) return -ENOSYS; mr->page_size = page_size; - return mr->device->map_mr_sg(mr, sg, sg_nents); + return mr->device->map_mr_sg(mr, sg, sg_nents, sg_offset); } EXPORT_SYMBOL(ib_map_mr_sg); @@ -1635,6 +1676,10 @@ EXPORT_SYMBOL(ib_map_mr_sg); * @mr: memory region * @sgl: dma mapped scatterlist * @sg_nents: number of entries in sg + * @sg_offset_p: IN: start offset in bytes into sg + * OUT: offset in bytes for element n of the sg of the first + * byte that has not been processed where n is the return + * value of this function. * @set_page: driver page assignment function pointer * * Core service helper for drivers to convert the largest @@ -1645,23 +1690,26 @@ EXPORT_SYMBOL(ib_map_mr_sg); * Returns the number of sg elements that were assigned to * a page vector. */ -int ib_sg_to_pages(struct ib_mr *mr, - struct scatterlist *sgl, - int sg_nents, - int (*set_page)(struct ib_mr *, u64)) +int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, + unsigned int *sg_offset_p, int (*set_page)(struct ib_mr *, u64)) { struct scatterlist *sg; u64 last_end_dma_addr = 0; + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; unsigned int last_page_off = 0; u64 page_mask = ~((u64)mr->page_size - 1); int i, ret; - mr->iova = sg_dma_address(&sgl[0]); + if (unlikely(sg_nents <= 0 || sg_offset > sg_dma_len(&sgl[0]))) + return -EINVAL; + + mr->iova = sg_dma_address(&sgl[0]) + sg_offset; mr->length = 0; for_each_sg(sgl, sg, sg_nents, i) { - u64 dma_addr = sg_dma_address(sg); - unsigned int dma_len = sg_dma_len(sg); + u64 dma_addr = sg_dma_address(sg) + sg_offset; + u64 prev_addr = dma_addr; + unsigned int dma_len = sg_dma_len(sg) - sg_offset; u64 end_dma_addr = dma_addr + dma_len; u64 page_addr = dma_addr & page_mask; @@ -1685,8 +1733,14 @@ int ib_sg_to_pages(struct ib_mr *mr, do { ret = set_page(mr, page_addr); - if (unlikely(ret < 0)) - return i ? : ret; + if (unlikely(ret < 0)) { + sg_offset = prev_addr - sg_dma_address(sg); + mr->length += prev_addr - dma_addr; + if (sg_offset_p) + *sg_offset_p = sg_offset; + return i || sg_offset ? i : ret; + } + prev_addr = page_addr; next_page: page_addr += mr->page_size; } while (page_addr < end_dma_addr); @@ -1694,8 +1748,12 @@ next_page: mr->length += dma_len; last_end_dma_addr = end_dma_addr; last_page_off = end_dma_addr & ~page_mask; + + sg_offset = 0; } + if (sg_offset_p) + *sg_offset_p = 0; return i; } EXPORT_SYMBOL(ib_sg_to_pages); diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile index c7ad0a4c8..c0c7cf8af 100644 --- a/drivers/infiniband/hw/Makefile +++ b/drivers/infiniband/hw/Makefile @@ -8,3 +8,4 @@ obj-$(CONFIG_MLX5_INFINIBAND) += mlx5/ obj-$(CONFIG_INFINIBAND_NES) += nes/ obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ obj-$(CONFIG_INFINIBAND_USNIC) += usnic/ +obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/ diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c index de1c61b41..ada2e5009 100644 --- a/drivers/infiniband/hw/cxgb3/cxio_hal.c +++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c @@ -327,7 +327,7 @@ int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq) kfree(cq->sw_queue); dma_free_coherent(&(rdev_p->rnic_info.pdev->dev), (1UL << (cq->size_log2)) - * sizeof(struct t3_cqe), cq->queue, + * sizeof(struct t3_cqe) + 1, cq->queue, dma_unmap_addr(cq, mapping)); cxio_hal_put_cqid(rdev_p->rscp, cq->cqid); return err; diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index d403231a4..3e8431b5c 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -367,7 +367,7 @@ static void arp_failure_discard(struct t3cdev *dev, struct sk_buff *skb) */ static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb) { - printk(KERN_ERR MOD "ARP failure duing connect\n"); + printk(KERN_ERR MOD "ARP failure during connect\n"); kfree_skb(skb); } diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 3234a8be1..bb1a839d4 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -783,15 +783,14 @@ static int iwch_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -static int iwch_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +static int iwch_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset) { struct iwch_mr *mhp = to_iwch_mr(ibmr); mhp->npages = 0; - return ib_sg_to_pages(ibmr, sg, sg_nents, iwch_set_page); + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, iwch_set_page); } static int iwch_destroy_qp(struct ib_qp *ib_qp) @@ -1219,59 +1218,119 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr, iwch_dev->rdev.rnic_info.pdev->device); } -static int iwch_get_mib(struct ib_device *ibdev, - union rdma_protocol_stats *stats) +enum counters { + IPINRECEIVES, + IPINHDRERRORS, + IPINADDRERRORS, + IPINUNKNOWNPROTOS, + IPINDISCARDS, + IPINDELIVERS, + IPOUTREQUESTS, + IPOUTDISCARDS, + IPOUTNOROUTES, + IPREASMTIMEOUT, + IPREASMREQDS, + IPREASMOKS, + IPREASMFAILS, + TCPACTIVEOPENS, + TCPPASSIVEOPENS, + TCPATTEMPTFAILS, + TCPESTABRESETS, + TCPCURRESTAB, + TCPINSEGS, + TCPOUTSEGS, + TCPRETRANSSEGS, + TCPINERRS, + TCPOUTRSTS, + TCPRTOMIN, + TCPRTOMAX, + NR_COUNTERS +}; + +static const char * const names[] = { + [IPINRECEIVES] = "ipInReceives", + [IPINHDRERRORS] = "ipInHdrErrors", + [IPINADDRERRORS] = "ipInAddrErrors", + [IPINUNKNOWNPROTOS] = "ipInUnknownProtos", + [IPINDISCARDS] = "ipInDiscards", + [IPINDELIVERS] = "ipInDelivers", + [IPOUTREQUESTS] = "ipOutRequests", + [IPOUTDISCARDS] = "ipOutDiscards", + [IPOUTNOROUTES] = "ipOutNoRoutes", + [IPREASMTIMEOUT] = "ipReasmTimeout", + [IPREASMREQDS] = "ipReasmReqds", + [IPREASMOKS] = "ipReasmOKs", + [IPREASMFAILS] = "ipReasmFails", + [TCPACTIVEOPENS] = "tcpActiveOpens", + [TCPPASSIVEOPENS] = "tcpPassiveOpens", + [TCPATTEMPTFAILS] = "tcpAttemptFails", + [TCPESTABRESETS] = "tcpEstabResets", + [TCPCURRESTAB] = "tcpCurrEstab", + [TCPINSEGS] = "tcpInSegs", + [TCPOUTSEGS] = "tcpOutSegs", + [TCPRETRANSSEGS] = "tcpRetransSegs", + [TCPINERRS] = "tcpInErrs", + [TCPOUTRSTS] = "tcpOutRsts", + [TCPRTOMIN] = "tcpRtoMin", + [TCPRTOMAX] = "tcpRtoMax", +}; + +static struct rdma_hw_stats *iwch_alloc_stats(struct ib_device *ibdev, + u8 port_num) +{ + BUILD_BUG_ON(ARRAY_SIZE(names) != NR_COUNTERS); + + /* Our driver only supports device level stats */ + if (port_num != 0) + return NULL; + + return rdma_alloc_hw_stats_struct(names, NR_COUNTERS, + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + +static int iwch_get_mib(struct ib_device *ibdev, struct rdma_hw_stats *stats, + u8 port, int index) { struct iwch_dev *dev; struct tp_mib_stats m; int ret; + if (port != 0 || !stats) + return -ENOSYS; + PDBG("%s ibdev %p\n", __func__, ibdev); dev = to_iwch_dev(ibdev); ret = dev->rdev.t3cdev_p->ctl(dev->rdev.t3cdev_p, RDMA_GET_MIB, &m); if (ret) return -ENOSYS; - memset(stats, 0, sizeof *stats); - stats->iw.ipInReceives = ((u64) m.ipInReceive_hi << 32) + - m.ipInReceive_lo; - stats->iw.ipInHdrErrors = ((u64) m.ipInHdrErrors_hi << 32) + - m.ipInHdrErrors_lo; - stats->iw.ipInAddrErrors = ((u64) m.ipInAddrErrors_hi << 32) + - m.ipInAddrErrors_lo; - stats->iw.ipInUnknownProtos = ((u64) m.ipInUnknownProtos_hi << 32) + - m.ipInUnknownProtos_lo; - stats->iw.ipInDiscards = ((u64) m.ipInDiscards_hi << 32) + - m.ipInDiscards_lo; - stats->iw.ipInDelivers = ((u64) m.ipInDelivers_hi << 32) + - m.ipInDelivers_lo; - stats->iw.ipOutRequests = ((u64) m.ipOutRequests_hi << 32) + - m.ipOutRequests_lo; - stats->iw.ipOutDiscards = ((u64) m.ipOutDiscards_hi << 32) + - m.ipOutDiscards_lo; - stats->iw.ipOutNoRoutes = ((u64) m.ipOutNoRoutes_hi << 32) + - m.ipOutNoRoutes_lo; - stats->iw.ipReasmTimeout = (u64) m.ipReasmTimeout; - stats->iw.ipReasmReqds = (u64) m.ipReasmReqds; - stats->iw.ipReasmOKs = (u64) m.ipReasmOKs; - stats->iw.ipReasmFails = (u64) m.ipReasmFails; - stats->iw.tcpActiveOpens = (u64) m.tcpActiveOpens; - stats->iw.tcpPassiveOpens = (u64) m.tcpPassiveOpens; - stats->iw.tcpAttemptFails = (u64) m.tcpAttemptFails; - stats->iw.tcpEstabResets = (u64) m.tcpEstabResets; - stats->iw.tcpOutRsts = (u64) m.tcpOutRsts; - stats->iw.tcpCurrEstab = (u64) m.tcpCurrEstab; - stats->iw.tcpInSegs = ((u64) m.tcpInSegs_hi << 32) + - m.tcpInSegs_lo; - stats->iw.tcpOutSegs = ((u64) m.tcpOutSegs_hi << 32) + - m.tcpOutSegs_lo; - stats->iw.tcpRetransSegs = ((u64) m.tcpRetransSeg_hi << 32) + - m.tcpRetransSeg_lo; - stats->iw.tcpInErrs = ((u64) m.tcpInErrs_hi << 32) + - m.tcpInErrs_lo; - stats->iw.tcpRtoMin = (u64) m.tcpRtoMin; - stats->iw.tcpRtoMax = (u64) m.tcpRtoMax; - return 0; + stats->value[IPINRECEIVES] = ((u64)m.ipInReceive_hi << 32) + m.ipInReceive_lo; + stats->value[IPINHDRERRORS] = ((u64)m.ipInHdrErrors_hi << 32) + m.ipInHdrErrors_lo; + stats->value[IPINADDRERRORS] = ((u64)m.ipInAddrErrors_hi << 32) + m.ipInAddrErrors_lo; + stats->value[IPINUNKNOWNPROTOS] = ((u64)m.ipInUnknownProtos_hi << 32) + m.ipInUnknownProtos_lo; + stats->value[IPINDISCARDS] = ((u64)m.ipInDiscards_hi << 32) + m.ipInDiscards_lo; + stats->value[IPINDELIVERS] = ((u64)m.ipInDelivers_hi << 32) + m.ipInDelivers_lo; + stats->value[IPOUTREQUESTS] = ((u64)m.ipOutRequests_hi << 32) + m.ipOutRequests_lo; + stats->value[IPOUTDISCARDS] = ((u64)m.ipOutDiscards_hi << 32) + m.ipOutDiscards_lo; + stats->value[IPOUTNOROUTES] = ((u64)m.ipOutNoRoutes_hi << 32) + m.ipOutNoRoutes_lo; + stats->value[IPREASMTIMEOUT] = m.ipReasmTimeout; + stats->value[IPREASMREQDS] = m.ipReasmReqds; + stats->value[IPREASMOKS] = m.ipReasmOKs; + stats->value[IPREASMFAILS] = m.ipReasmFails; + stats->value[TCPACTIVEOPENS] = m.tcpActiveOpens; + stats->value[TCPPASSIVEOPENS] = m.tcpPassiveOpens; + stats->value[TCPATTEMPTFAILS] = m.tcpAttemptFails; + stats->value[TCPESTABRESETS] = m.tcpEstabResets; + stats->value[TCPCURRESTAB] = m.tcpOutRsts; + stats->value[TCPINSEGS] = m.tcpCurrEstab; + stats->value[TCPOUTSEGS] = ((u64)m.tcpInSegs_hi << 32) + m.tcpInSegs_lo; + stats->value[TCPRETRANSSEGS] = ((u64)m.tcpOutSegs_hi << 32) + m.tcpOutSegs_lo; + stats->value[TCPINERRS] = ((u64)m.tcpRetransSeg_hi << 32) + m.tcpRetransSeg_lo, + stats->value[TCPOUTRSTS] = ((u64)m.tcpInErrs_hi << 32) + m.tcpInErrs_lo; + stats->value[TCPRTOMIN] = m.tcpRtoMin; + stats->value[TCPRTOMAX] = m.tcpRtoMax; + + return stats->num_counters; } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); @@ -1374,7 +1433,8 @@ int iwch_register_device(struct iwch_dev *dev) dev->ibdev.req_notify_cq = iwch_arm_cq; dev->ibdev.post_send = iwch_post_send; dev->ibdev.post_recv = iwch_post_receive; - dev->ibdev.get_protocol_stats = iwch_get_mib; + dev->ibdev.alloc_hw_stats = iwch_alloc_stats; + dev->ibdev.get_hw_stats = iwch_get_mib; dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION; dev->ibdev.get_port_immutable = iwch_port_immutable; diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 651711370..a3a67216b 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -119,7 +119,7 @@ MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout " static int mpa_rev = 2; module_param(mpa_rev, int, 0644); MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, " - "1 is RFC0544 spec compliant, 2 is IETF MPA Peer Connect Draft" + "1 is RFC5044 spec compliant, 2 is IETF MPA Peer Connect Draft" " compliant (default=2)"); static int markers_enabled; @@ -145,19 +145,35 @@ static struct sk_buff_head rxq; static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp); static void ep_timeout(unsigned long arg); static void connect_reply_upcall(struct c4iw_ep *ep, int status); +static int sched(struct c4iw_dev *dev, struct sk_buff *skb); static LIST_HEAD(timeout_list); static spinlock_t timeout_lock; +static void deref_cm_id(struct c4iw_ep_common *epc) +{ + epc->cm_id->rem_ref(epc->cm_id); + epc->cm_id = NULL; + set_bit(CM_ID_DEREFED, &epc->history); +} + +static void ref_cm_id(struct c4iw_ep_common *epc) +{ + set_bit(CM_ID_REFED, &epc->history); + epc->cm_id->add_ref(epc->cm_id); +} + static void deref_qp(struct c4iw_ep *ep) { c4iw_qp_rem_ref(&ep->com.qp->ibqp); clear_bit(QP_REFERENCED, &ep->com.flags); + set_bit(QP_DEREFED, &ep->com.history); } static void ref_qp(struct c4iw_ep *ep) { set_bit(QP_REFERENCED, &ep->com.flags); + set_bit(QP_REFED, &ep->com.history); c4iw_qp_add_ref(&ep->com.qp->ibqp); } @@ -201,6 +217,8 @@ static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb, error = cxgb4_l2t_send(rdev->lldi.ports[0], skb, l2e); if (error < 0) kfree_skb(skb); + else if (error == NET_XMIT_DROP) + return -ENOMEM; return error < 0 ? error : 0; } @@ -290,12 +308,63 @@ static void *alloc_ep(int size, gfp_t gfp) return epc; } +static void remove_ep_tid(struct c4iw_ep *ep) +{ + unsigned long flags; + + spin_lock_irqsave(&ep->com.dev->lock, flags); + _remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid, 0); + spin_unlock_irqrestore(&ep->com.dev->lock, flags); +} + +static void insert_ep_tid(struct c4iw_ep *ep) +{ + unsigned long flags; + + spin_lock_irqsave(&ep->com.dev->lock, flags); + _insert_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep, ep->hwtid, 0); + spin_unlock_irqrestore(&ep->com.dev->lock, flags); +} + +/* + * Atomically lookup the ep ptr given the tid and grab a reference on the ep. + */ +static struct c4iw_ep *get_ep_from_tid(struct c4iw_dev *dev, unsigned int tid) +{ + struct c4iw_ep *ep; + unsigned long flags; + + spin_lock_irqsave(&dev->lock, flags); + ep = idr_find(&dev->hwtid_idr, tid); + if (ep) + c4iw_get_ep(&ep->com); + spin_unlock_irqrestore(&dev->lock, flags); + return ep; +} + +/* + * Atomically lookup the ep ptr given the stid and grab a reference on the ep. + */ +static struct c4iw_listen_ep *get_ep_from_stid(struct c4iw_dev *dev, + unsigned int stid) +{ + struct c4iw_listen_ep *ep; + unsigned long flags; + + spin_lock_irqsave(&dev->lock, flags); + ep = idr_find(&dev->stid_idr, stid); + if (ep) + c4iw_get_ep(&ep->com); + spin_unlock_irqrestore(&dev->lock, flags); + return ep; +} + void _c4iw_free_ep(struct kref *kref) { struct c4iw_ep *ep; ep = container_of(kref, struct c4iw_ep, com.kref); - PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]); + PDBG("%s ep %p state %s\n", __func__, ep, states[ep->com.state]); if (test_bit(QP_REFERENCED, &ep->com.flags)) deref_qp(ep); if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) { @@ -309,10 +378,11 @@ void _c4iw_free_ep(struct kref *kref) (const u32 *)&sin6->sin6_addr.s6_addr, 1); } - remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid); cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid); dst_release(ep->dst); cxgb4_l2t_release(ep->l2t); + if (ep->mpa_skb) + kfree_skb(ep->mpa_skb); } kfree(ep); } @@ -320,6 +390,15 @@ void _c4iw_free_ep(struct kref *kref) static void release_ep_resources(struct c4iw_ep *ep) { set_bit(RELEASE_RESOURCES, &ep->com.flags); + + /* + * If we have a hwtid, then remove it from the idr table + * so lookups will no longer find this endpoint. Otherwise + * we have a race where one thread finds the ep ptr just + * before the other thread is freeing the ep memory. + */ + if (ep->hwtid != -1) + remove_ep_tid(ep); c4iw_put_ep(&ep->com); } @@ -432,10 +511,74 @@ static struct dst_entry *find_route(struct c4iw_dev *dev, __be32 local_ip, static void arp_failure_discard(void *handle, struct sk_buff *skb) { - PDBG("%s c4iw_dev %p\n", __func__, handle); + pr_err(MOD "ARP failure\n"); kfree_skb(skb); } +static void mpa_start_arp_failure(void *handle, struct sk_buff *skb) +{ + pr_err("ARP failure during MPA Negotiation - Closing Connection\n"); +} + +enum { + NUM_FAKE_CPLS = 2, + FAKE_CPL_PUT_EP_SAFE = NUM_CPL_CMDS + 0, + FAKE_CPL_PASS_PUT_EP_SAFE = NUM_CPL_CMDS + 1, +}; + +static int _put_ep_safe(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + + ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *))); + release_ep_resources(ep); + return 0; +} + +static int _put_pass_ep_safe(struct c4iw_dev *dev, struct sk_buff *skb) +{ + struct c4iw_ep *ep; + + ep = *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *))); + c4iw_put_ep(&ep->parent_ep->com); + release_ep_resources(ep); + return 0; +} + +/* + * Fake up a special CPL opcode and call sched() so process_work() will call + * _put_ep_safe() in a safe context to free the ep resources. This is needed + * because ARP error handlers are called in an ATOMIC context, and + * _c4iw_free_ep() needs to block. + */ +static void queue_arp_failure_cpl(struct c4iw_ep *ep, struct sk_buff *skb, + int cpl) +{ + struct cpl_act_establish *rpl = cplhdr(skb); + + /* Set our special ARP_FAILURE opcode */ + rpl->ot.opcode = cpl; + + /* + * Save ep in the skb->cb area, after where sched() will save the dev + * ptr. + */ + *((struct c4iw_ep **)(skb->cb + 2 * sizeof(void *))) = ep; + sched(ep->com.dev, skb); +} + +/* Handle an ARP failure for an accept */ +static void pass_accept_rpl_arp_failure(void *handle, struct sk_buff *skb) +{ + struct c4iw_ep *ep = handle; + + pr_err(MOD "ARP failure during accept - tid %u -dropping connection\n", + ep->hwtid); + + __state_set(&ep->com, DEAD); + queue_arp_failure_cpl(ep, skb, FAKE_CPL_PASS_PUT_EP_SAFE); +} + /* * Handle an ARP failure for an active open. */ @@ -444,9 +587,8 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb) struct c4iw_ep *ep = handle; printk(KERN_ERR MOD "ARP failure during connect\n"); - kfree_skb(skb); connect_reply_upcall(ep, -EHOSTUNREACH); - state_set(&ep->com, DEAD); + __state_set(&ep->com, DEAD); if (ep->com.remote_addr.ss_family == AF_INET6) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&ep->com.local_addr; @@ -455,9 +597,7 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb) } remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); - dst_release(ep->dst); - cxgb4_l2t_release(ep->l2t); - c4iw_put_ep(&ep->com); + queue_arp_failure_cpl(ep, skb, FAKE_CPL_PUT_EP_SAFE); } /* @@ -466,15 +606,21 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb) */ static void abort_arp_failure(void *handle, struct sk_buff *skb) { - struct c4iw_rdev *rdev = handle; + int ret; + struct c4iw_ep *ep = handle; + struct c4iw_rdev *rdev = &ep->com.dev->rdev; struct cpl_abort_req *req = cplhdr(skb); PDBG("%s rdev %p\n", __func__, rdev); req->cmd = CPL_ABORT_NO_RST; - c4iw_ofld_send(rdev, skb); + ret = c4iw_ofld_send(rdev, skb); + if (ret) { + __state_set(&ep->com, DEAD); + queue_arp_failure_cpl(ep, skb, FAKE_CPL_PUT_EP_SAFE); + } } -static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb) +static int send_flowc(struct c4iw_ep *ep, struct sk_buff *skb) { unsigned int flowclen = 80; struct fw_flowc_wr *flowc; @@ -530,7 +676,7 @@ static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb) } set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); - c4iw_ofld_send(&ep->com.dev->rdev, skb); + return c4iw_ofld_send(&ep->com.dev->rdev, skb); } static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp) @@ -568,7 +714,7 @@ static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp) return -ENOMEM; } set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); - t4_set_arp_err_handler(skb, &ep->com.dev->rdev, abort_arp_failure); + t4_set_arp_err_handler(skb, ep, abort_arp_failure); req = (struct cpl_abort_req *) skb_put(skb, wrlen); memset(req, 0, wrlen); INIT_TP_WR(req, ep->hwtid); @@ -807,10 +953,10 @@ clip_release: return ret; } -static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, - u8 mpa_rev_to_use) +static int send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, + u8 mpa_rev_to_use) { - int mpalen, wrlen; + int mpalen, wrlen, ret; struct fw_ofld_tx_data_wr *req; struct mpa_message *mpa; struct mpa_v2_conn_params mpa_v2_params; @@ -826,7 +972,7 @@ static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, skb = get_skb(skb, wrlen, GFP_KERNEL); if (!skb) { connect_reply_upcall(ep, -ENOMEM); - return; + return -ENOMEM; } set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); @@ -894,12 +1040,14 @@ static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb, t4_set_arp_err_handler(skb, NULL, arp_failure_discard); BUG_ON(ep->mpa_skb); ep->mpa_skb = skb; - c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); + ret = c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); + if (ret) + return ret; start_ep_timer(ep); __state_set(&ep->com, MPA_REQ_SENT); ep->mpa_attr.initiator = 1; ep->snd_seq += mpalen; - return; + return ret; } static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen) @@ -975,7 +1123,7 @@ static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen) */ skb_get(skb); set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx); - t4_set_arp_err_handler(skb, NULL, arp_failure_discard); + t4_set_arp_err_handler(skb, NULL, mpa_start_arp_failure); BUG_ON(ep->mpa_skb); ep->mpa_skb = skb; ep->snd_seq += mpalen; @@ -1060,7 +1208,7 @@ static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen) * Function fw4_ack() will deref it. */ skb_get(skb); - t4_set_arp_err_handler(skb, NULL, arp_failure_discard); + t4_set_arp_err_handler(skb, NULL, mpa_start_arp_failure); ep->mpa_skb = skb; __state_set(&ep->com, MPA_REP_SENT); ep->snd_seq += mpalen; @@ -1074,6 +1222,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) unsigned int tid = GET_TID(req); unsigned int atid = TID_TID_G(ntohl(req->tos_atid)); struct tid_info *t = dev->rdev.lldi.tids; + int ret; ep = lookup_atid(t, atid); @@ -1086,7 +1235,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) /* setup the hwtid for this connection */ ep->hwtid = tid; cxgb4_insert_tid(t, ep, tid); - insert_handle(dev, &dev->hwtid_idr, ep, ep->hwtid); + insert_ep_tid(ep); ep->snd_seq = be32_to_cpu(req->snd_isn); ep->rcv_seq = be32_to_cpu(req->rcv_isn); @@ -1099,13 +1248,22 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) set_bit(ACT_ESTAB, &ep->com.history); /* start MPA negotiation */ - send_flowc(ep, NULL); + ret = send_flowc(ep, NULL); + if (ret) + goto err; if (ep->retry_with_mpa_v1) - send_mpa_req(ep, skb, 1); + ret = send_mpa_req(ep, skb, 1); else - send_mpa_req(ep, skb, mpa_rev); + ret = send_mpa_req(ep, skb, mpa_rev); + if (ret) + goto err; mutex_unlock(&ep->com.mutex); return 0; +err: + mutex_unlock(&ep->com.mutex); + connect_reply_upcall(ep, -ENOMEM); + c4iw_ep_disconnect(ep, 0, GFP_KERNEL); + return 0; } static void close_complete_upcall(struct c4iw_ep *ep, int status) @@ -1120,20 +1278,11 @@ static void close_complete_upcall(struct c4iw_ep *ep, int status) PDBG("close complete delivered ep %p cm_id %p tid %u\n", ep, ep->com.cm_id, ep->hwtid); ep->com.cm_id->event_handler(ep->com.cm_id, &event); - ep->com.cm_id->rem_ref(ep->com.cm_id); - ep->com.cm_id = NULL; + deref_cm_id(&ep->com); set_bit(CLOSE_UPCALL, &ep->com.history); } } -static int abort_connection(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp) -{ - PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); - __state_set(&ep->com, ABORTING); - set_bit(ABORT_CONN, &ep->com.history); - return send_abort(ep, skb, gfp); -} - static void peer_close_upcall(struct c4iw_ep *ep) { struct iw_cm_event event; @@ -1161,8 +1310,7 @@ static void peer_abort_upcall(struct c4iw_ep *ep) PDBG("abort delivered ep %p cm_id %p tid %u\n", ep, ep->com.cm_id, ep->hwtid); ep->com.cm_id->event_handler(ep->com.cm_id, &event); - ep->com.cm_id->rem_ref(ep->com.cm_id); - ep->com.cm_id = NULL; + deref_cm_id(&ep->com); set_bit(ABORT_UPCALL, &ep->com.history); } } @@ -1205,10 +1353,8 @@ static void connect_reply_upcall(struct c4iw_ep *ep, int status) set_bit(CONN_RPL_UPCALL, &ep->com.history); ep->com.cm_id->event_handler(ep->com.cm_id, &event); - if (status < 0) { - ep->com.cm_id->rem_ref(ep->com.cm_id); - ep->com.cm_id = NULL; - } + if (status < 0) + deref_cm_id(&ep->com); } static int connect_request_upcall(struct c4iw_ep *ep) @@ -1301,6 +1447,18 @@ static int update_rx_credits(struct c4iw_ep *ep, u32 credits) #define RELAXED_IRD_NEGOTIATION 1 +/* + * process_mpa_reply - process streaming mode MPA reply + * + * Returns: + * + * 0 upon success indicating a connect request was delivered to the ULP + * or the mpa request is incomplete but valid so far. + * + * 1 if a failure requires the caller to close the connection. + * + * 2 if a failure requires the caller to abort the connection. + */ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) { struct mpa_message *mpa; @@ -1316,20 +1474,12 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); /* - * Stop mpa timer. If it expired, then - * we ignore the MPA reply. process_timeout() - * will abort the connection. - */ - if (stop_ep_timer(ep)) - return 0; - - /* * If we get more than the supported amount of private data * then we must fail this connection. */ if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { err = -EINVAL; - goto err; + goto err_stop_timer; } /* @@ -1351,11 +1501,11 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d," " Received = %d\n", __func__, mpa_rev, mpa->revision); err = -EPROTO; - goto err; + goto err_stop_timer; } if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) { err = -EPROTO; - goto err; + goto err_stop_timer; } plen = ntohs(mpa->private_data_size); @@ -1365,7 +1515,7 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) */ if (plen > MPA_MAX_PRIVATE_DATA) { err = -EPROTO; - goto err; + goto err_stop_timer; } /* @@ -1373,7 +1523,7 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) */ if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { err = -EPROTO; - goto err; + goto err_stop_timer; } ep->plen = (u8) plen; @@ -1387,10 +1537,18 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) if (mpa->flags & MPA_REJECT) { err = -ECONNREFUSED; - goto err; + goto err_stop_timer; } /* + * Stop mpa timer. If it expired, then + * we ignore the MPA reply. process_timeout() + * will abort the connection. + */ + if (stop_ep_timer(ep)) + return 0; + + /* * If we get here we have accumulated the entire mpa * start reply message including private data. And * the MPA header is valid. @@ -1529,15 +1687,28 @@ static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb) goto out; } goto out; +err_stop_timer: + stop_ep_timer(ep); err: - __state_set(&ep->com, ABORTING); - send_abort(ep, skb, GFP_KERNEL); + disconnect = 2; out: connect_reply_upcall(ep, err); return disconnect; } -static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) +/* + * process_mpa_request - process streaming mode MPA request + * + * Returns: + * + * 0 upon success indicating a connect request was delivered to the ULP + * or the mpa request is incomplete but valid so far. + * + * 1 if a failure requires the caller to close the connection. + * + * 2 if a failure requires the caller to abort the connection. + */ +static int process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) { struct mpa_message *mpa; struct mpa_v2_conn_params *mpa_v2_params; @@ -1549,11 +1720,8 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) * If we get more than the supported amount of private data * then we must fail this connection. */ - if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) { - (void)stop_ep_timer(ep); - abort_connection(ep, skb, GFP_KERNEL); - return; - } + if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) + goto err_stop_timer; PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__); @@ -1569,7 +1737,7 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) * We'll continue process when more data arrives. */ if (ep->mpa_pkt_len < sizeof(*mpa)) - return; + return 0; PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__); mpa = (struct mpa_message *) ep->mpa_pkt; @@ -1580,43 +1748,32 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) if (mpa->revision > mpa_rev) { printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d," " Received = %d\n", __func__, mpa_rev, mpa->revision); - (void)stop_ep_timer(ep); - abort_connection(ep, skb, GFP_KERNEL); - return; + goto err_stop_timer; } - if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) { - (void)stop_ep_timer(ep); - abort_connection(ep, skb, GFP_KERNEL); - return; - } + if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) + goto err_stop_timer; plen = ntohs(mpa->private_data_size); /* * Fail if there's too much private data. */ - if (plen > MPA_MAX_PRIVATE_DATA) { - (void)stop_ep_timer(ep); - abort_connection(ep, skb, GFP_KERNEL); - return; - } + if (plen > MPA_MAX_PRIVATE_DATA) + goto err_stop_timer; /* * If plen does not account for pkt size */ - if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) { - (void)stop_ep_timer(ep); - abort_connection(ep, skb, GFP_KERNEL); - return; - } + if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) + goto err_stop_timer; ep->plen = (u8) plen; /* * If we don't have all the pdata yet, then bail. */ if (ep->mpa_pkt_len < (sizeof(*mpa) + plen)) - return; + return 0; /* * If we get here we have accumulated the entire mpa @@ -1665,26 +1822,26 @@ static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb) ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version, ep->mpa_attr.p2p_type); - /* - * If the endpoint timer already expired, then we ignore - * the start request. process_timeout() will abort - * the connection. - */ - if (!stop_ep_timer(ep)) { - __state_set(&ep->com, MPA_REQ_RCVD); - - /* drive upcall */ - mutex_lock_nested(&ep->parent_ep->com.mutex, - SINGLE_DEPTH_NESTING); - if (ep->parent_ep->com.state != DEAD) { - if (connect_request_upcall(ep)) - abort_connection(ep, skb, GFP_KERNEL); - } else { - abort_connection(ep, skb, GFP_KERNEL); - } - mutex_unlock(&ep->parent_ep->com.mutex); + __state_set(&ep->com, MPA_REQ_RCVD); + + /* drive upcall */ + mutex_lock_nested(&ep->parent_ep->com.mutex, SINGLE_DEPTH_NESTING); + if (ep->parent_ep->com.state != DEAD) { + if (connect_request_upcall(ep)) + goto err_unlock_parent; + } else { + goto err_unlock_parent; } - return; + mutex_unlock(&ep->parent_ep->com.mutex); + return 0; + +err_unlock_parent: + mutex_unlock(&ep->parent_ep->com.mutex); + goto err_out; +err_stop_timer: + (void)stop_ep_timer(ep); +err_out: + return 2; } static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb) @@ -1693,11 +1850,10 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb) struct cpl_rx_data *hdr = cplhdr(skb); unsigned int dlen = ntohs(hdr->len); unsigned int tid = GET_TID(hdr); - struct tid_info *t = dev->rdev.lldi.tids; __u8 status = hdr->status; int disconnect = 0; - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); if (!ep) return 0; PDBG("%s ep %p tid %u dlen %u\n", __func__, ep, ep->hwtid, dlen); @@ -1715,7 +1871,7 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb) break; case MPA_REQ_WAIT: ep->rcv_seq += dlen; - process_mpa_request(ep, skb); + disconnect = process_mpa_request(ep, skb); break; case FPDU_MODE: { struct c4iw_qp_attributes attrs; @@ -1736,7 +1892,8 @@ static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb) } mutex_unlock(&ep->com.mutex); if (disconnect) - c4iw_ep_disconnect(ep, 0, GFP_KERNEL); + c4iw_ep_disconnect(ep, disconnect == 2, GFP_KERNEL); + c4iw_put_ep(&ep->com); return 0; } @@ -1746,9 +1903,8 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) struct cpl_abort_rpl_rss *rpl = cplhdr(skb); int release = 0; unsigned int tid = GET_TID(rpl); - struct tid_info *t = dev->rdev.lldi.tids; - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); if (!ep) { printk(KERN_WARNING MOD "Abort rpl to freed endpoint\n"); return 0; @@ -1770,10 +1926,11 @@ static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb) if (release) release_ep_resources(ep); + c4iw_put_ep(&ep->com); return 0; } -static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) +static int send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) { struct sk_buff *skb; struct fw_ofld_connection_wr *req; @@ -1843,7 +2000,7 @@ static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid) req->tcb.opt2 = cpu_to_be32((__force u32)req->tcb.opt2); set_wr_txq(skb, CPL_PRIORITY_CONTROL, ep->ctrlq_idx); set_bit(ACT_OFLD_CONN, &ep->com.history); - c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); + return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } /* @@ -1986,6 +2143,7 @@ static int c4iw_reconnect(struct c4iw_ep *ep) PDBG("%s qp %p cm_id %p\n", __func__, ep->com.qp, ep->com.cm_id); init_timer(&ep->timer); + c4iw_init_wr_wait(&ep->com.wr_wait); /* * Allocate an active TID to initiate a TCP connection. @@ -2069,6 +2227,7 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) struct sockaddr_in *ra; struct sockaddr_in6 *la6; struct sockaddr_in6 *ra6; + int ret = 0; ep = lookup_atid(t, atid); la = (struct sockaddr_in *)&ep->com.local_addr; @@ -2104,9 +2263,10 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) mutex_unlock(&dev->rdev.stats.lock); if (ep->com.local_addr.ss_family == AF_INET && dev->rdev.lldi.enable_fw_ofld_conn) { - send_fw_act_open_req(ep, - TID_TID_G(AOPEN_ATID_G( - ntohl(rpl->atid_status)))); + ret = send_fw_act_open_req(ep, TID_TID_G(AOPEN_ATID_G( + ntohl(rpl->atid_status)))); + if (ret) + goto fail; return 0; } break; @@ -2146,6 +2306,7 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) break; } +fail: connect_reply_upcall(ep, status2errno(status)); state_set(&ep->com, DEAD); @@ -2170,9 +2331,8 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) { struct cpl_pass_open_rpl *rpl = cplhdr(skb); - struct tid_info *t = dev->rdev.lldi.tids; unsigned int stid = GET_TID(rpl); - struct c4iw_listen_ep *ep = lookup_stid(t, stid); + struct c4iw_listen_ep *ep = get_ep_from_stid(dev, stid); if (!ep) { PDBG("%s stid %d lookup failure!\n", __func__, stid); @@ -2181,7 +2341,7 @@ static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) PDBG("%s ep %p status %d error %d\n", __func__, ep, rpl->status, status2errno(rpl->status)); c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status)); - + c4iw_put_ep(&ep->com); out: return 0; } @@ -2189,17 +2349,17 @@ out: static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb) { struct cpl_close_listsvr_rpl *rpl = cplhdr(skb); - struct tid_info *t = dev->rdev.lldi.tids; unsigned int stid = GET_TID(rpl); - struct c4iw_listen_ep *ep = lookup_stid(t, stid); + struct c4iw_listen_ep *ep = get_ep_from_stid(dev, stid); PDBG("%s ep %p\n", __func__, ep); c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status)); + c4iw_put_ep(&ep->com); return 0; } -static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb, - struct cpl_pass_accept_req *req) +static int accept_cr(struct c4iw_ep *ep, struct sk_buff *skb, + struct cpl_pass_accept_req *req) { struct cpl_pass_accept_rpl *rpl; unsigned int mtu_idx; @@ -2287,10 +2447,9 @@ static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb, rpl->opt0 = cpu_to_be64(opt0); rpl->opt2 = cpu_to_be32(opt2); set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx); - t4_set_arp_err_handler(skb, NULL, arp_failure_discard); - c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); + t4_set_arp_err_handler(skb, ep, pass_accept_rpl_arp_failure); - return; + return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t); } static void reject_cr(struct c4iw_dev *dev, u32 hwtid, struct sk_buff *skb) @@ -2355,7 +2514,7 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) unsigned short hdrs; u8 tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid)); - parent_ep = lookup_stid(t, stid); + parent_ep = (struct c4iw_ep *)get_ep_from_stid(dev, stid); if (!parent_ep) { PDBG("%s connect request on invalid stid %d\n", __func__, stid); goto reject; @@ -2468,9 +2627,13 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) init_timer(&child_ep->timer); cxgb4_insert_tid(t, child_ep, hwtid); - insert_handle(dev, &dev->hwtid_idr, child_ep, child_ep->hwtid); - accept_cr(child_ep, skb, req); - set_bit(PASS_ACCEPT_REQ, &child_ep->com.history); + insert_ep_tid(child_ep); + if (accept_cr(child_ep, skb, req)) { + c4iw_put_ep(&parent_ep->com); + release_ep_resources(child_ep); + } else { + set_bit(PASS_ACCEPT_REQ, &child_ep->com.history); + } if (iptype == 6) { sin6 = (struct sockaddr_in6 *)&child_ep->com.local_addr; cxgb4_clip_get(child_ep->com.dev->rdev.lldi.ports[0], @@ -2479,6 +2642,8 @@ static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb) goto out; reject: reject_cr(dev, hwtid, skb); + if (parent_ep) + c4iw_put_ep(&parent_ep->com); out: return 0; } @@ -2487,10 +2652,10 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb) { struct c4iw_ep *ep; struct cpl_pass_establish *req = cplhdr(skb); - struct tid_info *t = dev->rdev.lldi.tids; unsigned int tid = GET_TID(req); + int ret; - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); ep->snd_seq = be32_to_cpu(req->snd_isn); ep->rcv_seq = be32_to_cpu(req->rcv_isn); @@ -2501,10 +2666,15 @@ static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb) set_emss(ep, ntohs(req->tcp_opt)); dst_confirm(ep->dst); - state_set(&ep->com, MPA_REQ_WAIT); + mutex_lock(&ep->com.mutex); + ep->com.state = MPA_REQ_WAIT; start_ep_timer(ep); - send_flowc(ep, skb); set_bit(PASS_ESTAB, &ep->com.history); + ret = send_flowc(ep, skb); + mutex_unlock(&ep->com.mutex); + if (ret) + c4iw_ep_disconnect(ep, 1, GFP_KERNEL); + c4iw_put_ep(&ep->com); return 0; } @@ -2516,11 +2686,13 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) struct c4iw_qp_attributes attrs; int disconnect = 1; int release = 0; - struct tid_info *t = dev->rdev.lldi.tids; unsigned int tid = GET_TID(hdr); int ret; - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); + if (!ep) + return 0; + PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); dst_confirm(ep->dst); @@ -2592,6 +2764,7 @@ static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb) c4iw_ep_disconnect(ep, 0, GFP_KERNEL); if (release) release_ep_resources(ep); + c4iw_put_ep(&ep->com); return 0; } @@ -2604,10 +2777,12 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) struct c4iw_qp_attributes attrs; int ret; int release = 0; - struct tid_info *t = dev->rdev.lldi.tids; unsigned int tid = GET_TID(req); - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); + if (!ep) + return 0; + if (is_neg_adv(req->status)) { PDBG("%s Negative advice on abort- tid %u status %d (%s)\n", __func__, ep->hwtid, req->status, @@ -2616,7 +2791,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) mutex_lock(&dev->rdev.stats.lock); dev->rdev.stats.neg_adv++; mutex_unlock(&dev->rdev.stats.lock); - return 0; + goto deref_ep; } PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid, ep->com.state); @@ -2633,6 +2808,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) mutex_lock(&ep->com.mutex); switch (ep->com.state) { case CONNECTING: + c4iw_put_ep(&ep->parent_ep->com); break; case MPA_REQ_WAIT: (void)stop_ep_timer(ep); @@ -2681,7 +2857,7 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) case DEAD: PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__); mutex_unlock(&ep->com.mutex); - return 0; + goto deref_ep; default: BUG_ON(1); break; @@ -2728,6 +2904,10 @@ out: c4iw_reconnect(ep); } +deref_ep: + c4iw_put_ep(&ep->com); + /* Dereferencing ep, referenced in peer_abort_intr() */ + c4iw_put_ep(&ep->com); return 0; } @@ -2737,16 +2917,18 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb) struct c4iw_qp_attributes attrs; struct cpl_close_con_rpl *rpl = cplhdr(skb); int release = 0; - struct tid_info *t = dev->rdev.lldi.tids; unsigned int tid = GET_TID(rpl); - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); + if (!ep) + return 0; PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); BUG_ON(!ep); /* The cm_id may be null if we failed to connect */ mutex_lock(&ep->com.mutex); + set_bit(CLOSE_CON_RPL, &ep->com.history); switch (ep->com.state) { case CLOSING: __state_set(&ep->com, MORIBUND); @@ -2774,18 +2956,18 @@ static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb) mutex_unlock(&ep->com.mutex); if (release) release_ep_resources(ep); + c4iw_put_ep(&ep->com); return 0; } static int terminate(struct c4iw_dev *dev, struct sk_buff *skb) { struct cpl_rdma_terminate *rpl = cplhdr(skb); - struct tid_info *t = dev->rdev.lldi.tids; unsigned int tid = GET_TID(rpl); struct c4iw_ep *ep; struct c4iw_qp_attributes attrs; - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); BUG_ON(!ep); if (ep && ep->com.qp) { @@ -2796,6 +2978,7 @@ static int terminate(struct c4iw_dev *dev, struct sk_buff *skb) C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); } else printk(KERN_WARNING MOD "TERM received tid %u no ep/qp\n", tid); + c4iw_put_ep(&ep->com); return 0; } @@ -2811,15 +2994,16 @@ static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb) struct cpl_fw4_ack *hdr = cplhdr(skb); u8 credits = hdr->credits; unsigned int tid = GET_TID(hdr); - struct tid_info *t = dev->rdev.lldi.tids; - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); + if (!ep) + return 0; PDBG("%s ep %p tid %u credits %u\n", __func__, ep, ep->hwtid, credits); if (credits == 0) { PDBG("%s 0 credit ack ep %p tid %u state %u\n", __func__, ep, ep->hwtid, state_read(&ep->com)); - return 0; + goto out; } dst_confirm(ep->dst); @@ -2829,7 +3013,13 @@ static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb) state_read(&ep->com), ep->mpa_attr.initiator ? 1 : 0); kfree_skb(ep->mpa_skb); ep->mpa_skb = NULL; + mutex_lock(&ep->com.mutex); + if (test_bit(STOP_MPA_TIMER, &ep->com.flags)) + stop_ep_timer(ep); + mutex_unlock(&ep->com.mutex); } +out: + c4iw_put_ep(&ep->com); return 0; } @@ -2841,22 +3031,23 @@ int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len) PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); mutex_lock(&ep->com.mutex); - if (ep->com.state == DEAD) { + if (ep->com.state != MPA_REQ_RCVD) { mutex_unlock(&ep->com.mutex); c4iw_put_ep(&ep->com); return -ECONNRESET; } set_bit(ULP_REJECT, &ep->com.history); - BUG_ON(ep->com.state != MPA_REQ_RCVD); if (mpa_rev == 0) - abort_connection(ep, NULL, GFP_KERNEL); + disconnect = 2; else { err = send_mpa_reject(ep, pdata, pdata_len); disconnect = 1; } mutex_unlock(&ep->com.mutex); - if (disconnect) - err = c4iw_ep_disconnect(ep, 0, GFP_KERNEL); + if (disconnect) { + stop_ep_timer(ep); + err = c4iw_ep_disconnect(ep, disconnect == 2, GFP_KERNEL); + } c4iw_put_ep(&ep->com); return 0; } @@ -2869,24 +3060,23 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct c4iw_ep *ep = to_ep(cm_id); struct c4iw_dev *h = to_c4iw_dev(cm_id->device); struct c4iw_qp *qp = get_qhp(h, conn_param->qpn); + int abort = 0; PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid); mutex_lock(&ep->com.mutex); - if (ep->com.state == DEAD) { + if (ep->com.state != MPA_REQ_RCVD) { err = -ECONNRESET; - goto err; + goto err_out; } - BUG_ON(ep->com.state != MPA_REQ_RCVD); BUG_ON(!qp); set_bit(ULP_ACCEPT, &ep->com.history); if ((conn_param->ord > cur_max_read_depth(ep->com.dev)) || (conn_param->ird > cur_max_read_depth(ep->com.dev))) { - abort_connection(ep, NULL, GFP_KERNEL); err = -EINVAL; - goto err; + goto err_abort; } if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) { @@ -2898,9 +3088,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ep->ord = conn_param->ord; send_mpa_reject(ep, conn_param->private_data, conn_param->private_data_len); - abort_connection(ep, NULL, GFP_KERNEL); err = -ENOMEM; - goto err; + goto err_abort; } } if (conn_param->ird < ep->ord) { @@ -2908,9 +3097,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) ep->ord <= h->rdev.lldi.max_ordird_qp) { conn_param->ird = ep->ord; } else { - abort_connection(ep, NULL, GFP_KERNEL); err = -ENOMEM; - goto err; + goto err_abort; } } } @@ -2929,8 +3117,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) PDBG("%s %d ird %d ord %d\n", __func__, __LINE__, ep->ird, ep->ord); - cm_id->add_ref(cm_id); ep->com.cm_id = cm_id; + ref_cm_id(&ep->com); ep->com.qp = qp; ref_qp(ep); @@ -2951,23 +3139,27 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp, mask, &attrs, 1); if (err) - goto err1; + goto err_deref_cm_id; + + set_bit(STOP_MPA_TIMER, &ep->com.flags); err = send_mpa_reply(ep, conn_param->private_data, conn_param->private_data_len); if (err) - goto err1; + goto err_deref_cm_id; __state_set(&ep->com, FPDU_MODE); established_upcall(ep); mutex_unlock(&ep->com.mutex); c4iw_put_ep(&ep->com); return 0; -err1: - ep->com.cm_id = NULL; - abort_connection(ep, NULL, GFP_KERNEL); - cm_id->rem_ref(cm_id); -err: +err_deref_cm_id: + deref_cm_id(&ep->com); +err_abort: + abort = 1; +err_out: mutex_unlock(&ep->com.mutex); + if (abort) + c4iw_ep_disconnect(ep, 1, GFP_KERNEL); c4iw_put_ep(&ep->com); return err; } @@ -3067,9 +3259,9 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) if (peer2peer && ep->ord == 0) ep->ord = 1; - cm_id->add_ref(cm_id); - ep->com.dev = dev; ep->com.cm_id = cm_id; + ref_cm_id(&ep->com); + ep->com.dev = dev; ep->com.qp = get_qhp(dev, conn_param->qpn); if (!ep->com.qp) { PDBG("%s qpn 0x%x not found!\n", __func__, conn_param->qpn); @@ -3108,7 +3300,7 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) /* * Handle loopback requests to INADDR_ANY. */ - if ((__force int)raddr->sin_addr.s_addr == INADDR_ANY) { + if (raddr->sin_addr.s_addr == htonl(INADDR_ANY)) { err = pick_local_ipaddrs(dev, cm_id); if (err) goto fail1; @@ -3176,7 +3368,7 @@ fail2: remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid); cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid); fail1: - cm_id->rem_ref(cm_id); + deref_cm_id(&ep->com); c4iw_put_ep(&ep->com); out: return err; @@ -3270,8 +3462,8 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) goto fail1; } PDBG("%s ep %p\n", __func__, ep); - cm_id->add_ref(cm_id); ep->com.cm_id = cm_id; + ref_cm_id(&ep->com); ep->com.dev = dev; ep->backlog = backlog; memcpy(&ep->com.local_addr, &cm_id->m_local_addr, @@ -3311,7 +3503,7 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog) cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, ep->com.local_addr.ss_family); fail2: - cm_id->rem_ref(cm_id); + deref_cm_id(&ep->com); c4iw_put_ep(&ep->com); fail1: out: @@ -3350,7 +3542,7 @@ int c4iw_destroy_listen(struct iw_cm_id *cm_id) cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid, ep->com.local_addr.ss_family); done: - cm_id->rem_ref(cm_id); + deref_cm_id(&ep->com); c4iw_put_ep(&ep->com); return err; } @@ -3367,6 +3559,12 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep, states[ep->com.state], abrupt); + /* + * Ref the ep here in case we have fatal errors causing the + * ep to be released and freed. + */ + c4iw_get_ep(&ep->com); + rdev = &ep->com.dev->rdev; if (c4iw_fatal_error(rdev)) { fatal = 1; @@ -3418,10 +3616,30 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) set_bit(EP_DISC_CLOSE, &ep->com.history); ret = send_halfclose(ep, gfp); } - if (ret) + if (ret) { + set_bit(EP_DISC_FAIL, &ep->com.history); + if (!abrupt) { + stop_ep_timer(ep); + close_complete_upcall(ep, -EIO); + } + if (ep->com.qp) { + struct c4iw_qp_attributes attrs; + + attrs.next_state = C4IW_QP_STATE_ERROR; + ret = c4iw_modify_qp(ep->com.qp->rhp, + ep->com.qp, + C4IW_QP_ATTR_NEXT_STATE, + &attrs, 1); + if (ret) + pr_err(MOD + "%s - qp <- error failed!\n", + __func__); + } fatal = 1; + } } mutex_unlock(&ep->com.mutex); + c4iw_put_ep(&ep->com); if (fatal) release_ep_resources(ep); return ret; @@ -3676,7 +3894,7 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb) struct cpl_pass_accept_req *req = (void *)(rss + 1); struct l2t_entry *e; struct dst_entry *dst; - struct c4iw_ep *lep; + struct c4iw_ep *lep = NULL; u16 window; struct port_info *pi; struct net_device *pdev; @@ -3701,7 +3919,7 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb) */ stid = (__force int) cpu_to_be32((__force u32) rss->hash_val); - lep = (struct c4iw_ep *)lookup_stid(dev->rdev.lldi.tids, stid); + lep = (struct c4iw_ep *)get_ep_from_stid(dev, stid); if (!lep) { PDBG("%s connect request on invalid stid %d\n", __func__, stid); goto reject; @@ -3802,6 +4020,8 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb) free_dst: dst_release(dst); reject: + if (lep) + c4iw_put_ep(&lep->com); return 0; } @@ -3809,7 +4029,7 @@ reject: * These are the real handlers that are called from a * work queue. */ -static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = { +static c4iw_handler_func work_handlers[NUM_CPL_CMDS + NUM_FAKE_CPLS] = { [CPL_ACT_ESTABLISH] = act_establish, [CPL_ACT_OPEN_RPL] = act_open_rpl, [CPL_RX_DATA] = rx_data, @@ -3825,7 +4045,9 @@ static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = { [CPL_RDMA_TERMINATE] = terminate, [CPL_FW4_ACK] = fw4_ack, [CPL_FW6_MSG] = deferred_fw6_msg, - [CPL_RX_PKT] = rx_pkt + [CPL_RX_PKT] = rx_pkt, + [FAKE_CPL_PUT_EP_SAFE] = _put_ep_safe, + [FAKE_CPL_PASS_PUT_EP_SAFE] = _put_pass_ep_safe }; static void process_timeout(struct c4iw_ep *ep) @@ -3839,11 +4061,12 @@ static void process_timeout(struct c4iw_ep *ep) set_bit(TIMEDOUT, &ep->com.history); switch (ep->com.state) { case MPA_REQ_SENT: - __state_set(&ep->com, ABORTING); connect_reply_upcall(ep, -ETIMEDOUT); break; case MPA_REQ_WAIT: - __state_set(&ep->com, ABORTING); + case MPA_REQ_RCVD: + case MPA_REP_SENT: + case FPDU_MODE: break; case CLOSING: case MORIBUND: @@ -3853,7 +4076,6 @@ static void process_timeout(struct c4iw_ep *ep) ep->com.qp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1); } - __state_set(&ep->com, ABORTING); close_complete_upcall(ep, -ETIMEDOUT); break; case ABORTING: @@ -3871,9 +4093,9 @@ static void process_timeout(struct c4iw_ep *ep) __func__, ep, ep->hwtid, ep->com.state); abort = 0; } - if (abort) - abort_connection(ep, NULL, GFP_KERNEL); mutex_unlock(&ep->com.mutex); + if (abort) + c4iw_ep_disconnect(ep, 1, GFP_KERNEL); c4iw_put_ep(&ep->com); } @@ -4006,10 +4228,10 @@ static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb) { struct cpl_abort_req_rss *req = cplhdr(skb); struct c4iw_ep *ep; - struct tid_info *t = dev->rdev.lldi.tids; unsigned int tid = GET_TID(req); - ep = lookup_tid(t, tid); + ep = get_ep_from_tid(dev, tid); + /* This EP will be dereferenced in peer_abort() */ if (!ep) { printk(KERN_WARNING MOD "Abort on non-existent endpoint, tid %d\n", tid); @@ -4020,24 +4242,13 @@ static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb) PDBG("%s Negative advice on abort- tid %u status %d (%s)\n", __func__, ep->hwtid, req->status, neg_adv_str(req->status)); - ep->stats.abort_neg_adv++; - dev->rdev.stats.neg_adv++; - kfree_skb(skb); - return 0; + goto out; } PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid, ep->com.state); - /* - * Wake up any threads in rdma_init() or rdma_fini(). - * However, if we are on MPAv2 and want to retry with MPAv1 - * then, don't wake up yet. - */ - if (mpa_rev == 2 && !ep->tried_with_mpa_v1) { - if (ep->com.state != MPA_REQ_SENT) - c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); - } else - c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); + c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET); +out: sched(dev, skb); return 0; } diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index df43f871a..f6f34a75a 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -755,6 +755,7 @@ enum c4iw_ep_flags { CLOSE_SENT = 3, TIMEOUT = 4, QP_REFERENCED = 5, + STOP_MPA_TIMER = 7, }; enum c4iw_ep_history { @@ -779,7 +780,13 @@ enum c4iw_ep_history { EP_DISC_ABORT = 18, CONN_RPL_UPCALL = 19, ACT_RETRY_NOMEM = 20, - ACT_RETRY_INUSE = 21 + ACT_RETRY_INUSE = 21, + CLOSE_CON_RPL = 22, + EP_DISC_FAIL = 24, + QP_REFED = 25, + QP_DEREFED = 26, + CM_ID_REFED = 27, + CM_ID_DEREFED = 28, }; struct c4iw_ep_common { @@ -917,9 +924,8 @@ void c4iw_qp_rem_ref(struct ib_qp *qp); struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); -int c4iw_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents); +int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); int c4iw_dealloc_mw(struct ib_mw *mw); struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index 008be07d5..55d0651ee 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -86,8 +86,9 @@ static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr, (wait ? FW_WR_COMPL_F : 0)); req->wr.wr_lo = wait ? (__force __be64)(unsigned long) &wr_wait : 0L; req->wr.wr_mid = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(wr_len, 16))); - req->cmd = cpu_to_be32(ULPTX_CMD_V(ULP_TX_MEM_WRITE)); - req->cmd |= cpu_to_be32(T5_ULP_MEMIO_ORDER_V(1)); + req->cmd = cpu_to_be32(ULPTX_CMD_V(ULP_TX_MEM_WRITE) | + T5_ULP_MEMIO_ORDER_V(1) | + T5_ULP_MEMIO_FID_V(rdev->lldi.rxq_ids[0])); req->dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN_V(len>>5)); req->len16 = cpu_to_be32(DIV_ROUND_UP(wr_len-sizeof(req->wr), 16)); req->lock_addr = cpu_to_be32(ULP_MEMIO_ADDR_V(addr)); @@ -690,15 +691,14 @@ static int c4iw_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -int c4iw_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) { struct c4iw_mr *mhp = to_c4iw_mr(ibmr); mhp->mpl_len = 0; - return ib_sg_to_pages(ibmr, sg, sg_nents, c4iw_set_page); + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, c4iw_set_page); } int c4iw_dereg_mr(struct ib_mr *ib_mr) diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 7574f394f..dd8a86b72 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -446,20 +446,59 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr, c4iw_dev->rdev.lldi.pdev->device); } +enum counters { + IP4INSEGS, + IP4OUTSEGS, + IP4RETRANSSEGS, + IP4OUTRSTS, + IP6INSEGS, + IP6OUTSEGS, + IP6RETRANSSEGS, + IP6OUTRSTS, + NR_COUNTERS +}; + +static const char * const names[] = { + [IP4INSEGS] = "ip4InSegs", + [IP4OUTSEGS] = "ip4OutSegs", + [IP4RETRANSSEGS] = "ip4RetransSegs", + [IP4OUTRSTS] = "ip4OutRsts", + [IP6INSEGS] = "ip6InSegs", + [IP6OUTSEGS] = "ip6OutSegs", + [IP6RETRANSSEGS] = "ip6RetransSegs", + [IP6OUTRSTS] = "ip6OutRsts" +}; + +static struct rdma_hw_stats *c4iw_alloc_stats(struct ib_device *ibdev, + u8 port_num) +{ + BUILD_BUG_ON(ARRAY_SIZE(names) != NR_COUNTERS); + + if (port_num != 0) + return NULL; + + return rdma_alloc_hw_stats_struct(names, NR_COUNTERS, + RDMA_HW_STATS_DEFAULT_LIFESPAN); +} + static int c4iw_get_mib(struct ib_device *ibdev, - union rdma_protocol_stats *stats) + struct rdma_hw_stats *stats, + u8 port, int index) { struct tp_tcp_stats v4, v6; struct c4iw_dev *c4iw_dev = to_c4iw_dev(ibdev); cxgb4_get_tcp_stats(c4iw_dev->rdev.lldi.pdev, &v4, &v6); - memset(stats, 0, sizeof *stats); - stats->iw.tcpInSegs = v4.tcp_in_segs + v6.tcp_in_segs; - stats->iw.tcpOutSegs = v4.tcp_out_segs + v6.tcp_out_segs; - stats->iw.tcpRetransSegs = v4.tcp_retrans_segs + v6.tcp_retrans_segs; - stats->iw.tcpOutRsts = v4.tcp_out_rsts + v6.tcp_out_rsts; - - return 0; + stats->value[IP4INSEGS] = v4.tcp_in_segs; + stats->value[IP4OUTSEGS] = v4.tcp_out_segs; + stats->value[IP4RETRANSSEGS] = v4.tcp_retrans_segs; + stats->value[IP4OUTRSTS] = v4.tcp_out_rsts; + stats->value[IP6INSEGS] = v6.tcp_in_segs; + stats->value[IP6OUTSEGS] = v6.tcp_out_segs; + stats->value[IP6RETRANSSEGS] = v6.tcp_retrans_segs; + stats->value[IP6OUTRSTS] = v6.tcp_out_rsts; + + return stats->num_counters; } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); @@ -562,7 +601,8 @@ int c4iw_register_device(struct c4iw_dev *dev) dev->ibdev.req_notify_cq = c4iw_arm_cq; dev->ibdev.post_send = c4iw_post_send; dev->ibdev.post_recv = c4iw_post_receive; - dev->ibdev.get_protocol_stats = c4iw_get_mib; + dev->ibdev.alloc_hw_stats = c4iw_alloc_stats; + dev->ibdev.get_hw_stats = c4iw_get_mib; dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION; dev->ibdev.get_port_immutable = c4iw_port_immutable; dev->ibdev.drain_sq = c4iw_drain_sq; diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig new file mode 100644 index 000000000..f846fd51b --- /dev/null +++ b/drivers/infiniband/hw/hfi1/Kconfig @@ -0,0 +1,28 @@ +config INFINIBAND_HFI1 + tristate "Intel OPA Gen1 support" + depends on X86_64 && INFINIBAND_RDMAVT + select MMU_NOTIFIER + select CRC32 + ---help--- + This is a low-level driver for Intel OPA Gen1 adapter. +config HFI1_DEBUG_SDMA_ORDER + bool "HFI1 SDMA Order debug" + depends on INFINIBAND_HFI1 + default n + ---help--- + This is a debug flag to test for out of order + sdma completions for unit testing +config HFI1_VERBS_31BIT_PSN + bool "HFI1 enable 31 bit PSN" + depends on INFINIBAND_HFI1 + default y + ---help--- + Setting this enables 31 BIT PSN + For verbs RC/UC +config SDMA_VERBOSITY + bool "Config SDMA Verbosity" + depends on INFINIBAND_HFI1 + default n + ---help--- + This is a configuration flag to enable verbose + SDMA debug diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile new file mode 100644 index 000000000..9b5382c94 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -0,0 +1,21 @@ +# +# HFI driver +# +# +# +# Called from the kernel module build system. +# +obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o + +hfi1-y := affinity.o chip.o device.o driver.o efivar.o \ + eprom.o file_ops.o firmware.o \ + init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \ + qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o twsi.o \ + uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \ + verbs_txreq.o +hfi1-$(CONFIG_DEBUG_FS) += debugfs.o + +CFLAGS_trace.o = -I$(src) +ifdef MVERSION +CFLAGS_driver.o = -DHFI_DRIVER_VERSION_BASE=\"$(MVERSION)\" +endif diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c new file mode 100644 index 000000000..14d7eeb09 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -0,0 +1,428 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include <linux/topology.h> +#include <linux/cpumask.h> +#include <linux/module.h> + +#include "hfi.h" +#include "affinity.h" +#include "sdma.h" +#include "trace.h" + +/* Name of IRQ types, indexed by enum irq_type */ +static const char * const irq_type_names[] = { + "SDMA", + "RCVCTXT", + "GENERAL", + "OTHER", +}; + +static inline void init_cpu_mask_set(struct cpu_mask_set *set) +{ + cpumask_clear(&set->mask); + cpumask_clear(&set->used); + set->gen = 0; +} + +/* Initialize non-HT cpu cores mask */ +int init_real_cpu_mask(struct hfi1_devdata *dd) +{ + struct hfi1_affinity *info; + int possible, curr_cpu, i, ht; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + cpumask_clear(&info->real_cpu_mask); + + /* Start with cpu online mask as the real cpu mask */ + cpumask_copy(&info->real_cpu_mask, cpu_online_mask); + + /* + * Remove HT cores from the real cpu mask. Do this in two steps below. + */ + possible = cpumask_weight(&info->real_cpu_mask); + ht = cpumask_weight(topology_sibling_cpumask( + cpumask_first(&info->real_cpu_mask))); + /* + * Step 1. Skip over the first N HT siblings and use them as the + * "real" cores. Assumes that HT cores are not enumerated in + * succession (except in the single core case). + */ + curr_cpu = cpumask_first(&info->real_cpu_mask); + for (i = 0; i < possible / ht; i++) + curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask); + /* + * Step 2. Remove the remaining HT siblings. Use cpumask_next() to + * skip any gaps. + */ + for (; i < possible; i++) { + cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask); + curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask); + } + + dd->affinity = info; + return 0; +} + +/* + * Interrupt affinity. + * + * non-rcv avail gets a default mask that + * starts as possible cpus with threads reset + * and each rcv avail reset. + * + * rcv avail gets node relative 1 wrapping back + * to the node relative 1 as necessary. + * + */ +void hfi1_dev_affinity_init(struct hfi1_devdata *dd) +{ + int node = pcibus_to_node(dd->pcidev->bus); + struct hfi1_affinity *info = dd->affinity; + const struct cpumask *local_mask; + int curr_cpu, possible, i; + + if (node < 0) + node = numa_node_id(); + dd->node = node; + + spin_lock_init(&info->lock); + + init_cpu_mask_set(&info->def_intr); + init_cpu_mask_set(&info->rcv_intr); + init_cpu_mask_set(&info->proc); + + local_mask = cpumask_of_node(dd->node); + if (cpumask_first(local_mask) >= nr_cpu_ids) + local_mask = topology_core_cpumask(0); + /* Use the "real" cpu mask of this node as the default */ + cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask); + + /* fill in the receive list */ + possible = cpumask_weight(&info->def_intr.mask); + curr_cpu = cpumask_first(&info->def_intr.mask); + if (possible == 1) { + /* only one CPU, everyone will use it */ + cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask); + } else { + /* + * Retain the first CPU in the default list for the control + * context. + */ + curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask); + /* + * Remove the remaining kernel receive queues from + * the default list and add them to the receive list. + */ + for (i = 0; i < dd->n_krcv_queues - 1; i++) { + cpumask_clear_cpu(curr_cpu, &info->def_intr.mask); + cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask); + curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask); + if (curr_cpu >= nr_cpu_ids) + break; + } + } + + cpumask_copy(&info->proc.mask, cpu_online_mask); +} + +void hfi1_dev_affinity_free(struct hfi1_devdata *dd) +{ + kfree(dd->affinity); +} + +int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix) +{ + int ret; + cpumask_var_t diff; + struct cpu_mask_set *set; + struct sdma_engine *sde = NULL; + struct hfi1_ctxtdata *rcd = NULL; + char extra[64]; + int cpu = -1; + + extra[0] = '\0'; + cpumask_clear(&msix->mask); + + ret = zalloc_cpumask_var(&diff, GFP_KERNEL); + if (!ret) + return -ENOMEM; + + switch (msix->type) { + case IRQ_SDMA: + sde = (struct sdma_engine *)msix->arg; + scnprintf(extra, 64, "engine %u", sde->this_idx); + /* fall through */ + case IRQ_GENERAL: + set = &dd->affinity->def_intr; + break; + case IRQ_RCVCTXT: + rcd = (struct hfi1_ctxtdata *)msix->arg; + if (rcd->ctxt == HFI1_CTRL_CTXT) { + set = &dd->affinity->def_intr; + cpu = cpumask_first(&set->mask); + } else { + set = &dd->affinity->rcv_intr; + } + scnprintf(extra, 64, "ctxt %u", rcd->ctxt); + break; + default: + dd_dev_err(dd, "Invalid IRQ type %d\n", msix->type); + return -EINVAL; + } + + /* + * The control receive context is placed on a particular CPU, which + * is set above. Skip accounting for it. Everything else finds its + * CPU here. + */ + if (cpu == -1) { + spin_lock(&dd->affinity->lock); + if (cpumask_equal(&set->mask, &set->used)) { + /* + * We've used up all the CPUs, bump up the generation + * and reset the 'used' map + */ + set->gen++; + cpumask_clear(&set->used); + } + cpumask_andnot(diff, &set->mask, &set->used); + cpu = cpumask_first(diff); + cpumask_set_cpu(cpu, &set->used); + spin_unlock(&dd->affinity->lock); + } + + switch (msix->type) { + case IRQ_SDMA: + sde->cpu = cpu; + break; + case IRQ_GENERAL: + case IRQ_RCVCTXT: + case IRQ_OTHER: + break; + } + + cpumask_set_cpu(cpu, &msix->mask); + dd_dev_info(dd, "IRQ vector: %u, type %s %s -> cpu: %d\n", + msix->msix.vector, irq_type_names[msix->type], + extra, cpu); + irq_set_affinity_hint(msix->msix.vector, &msix->mask); + + free_cpumask_var(diff); + return 0; +} + +void hfi1_put_irq_affinity(struct hfi1_devdata *dd, + struct hfi1_msix_entry *msix) +{ + struct cpu_mask_set *set = NULL; + struct hfi1_ctxtdata *rcd; + + switch (msix->type) { + case IRQ_SDMA: + case IRQ_GENERAL: + set = &dd->affinity->def_intr; + break; + case IRQ_RCVCTXT: + rcd = (struct hfi1_ctxtdata *)msix->arg; + /* only do accounting for non control contexts */ + if (rcd->ctxt != HFI1_CTRL_CTXT) + set = &dd->affinity->rcv_intr; + break; + default: + return; + } + + if (set) { + spin_lock(&dd->affinity->lock); + cpumask_andnot(&set->used, &set->used, &msix->mask); + if (cpumask_empty(&set->used) && set->gen) { + set->gen--; + cpumask_copy(&set->used, &set->mask); + } + spin_unlock(&dd->affinity->lock); + } + + irq_set_affinity_hint(msix->msix.vector, NULL); + cpumask_clear(&msix->mask); +} + +int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node) +{ + int cpu = -1, ret; + cpumask_var_t diff, mask, intrs; + const struct cpumask *node_mask, + *proc_mask = tsk_cpus_allowed(current); + struct cpu_mask_set *set = &dd->affinity->proc; + + /* + * check whether process/context affinity has already + * been set + */ + if (cpumask_weight(proc_mask) == 1) { + hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl", + current->pid, current->comm, + cpumask_pr_args(proc_mask)); + /* + * Mark the pre-set CPU as used. This is atomic so we don't + * need the lock + */ + cpu = cpumask_first(proc_mask); + cpumask_set_cpu(cpu, &set->used); + goto done; + } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) { + hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl", + current->pid, current->comm, + cpumask_pr_args(proc_mask)); + goto done; + } + + /* + * The process does not have a preset CPU affinity so find one to + * recommend. We prefer CPUs on the same NUMA as the device. + */ + + ret = zalloc_cpumask_var(&diff, GFP_KERNEL); + if (!ret) + goto done; + ret = zalloc_cpumask_var(&mask, GFP_KERNEL); + if (!ret) + goto free_diff; + ret = zalloc_cpumask_var(&intrs, GFP_KERNEL); + if (!ret) + goto free_mask; + + spin_lock(&dd->affinity->lock); + /* + * If we've used all available CPUs, clear the mask and start + * overloading. + */ + if (cpumask_equal(&set->mask, &set->used)) { + set->gen++; + cpumask_clear(&set->used); + } + + /* CPUs used by interrupt handlers */ + cpumask_copy(intrs, (dd->affinity->def_intr.gen ? + &dd->affinity->def_intr.mask : + &dd->affinity->def_intr.used)); + cpumask_or(intrs, intrs, (dd->affinity->rcv_intr.gen ? + &dd->affinity->rcv_intr.mask : + &dd->affinity->rcv_intr.used)); + hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl", + cpumask_pr_args(intrs)); + + /* + * If we don't have a NUMA node requested, preference is towards + * device NUMA node + */ + if (node == -1) + node = dd->node; + node_mask = cpumask_of_node(node); + hfi1_cdbg(PROC, "device on NUMA %u, CPUs %*pbl", node, + cpumask_pr_args(node_mask)); + + /* diff will hold all unused cpus */ + cpumask_andnot(diff, &set->mask, &set->used); + hfi1_cdbg(PROC, "unused CPUs (all) %*pbl", cpumask_pr_args(diff)); + + /* get cpumask of available CPUs on preferred NUMA */ + cpumask_and(mask, diff, node_mask); + hfi1_cdbg(PROC, "available cpus on NUMA %*pbl", cpumask_pr_args(mask)); + + /* + * At first, we don't want to place processes on the same + * CPUs as interrupt handlers. + */ + cpumask_andnot(diff, mask, intrs); + if (!cpumask_empty(diff)) + cpumask_copy(mask, diff); + + /* + * if we don't have a cpu on the preferred NUMA, get + * the list of the remaining available CPUs + */ + if (cpumask_empty(mask)) { + cpumask_andnot(diff, &set->mask, &set->used); + cpumask_andnot(mask, diff, node_mask); + } + hfi1_cdbg(PROC, "possible CPUs for process %*pbl", + cpumask_pr_args(mask)); + + cpu = cpumask_first(mask); + if (cpu >= nr_cpu_ids) /* empty */ + cpu = -1; + else + cpumask_set_cpu(cpu, &set->used); + spin_unlock(&dd->affinity->lock); + + free_cpumask_var(intrs); +free_mask: + free_cpumask_var(mask); +free_diff: + free_cpumask_var(diff); +done: + return cpu; +} + +void hfi1_put_proc_affinity(struct hfi1_devdata *dd, int cpu) +{ + struct cpu_mask_set *set = &dd->affinity->proc; + + if (cpu < 0) + return; + spin_lock(&dd->affinity->lock); + cpumask_clear_cpu(cpu, &set->used); + if (cpumask_empty(&set->used) && set->gen) { + set->gen--; + cpumask_copy(&set->used, &set->mask); + } + spin_unlock(&dd->affinity->lock); +} + diff --git a/drivers/infiniband/hw/hfi1/affinity.h b/drivers/infiniband/hw/hfi1/affinity.h new file mode 100644 index 000000000..20f52fe74 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/affinity.h @@ -0,0 +1,108 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef _HFI1_AFFINITY_H +#define _HFI1_AFFINITY_H + +#include "hfi.h" + +enum irq_type { + IRQ_SDMA, + IRQ_RCVCTXT, + IRQ_GENERAL, + IRQ_OTHER +}; + +/* Can be used for both memory and cpu */ +enum affinity_flags { + AFF_AUTO, + AFF_NUMA_LOCAL, + AFF_DEV_LOCAL, + AFF_IRQ_LOCAL +}; + +struct cpu_mask_set { + struct cpumask mask; + struct cpumask used; + uint gen; +}; + +struct hfi1_affinity { + struct cpu_mask_set def_intr; + struct cpu_mask_set rcv_intr; + struct cpu_mask_set proc; + struct cpumask real_cpu_mask; + /* spin lock to protect affinity struct */ + spinlock_t lock; +}; + +struct hfi1_msix_entry; + +/* Initialize non-HT cpu cores mask */ +int init_real_cpu_mask(struct hfi1_devdata *); +/* Initialize driver affinity data */ +void hfi1_dev_affinity_init(struct hfi1_devdata *); +/* Free driver affinity data */ +void hfi1_dev_affinity_free(struct hfi1_devdata *); +/* + * Set IRQ affinity to a CPU. The function will determine the + * CPU and set the affinity to it. + */ +int hfi1_get_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *); +/* + * Remove the IRQ's CPU affinity. This function also updates + * any internal CPU tracking data + */ +void hfi1_put_irq_affinity(struct hfi1_devdata *, struct hfi1_msix_entry *); +/* + * Determine a CPU affinity for a user process, if the process does not + * have an affinity set yet. + */ +int hfi1_get_proc_affinity(struct hfi1_devdata *, int); +/* Release a CPU used by a user process. */ +void hfi1_put_proc_affinity(struct hfi1_devdata *, int); + +#endif /* _HFI1_AFFINITY_H */ diff --git a/drivers/infiniband/hw/hfi1/aspm.h b/drivers/infiniband/hw/hfi1/aspm.h new file mode 100644 index 000000000..0d58fe3b4 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/aspm.h @@ -0,0 +1,309 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef _ASPM_H +#define _ASPM_H + +#include "hfi.h" + +extern uint aspm_mode; + +enum aspm_mode { + ASPM_MODE_DISABLED = 0, /* ASPM always disabled, performance mode */ + ASPM_MODE_ENABLED = 1, /* ASPM always enabled, power saving mode */ + ASPM_MODE_DYNAMIC = 2, /* ASPM enabled/disabled dynamically */ +}; + +/* Time after which the timer interrupt will re-enable ASPM */ +#define ASPM_TIMER_MS 1000 +/* Time for which interrupts are ignored after a timer has been scheduled */ +#define ASPM_RESCHED_TIMER_MS (ASPM_TIMER_MS / 2) +/* Two interrupts within this time trigger ASPM disable */ +#define ASPM_TRIGGER_MS 1 +#define ASPM_TRIGGER_NS (ASPM_TRIGGER_MS * 1000 * 1000ull) +#define ASPM_L1_SUPPORTED(reg) \ + (((reg & PCI_EXP_LNKCAP_ASPMS) >> 10) & 0x2) + +static inline bool aspm_hw_l1_supported(struct hfi1_devdata *dd) +{ + struct pci_dev *parent = dd->pcidev->bus->self; + u32 up, dn; + + /* + * If the driver does not have access to the upstream component, + * it cannot support ASPM L1 at all. + */ + if (!parent) + return false; + + pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &dn); + dn = ASPM_L1_SUPPORTED(dn); + + pcie_capability_read_dword(parent, PCI_EXP_LNKCAP, &up); + up = ASPM_L1_SUPPORTED(up); + + /* ASPM works on A-step but is reported as not supported */ + return (!!dn || is_ax(dd)) && !!up; +} + +/* Set L1 entrance latency for slower entry to L1 */ +static inline void aspm_hw_set_l1_ent_latency(struct hfi1_devdata *dd) +{ + u32 l1_ent_lat = 0x4u; + u32 reg32; + + pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, ®32); + reg32 &= ~PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK; + reg32 |= l1_ent_lat << PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT; + pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL3, reg32); +} + +static inline void aspm_hw_enable_l1(struct hfi1_devdata *dd) +{ + struct pci_dev *parent = dd->pcidev->bus->self; + + /* + * If the driver does not have access to the upstream component, + * it cannot support ASPM L1 at all. + */ + if (!parent) + return; + + /* Enable ASPM L1 first in upstream component and then downstream */ + pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, + PCI_EXP_LNKCTL_ASPM_L1); + pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, + PCI_EXP_LNKCTL_ASPM_L1); +} + +static inline void aspm_hw_disable_l1(struct hfi1_devdata *dd) +{ + struct pci_dev *parent = dd->pcidev->bus->self; + + /* Disable ASPM L1 first in downstream component and then upstream */ + pcie_capability_clear_and_set_word(dd->pcidev, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, 0x0); + if (parent) + pcie_capability_clear_and_set_word(parent, PCI_EXP_LNKCTL, + PCI_EXP_LNKCTL_ASPMC, 0x0); +} + +static inline void aspm_enable(struct hfi1_devdata *dd) +{ + if (dd->aspm_enabled || aspm_mode == ASPM_MODE_DISABLED || + !dd->aspm_supported) + return; + + aspm_hw_enable_l1(dd); + dd->aspm_enabled = true; +} + +static inline void aspm_disable(struct hfi1_devdata *dd) +{ + if (!dd->aspm_enabled || aspm_mode == ASPM_MODE_ENABLED) + return; + + aspm_hw_disable_l1(dd); + dd->aspm_enabled = false; +} + +static inline void aspm_disable_inc(struct hfi1_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->aspm_lock, flags); + aspm_disable(dd); + atomic_inc(&dd->aspm_disabled_cnt); + spin_unlock_irqrestore(&dd->aspm_lock, flags); +} + +static inline void aspm_enable_dec(struct hfi1_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->aspm_lock, flags); + if (atomic_dec_and_test(&dd->aspm_disabled_cnt)) + aspm_enable(dd); + spin_unlock_irqrestore(&dd->aspm_lock, flags); +} + +/* ASPM processing for each receive context interrupt */ +static inline void aspm_ctx_disable(struct hfi1_ctxtdata *rcd) +{ + bool restart_timer; + bool close_interrupts; + unsigned long flags; + ktime_t now, prev; + + /* Quickest exit for minimum impact */ + if (!rcd->aspm_intr_supported) + return; + + spin_lock_irqsave(&rcd->aspm_lock, flags); + /* PSM contexts are open */ + if (!rcd->aspm_intr_enable) + goto unlock; + + prev = rcd->aspm_ts_last_intr; + now = ktime_get(); + rcd->aspm_ts_last_intr = now; + + /* An interrupt pair close together in time */ + close_interrupts = ktime_to_ns(ktime_sub(now, prev)) < ASPM_TRIGGER_NS; + + /* Don't push out our timer till this much time has elapsed */ + restart_timer = ktime_to_ns(ktime_sub(now, rcd->aspm_ts_timer_sched)) > + ASPM_RESCHED_TIMER_MS * NSEC_PER_MSEC; + restart_timer = restart_timer && close_interrupts; + + /* Disable ASPM and schedule timer */ + if (rcd->aspm_enabled && close_interrupts) { + aspm_disable_inc(rcd->dd); + rcd->aspm_enabled = false; + restart_timer = true; + } + + if (restart_timer) { + mod_timer(&rcd->aspm_timer, + jiffies + msecs_to_jiffies(ASPM_TIMER_MS)); + rcd->aspm_ts_timer_sched = now; + } +unlock: + spin_unlock_irqrestore(&rcd->aspm_lock, flags); +} + +/* Timer function for re-enabling ASPM in the absence of interrupt activity */ +static inline void aspm_ctx_timer_function(unsigned long data) +{ + struct hfi1_ctxtdata *rcd = (struct hfi1_ctxtdata *)data; + unsigned long flags; + + spin_lock_irqsave(&rcd->aspm_lock, flags); + aspm_enable_dec(rcd->dd); + rcd->aspm_enabled = true; + spin_unlock_irqrestore(&rcd->aspm_lock, flags); +} + +/* Disable interrupt processing for verbs contexts when PSM contexts are open */ +static inline void aspm_disable_all(struct hfi1_devdata *dd) +{ + struct hfi1_ctxtdata *rcd; + unsigned long flags; + unsigned i; + + for (i = 0; i < dd->first_user_ctxt; i++) { + rcd = dd->rcd[i]; + del_timer_sync(&rcd->aspm_timer); + spin_lock_irqsave(&rcd->aspm_lock, flags); + rcd->aspm_intr_enable = false; + spin_unlock_irqrestore(&rcd->aspm_lock, flags); + } + + aspm_disable(dd); + atomic_set(&dd->aspm_disabled_cnt, 0); +} + +/* Re-enable interrupt processing for verbs contexts */ +static inline void aspm_enable_all(struct hfi1_devdata *dd) +{ + struct hfi1_ctxtdata *rcd; + unsigned long flags; + unsigned i; + + aspm_enable(dd); + + if (aspm_mode != ASPM_MODE_DYNAMIC) + return; + + for (i = 0; i < dd->first_user_ctxt; i++) { + rcd = dd->rcd[i]; + spin_lock_irqsave(&rcd->aspm_lock, flags); + rcd->aspm_intr_enable = true; + rcd->aspm_enabled = true; + spin_unlock_irqrestore(&rcd->aspm_lock, flags); + } +} + +static inline void aspm_ctx_init(struct hfi1_ctxtdata *rcd) +{ + spin_lock_init(&rcd->aspm_lock); + setup_timer(&rcd->aspm_timer, aspm_ctx_timer_function, + (unsigned long)rcd); + rcd->aspm_intr_supported = rcd->dd->aspm_supported && + aspm_mode == ASPM_MODE_DYNAMIC && + rcd->ctxt < rcd->dd->first_user_ctxt; +} + +static inline void aspm_init(struct hfi1_devdata *dd) +{ + unsigned i; + + spin_lock_init(&dd->aspm_lock); + dd->aspm_supported = aspm_hw_l1_supported(dd); + + for (i = 0; i < dd->first_user_ctxt; i++) + aspm_ctx_init(dd->rcd[i]); + + /* Start with ASPM disabled */ + aspm_hw_set_l1_ent_latency(dd); + dd->aspm_enabled = false; + aspm_hw_disable_l1(dd); + + /* Now turn on ASPM if configured */ + aspm_enable_all(dd); +} + +static inline void aspm_exit(struct hfi1_devdata *dd) +{ + aspm_disable_all(dd); + + /* Turn on ASPM on exit to conserve power */ + aspm_enable(dd); +} + +#endif /* _ASPM_H */ diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c new file mode 100644 index 000000000..dad4d0ebb --- /dev/null +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -0,0 +1,14726 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This file contains all of the code that is specific to the HFI chip + */ + +#include <linux/pci.h> +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/module.h> + +#include "hfi.h" +#include "trace.h" +#include "mad.h" +#include "pio.h" +#include "sdma.h" +#include "eprom.h" +#include "efivar.h" +#include "platform.h" +#include "aspm.h" + +#define NUM_IB_PORTS 1 + +uint kdeth_qp; +module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO); +MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix"); + +uint num_vls = HFI1_MAX_VLS_SUPPORTED; +module_param(num_vls, uint, S_IRUGO); +MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)"); + +/* + * Default time to aggregate two 10K packets from the idle state + * (timer not running). The timer starts at the end of the first packet, + * so only the time for one 10K packet and header plus a bit extra is needed. + * 10 * 1024 + 64 header byte = 10304 byte + * 10304 byte / 12.5 GB/s = 824.32ns + */ +uint rcv_intr_timeout = (824 + 16); /* 16 is for coalescing interrupt */ +module_param(rcv_intr_timeout, uint, S_IRUGO); +MODULE_PARM_DESC(rcv_intr_timeout, "Receive interrupt mitigation timeout in ns"); + +uint rcv_intr_count = 16; /* same as qib */ +module_param(rcv_intr_count, uint, S_IRUGO); +MODULE_PARM_DESC(rcv_intr_count, "Receive interrupt mitigation count"); + +ushort link_crc_mask = SUPPORTED_CRCS; +module_param(link_crc_mask, ushort, S_IRUGO); +MODULE_PARM_DESC(link_crc_mask, "CRCs to use on the link"); + +uint loopback; +module_param_named(loopback, loopback, uint, S_IRUGO); +MODULE_PARM_DESC(loopback, "Put into loopback mode (1 = serdes, 3 = external cable"); + +/* Other driver tunables */ +uint rcv_intr_dynamic = 1; /* enable dynamic mode for rcv int mitigation*/ +static ushort crc_14b_sideband = 1; +static uint use_flr = 1; +uint quick_linkup; /* skip LNI */ + +struct flag_table { + u64 flag; /* the flag */ + char *str; /* description string */ + u16 extra; /* extra information */ + u16 unused0; + u32 unused1; +}; + +/* str must be a string constant */ +#define FLAG_ENTRY(str, extra, flag) {flag, str, extra} +#define FLAG_ENTRY0(str, flag) {flag, str, 0} + +/* Send Error Consequences */ +#define SEC_WRITE_DROPPED 0x1 +#define SEC_PACKET_DROPPED 0x2 +#define SEC_SC_HALTED 0x4 /* per-context only */ +#define SEC_SPC_FREEZE 0x8 /* per-HFI only */ + +#define MIN_KERNEL_KCTXTS 2 +#define FIRST_KERNEL_KCTXT 1 +/* sizes for both the QP and RSM map tables */ +#define NUM_MAP_ENTRIES 256 +#define NUM_MAP_REGS 32 + +/* Bit offset into the GUID which carries HFI id information */ +#define GUID_HFI_INDEX_SHIFT 39 + +/* extract the emulation revision */ +#define emulator_rev(dd) ((dd)->irev >> 8) +/* parallel and serial emulation versions are 3 and 4 respectively */ +#define is_emulator_p(dd) ((((dd)->irev) & 0xf) == 3) +#define is_emulator_s(dd) ((((dd)->irev) & 0xf) == 4) + +/* RSM fields */ + +/* packet type */ +#define IB_PACKET_TYPE 2ull +#define QW_SHIFT 6ull +/* QPN[7..1] */ +#define QPN_WIDTH 7ull + +/* LRH.BTH: QW 0, OFFSET 48 - for match */ +#define LRH_BTH_QW 0ull +#define LRH_BTH_BIT_OFFSET 48ull +#define LRH_BTH_OFFSET(off) ((LRH_BTH_QW << QW_SHIFT) | (off)) +#define LRH_BTH_MATCH_OFFSET LRH_BTH_OFFSET(LRH_BTH_BIT_OFFSET) +#define LRH_BTH_SELECT +#define LRH_BTH_MASK 3ull +#define LRH_BTH_VALUE 2ull + +/* LRH.SC[3..0] QW 0, OFFSET 56 - for match */ +#define LRH_SC_QW 0ull +#define LRH_SC_BIT_OFFSET 56ull +#define LRH_SC_OFFSET(off) ((LRH_SC_QW << QW_SHIFT) | (off)) +#define LRH_SC_MATCH_OFFSET LRH_SC_OFFSET(LRH_SC_BIT_OFFSET) +#define LRH_SC_MASK 128ull +#define LRH_SC_VALUE 0ull + +/* SC[n..0] QW 0, OFFSET 60 - for select */ +#define LRH_SC_SELECT_OFFSET ((LRH_SC_QW << QW_SHIFT) | (60ull)) + +/* QPN[m+n:1] QW 1, OFFSET 1 */ +#define QPN_SELECT_OFFSET ((1ull << QW_SHIFT) | (1ull)) + +/* defines to build power on SC2VL table */ +#define SC2VL_VAL( \ + num, \ + sc0, sc0val, \ + sc1, sc1val, \ + sc2, sc2val, \ + sc3, sc3val, \ + sc4, sc4val, \ + sc5, sc5val, \ + sc6, sc6val, \ + sc7, sc7val) \ +( \ + ((u64)(sc0val) << SEND_SC2VLT##num##_SC##sc0##_SHIFT) | \ + ((u64)(sc1val) << SEND_SC2VLT##num##_SC##sc1##_SHIFT) | \ + ((u64)(sc2val) << SEND_SC2VLT##num##_SC##sc2##_SHIFT) | \ + ((u64)(sc3val) << SEND_SC2VLT##num##_SC##sc3##_SHIFT) | \ + ((u64)(sc4val) << SEND_SC2VLT##num##_SC##sc4##_SHIFT) | \ + ((u64)(sc5val) << SEND_SC2VLT##num##_SC##sc5##_SHIFT) | \ + ((u64)(sc6val) << SEND_SC2VLT##num##_SC##sc6##_SHIFT) | \ + ((u64)(sc7val) << SEND_SC2VLT##num##_SC##sc7##_SHIFT) \ +) + +#define DC_SC_VL_VAL( \ + range, \ + e0, e0val, \ + e1, e1val, \ + e2, e2val, \ + e3, e3val, \ + e4, e4val, \ + e5, e5val, \ + e6, e6val, \ + e7, e7val, \ + e8, e8val, \ + e9, e9val, \ + e10, e10val, \ + e11, e11val, \ + e12, e12val, \ + e13, e13val, \ + e14, e14val, \ + e15, e15val) \ +( \ + ((u64)(e0val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e0##_SHIFT) | \ + ((u64)(e1val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e1##_SHIFT) | \ + ((u64)(e2val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e2##_SHIFT) | \ + ((u64)(e3val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e3##_SHIFT) | \ + ((u64)(e4val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e4##_SHIFT) | \ + ((u64)(e5val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e5##_SHIFT) | \ + ((u64)(e6val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e6##_SHIFT) | \ + ((u64)(e7val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e7##_SHIFT) | \ + ((u64)(e8val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e8##_SHIFT) | \ + ((u64)(e9val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e9##_SHIFT) | \ + ((u64)(e10val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e10##_SHIFT) | \ + ((u64)(e11val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e11##_SHIFT) | \ + ((u64)(e12val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e12##_SHIFT) | \ + ((u64)(e13val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e13##_SHIFT) | \ + ((u64)(e14val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e14##_SHIFT) | \ + ((u64)(e15val) << DCC_CFG_SC_VL_TABLE_##range##_ENTRY##e15##_SHIFT) \ +) + +/* all CceStatus sub-block freeze bits */ +#define ALL_FROZE (CCE_STATUS_SDMA_FROZE_SMASK \ + | CCE_STATUS_RXE_FROZE_SMASK \ + | CCE_STATUS_TXE_FROZE_SMASK \ + | CCE_STATUS_TXE_PIO_FROZE_SMASK) +/* all CceStatus sub-block TXE pause bits */ +#define ALL_TXE_PAUSE (CCE_STATUS_TXE_PIO_PAUSED_SMASK \ + | CCE_STATUS_TXE_PAUSED_SMASK \ + | CCE_STATUS_SDMA_PAUSED_SMASK) +/* all CceStatus sub-block RXE pause bits */ +#define ALL_RXE_PAUSE CCE_STATUS_RXE_PAUSED_SMASK + +/* + * CCE Error flags. + */ +static struct flag_table cce_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("CceCsrParityErr", + CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK), +/* 1*/ FLAG_ENTRY0("CceCsrReadBadAddrErr", + CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK), +/* 2*/ FLAG_ENTRY0("CceCsrWriteBadAddrErr", + CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK), +/* 3*/ FLAG_ENTRY0("CceTrgtAsyncFifoParityErr", + CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK), +/* 4*/ FLAG_ENTRY0("CceTrgtAccessErr", + CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK), +/* 5*/ FLAG_ENTRY0("CceRspdDataParityErr", + CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK), +/* 6*/ FLAG_ENTRY0("CceCli0AsyncFifoParityErr", + CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK), +/* 7*/ FLAG_ENTRY0("CceCsrCfgBusParityErr", + CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK), +/* 8*/ FLAG_ENTRY0("CceCli2AsyncFifoParityErr", + CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK), +/* 9*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr", + CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK), +/*10*/ FLAG_ENTRY0("CceCli1AsyncFifoPioCrdtParityErr", + CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK), +/*11*/ FLAG_ENTRY0("CceCli1AsyncFifoRxdmaParityError", + CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK), +/*12*/ FLAG_ENTRY0("CceCli1AsyncFifoDbgParityError", + CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK), +/*13*/ FLAG_ENTRY0("PcicRetryMemCorErr", + CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK), +/*14*/ FLAG_ENTRY0("PcicRetryMemCorErr", + CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK), +/*15*/ FLAG_ENTRY0("PcicPostHdQCorErr", + CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK), +/*16*/ FLAG_ENTRY0("PcicPostHdQCorErr", + CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK), +/*17*/ FLAG_ENTRY0("PcicPostHdQCorErr", + CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK), +/*18*/ FLAG_ENTRY0("PcicCplDatQCorErr", + CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK), +/*19*/ FLAG_ENTRY0("PcicNPostHQParityErr", + CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK), +/*20*/ FLAG_ENTRY0("PcicNPostDatQParityErr", + CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK), +/*21*/ FLAG_ENTRY0("PcicRetryMemUncErr", + CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK), +/*22*/ FLAG_ENTRY0("PcicRetrySotMemUncErr", + CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK), +/*23*/ FLAG_ENTRY0("PcicPostHdQUncErr", + CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK), +/*24*/ FLAG_ENTRY0("PcicPostDatQUncErr", + CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK), +/*25*/ FLAG_ENTRY0("PcicCplHdQUncErr", + CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK), +/*26*/ FLAG_ENTRY0("PcicCplDatQUncErr", + CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK), +/*27*/ FLAG_ENTRY0("PcicTransmitFrontParityErr", + CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK), +/*28*/ FLAG_ENTRY0("PcicTransmitBackParityErr", + CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK), +/*29*/ FLAG_ENTRY0("PcicReceiveParityErr", + CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK), +/*30*/ FLAG_ENTRY0("CceTrgtCplTimeoutErr", + CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK), +/*31*/ FLAG_ENTRY0("LATriggered", + CCE_ERR_STATUS_LA_TRIGGERED_SMASK), +/*32*/ FLAG_ENTRY0("CceSegReadBadAddrErr", + CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK), +/*33*/ FLAG_ENTRY0("CceSegWriteBadAddrErr", + CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK), +/*34*/ FLAG_ENTRY0("CceRcplAsyncFifoParityErr", + CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK), +/*35*/ FLAG_ENTRY0("CceRxdmaConvFifoParityErr", + CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK), +/*36*/ FLAG_ENTRY0("CceMsixTableCorErr", + CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK), +/*37*/ FLAG_ENTRY0("CceMsixTableUncErr", + CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK), +/*38*/ FLAG_ENTRY0("CceIntMapCorErr", + CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK), +/*39*/ FLAG_ENTRY0("CceIntMapUncErr", + CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK), +/*40*/ FLAG_ENTRY0("CceMsixCsrParityErr", + CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK), +/*41-63 reserved*/ +}; + +/* + * Misc Error flags + */ +#define MES(text) MISC_ERR_STATUS_MISC_##text##_ERR_SMASK +static struct flag_table misc_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("CSR_PARITY", MES(CSR_PARITY)), +/* 1*/ FLAG_ENTRY0("CSR_READ_BAD_ADDR", MES(CSR_READ_BAD_ADDR)), +/* 2*/ FLAG_ENTRY0("CSR_WRITE_BAD_ADDR", MES(CSR_WRITE_BAD_ADDR)), +/* 3*/ FLAG_ENTRY0("SBUS_WRITE_FAILED", MES(SBUS_WRITE_FAILED)), +/* 4*/ FLAG_ENTRY0("KEY_MISMATCH", MES(KEY_MISMATCH)), +/* 5*/ FLAG_ENTRY0("FW_AUTH_FAILED", MES(FW_AUTH_FAILED)), +/* 6*/ FLAG_ENTRY0("EFUSE_CSR_PARITY", MES(EFUSE_CSR_PARITY)), +/* 7*/ FLAG_ENTRY0("EFUSE_READ_BAD_ADDR", MES(EFUSE_READ_BAD_ADDR)), +/* 8*/ FLAG_ENTRY0("EFUSE_WRITE", MES(EFUSE_WRITE)), +/* 9*/ FLAG_ENTRY0("EFUSE_DONE_PARITY", MES(EFUSE_DONE_PARITY)), +/*10*/ FLAG_ENTRY0("INVALID_EEP_CMD", MES(INVALID_EEP_CMD)), +/*11*/ FLAG_ENTRY0("MBIST_FAIL", MES(MBIST_FAIL)), +/*12*/ FLAG_ENTRY0("PLL_LOCK_FAIL", MES(PLL_LOCK_FAIL)) +}; + +/* + * TXE PIO Error flags and consequences + */ +static struct flag_table pio_err_status_flags[] = { +/* 0*/ FLAG_ENTRY("PioWriteBadCtxt", + SEC_WRITE_DROPPED, + SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK), +/* 1*/ FLAG_ENTRY("PioWriteAddrParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK), +/* 2*/ FLAG_ENTRY("PioCsrParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK), +/* 3*/ FLAG_ENTRY("PioSbMemFifo0", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK), +/* 4*/ FLAG_ENTRY("PioSbMemFifo1", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK), +/* 5*/ FLAG_ENTRY("PioPccFifoParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK), +/* 6*/ FLAG_ENTRY("PioPecFifoParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK), +/* 7*/ FLAG_ENTRY("PioSbrdctlCrrelParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK), +/* 8*/ FLAG_ENTRY("PioSbrdctrlCrrelFifoParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK), +/* 9*/ FLAG_ENTRY("PioPktEvictFifoParityErr", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK), +/*10*/ FLAG_ENTRY("PioSmPktResetParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK), +/*11*/ FLAG_ENTRY("PioVlLenMemBank0Unc", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK), +/*12*/ FLAG_ENTRY("PioVlLenMemBank1Unc", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK), +/*13*/ FLAG_ENTRY("PioVlLenMemBank0Cor", + 0, + SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK), +/*14*/ FLAG_ENTRY("PioVlLenMemBank1Cor", + 0, + SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK), +/*15*/ FLAG_ENTRY("PioCreditRetFifoParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK), +/*16*/ FLAG_ENTRY("PioPpmcPblFifo", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK), +/*17*/ FLAG_ENTRY("PioInitSmIn", + 0, + SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK), +/*18*/ FLAG_ENTRY("PioPktEvictSmOrArbSm", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK), +/*19*/ FLAG_ENTRY("PioHostAddrMemUnc", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK), +/*20*/ FLAG_ENTRY("PioHostAddrMemCor", + 0, + SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK), +/*21*/ FLAG_ENTRY("PioWriteDataParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK), +/*22*/ FLAG_ENTRY("PioStateMachine", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK), +/*23*/ FLAG_ENTRY("PioWriteQwValidParity", + SEC_WRITE_DROPPED | SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK), +/*24*/ FLAG_ENTRY("PioBlockQwCountParity", + SEC_WRITE_DROPPED | SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK), +/*25*/ FLAG_ENTRY("PioVlfVlLenParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK), +/*26*/ FLAG_ENTRY("PioVlfSopParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK), +/*27*/ FLAG_ENTRY("PioVlFifoParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK), +/*28*/ FLAG_ENTRY("PioPpmcBqcMemParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK), +/*29*/ FLAG_ENTRY("PioPpmcSopLen", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK), +/*30-31 reserved*/ +/*32*/ FLAG_ENTRY("PioCurrentFreeCntParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK), +/*33*/ FLAG_ENTRY("PioLastReturnedCntParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK), +/*34*/ FLAG_ENTRY("PioPccSopHeadParity", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK), +/*35*/ FLAG_ENTRY("PioPecSopHeadParityErr", + SEC_SPC_FREEZE, + SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK), +/*36-63 reserved*/ +}; + +/* TXE PIO errors that cause an SPC freeze */ +#define ALL_PIO_FREEZE_ERR \ + (SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \ + | SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK) + +/* + * TXE SDMA Error flags + */ +static struct flag_table sdma_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("SDmaRpyTagErr", + SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK), +/* 1*/ FLAG_ENTRY0("SDmaCsrParityErr", + SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK), +/* 2*/ FLAG_ENTRY0("SDmaPcieReqTrackingUncErr", + SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK), +/* 3*/ FLAG_ENTRY0("SDmaPcieReqTrackingCorErr", + SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK), +/*04-63 reserved*/ +}; + +/* TXE SDMA errors that cause an SPC freeze */ +#define ALL_SDMA_FREEZE_ERR \ + (SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK \ + | SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK \ + | SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK) + +/* SendEgressErrInfo bits that correspond to a PortXmitDiscard counter */ +#define PORT_DISCARD_EGRESS_ERRS \ + (SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK \ + | SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK \ + | SEND_EGRESS_ERR_INFO_VL_ERR_SMASK) + +/* + * TXE Egress Error flags + */ +#define SEES(text) SEND_EGRESS_ERR_STATUS_##text##_ERR_SMASK +static struct flag_table egress_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("TxPktIntegrityMemCorErr", SEES(TX_PKT_INTEGRITY_MEM_COR)), +/* 1*/ FLAG_ENTRY0("TxPktIntegrityMemUncErr", SEES(TX_PKT_INTEGRITY_MEM_UNC)), +/* 2 reserved */ +/* 3*/ FLAG_ENTRY0("TxEgressFifoUnderrunOrParityErr", + SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY)), +/* 4*/ FLAG_ENTRY0("TxLinkdownErr", SEES(TX_LINKDOWN)), +/* 5*/ FLAG_ENTRY0("TxIncorrectLinkStateErr", SEES(TX_INCORRECT_LINK_STATE)), +/* 6 reserved */ +/* 7*/ FLAG_ENTRY0("TxPioLaunchIntfParityErr", + SEES(TX_PIO_LAUNCH_INTF_PARITY)), +/* 8*/ FLAG_ENTRY0("TxSdmaLaunchIntfParityErr", + SEES(TX_SDMA_LAUNCH_INTF_PARITY)), +/* 9-10 reserved */ +/*11*/ FLAG_ENTRY0("TxSbrdCtlStateMachineParityErr", + SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY)), +/*12*/ FLAG_ENTRY0("TxIllegalVLErr", SEES(TX_ILLEGAL_VL)), +/*13*/ FLAG_ENTRY0("TxLaunchCsrParityErr", SEES(TX_LAUNCH_CSR_PARITY)), +/*14*/ FLAG_ENTRY0("TxSbrdCtlCsrParityErr", SEES(TX_SBRD_CTL_CSR_PARITY)), +/*15*/ FLAG_ENTRY0("TxConfigParityErr", SEES(TX_CONFIG_PARITY)), +/*16*/ FLAG_ENTRY0("TxSdma0DisallowedPacketErr", + SEES(TX_SDMA0_DISALLOWED_PACKET)), +/*17*/ FLAG_ENTRY0("TxSdma1DisallowedPacketErr", + SEES(TX_SDMA1_DISALLOWED_PACKET)), +/*18*/ FLAG_ENTRY0("TxSdma2DisallowedPacketErr", + SEES(TX_SDMA2_DISALLOWED_PACKET)), +/*19*/ FLAG_ENTRY0("TxSdma3DisallowedPacketErr", + SEES(TX_SDMA3_DISALLOWED_PACKET)), +/*20*/ FLAG_ENTRY0("TxSdma4DisallowedPacketErr", + SEES(TX_SDMA4_DISALLOWED_PACKET)), +/*21*/ FLAG_ENTRY0("TxSdma5DisallowedPacketErr", + SEES(TX_SDMA5_DISALLOWED_PACKET)), +/*22*/ FLAG_ENTRY0("TxSdma6DisallowedPacketErr", + SEES(TX_SDMA6_DISALLOWED_PACKET)), +/*23*/ FLAG_ENTRY0("TxSdma7DisallowedPacketErr", + SEES(TX_SDMA7_DISALLOWED_PACKET)), +/*24*/ FLAG_ENTRY0("TxSdma8DisallowedPacketErr", + SEES(TX_SDMA8_DISALLOWED_PACKET)), +/*25*/ FLAG_ENTRY0("TxSdma9DisallowedPacketErr", + SEES(TX_SDMA9_DISALLOWED_PACKET)), +/*26*/ FLAG_ENTRY0("TxSdma10DisallowedPacketErr", + SEES(TX_SDMA10_DISALLOWED_PACKET)), +/*27*/ FLAG_ENTRY0("TxSdma11DisallowedPacketErr", + SEES(TX_SDMA11_DISALLOWED_PACKET)), +/*28*/ FLAG_ENTRY0("TxSdma12DisallowedPacketErr", + SEES(TX_SDMA12_DISALLOWED_PACKET)), +/*29*/ FLAG_ENTRY0("TxSdma13DisallowedPacketErr", + SEES(TX_SDMA13_DISALLOWED_PACKET)), +/*30*/ FLAG_ENTRY0("TxSdma14DisallowedPacketErr", + SEES(TX_SDMA14_DISALLOWED_PACKET)), +/*31*/ FLAG_ENTRY0("TxSdma15DisallowedPacketErr", + SEES(TX_SDMA15_DISALLOWED_PACKET)), +/*32*/ FLAG_ENTRY0("TxLaunchFifo0UncOrParityErr", + SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY)), +/*33*/ FLAG_ENTRY0("TxLaunchFifo1UncOrParityErr", + SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY)), +/*34*/ FLAG_ENTRY0("TxLaunchFifo2UncOrParityErr", + SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY)), +/*35*/ FLAG_ENTRY0("TxLaunchFifo3UncOrParityErr", + SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY)), +/*36*/ FLAG_ENTRY0("TxLaunchFifo4UncOrParityErr", + SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY)), +/*37*/ FLAG_ENTRY0("TxLaunchFifo5UncOrParityErr", + SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY)), +/*38*/ FLAG_ENTRY0("TxLaunchFifo6UncOrParityErr", + SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY)), +/*39*/ FLAG_ENTRY0("TxLaunchFifo7UncOrParityErr", + SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY)), +/*40*/ FLAG_ENTRY0("TxLaunchFifo8UncOrParityErr", + SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY)), +/*41*/ FLAG_ENTRY0("TxCreditReturnParityErr", SEES(TX_CREDIT_RETURN_PARITY)), +/*42*/ FLAG_ENTRY0("TxSbHdrUncErr", SEES(TX_SB_HDR_UNC)), +/*43*/ FLAG_ENTRY0("TxReadSdmaMemoryUncErr", SEES(TX_READ_SDMA_MEMORY_UNC)), +/*44*/ FLAG_ENTRY0("TxReadPioMemoryUncErr", SEES(TX_READ_PIO_MEMORY_UNC)), +/*45*/ FLAG_ENTRY0("TxEgressFifoUncErr", SEES(TX_EGRESS_FIFO_UNC)), +/*46*/ FLAG_ENTRY0("TxHcrcInsertionErr", SEES(TX_HCRC_INSERTION)), +/*47*/ FLAG_ENTRY0("TxCreditReturnVLErr", SEES(TX_CREDIT_RETURN_VL)), +/*48*/ FLAG_ENTRY0("TxLaunchFifo0CorErr", SEES(TX_LAUNCH_FIFO0_COR)), +/*49*/ FLAG_ENTRY0("TxLaunchFifo1CorErr", SEES(TX_LAUNCH_FIFO1_COR)), +/*50*/ FLAG_ENTRY0("TxLaunchFifo2CorErr", SEES(TX_LAUNCH_FIFO2_COR)), +/*51*/ FLAG_ENTRY0("TxLaunchFifo3CorErr", SEES(TX_LAUNCH_FIFO3_COR)), +/*52*/ FLAG_ENTRY0("TxLaunchFifo4CorErr", SEES(TX_LAUNCH_FIFO4_COR)), +/*53*/ FLAG_ENTRY0("TxLaunchFifo5CorErr", SEES(TX_LAUNCH_FIFO5_COR)), +/*54*/ FLAG_ENTRY0("TxLaunchFifo6CorErr", SEES(TX_LAUNCH_FIFO6_COR)), +/*55*/ FLAG_ENTRY0("TxLaunchFifo7CorErr", SEES(TX_LAUNCH_FIFO7_COR)), +/*56*/ FLAG_ENTRY0("TxLaunchFifo8CorErr", SEES(TX_LAUNCH_FIFO8_COR)), +/*57*/ FLAG_ENTRY0("TxCreditOverrunErr", SEES(TX_CREDIT_OVERRUN)), +/*58*/ FLAG_ENTRY0("TxSbHdrCorErr", SEES(TX_SB_HDR_COR)), +/*59*/ FLAG_ENTRY0("TxReadSdmaMemoryCorErr", SEES(TX_READ_SDMA_MEMORY_COR)), +/*60*/ FLAG_ENTRY0("TxReadPioMemoryCorErr", SEES(TX_READ_PIO_MEMORY_COR)), +/*61*/ FLAG_ENTRY0("TxEgressFifoCorErr", SEES(TX_EGRESS_FIFO_COR)), +/*62*/ FLAG_ENTRY0("TxReadSdmaMemoryCsrUncErr", + SEES(TX_READ_SDMA_MEMORY_CSR_UNC)), +/*63*/ FLAG_ENTRY0("TxReadPioMemoryCsrUncErr", + SEES(TX_READ_PIO_MEMORY_CSR_UNC)), +}; + +/* + * TXE Egress Error Info flags + */ +#define SEEI(text) SEND_EGRESS_ERR_INFO_##text##_ERR_SMASK +static struct flag_table egress_err_info_flags[] = { +/* 0*/ FLAG_ENTRY0("Reserved", 0ull), +/* 1*/ FLAG_ENTRY0("VLErr", SEEI(VL)), +/* 2*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)), +/* 3*/ FLAG_ENTRY0("JobKeyErr", SEEI(JOB_KEY)), +/* 4*/ FLAG_ENTRY0("PartitionKeyErr", SEEI(PARTITION_KEY)), +/* 5*/ FLAG_ENTRY0("SLIDErr", SEEI(SLID)), +/* 6*/ FLAG_ENTRY0("OpcodeErr", SEEI(OPCODE)), +/* 7*/ FLAG_ENTRY0("VLMappingErr", SEEI(VL_MAPPING)), +/* 8*/ FLAG_ENTRY0("RawErr", SEEI(RAW)), +/* 9*/ FLAG_ENTRY0("RawIPv6Err", SEEI(RAW_IPV6)), +/*10*/ FLAG_ENTRY0("GRHErr", SEEI(GRH)), +/*11*/ FLAG_ENTRY0("BypassErr", SEEI(BYPASS)), +/*12*/ FLAG_ENTRY0("KDETHPacketsErr", SEEI(KDETH_PACKETS)), +/*13*/ FLAG_ENTRY0("NonKDETHPacketsErr", SEEI(NON_KDETH_PACKETS)), +/*14*/ FLAG_ENTRY0("TooSmallIBPacketsErr", SEEI(TOO_SMALL_IB_PACKETS)), +/*15*/ FLAG_ENTRY0("TooSmallBypassPacketsErr", SEEI(TOO_SMALL_BYPASS_PACKETS)), +/*16*/ FLAG_ENTRY0("PbcTestErr", SEEI(PBC_TEST)), +/*17*/ FLAG_ENTRY0("BadPktLenErr", SEEI(BAD_PKT_LEN)), +/*18*/ FLAG_ENTRY0("TooLongIBPacketErr", SEEI(TOO_LONG_IB_PACKET)), +/*19*/ FLAG_ENTRY0("TooLongBypassPacketsErr", SEEI(TOO_LONG_BYPASS_PACKETS)), +/*20*/ FLAG_ENTRY0("PbcStaticRateControlErr", SEEI(PBC_STATIC_RATE_CONTROL)), +/*21*/ FLAG_ENTRY0("BypassBadPktLenErr", SEEI(BAD_PKT_LEN)), +}; + +/* TXE Egress errors that cause an SPC freeze */ +#define ALL_TXE_EGRESS_FREEZE_ERR \ + (SEES(TX_EGRESS_FIFO_UNDERRUN_OR_PARITY) \ + | SEES(TX_PIO_LAUNCH_INTF_PARITY) \ + | SEES(TX_SDMA_LAUNCH_INTF_PARITY) \ + | SEES(TX_SBRD_CTL_STATE_MACHINE_PARITY) \ + | SEES(TX_LAUNCH_CSR_PARITY) \ + | SEES(TX_SBRD_CTL_CSR_PARITY) \ + | SEES(TX_CONFIG_PARITY) \ + | SEES(TX_LAUNCH_FIFO0_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO1_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO2_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO3_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO4_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO5_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO6_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO7_UNC_OR_PARITY) \ + | SEES(TX_LAUNCH_FIFO8_UNC_OR_PARITY) \ + | SEES(TX_CREDIT_RETURN_PARITY)) + +/* + * TXE Send error flags + */ +#define SES(name) SEND_ERR_STATUS_SEND_##name##_ERR_SMASK +static struct flag_table send_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("SendCsrParityErr", SES(CSR_PARITY)), +/* 1*/ FLAG_ENTRY0("SendCsrReadBadAddrErr", SES(CSR_READ_BAD_ADDR)), +/* 2*/ FLAG_ENTRY0("SendCsrWriteBadAddrErr", SES(CSR_WRITE_BAD_ADDR)) +}; + +/* + * TXE Send Context Error flags and consequences + */ +static struct flag_table sc_err_status_flags[] = { +/* 0*/ FLAG_ENTRY("InconsistentSop", + SEC_PACKET_DROPPED | SEC_SC_HALTED, + SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK), +/* 1*/ FLAG_ENTRY("DisallowedPacket", + SEC_PACKET_DROPPED | SEC_SC_HALTED, + SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK), +/* 2*/ FLAG_ENTRY("WriteCrossesBoundary", + SEC_WRITE_DROPPED | SEC_SC_HALTED, + SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK), +/* 3*/ FLAG_ENTRY("WriteOverflow", + SEC_WRITE_DROPPED | SEC_SC_HALTED, + SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK), +/* 4*/ FLAG_ENTRY("WriteOutOfBounds", + SEC_WRITE_DROPPED | SEC_SC_HALTED, + SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK), +/* 5-63 reserved*/ +}; + +/* + * RXE Receive Error flags + */ +#define RXES(name) RCV_ERR_STATUS_RX_##name##_ERR_SMASK +static struct flag_table rxe_err_status_flags[] = { +/* 0*/ FLAG_ENTRY0("RxDmaCsrCorErr", RXES(DMA_CSR_COR)), +/* 1*/ FLAG_ENTRY0("RxDcIntfParityErr", RXES(DC_INTF_PARITY)), +/* 2*/ FLAG_ENTRY0("RxRcvHdrUncErr", RXES(RCV_HDR_UNC)), +/* 3*/ FLAG_ENTRY0("RxRcvHdrCorErr", RXES(RCV_HDR_COR)), +/* 4*/ FLAG_ENTRY0("RxRcvDataUncErr", RXES(RCV_DATA_UNC)), +/* 5*/ FLAG_ENTRY0("RxRcvDataCorErr", RXES(RCV_DATA_COR)), +/* 6*/ FLAG_ENTRY0("RxRcvQpMapTableUncErr", RXES(RCV_QP_MAP_TABLE_UNC)), +/* 7*/ FLAG_ENTRY0("RxRcvQpMapTableCorErr", RXES(RCV_QP_MAP_TABLE_COR)), +/* 8*/ FLAG_ENTRY0("RxRcvCsrParityErr", RXES(RCV_CSR_PARITY)), +/* 9*/ FLAG_ENTRY0("RxDcSopEopParityErr", RXES(DC_SOP_EOP_PARITY)), +/*10*/ FLAG_ENTRY0("RxDmaFlagUncErr", RXES(DMA_FLAG_UNC)), +/*11*/ FLAG_ENTRY0("RxDmaFlagCorErr", RXES(DMA_FLAG_COR)), +/*12*/ FLAG_ENTRY0("RxRcvFsmEncodingErr", RXES(RCV_FSM_ENCODING)), +/*13*/ FLAG_ENTRY0("RxRbufFreeListUncErr", RXES(RBUF_FREE_LIST_UNC)), +/*14*/ FLAG_ENTRY0("RxRbufFreeListCorErr", RXES(RBUF_FREE_LIST_COR)), +/*15*/ FLAG_ENTRY0("RxRbufLookupDesRegUncErr", RXES(RBUF_LOOKUP_DES_REG_UNC)), +/*16*/ FLAG_ENTRY0("RxRbufLookupDesRegUncCorErr", + RXES(RBUF_LOOKUP_DES_REG_UNC_COR)), +/*17*/ FLAG_ENTRY0("RxRbufLookupDesUncErr", RXES(RBUF_LOOKUP_DES_UNC)), +/*18*/ FLAG_ENTRY0("RxRbufLookupDesCorErr", RXES(RBUF_LOOKUP_DES_COR)), +/*19*/ FLAG_ENTRY0("RxRbufBlockListReadUncErr", + RXES(RBUF_BLOCK_LIST_READ_UNC)), +/*20*/ FLAG_ENTRY0("RxRbufBlockListReadCorErr", + RXES(RBUF_BLOCK_LIST_READ_COR)), +/*21*/ FLAG_ENTRY0("RxRbufCsrQHeadBufNumParityErr", + RXES(RBUF_CSR_QHEAD_BUF_NUM_PARITY)), +/*22*/ FLAG_ENTRY0("RxRbufCsrQEntCntParityErr", + RXES(RBUF_CSR_QENT_CNT_PARITY)), +/*23*/ FLAG_ENTRY0("RxRbufCsrQNextBufParityErr", + RXES(RBUF_CSR_QNEXT_BUF_PARITY)), +/*24*/ FLAG_ENTRY0("RxRbufCsrQVldBitParityErr", + RXES(RBUF_CSR_QVLD_BIT_PARITY)), +/*25*/ FLAG_ENTRY0("RxRbufCsrQHdPtrParityErr", RXES(RBUF_CSR_QHD_PTR_PARITY)), +/*26*/ FLAG_ENTRY0("RxRbufCsrQTlPtrParityErr", RXES(RBUF_CSR_QTL_PTR_PARITY)), +/*27*/ FLAG_ENTRY0("RxRbufCsrQNumOfPktParityErr", + RXES(RBUF_CSR_QNUM_OF_PKT_PARITY)), +/*28*/ FLAG_ENTRY0("RxRbufCsrQEOPDWParityErr", RXES(RBUF_CSR_QEOPDW_PARITY)), +/*29*/ FLAG_ENTRY0("RxRbufCtxIdParityErr", RXES(RBUF_CTX_ID_PARITY)), +/*30*/ FLAG_ENTRY0("RxRBufBadLookupErr", RXES(RBUF_BAD_LOOKUP)), +/*31*/ FLAG_ENTRY0("RxRbufFullErr", RXES(RBUF_FULL)), +/*32*/ FLAG_ENTRY0("RxRbufEmptyErr", RXES(RBUF_EMPTY)), +/*33*/ FLAG_ENTRY0("RxRbufFlRdAddrParityErr", RXES(RBUF_FL_RD_ADDR_PARITY)), +/*34*/ FLAG_ENTRY0("RxRbufFlWrAddrParityErr", RXES(RBUF_FL_WR_ADDR_PARITY)), +/*35*/ FLAG_ENTRY0("RxRbufFlInitdoneParityErr", + RXES(RBUF_FL_INITDONE_PARITY)), +/*36*/ FLAG_ENTRY0("RxRbufFlInitWrAddrParityErr", + RXES(RBUF_FL_INIT_WR_ADDR_PARITY)), +/*37*/ FLAG_ENTRY0("RxRbufNextFreeBufUncErr", RXES(RBUF_NEXT_FREE_BUF_UNC)), +/*38*/ FLAG_ENTRY0("RxRbufNextFreeBufCorErr", RXES(RBUF_NEXT_FREE_BUF_COR)), +/*39*/ FLAG_ENTRY0("RxLookupDesPart1UncErr", RXES(LOOKUP_DES_PART1_UNC)), +/*40*/ FLAG_ENTRY0("RxLookupDesPart1UncCorErr", + RXES(LOOKUP_DES_PART1_UNC_COR)), +/*41*/ FLAG_ENTRY0("RxLookupDesPart2ParityErr", + RXES(LOOKUP_DES_PART2_PARITY)), +/*42*/ FLAG_ENTRY0("RxLookupRcvArrayUncErr", RXES(LOOKUP_RCV_ARRAY_UNC)), +/*43*/ FLAG_ENTRY0("RxLookupRcvArrayCorErr", RXES(LOOKUP_RCV_ARRAY_COR)), +/*44*/ FLAG_ENTRY0("RxLookupCsrParityErr", RXES(LOOKUP_CSR_PARITY)), +/*45*/ FLAG_ENTRY0("RxHqIntrCsrParityErr", RXES(HQ_INTR_CSR_PARITY)), +/*46*/ FLAG_ENTRY0("RxHqIntrFsmErr", RXES(HQ_INTR_FSM)), +/*47*/ FLAG_ENTRY0("RxRbufDescPart1UncErr", RXES(RBUF_DESC_PART1_UNC)), +/*48*/ FLAG_ENTRY0("RxRbufDescPart1CorErr", RXES(RBUF_DESC_PART1_COR)), +/*49*/ FLAG_ENTRY0("RxRbufDescPart2UncErr", RXES(RBUF_DESC_PART2_UNC)), +/*50*/ FLAG_ENTRY0("RxRbufDescPart2CorErr", RXES(RBUF_DESC_PART2_COR)), +/*51*/ FLAG_ENTRY0("RxDmaHdrFifoRdUncErr", RXES(DMA_HDR_FIFO_RD_UNC)), +/*52*/ FLAG_ENTRY0("RxDmaHdrFifoRdCorErr", RXES(DMA_HDR_FIFO_RD_COR)), +/*53*/ FLAG_ENTRY0("RxDmaDataFifoRdUncErr", RXES(DMA_DATA_FIFO_RD_UNC)), +/*54*/ FLAG_ENTRY0("RxDmaDataFifoRdCorErr", RXES(DMA_DATA_FIFO_RD_COR)), +/*55*/ FLAG_ENTRY0("RxRbufDataUncErr", RXES(RBUF_DATA_UNC)), +/*56*/ FLAG_ENTRY0("RxRbufDataCorErr", RXES(RBUF_DATA_COR)), +/*57*/ FLAG_ENTRY0("RxDmaCsrParityErr", RXES(DMA_CSR_PARITY)), +/*58*/ FLAG_ENTRY0("RxDmaEqFsmEncodingErr", RXES(DMA_EQ_FSM_ENCODING)), +/*59*/ FLAG_ENTRY0("RxDmaDqFsmEncodingErr", RXES(DMA_DQ_FSM_ENCODING)), +/*60*/ FLAG_ENTRY0("RxDmaCsrUncErr", RXES(DMA_CSR_UNC)), +/*61*/ FLAG_ENTRY0("RxCsrReadBadAddrErr", RXES(CSR_READ_BAD_ADDR)), +/*62*/ FLAG_ENTRY0("RxCsrWriteBadAddrErr", RXES(CSR_WRITE_BAD_ADDR)), +/*63*/ FLAG_ENTRY0("RxCsrParityErr", RXES(CSR_PARITY)) +}; + +/* RXE errors that will trigger an SPC freeze */ +#define ALL_RXE_FREEZE_ERR \ + (RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \ + | RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \ + | RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK \ + | RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK) + +#define RXE_FREEZE_ABORT_MASK \ + (RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK | \ + RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK | \ + RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK) + +/* + * DCC Error Flags + */ +#define DCCE(name) DCC_ERR_FLG_##name##_SMASK +static struct flag_table dcc_err_flags[] = { + FLAG_ENTRY0("bad_l2_err", DCCE(BAD_L2_ERR)), + FLAG_ENTRY0("bad_sc_err", DCCE(BAD_SC_ERR)), + FLAG_ENTRY0("bad_mid_tail_err", DCCE(BAD_MID_TAIL_ERR)), + FLAG_ENTRY0("bad_preemption_err", DCCE(BAD_PREEMPTION_ERR)), + FLAG_ENTRY0("preemption_err", DCCE(PREEMPTION_ERR)), + FLAG_ENTRY0("preemptionvl15_err", DCCE(PREEMPTIONVL15_ERR)), + FLAG_ENTRY0("bad_vl_marker_err", DCCE(BAD_VL_MARKER_ERR)), + FLAG_ENTRY0("bad_dlid_target_err", DCCE(BAD_DLID_TARGET_ERR)), + FLAG_ENTRY0("bad_lver_err", DCCE(BAD_LVER_ERR)), + FLAG_ENTRY0("uncorrectable_err", DCCE(UNCORRECTABLE_ERR)), + FLAG_ENTRY0("bad_crdt_ack_err", DCCE(BAD_CRDT_ACK_ERR)), + FLAG_ENTRY0("unsup_pkt_type", DCCE(UNSUP_PKT_TYPE)), + FLAG_ENTRY0("bad_ctrl_flit_err", DCCE(BAD_CTRL_FLIT_ERR)), + FLAG_ENTRY0("event_cntr_parity_err", DCCE(EVENT_CNTR_PARITY_ERR)), + FLAG_ENTRY0("event_cntr_rollover_err", DCCE(EVENT_CNTR_ROLLOVER_ERR)), + FLAG_ENTRY0("link_err", DCCE(LINK_ERR)), + FLAG_ENTRY0("misc_cntr_rollover_err", DCCE(MISC_CNTR_ROLLOVER_ERR)), + FLAG_ENTRY0("bad_ctrl_dist_err", DCCE(BAD_CTRL_DIST_ERR)), + FLAG_ENTRY0("bad_tail_dist_err", DCCE(BAD_TAIL_DIST_ERR)), + FLAG_ENTRY0("bad_head_dist_err", DCCE(BAD_HEAD_DIST_ERR)), + FLAG_ENTRY0("nonvl15_state_err", DCCE(NONVL15_STATE_ERR)), + FLAG_ENTRY0("vl15_multi_err", DCCE(VL15_MULTI_ERR)), + FLAG_ENTRY0("bad_pkt_length_err", DCCE(BAD_PKT_LENGTH_ERR)), + FLAG_ENTRY0("unsup_vl_err", DCCE(UNSUP_VL_ERR)), + FLAG_ENTRY0("perm_nvl15_err", DCCE(PERM_NVL15_ERR)), + FLAG_ENTRY0("slid_zero_err", DCCE(SLID_ZERO_ERR)), + FLAG_ENTRY0("dlid_zero_err", DCCE(DLID_ZERO_ERR)), + FLAG_ENTRY0("length_mtu_err", DCCE(LENGTH_MTU_ERR)), + FLAG_ENTRY0("rx_early_drop_err", DCCE(RX_EARLY_DROP_ERR)), + FLAG_ENTRY0("late_short_err", DCCE(LATE_SHORT_ERR)), + FLAG_ENTRY0("late_long_err", DCCE(LATE_LONG_ERR)), + FLAG_ENTRY0("late_ebp_err", DCCE(LATE_EBP_ERR)), + FLAG_ENTRY0("fpe_tx_fifo_ovflw_err", DCCE(FPE_TX_FIFO_OVFLW_ERR)), + FLAG_ENTRY0("fpe_tx_fifo_unflw_err", DCCE(FPE_TX_FIFO_UNFLW_ERR)), + FLAG_ENTRY0("csr_access_blocked_host", DCCE(CSR_ACCESS_BLOCKED_HOST)), + FLAG_ENTRY0("csr_access_blocked_uc", DCCE(CSR_ACCESS_BLOCKED_UC)), + FLAG_ENTRY0("tx_ctrl_parity_err", DCCE(TX_CTRL_PARITY_ERR)), + FLAG_ENTRY0("tx_ctrl_parity_mbe_err", DCCE(TX_CTRL_PARITY_MBE_ERR)), + FLAG_ENTRY0("tx_sc_parity_err", DCCE(TX_SC_PARITY_ERR)), + FLAG_ENTRY0("rx_ctrl_parity_mbe_err", DCCE(RX_CTRL_PARITY_MBE_ERR)), + FLAG_ENTRY0("csr_parity_err", DCCE(CSR_PARITY_ERR)), + FLAG_ENTRY0("csr_inval_addr", DCCE(CSR_INVAL_ADDR)), + FLAG_ENTRY0("tx_byte_shft_parity_err", DCCE(TX_BYTE_SHFT_PARITY_ERR)), + FLAG_ENTRY0("rx_byte_shft_parity_err", DCCE(RX_BYTE_SHFT_PARITY_ERR)), + FLAG_ENTRY0("fmconfig_err", DCCE(FMCONFIG_ERR)), + FLAG_ENTRY0("rcvport_err", DCCE(RCVPORT_ERR)), +}; + +/* + * LCB error flags + */ +#define LCBE(name) DC_LCB_ERR_FLG_##name##_SMASK +static struct flag_table lcb_err_flags[] = { +/* 0*/ FLAG_ENTRY0("CSR_PARITY_ERR", LCBE(CSR_PARITY_ERR)), +/* 1*/ FLAG_ENTRY0("INVALID_CSR_ADDR", LCBE(INVALID_CSR_ADDR)), +/* 2*/ FLAG_ENTRY0("RST_FOR_FAILED_DESKEW", LCBE(RST_FOR_FAILED_DESKEW)), +/* 3*/ FLAG_ENTRY0("ALL_LNS_FAILED_REINIT_TEST", + LCBE(ALL_LNS_FAILED_REINIT_TEST)), +/* 4*/ FLAG_ENTRY0("LOST_REINIT_STALL_OR_TOS", LCBE(LOST_REINIT_STALL_OR_TOS)), +/* 5*/ FLAG_ENTRY0("TX_LESS_THAN_FOUR_LNS", LCBE(TX_LESS_THAN_FOUR_LNS)), +/* 6*/ FLAG_ENTRY0("RX_LESS_THAN_FOUR_LNS", LCBE(RX_LESS_THAN_FOUR_LNS)), +/* 7*/ FLAG_ENTRY0("SEQ_CRC_ERR", LCBE(SEQ_CRC_ERR)), +/* 8*/ FLAG_ENTRY0("REINIT_FROM_PEER", LCBE(REINIT_FROM_PEER)), +/* 9*/ FLAG_ENTRY0("REINIT_FOR_LN_DEGRADE", LCBE(REINIT_FOR_LN_DEGRADE)), +/*10*/ FLAG_ENTRY0("CRC_ERR_CNT_HIT_LIMIT", LCBE(CRC_ERR_CNT_HIT_LIMIT)), +/*11*/ FLAG_ENTRY0("RCLK_STOPPED", LCBE(RCLK_STOPPED)), +/*12*/ FLAG_ENTRY0("UNEXPECTED_REPLAY_MARKER", LCBE(UNEXPECTED_REPLAY_MARKER)), +/*13*/ FLAG_ENTRY0("UNEXPECTED_ROUND_TRIP_MARKER", + LCBE(UNEXPECTED_ROUND_TRIP_MARKER)), +/*14*/ FLAG_ENTRY0("ILLEGAL_NULL_LTP", LCBE(ILLEGAL_NULL_LTP)), +/*15*/ FLAG_ENTRY0("ILLEGAL_FLIT_ENCODING", LCBE(ILLEGAL_FLIT_ENCODING)), +/*16*/ FLAG_ENTRY0("FLIT_INPUT_BUF_OFLW", LCBE(FLIT_INPUT_BUF_OFLW)), +/*17*/ FLAG_ENTRY0("VL_ACK_INPUT_BUF_OFLW", LCBE(VL_ACK_INPUT_BUF_OFLW)), +/*18*/ FLAG_ENTRY0("VL_ACK_INPUT_PARITY_ERR", LCBE(VL_ACK_INPUT_PARITY_ERR)), +/*19*/ FLAG_ENTRY0("VL_ACK_INPUT_WRONG_CRC_MODE", + LCBE(VL_ACK_INPUT_WRONG_CRC_MODE)), +/*20*/ FLAG_ENTRY0("FLIT_INPUT_BUF_MBE", LCBE(FLIT_INPUT_BUF_MBE)), +/*21*/ FLAG_ENTRY0("FLIT_INPUT_BUF_SBE", LCBE(FLIT_INPUT_BUF_SBE)), +/*22*/ FLAG_ENTRY0("REPLAY_BUF_MBE", LCBE(REPLAY_BUF_MBE)), +/*23*/ FLAG_ENTRY0("REPLAY_BUF_SBE", LCBE(REPLAY_BUF_SBE)), +/*24*/ FLAG_ENTRY0("CREDIT_RETURN_FLIT_MBE", LCBE(CREDIT_RETURN_FLIT_MBE)), +/*25*/ FLAG_ENTRY0("RST_FOR_LINK_TIMEOUT", LCBE(RST_FOR_LINK_TIMEOUT)), +/*26*/ FLAG_ENTRY0("RST_FOR_INCOMPLT_RND_TRIP", + LCBE(RST_FOR_INCOMPLT_RND_TRIP)), +/*27*/ FLAG_ENTRY0("HOLD_REINIT", LCBE(HOLD_REINIT)), +/*28*/ FLAG_ENTRY0("NEG_EDGE_LINK_TRANSFER_ACTIVE", + LCBE(NEG_EDGE_LINK_TRANSFER_ACTIVE)), +/*29*/ FLAG_ENTRY0("REDUNDANT_FLIT_PARITY_ERR", + LCBE(REDUNDANT_FLIT_PARITY_ERR)) +}; + +/* + * DC8051 Error Flags + */ +#define D8E(name) DC_DC8051_ERR_FLG_##name##_SMASK +static struct flag_table dc8051_err_flags[] = { + FLAG_ENTRY0("SET_BY_8051", D8E(SET_BY_8051)), + FLAG_ENTRY0("LOST_8051_HEART_BEAT", D8E(LOST_8051_HEART_BEAT)), + FLAG_ENTRY0("CRAM_MBE", D8E(CRAM_MBE)), + FLAG_ENTRY0("CRAM_SBE", D8E(CRAM_SBE)), + FLAG_ENTRY0("DRAM_MBE", D8E(DRAM_MBE)), + FLAG_ENTRY0("DRAM_SBE", D8E(DRAM_SBE)), + FLAG_ENTRY0("IRAM_MBE", D8E(IRAM_MBE)), + FLAG_ENTRY0("IRAM_SBE", D8E(IRAM_SBE)), + FLAG_ENTRY0("UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES", + D8E(UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES)), + FLAG_ENTRY0("INVALID_CSR_ADDR", D8E(INVALID_CSR_ADDR)), +}; + +/* + * DC8051 Information Error flags + * + * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR field. + */ +static struct flag_table dc8051_info_err_flags[] = { + FLAG_ENTRY0("Spico ROM check failed", SPICO_ROM_FAILED), + FLAG_ENTRY0("Unknown frame received", UNKNOWN_FRAME), + FLAG_ENTRY0("Target BER not met", TARGET_BER_NOT_MET), + FLAG_ENTRY0("Serdes internal loopback failure", + FAILED_SERDES_INTERNAL_LOOPBACK), + FLAG_ENTRY0("Failed SerDes init", FAILED_SERDES_INIT), + FLAG_ENTRY0("Failed LNI(Polling)", FAILED_LNI_POLLING), + FLAG_ENTRY0("Failed LNI(Debounce)", FAILED_LNI_DEBOUNCE), + FLAG_ENTRY0("Failed LNI(EstbComm)", FAILED_LNI_ESTBCOMM), + FLAG_ENTRY0("Failed LNI(OptEq)", FAILED_LNI_OPTEQ), + FLAG_ENTRY0("Failed LNI(VerifyCap_1)", FAILED_LNI_VERIFY_CAP1), + FLAG_ENTRY0("Failed LNI(VerifyCap_2)", FAILED_LNI_VERIFY_CAP2), + FLAG_ENTRY0("Failed LNI(ConfigLT)", FAILED_LNI_CONFIGLT), + FLAG_ENTRY0("Host Handshake Timeout", HOST_HANDSHAKE_TIMEOUT) +}; + +/* + * DC8051 Information Host Information flags + * + * Flags in DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG field. + */ +static struct flag_table dc8051_info_host_msg_flags[] = { + FLAG_ENTRY0("Host request done", 0x0001), + FLAG_ENTRY0("BC SMA message", 0x0002), + FLAG_ENTRY0("BC PWR_MGM message", 0x0004), + FLAG_ENTRY0("BC Unknown message (BCC)", 0x0008), + FLAG_ENTRY0("BC Unknown message (LCB)", 0x0010), + FLAG_ENTRY0("External device config request", 0x0020), + FLAG_ENTRY0("VerifyCap all frames received", 0x0040), + FLAG_ENTRY0("LinkUp achieved", 0x0080), + FLAG_ENTRY0("Link going down", 0x0100), +}; + +static u32 encoded_size(u32 size); +static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate); +static int set_physical_link_state(struct hfi1_devdata *dd, u64 state); +static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management, + u8 *continuous); +static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z, + u8 *vcu, u16 *vl15buf, u8 *crc_sizes); +static void read_vc_remote_link_width(struct hfi1_devdata *dd, + u8 *remote_tx_rate, u16 *link_widths); +static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits, + u8 *flag_bits, u16 *link_widths); +static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id, + u8 *device_rev); +static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed); +static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx); +static int read_tx_settings(struct hfi1_devdata *dd, u8 *enable_lane_tx, + u8 *tx_polarity_inversion, + u8 *rx_polarity_inversion, u8 *max_rate); +static void handle_sdma_eng_err(struct hfi1_devdata *dd, + unsigned int context, u64 err_status); +static void handle_qsfp_int(struct hfi1_devdata *dd, u32 source, u64 reg); +static void handle_dcc_err(struct hfi1_devdata *dd, + unsigned int context, u64 err_status); +static void handle_lcb_err(struct hfi1_devdata *dd, + unsigned int context, u64 err_status); +static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg); +static void set_partition_keys(struct hfi1_pportdata *); +static const char *link_state_name(u32 state); +static const char *link_state_reason_name(struct hfi1_pportdata *ppd, + u32 state); +static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data, + u64 *out_data); +static int read_idle_sma(struct hfi1_devdata *dd, u64 *data); +static int thermal_init(struct hfi1_devdata *dd); + +static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state, + int msecs); +static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc); +static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr); +static void handle_temp_err(struct hfi1_devdata *); +static void dc_shutdown(struct hfi1_devdata *); +static void dc_start(struct hfi1_devdata *); +static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp, + unsigned int *np); +static void clear_full_mgmt_pkey(struct hfi1_pportdata *ppd); + +/* + * Error interrupt table entry. This is used as input to the interrupt + * "clear down" routine used for all second tier error interrupt register. + * Second tier interrupt registers have a single bit representing them + * in the top-level CceIntStatus. + */ +struct err_reg_info { + u32 status; /* status CSR offset */ + u32 clear; /* clear CSR offset */ + u32 mask; /* mask CSR offset */ + void (*handler)(struct hfi1_devdata *dd, u32 source, u64 reg); + const char *desc; +}; + +#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START) +#define NUM_DC_ERRS (IS_DC_END - IS_DC_START) +#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START) + +/* + * Helpers for building HFI and DC error interrupt table entries. Different + * helpers are needed because of inconsistent register names. + */ +#define EE(reg, handler, desc) \ + { reg##_STATUS, reg##_CLEAR, reg##_MASK, \ + handler, desc } +#define DC_EE1(reg, handler, desc) \ + { reg##_FLG, reg##_FLG_CLR, reg##_FLG_EN, handler, desc } +#define DC_EE2(reg, handler, desc) \ + { reg##_FLG, reg##_CLR, reg##_EN, handler, desc } + +/* + * Table of the "misc" grouping of error interrupts. Each entry refers to + * another register containing more information. + */ +static const struct err_reg_info misc_errs[NUM_MISC_ERRS] = { +/* 0*/ EE(CCE_ERR, handle_cce_err, "CceErr"), +/* 1*/ EE(RCV_ERR, handle_rxe_err, "RxeErr"), +/* 2*/ EE(MISC_ERR, handle_misc_err, "MiscErr"), +/* 3*/ { 0, 0, 0, NULL }, /* reserved */ +/* 4*/ EE(SEND_PIO_ERR, handle_pio_err, "PioErr"), +/* 5*/ EE(SEND_DMA_ERR, handle_sdma_err, "SDmaErr"), +/* 6*/ EE(SEND_EGRESS_ERR, handle_egress_err, "EgressErr"), +/* 7*/ EE(SEND_ERR, handle_txe_err, "TxeErr") + /* the rest are reserved */ +}; + +/* + * Index into the Various section of the interrupt sources + * corresponding to the Critical Temperature interrupt. + */ +#define TCRIT_INT_SOURCE 4 + +/* + * SDMA error interrupt entry - refers to another register containing more + * information. + */ +static const struct err_reg_info sdma_eng_err = + EE(SEND_DMA_ENG_ERR, handle_sdma_eng_err, "SDmaEngErr"); + +static const struct err_reg_info various_err[NUM_VARIOUS] = { +/* 0*/ { 0, 0, 0, NULL }, /* PbcInt */ +/* 1*/ { 0, 0, 0, NULL }, /* GpioAssertInt */ +/* 2*/ EE(ASIC_QSFP1, handle_qsfp_int, "QSFP1"), +/* 3*/ EE(ASIC_QSFP2, handle_qsfp_int, "QSFP2"), +/* 4*/ { 0, 0, 0, NULL }, /* TCritInt */ + /* rest are reserved */ +}; + +/* + * The DC encoding of mtu_cap for 10K MTU in the DCC_CFG_PORT_CONFIG + * register can not be derived from the MTU value because 10K is not + * a power of 2. Therefore, we need a constant. Everything else can + * be calculated. + */ +#define DCC_CFG_PORT_MTU_CAP_10240 7 + +/* + * Table of the DC grouping of error interrupts. Each entry refers to + * another register containing more information. + */ +static const struct err_reg_info dc_errs[NUM_DC_ERRS] = { +/* 0*/ DC_EE1(DCC_ERR, handle_dcc_err, "DCC Err"), +/* 1*/ DC_EE2(DC_LCB_ERR, handle_lcb_err, "LCB Err"), +/* 2*/ DC_EE2(DC_DC8051_ERR, handle_8051_interrupt, "DC8051 Interrupt"), +/* 3*/ /* dc_lbm_int - special, see is_dc_int() */ + /* the rest are reserved */ +}; + +struct cntr_entry { + /* + * counter name + */ + char *name; + + /* + * csr to read for name (if applicable) + */ + u64 csr; + + /* + * offset into dd or ppd to store the counter's value + */ + int offset; + + /* + * flags + */ + u8 flags; + + /* + * accessor for stat element, context either dd or ppd + */ + u64 (*rw_cntr)(const struct cntr_entry *, void *context, int vl, + int mode, u64 data); +}; + +#define C_RCV_HDR_OVF_FIRST C_RCV_HDR_OVF_0 +#define C_RCV_HDR_OVF_LAST C_RCV_HDR_OVF_159 + +#define CNTR_ELEM(name, csr, offset, flags, accessor) \ +{ \ + name, \ + csr, \ + offset, \ + flags, \ + accessor \ +} + +/* 32bit RXE */ +#define RXE32_PORT_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + RCV_COUNTER_ARRAY32), \ + 0, flags | CNTR_32BIT, \ + port_access_u32_csr) + +#define RXE32_DEV_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + RCV_COUNTER_ARRAY32), \ + 0, flags | CNTR_32BIT, \ + dev_access_u32_csr) + +/* 64bit RXE */ +#define RXE64_PORT_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + RCV_COUNTER_ARRAY64), \ + 0, flags, \ + port_access_u64_csr) + +#define RXE64_DEV_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + RCV_COUNTER_ARRAY64), \ + 0, flags, \ + dev_access_u64_csr) + +#define OVR_LBL(ctx) C_RCV_HDR_OVF_ ## ctx +#define OVR_ELM(ctx) \ +CNTR_ELEM("RcvHdrOvr" #ctx, \ + (RCV_HDR_OVFL_CNT + ctx * 0x100), \ + 0, CNTR_NORMAL, port_access_u64_csr) + +/* 32bit TXE */ +#define TXE32_PORT_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + SEND_COUNTER_ARRAY32), \ + 0, flags | CNTR_32BIT, \ + port_access_u32_csr) + +/* 64bit TXE */ +#define TXE64_PORT_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + SEND_COUNTER_ARRAY64), \ + 0, flags, \ + port_access_u64_csr) + +# define TX64_DEV_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name,\ + counter * 8 + SEND_COUNTER_ARRAY64, \ + 0, \ + flags, \ + dev_access_u64_csr) + +/* CCE */ +#define CCE_PERF_DEV_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + CCE_COUNTER_ARRAY32), \ + 0, flags | CNTR_32BIT, \ + dev_access_u32_csr) + +#define CCE_INT_DEV_CNTR_ELEM(name, counter, flags) \ +CNTR_ELEM(#name, \ + (counter * 8 + CCE_INT_COUNTER_ARRAY32), \ + 0, flags | CNTR_32BIT, \ + dev_access_u32_csr) + +/* DC */ +#define DC_PERF_CNTR(name, counter, flags) \ +CNTR_ELEM(#name, \ + counter, \ + 0, \ + flags, \ + dev_access_u64_csr) + +#define DC_PERF_CNTR_LCB(name, counter, flags) \ +CNTR_ELEM(#name, \ + counter, \ + 0, \ + flags, \ + dc_access_lcb_cntr) + +/* ibp counters */ +#define SW_IBP_CNTR(name, cntr) \ +CNTR_ELEM(#name, \ + 0, \ + 0, \ + CNTR_SYNTH, \ + access_ibp_##cntr) + +u64 read_csr(const struct hfi1_devdata *dd, u32 offset) +{ + if (dd->flags & HFI1_PRESENT) { + return readq((void __iomem *)dd->kregbase + offset); + } + return -1; +} + +void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value) +{ + if (dd->flags & HFI1_PRESENT) + writeq(value, (void __iomem *)dd->kregbase + offset); +} + +void __iomem *get_csr_addr( + struct hfi1_devdata *dd, + u32 offset) +{ + return (void __iomem *)dd->kregbase + offset; +} + +static inline u64 read_write_csr(const struct hfi1_devdata *dd, u32 csr, + int mode, u64 value) +{ + u64 ret; + + if (mode == CNTR_MODE_R) { + ret = read_csr(dd, csr); + } else if (mode == CNTR_MODE_W) { + write_csr(dd, csr, value); + ret = value; + } else { + dd_dev_err(dd, "Invalid cntr register access mode"); + return 0; + } + + hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, ret, mode); + return ret; +} + +/* Dev Access */ +static u64 dev_access_u32_csr(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + u64 csr = entry->csr; + + if (entry->flags & CNTR_SDMA) { + if (vl == CNTR_INVALID_VL) + return 0; + csr += 0x100 * vl; + } else { + if (vl != CNTR_INVALID_VL) + return 0; + } + return read_write_csr(dd, csr, mode, data); +} + +static u64 access_sde_err_cnt(const struct cntr_entry *entry, + void *context, int idx, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + if (dd->per_sdma && idx < dd->num_sdma) + return dd->per_sdma[idx].err_cnt; + return 0; +} + +static u64 access_sde_int_cnt(const struct cntr_entry *entry, + void *context, int idx, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + if (dd->per_sdma && idx < dd->num_sdma) + return dd->per_sdma[idx].sdma_int_cnt; + return 0; +} + +static u64 access_sde_idle_int_cnt(const struct cntr_entry *entry, + void *context, int idx, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + if (dd->per_sdma && idx < dd->num_sdma) + return dd->per_sdma[idx].idle_int_cnt; + return 0; +} + +static u64 access_sde_progress_int_cnt(const struct cntr_entry *entry, + void *context, int idx, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + if (dd->per_sdma && idx < dd->num_sdma) + return dd->per_sdma[idx].progress_int_cnt; + return 0; +} + +static u64 dev_access_u64_csr(const struct cntr_entry *entry, void *context, + int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + + u64 val = 0; + u64 csr = entry->csr; + + if (entry->flags & CNTR_VL) { + if (vl == CNTR_INVALID_VL) + return 0; + csr += 8 * vl; + } else { + if (vl != CNTR_INVALID_VL) + return 0; + } + + val = read_write_csr(dd, csr, mode, data); + return val; +} + +static u64 dc_access_lcb_cntr(const struct cntr_entry *entry, void *context, + int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + u32 csr = entry->csr; + int ret = 0; + + if (vl != CNTR_INVALID_VL) + return 0; + if (mode == CNTR_MODE_R) + ret = read_lcb_csr(dd, csr, &data); + else if (mode == CNTR_MODE_W) + ret = write_lcb_csr(dd, csr, data); + + if (ret) { + dd_dev_err(dd, "Could not acquire LCB for counter 0x%x", csr); + return 0; + } + + hfi1_cdbg(CNTR, "csr 0x%x val 0x%llx mode %d", csr, data, mode); + return data; +} + +/* Port Access */ +static u64 port_access_u32_csr(const struct cntr_entry *entry, void *context, + int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = context; + + if (vl != CNTR_INVALID_VL) + return 0; + return read_write_csr(ppd->dd, entry->csr, mode, data); +} + +static u64 port_access_u64_csr(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = context; + u64 val; + u64 csr = entry->csr; + + if (entry->flags & CNTR_VL) { + if (vl == CNTR_INVALID_VL) + return 0; + csr += 8 * vl; + } else { + if (vl != CNTR_INVALID_VL) + return 0; + } + val = read_write_csr(ppd->dd, csr, mode, data); + return val; +} + +/* Software defined */ +static inline u64 read_write_sw(struct hfi1_devdata *dd, u64 *cntr, int mode, + u64 data) +{ + u64 ret; + + if (mode == CNTR_MODE_R) { + ret = *cntr; + } else if (mode == CNTR_MODE_W) { + *cntr = data; + ret = data; + } else { + dd_dev_err(dd, "Invalid cntr sw access mode"); + return 0; + } + + hfi1_cdbg(CNTR, "val 0x%llx mode %d", ret, mode); + + return ret; +} + +static u64 access_sw_link_dn_cnt(const struct cntr_entry *entry, void *context, + int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = context; + + if (vl != CNTR_INVALID_VL) + return 0; + return read_write_sw(ppd->dd, &ppd->link_downed, mode, data); +} + +static u64 access_sw_link_up_cnt(const struct cntr_entry *entry, void *context, + int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = context; + + if (vl != CNTR_INVALID_VL) + return 0; + return read_write_sw(ppd->dd, &ppd->link_up, mode, data); +} + +static u64 access_sw_unknown_frame_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; + + if (vl != CNTR_INVALID_VL) + return 0; + return read_write_sw(ppd->dd, &ppd->unknown_frame_count, mode, data); +} + +static u64 access_sw_xmit_discards(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; + u64 zero = 0; + u64 *counter; + + if (vl == CNTR_INVALID_VL) + counter = &ppd->port_xmit_discards; + else if (vl >= 0 && vl < C_VL_COUNT) + counter = &ppd->port_xmit_discards_vl[vl]; + else + counter = &zero; + + return read_write_sw(ppd->dd, counter, mode, data); +} + +static u64 access_xmit_constraint_errs(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_pportdata *ppd = context; + + if (vl != CNTR_INVALID_VL) + return 0; + + return read_write_sw(ppd->dd, &ppd->port_xmit_constraint_errors, + mode, data); +} + +static u64 access_rcv_constraint_errs(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_pportdata *ppd = context; + + if (vl != CNTR_INVALID_VL) + return 0; + + return read_write_sw(ppd->dd, &ppd->port_rcv_constraint_errors, + mode, data); +} + +u64 get_all_cpu_total(u64 __percpu *cntr) +{ + int cpu; + u64 counter = 0; + + for_each_possible_cpu(cpu) + counter += *per_cpu_ptr(cntr, cpu); + return counter; +} + +static u64 read_write_cpu(struct hfi1_devdata *dd, u64 *z_val, + u64 __percpu *cntr, + int vl, int mode, u64 data) +{ + u64 ret = 0; + + if (vl != CNTR_INVALID_VL) + return 0; + + if (mode == CNTR_MODE_R) { + ret = get_all_cpu_total(cntr) - *z_val; + } else if (mode == CNTR_MODE_W) { + /* A write can only zero the counter */ + if (data == 0) + *z_val = get_all_cpu_total(cntr); + else + dd_dev_err(dd, "Per CPU cntrs can only be zeroed"); + } else { + dd_dev_err(dd, "Invalid cntr sw cpu access mode"); + return 0; + } + + return ret; +} + +static u64 access_sw_cpu_intr(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + + return read_write_cpu(dd, &dd->z_int_counter, dd->int_counter, vl, + mode, data); +} + +static u64 access_sw_cpu_rcv_limit(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + + return read_write_cpu(dd, &dd->z_rcv_limit, dd->rcv_limit, vl, + mode, data); +} + +static u64 access_sw_pio_wait(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + + return dd->verbs_dev.n_piowait; +} + +static u64 access_sw_pio_drain(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->verbs_dev.n_piodrain; +} + +static u64 access_sw_vtx_wait(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + + return dd->verbs_dev.n_txwait; +} + +static u64 access_sw_kmem_wait(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = context; + + return dd->verbs_dev.n_kmem_wait; +} + +static u64 access_sw_send_schedule(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return read_write_cpu(dd, &dd->z_send_schedule, dd->send_schedule, vl, + mode, data); +} + +/* Software counters for the error status bits within MISC_ERR_STATUS */ +static u64 access_misc_pll_lock_fail_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[12]; +} + +static u64 access_misc_mbist_fail_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[11]; +} + +static u64 access_misc_invalid_eep_cmd_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[10]; +} + +static u64 access_misc_efuse_done_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[9]; +} + +static u64 access_misc_efuse_write_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[8]; +} + +static u64 access_misc_efuse_read_bad_addr_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[7]; +} + +static u64 access_misc_efuse_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[6]; +} + +static u64 access_misc_fw_auth_failed_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[5]; +} + +static u64 access_misc_key_mismatch_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[4]; +} + +static u64 access_misc_sbus_write_failed_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[3]; +} + +static u64 access_misc_csr_write_bad_addr_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[2]; +} + +static u64 access_misc_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[1]; +} + +static u64 access_misc_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->misc_err_status_cnt[0]; +} + +/* + * Software counter for the aggregate of + * individual CceErrStatus counters + */ +static u64 access_sw_cce_err_status_aggregated_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_cce_err_status_aggregate; +} + +/* + * Software counters corresponding to each of the + * error status bits within CceErrStatus + */ +static u64 access_cce_msix_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[40]; +} + +static u64 access_cce_int_map_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[39]; +} + +static u64 access_cce_int_map_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[38]; +} + +static u64 access_cce_msix_table_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[37]; +} + +static u64 access_cce_msix_table_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[36]; +} + +static u64 access_cce_rxdma_conv_fifo_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[35]; +} + +static u64 access_cce_rcpl_async_fifo_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[34]; +} + +static u64 access_cce_seg_write_bad_addr_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[33]; +} + +static u64 access_cce_seg_read_bad_addr_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[32]; +} + +static u64 access_la_triggered_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[31]; +} + +static u64 access_cce_trgt_cpl_timeout_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[30]; +} + +static u64 access_pcic_receive_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[29]; +} + +static u64 access_pcic_transmit_back_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[28]; +} + +static u64 access_pcic_transmit_front_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[27]; +} + +static u64 access_pcic_cpl_dat_q_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[26]; +} + +static u64 access_pcic_cpl_hd_q_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[25]; +} + +static u64 access_pcic_post_dat_q_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[24]; +} + +static u64 access_pcic_post_hd_q_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[23]; +} + +static u64 access_pcic_retry_sot_mem_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[22]; +} + +static u64 access_pcic_retry_mem_unc_err(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[21]; +} + +static u64 access_pcic_n_post_dat_q_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[20]; +} + +static u64 access_pcic_n_post_h_q_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[19]; +} + +static u64 access_pcic_cpl_dat_q_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[18]; +} + +static u64 access_pcic_cpl_hd_q_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[17]; +} + +static u64 access_pcic_post_dat_q_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[16]; +} + +static u64 access_pcic_post_hd_q_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[15]; +} + +static u64 access_pcic_retry_sot_mem_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[14]; +} + +static u64 access_pcic_retry_mem_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[13]; +} + +static u64 access_cce_cli1_async_fifo_dbg_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[12]; +} + +static u64 access_cce_cli1_async_fifo_rxdma_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[11]; +} + +static u64 access_cce_cli1_async_fifo_sdma_hd_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[10]; +} + +static u64 access_cce_cl1_async_fifo_pio_crdt_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[9]; +} + +static u64 access_cce_cli2_async_fifo_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[8]; +} + +static u64 access_cce_csr_cfg_bus_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[7]; +} + +static u64 access_cce_cli0_async_fifo_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[6]; +} + +static u64 access_cce_rspd_data_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[5]; +} + +static u64 access_cce_trgt_access_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[4]; +} + +static u64 access_cce_trgt_async_fifo_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[3]; +} + +static u64 access_cce_csr_write_bad_addr_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[2]; +} + +static u64 access_cce_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[1]; +} + +static u64 access_ccs_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->cce_err_status_cnt[0]; +} + +/* + * Software counters corresponding to each of the + * error status bits within RcvErrStatus + */ +static u64 access_rx_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[63]; +} + +static u64 access_rx_csr_write_bad_addr_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[62]; +} + +static u64 access_rx_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[61]; +} + +static u64 access_rx_dma_csr_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[60]; +} + +static u64 access_rx_dma_dq_fsm_encoding_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[59]; +} + +static u64 access_rx_dma_eq_fsm_encoding_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[58]; +} + +static u64 access_rx_dma_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[57]; +} + +static u64 access_rx_rbuf_data_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[56]; +} + +static u64 access_rx_rbuf_data_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[55]; +} + +static u64 access_rx_dma_data_fifo_rd_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[54]; +} + +static u64 access_rx_dma_data_fifo_rd_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[53]; +} + +static u64 access_rx_dma_hdr_fifo_rd_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[52]; +} + +static u64 access_rx_dma_hdr_fifo_rd_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[51]; +} + +static u64 access_rx_rbuf_desc_part2_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[50]; +} + +static u64 access_rx_rbuf_desc_part2_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[49]; +} + +static u64 access_rx_rbuf_desc_part1_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[48]; +} + +static u64 access_rx_rbuf_desc_part1_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[47]; +} + +static u64 access_rx_hq_intr_fsm_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[46]; +} + +static u64 access_rx_hq_intr_csr_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[45]; +} + +static u64 access_rx_lookup_csr_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[44]; +} + +static u64 access_rx_lookup_rcv_array_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[43]; +} + +static u64 access_rx_lookup_rcv_array_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[42]; +} + +static u64 access_rx_lookup_des_part2_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[41]; +} + +static u64 access_rx_lookup_des_part1_unc_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[40]; +} + +static u64 access_rx_lookup_des_part1_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[39]; +} + +static u64 access_rx_rbuf_next_free_buf_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[38]; +} + +static u64 access_rx_rbuf_next_free_buf_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[37]; +} + +static u64 access_rbuf_fl_init_wr_addr_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[36]; +} + +static u64 access_rx_rbuf_fl_initdone_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[35]; +} + +static u64 access_rx_rbuf_fl_write_addr_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[34]; +} + +static u64 access_rx_rbuf_fl_rd_addr_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[33]; +} + +static u64 access_rx_rbuf_empty_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[32]; +} + +static u64 access_rx_rbuf_full_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[31]; +} + +static u64 access_rbuf_bad_lookup_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[30]; +} + +static u64 access_rbuf_ctx_id_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[29]; +} + +static u64 access_rbuf_csr_qeopdw_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[28]; +} + +static u64 access_rx_rbuf_csr_q_num_of_pkt_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[27]; +} + +static u64 access_rx_rbuf_csr_q_t1_ptr_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[26]; +} + +static u64 access_rx_rbuf_csr_q_hd_ptr_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[25]; +} + +static u64 access_rx_rbuf_csr_q_vld_bit_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[24]; +} + +static u64 access_rx_rbuf_csr_q_next_buf_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[23]; +} + +static u64 access_rx_rbuf_csr_q_ent_cnt_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[22]; +} + +static u64 access_rx_rbuf_csr_q_head_buf_num_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[21]; +} + +static u64 access_rx_rbuf_block_list_read_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[20]; +} + +static u64 access_rx_rbuf_block_list_read_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[19]; +} + +static u64 access_rx_rbuf_lookup_des_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[18]; +} + +static u64 access_rx_rbuf_lookup_des_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[17]; +} + +static u64 access_rx_rbuf_lookup_des_reg_unc_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[16]; +} + +static u64 access_rx_rbuf_lookup_des_reg_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[15]; +} + +static u64 access_rx_rbuf_free_list_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[14]; +} + +static u64 access_rx_rbuf_free_list_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[13]; +} + +static u64 access_rx_rcv_fsm_encoding_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[12]; +} + +static u64 access_rx_dma_flag_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[11]; +} + +static u64 access_rx_dma_flag_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[10]; +} + +static u64 access_rx_dc_sop_eop_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[9]; +} + +static u64 access_rx_rcv_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[8]; +} + +static u64 access_rx_rcv_qp_map_table_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[7]; +} + +static u64 access_rx_rcv_qp_map_table_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[6]; +} + +static u64 access_rx_rcv_data_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[5]; +} + +static u64 access_rx_rcv_data_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[4]; +} + +static u64 access_rx_rcv_hdr_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[3]; +} + +static u64 access_rx_rcv_hdr_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[2]; +} + +static u64 access_rx_dc_intf_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[1]; +} + +static u64 access_rx_dma_csr_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->rcv_err_status_cnt[0]; +} + +/* + * Software counters corresponding to each of the + * error status bits within SendPioErrStatus + */ +static u64 access_pio_pec_sop_head_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[35]; +} + +static u64 access_pio_pcc_sop_head_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[34]; +} + +static u64 access_pio_last_returned_cnt_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[33]; +} + +static u64 access_pio_current_free_cnt_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[32]; +} + +static u64 access_pio_reserved_31_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[31]; +} + +static u64 access_pio_reserved_30_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[30]; +} + +static u64 access_pio_ppmc_sop_len_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[29]; +} + +static u64 access_pio_ppmc_bqc_mem_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[28]; +} + +static u64 access_pio_vl_fifo_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[27]; +} + +static u64 access_pio_vlf_sop_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[26]; +} + +static u64 access_pio_vlf_v1_len_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[25]; +} + +static u64 access_pio_block_qw_count_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[24]; +} + +static u64 access_pio_write_qw_valid_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[23]; +} + +static u64 access_pio_state_machine_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[22]; +} + +static u64 access_pio_write_data_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[21]; +} + +static u64 access_pio_host_addr_mem_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[20]; +} + +static u64 access_pio_host_addr_mem_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[19]; +} + +static u64 access_pio_pkt_evict_sm_or_arb_sm_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[18]; +} + +static u64 access_pio_init_sm_in_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[17]; +} + +static u64 access_pio_ppmc_pbl_fifo_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[16]; +} + +static u64 access_pio_credit_ret_fifo_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[15]; +} + +static u64 access_pio_v1_len_mem_bank1_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[14]; +} + +static u64 access_pio_v1_len_mem_bank0_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[13]; +} + +static u64 access_pio_v1_len_mem_bank1_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[12]; +} + +static u64 access_pio_v1_len_mem_bank0_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[11]; +} + +static u64 access_pio_sm_pkt_reset_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[10]; +} + +static u64 access_pio_pkt_evict_fifo_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[9]; +} + +static u64 access_pio_sbrdctrl_crrel_fifo_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[8]; +} + +static u64 access_pio_sbrdctl_crrel_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[7]; +} + +static u64 access_pio_pec_fifo_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[6]; +} + +static u64 access_pio_pcc_fifo_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[5]; +} + +static u64 access_pio_sb_mem_fifo1_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[4]; +} + +static u64 access_pio_sb_mem_fifo0_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[3]; +} + +static u64 access_pio_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[2]; +} + +static u64 access_pio_write_addr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[1]; +} + +static u64 access_pio_write_bad_ctxt_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_pio_err_status_cnt[0]; +} + +/* + * Software counters corresponding to each of the + * error status bits within SendDmaErrStatus + */ +static u64 access_sdma_pcie_req_tracking_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_dma_err_status_cnt[3]; +} + +static u64 access_sdma_pcie_req_tracking_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_dma_err_status_cnt[2]; +} + +static u64 access_sdma_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_dma_err_status_cnt[1]; +} + +static u64 access_sdma_rpy_tag_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_dma_err_status_cnt[0]; +} + +/* + * Software counters corresponding to each of the + * error status bits within SendEgressErrStatus + */ +static u64 access_tx_read_pio_memory_csr_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[63]; +} + +static u64 access_tx_read_sdma_memory_csr_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[62]; +} + +static u64 access_tx_egress_fifo_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[61]; +} + +static u64 access_tx_read_pio_memory_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[60]; +} + +static u64 access_tx_read_sdma_memory_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[59]; +} + +static u64 access_tx_sb_hdr_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[58]; +} + +static u64 access_tx_credit_overrun_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[57]; +} + +static u64 access_tx_launch_fifo8_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[56]; +} + +static u64 access_tx_launch_fifo7_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[55]; +} + +static u64 access_tx_launch_fifo6_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[54]; +} + +static u64 access_tx_launch_fifo5_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[53]; +} + +static u64 access_tx_launch_fifo4_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[52]; +} + +static u64 access_tx_launch_fifo3_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[51]; +} + +static u64 access_tx_launch_fifo2_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[50]; +} + +static u64 access_tx_launch_fifo1_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[49]; +} + +static u64 access_tx_launch_fifo0_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[48]; +} + +static u64 access_tx_credit_return_vl_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[47]; +} + +static u64 access_tx_hcrc_insertion_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[46]; +} + +static u64 access_tx_egress_fifo_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[45]; +} + +static u64 access_tx_read_pio_memory_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[44]; +} + +static u64 access_tx_read_sdma_memory_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[43]; +} + +static u64 access_tx_sb_hdr_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[42]; +} + +static u64 access_tx_credit_return_partiy_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[41]; +} + +static u64 access_tx_launch_fifo8_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[40]; +} + +static u64 access_tx_launch_fifo7_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[39]; +} + +static u64 access_tx_launch_fifo6_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[38]; +} + +static u64 access_tx_launch_fifo5_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[37]; +} + +static u64 access_tx_launch_fifo4_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[36]; +} + +static u64 access_tx_launch_fifo3_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[35]; +} + +static u64 access_tx_launch_fifo2_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[34]; +} + +static u64 access_tx_launch_fifo1_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[33]; +} + +static u64 access_tx_launch_fifo0_unc_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[32]; +} + +static u64 access_tx_sdma15_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[31]; +} + +static u64 access_tx_sdma14_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[30]; +} + +static u64 access_tx_sdma13_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[29]; +} + +static u64 access_tx_sdma12_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[28]; +} + +static u64 access_tx_sdma11_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[27]; +} + +static u64 access_tx_sdma10_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[26]; +} + +static u64 access_tx_sdma9_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[25]; +} + +static u64 access_tx_sdma8_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[24]; +} + +static u64 access_tx_sdma7_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[23]; +} + +static u64 access_tx_sdma6_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[22]; +} + +static u64 access_tx_sdma5_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[21]; +} + +static u64 access_tx_sdma4_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[20]; +} + +static u64 access_tx_sdma3_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[19]; +} + +static u64 access_tx_sdma2_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[18]; +} + +static u64 access_tx_sdma1_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[17]; +} + +static u64 access_tx_sdma0_disallowed_packet_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[16]; +} + +static u64 access_tx_config_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[15]; +} + +static u64 access_tx_sbrd_ctl_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[14]; +} + +static u64 access_tx_launch_csr_parity_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[13]; +} + +static u64 access_tx_illegal_vl_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[12]; +} + +static u64 access_tx_sbrd_ctl_state_machine_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[11]; +} + +static u64 access_egress_reserved_10_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[10]; +} + +static u64 access_egress_reserved_9_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[9]; +} + +static u64 access_tx_sdma_launch_intf_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[8]; +} + +static u64 access_tx_pio_launch_intf_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[7]; +} + +static u64 access_egress_reserved_6_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[6]; +} + +static u64 access_tx_incorrect_link_state_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[5]; +} + +static u64 access_tx_linkdown_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[4]; +} + +static u64 access_tx_egress_fifi_underrun_or_parity_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[3]; +} + +static u64 access_egress_reserved_2_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[2]; +} + +static u64 access_tx_pkt_integrity_mem_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[1]; +} + +static u64 access_tx_pkt_integrity_mem_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_egress_err_status_cnt[0]; +} + +/* + * Software counters corresponding to each of the + * error status bits within SendErrStatus + */ +static u64 access_send_csr_write_bad_addr_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_err_status_cnt[2]; +} + +static u64 access_send_csr_read_bad_addr_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_err_status_cnt[1]; +} + +static u64 access_send_csr_parity_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->send_err_status_cnt[0]; +} + +/* + * Software counters corresponding to each of the + * error status bits within SendCtxtErrStatus + */ +static u64 access_pio_write_out_of_bounds_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_ctxt_err_status_cnt[4]; +} + +static u64 access_pio_write_overflow_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_ctxt_err_status_cnt[3]; +} + +static u64 access_pio_write_crosses_boundary_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_ctxt_err_status_cnt[2]; +} + +static u64 access_pio_disallowed_packet_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_ctxt_err_status_cnt[1]; +} + +static u64 access_pio_inconsistent_sop_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_ctxt_err_status_cnt[0]; +} + +/* + * Software counters corresponding to each of the + * error status bits within SendDmaEngErrStatus + */ +static u64 access_sdma_header_request_fifo_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[23]; +} + +static u64 access_sdma_header_storage_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[22]; +} + +static u64 access_sdma_packet_tracking_cor_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[21]; +} + +static u64 access_sdma_assembly_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[20]; +} + +static u64 access_sdma_desc_table_cor_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[19]; +} + +static u64 access_sdma_header_request_fifo_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[18]; +} + +static u64 access_sdma_header_storage_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[17]; +} + +static u64 access_sdma_packet_tracking_unc_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[16]; +} + +static u64 access_sdma_assembly_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[15]; +} + +static u64 access_sdma_desc_table_unc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[14]; +} + +static u64 access_sdma_timeout_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[13]; +} + +static u64 access_sdma_header_length_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[12]; +} + +static u64 access_sdma_header_address_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[11]; +} + +static u64 access_sdma_header_select_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[10]; +} + +static u64 access_sdma_reserved_9_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[9]; +} + +static u64 access_sdma_packet_desc_overflow_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[8]; +} + +static u64 access_sdma_length_mismatch_err_cnt(const struct cntr_entry *entry, + void *context, int vl, + int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[7]; +} + +static u64 access_sdma_halt_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[6]; +} + +static u64 access_sdma_mem_read_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[5]; +} + +static u64 access_sdma_first_desc_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[4]; +} + +static u64 access_sdma_tail_out_of_bounds_err_cnt( + const struct cntr_entry *entry, + void *context, int vl, int mode, u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[3]; +} + +static u64 access_sdma_too_long_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[2]; +} + +static u64 access_sdma_gen_mismatch_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[1]; +} + +static u64 access_sdma_wrong_dw_err_cnt(const struct cntr_entry *entry, + void *context, int vl, int mode, + u64 data) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)context; + + return dd->sw_send_dma_eng_err_status_cnt[0]; +} + +#define def_access_sw_cpu(cntr) \ +static u64 access_sw_cpu_##cntr(const struct cntr_entry *entry, \ + void *context, int vl, int mode, u64 data) \ +{ \ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; \ + return read_write_cpu(ppd->dd, &ppd->ibport_data.rvp.z_ ##cntr, \ + ppd->ibport_data.rvp.cntr, vl, \ + mode, data); \ +} + +def_access_sw_cpu(rc_acks); +def_access_sw_cpu(rc_qacks); +def_access_sw_cpu(rc_delayed_comp); + +#define def_access_ibp_counter(cntr) \ +static u64 access_ibp_##cntr(const struct cntr_entry *entry, \ + void *context, int vl, int mode, u64 data) \ +{ \ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)context; \ + \ + if (vl != CNTR_INVALID_VL) \ + return 0; \ + \ + return read_write_sw(ppd->dd, &ppd->ibport_data.rvp.n_ ##cntr, \ + mode, data); \ +} + +def_access_ibp_counter(loop_pkts); +def_access_ibp_counter(rc_resends); +def_access_ibp_counter(rnr_naks); +def_access_ibp_counter(other_naks); +def_access_ibp_counter(rc_timeouts); +def_access_ibp_counter(pkt_drops); +def_access_ibp_counter(dmawait); +def_access_ibp_counter(rc_seqnak); +def_access_ibp_counter(rc_dupreq); +def_access_ibp_counter(rdma_seq); +def_access_ibp_counter(unaligned); +def_access_ibp_counter(seq_naks); + +static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = { +[C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH), +[C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT, + CNTR_NORMAL), +[C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT, + CNTR_NORMAL), +[C_RX_TID_FLGMS] = RXE32_DEV_CNTR_ELEM(RxTidFLGMs, + RCV_TID_FLOW_GEN_MISMATCH_CNT, + CNTR_NORMAL), +[C_RX_CTX_EGRS] = RXE32_DEV_CNTR_ELEM(RxCtxEgrS, RCV_CONTEXT_EGR_STALL, + CNTR_NORMAL), +[C_RCV_TID_FLSMS] = RXE32_DEV_CNTR_ELEM(RxTidFLSMs, + RCV_TID_FLOW_SEQ_MISMATCH_CNT, CNTR_NORMAL), +[C_CCE_PCI_CR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciCrSt, + CCE_PCIE_POSTED_CRDT_STALL_CNT, CNTR_NORMAL), +[C_CCE_PCI_TR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePciTrSt, CCE_PCIE_TRGT_STALL_CNT, + CNTR_NORMAL), +[C_CCE_PIO_WR_ST] = CCE_PERF_DEV_CNTR_ELEM(CcePioWrSt, CCE_PIO_WR_STALL_CNT, + CNTR_NORMAL), +[C_CCE_ERR_INT] = CCE_INT_DEV_CNTR_ELEM(CceErrInt, CCE_ERR_INT_CNT, + CNTR_NORMAL), +[C_CCE_SDMA_INT] = CCE_INT_DEV_CNTR_ELEM(CceSdmaInt, CCE_SDMA_INT_CNT, + CNTR_NORMAL), +[C_CCE_MISC_INT] = CCE_INT_DEV_CNTR_ELEM(CceMiscInt, CCE_MISC_INT_CNT, + CNTR_NORMAL), +[C_CCE_RCV_AV_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvAvInt, CCE_RCV_AVAIL_INT_CNT, + CNTR_NORMAL), +[C_CCE_RCV_URG_INT] = CCE_INT_DEV_CNTR_ELEM(CceRcvUrgInt, + CCE_RCV_URGENT_INT_CNT, CNTR_NORMAL), +[C_CCE_SEND_CR_INT] = CCE_INT_DEV_CNTR_ELEM(CceSndCrInt, + CCE_SEND_CREDIT_INT_CNT, CNTR_NORMAL), +[C_DC_UNC_ERR] = DC_PERF_CNTR(DcUnctblErr, DCC_ERR_UNCORRECTABLE_CNT, + CNTR_SYNTH), +[C_DC_RCV_ERR] = DC_PERF_CNTR(DcRecvErr, DCC_ERR_PORTRCV_ERR_CNT, CNTR_SYNTH), +[C_DC_FM_CFG_ERR] = DC_PERF_CNTR(DcFmCfgErr, DCC_ERR_FMCONFIG_ERR_CNT, + CNTR_SYNTH), +[C_DC_RMT_PHY_ERR] = DC_PERF_CNTR(DcRmtPhyErr, DCC_ERR_RCVREMOTE_PHY_ERR_CNT, + CNTR_SYNTH), +[C_DC_DROPPED_PKT] = DC_PERF_CNTR(DcDroppedPkt, DCC_ERR_DROPPED_PKT_CNT, + CNTR_SYNTH), +[C_DC_MC_XMIT_PKTS] = DC_PERF_CNTR(DcMcXmitPkts, + DCC_PRF_PORT_XMIT_MULTICAST_CNT, CNTR_SYNTH), +[C_DC_MC_RCV_PKTS] = DC_PERF_CNTR(DcMcRcvPkts, + DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT, + CNTR_SYNTH), +[C_DC_XMIT_CERR] = DC_PERF_CNTR(DcXmitCorr, + DCC_PRF_PORT_XMIT_CORRECTABLE_CNT, CNTR_SYNTH), +[C_DC_RCV_CERR] = DC_PERF_CNTR(DcRcvCorrCnt, DCC_PRF_PORT_RCV_CORRECTABLE_CNT, + CNTR_SYNTH), +[C_DC_RCV_FCC] = DC_PERF_CNTR(DcRxFCntl, DCC_PRF_RX_FLOW_CRTL_CNT, + CNTR_SYNTH), +[C_DC_XMIT_FCC] = DC_PERF_CNTR(DcXmitFCntl, DCC_PRF_TX_FLOW_CRTL_CNT, + CNTR_SYNTH), +[C_DC_XMIT_FLITS] = DC_PERF_CNTR(DcXmitFlits, DCC_PRF_PORT_XMIT_DATA_CNT, + CNTR_SYNTH), +[C_DC_RCV_FLITS] = DC_PERF_CNTR(DcRcvFlits, DCC_PRF_PORT_RCV_DATA_CNT, + CNTR_SYNTH), +[C_DC_XMIT_PKTS] = DC_PERF_CNTR(DcXmitPkts, DCC_PRF_PORT_XMIT_PKTS_CNT, + CNTR_SYNTH), +[C_DC_RCV_PKTS] = DC_PERF_CNTR(DcRcvPkts, DCC_PRF_PORT_RCV_PKTS_CNT, + CNTR_SYNTH), +[C_DC_RX_FLIT_VL] = DC_PERF_CNTR(DcRxFlitVl, DCC_PRF_PORT_VL_RCV_DATA_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_RX_PKT_VL] = DC_PERF_CNTR(DcRxPktVl, DCC_PRF_PORT_VL_RCV_PKTS_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_RCV_FCN] = DC_PERF_CNTR(DcRcvFcn, DCC_PRF_PORT_RCV_FECN_CNT, CNTR_SYNTH), +[C_DC_RCV_FCN_VL] = DC_PERF_CNTR(DcRcvFcnVl, DCC_PRF_PORT_VL_RCV_FECN_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_RCV_BCN] = DC_PERF_CNTR(DcRcvBcn, DCC_PRF_PORT_RCV_BECN_CNT, CNTR_SYNTH), +[C_DC_RCV_BCN_VL] = DC_PERF_CNTR(DcRcvBcnVl, DCC_PRF_PORT_VL_RCV_BECN_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_RCV_BBL] = DC_PERF_CNTR(DcRcvBbl, DCC_PRF_PORT_RCV_BUBBLE_CNT, + CNTR_SYNTH), +[C_DC_RCV_BBL_VL] = DC_PERF_CNTR(DcRcvBblVl, DCC_PRF_PORT_VL_RCV_BUBBLE_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_MARK_FECN] = DC_PERF_CNTR(DcMarkFcn, DCC_PRF_PORT_MARK_FECN_CNT, + CNTR_SYNTH), +[C_DC_MARK_FECN_VL] = DC_PERF_CNTR(DcMarkFcnVl, DCC_PRF_PORT_VL_MARK_FECN_CNT, + CNTR_SYNTH | CNTR_VL), +[C_DC_TOTAL_CRC] = + DC_PERF_CNTR_LCB(DcTotCrc, DC_LCB_ERR_INFO_TOTAL_CRC_ERR, + CNTR_SYNTH), +[C_DC_CRC_LN0] = DC_PERF_CNTR_LCB(DcCrcLn0, DC_LCB_ERR_INFO_CRC_ERR_LN0, + CNTR_SYNTH), +[C_DC_CRC_LN1] = DC_PERF_CNTR_LCB(DcCrcLn1, DC_LCB_ERR_INFO_CRC_ERR_LN1, + CNTR_SYNTH), +[C_DC_CRC_LN2] = DC_PERF_CNTR_LCB(DcCrcLn2, DC_LCB_ERR_INFO_CRC_ERR_LN2, + CNTR_SYNTH), +[C_DC_CRC_LN3] = DC_PERF_CNTR_LCB(DcCrcLn3, DC_LCB_ERR_INFO_CRC_ERR_LN3, + CNTR_SYNTH), +[C_DC_CRC_MULT_LN] = + DC_PERF_CNTR_LCB(DcMultLn, DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN, + CNTR_SYNTH), +[C_DC_TX_REPLAY] = DC_PERF_CNTR_LCB(DcTxReplay, DC_LCB_ERR_INFO_TX_REPLAY_CNT, + CNTR_SYNTH), +[C_DC_RX_REPLAY] = DC_PERF_CNTR_LCB(DcRxReplay, DC_LCB_ERR_INFO_RX_REPLAY_CNT, + CNTR_SYNTH), +[C_DC_SEQ_CRC_CNT] = + DC_PERF_CNTR_LCB(DcLinkSeqCrc, DC_LCB_ERR_INFO_SEQ_CRC_CNT, + CNTR_SYNTH), +[C_DC_ESC0_ONLY_CNT] = + DC_PERF_CNTR_LCB(DcEsc0, DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT, + CNTR_SYNTH), +[C_DC_ESC0_PLUS1_CNT] = + DC_PERF_CNTR_LCB(DcEsc1, DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT, + CNTR_SYNTH), +[C_DC_ESC0_PLUS2_CNT] = + DC_PERF_CNTR_LCB(DcEsc0Plus2, DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT, + CNTR_SYNTH), +[C_DC_REINIT_FROM_PEER_CNT] = + DC_PERF_CNTR_LCB(DcReinitPeer, DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT, + CNTR_SYNTH), +[C_DC_SBE_CNT] = DC_PERF_CNTR_LCB(DcSbe, DC_LCB_ERR_INFO_SBE_CNT, + CNTR_SYNTH), +[C_DC_MISC_FLG_CNT] = + DC_PERF_CNTR_LCB(DcMiscFlg, DC_LCB_ERR_INFO_MISC_FLG_CNT, + CNTR_SYNTH), +[C_DC_PRF_GOOD_LTP_CNT] = + DC_PERF_CNTR_LCB(DcGoodLTP, DC_LCB_PRF_GOOD_LTP_CNT, CNTR_SYNTH), +[C_DC_PRF_ACCEPTED_LTP_CNT] = + DC_PERF_CNTR_LCB(DcAccLTP, DC_LCB_PRF_ACCEPTED_LTP_CNT, + CNTR_SYNTH), +[C_DC_PRF_RX_FLIT_CNT] = + DC_PERF_CNTR_LCB(DcPrfRxFlit, DC_LCB_PRF_RX_FLIT_CNT, CNTR_SYNTH), +[C_DC_PRF_TX_FLIT_CNT] = + DC_PERF_CNTR_LCB(DcPrfTxFlit, DC_LCB_PRF_TX_FLIT_CNT, CNTR_SYNTH), +[C_DC_PRF_CLK_CNTR] = + DC_PERF_CNTR_LCB(DcPrfClk, DC_LCB_PRF_CLK_CNTR, CNTR_SYNTH), +[C_DC_PG_DBG_FLIT_CRDTS_CNT] = + DC_PERF_CNTR_LCB(DcFltCrdts, DC_LCB_PG_DBG_FLIT_CRDTS_CNT, CNTR_SYNTH), +[C_DC_PG_STS_PAUSE_COMPLETE_CNT] = + DC_PERF_CNTR_LCB(DcPauseComp, DC_LCB_PG_STS_PAUSE_COMPLETE_CNT, + CNTR_SYNTH), +[C_DC_PG_STS_TX_SBE_CNT] = + DC_PERF_CNTR_LCB(DcStsTxSbe, DC_LCB_PG_STS_TX_SBE_CNT, CNTR_SYNTH), +[C_DC_PG_STS_TX_MBE_CNT] = + DC_PERF_CNTR_LCB(DcStsTxMbe, DC_LCB_PG_STS_TX_MBE_CNT, + CNTR_SYNTH), +[C_SW_CPU_INTR] = CNTR_ELEM("Intr", 0, 0, CNTR_NORMAL, + access_sw_cpu_intr), +[C_SW_CPU_RCV_LIM] = CNTR_ELEM("RcvLimit", 0, 0, CNTR_NORMAL, + access_sw_cpu_rcv_limit), +[C_SW_VTX_WAIT] = CNTR_ELEM("vTxWait", 0, 0, CNTR_NORMAL, + access_sw_vtx_wait), +[C_SW_PIO_WAIT] = CNTR_ELEM("PioWait", 0, 0, CNTR_NORMAL, + access_sw_pio_wait), +[C_SW_PIO_DRAIN] = CNTR_ELEM("PioDrain", 0, 0, CNTR_NORMAL, + access_sw_pio_drain), +[C_SW_KMEM_WAIT] = CNTR_ELEM("KmemWait", 0, 0, CNTR_NORMAL, + access_sw_kmem_wait), +[C_SW_SEND_SCHED] = CNTR_ELEM("SendSched", 0, 0, CNTR_NORMAL, + access_sw_send_schedule), +[C_SDMA_DESC_FETCHED_CNT] = CNTR_ELEM("SDEDscFdCn", + SEND_DMA_DESC_FETCHED_CNT, 0, + CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA, + dev_access_u32_csr), +[C_SDMA_INT_CNT] = CNTR_ELEM("SDMAInt", 0, 0, + CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA, + access_sde_int_cnt), +[C_SDMA_ERR_CNT] = CNTR_ELEM("SDMAErrCt", 0, 0, + CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA, + access_sde_err_cnt), +[C_SDMA_IDLE_INT_CNT] = CNTR_ELEM("SDMAIdInt", 0, 0, + CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA, + access_sde_idle_int_cnt), +[C_SDMA_PROGRESS_INT_CNT] = CNTR_ELEM("SDMAPrIntCn", 0, 0, + CNTR_NORMAL | CNTR_32BIT | CNTR_SDMA, + access_sde_progress_int_cnt), +/* MISC_ERR_STATUS */ +[C_MISC_PLL_LOCK_FAIL_ERR] = CNTR_ELEM("MISC_PLL_LOCK_FAIL_ERR", 0, 0, + CNTR_NORMAL, + access_misc_pll_lock_fail_err_cnt), +[C_MISC_MBIST_FAIL_ERR] = CNTR_ELEM("MISC_MBIST_FAIL_ERR", 0, 0, + CNTR_NORMAL, + access_misc_mbist_fail_err_cnt), +[C_MISC_INVALID_EEP_CMD_ERR] = CNTR_ELEM("MISC_INVALID_EEP_CMD_ERR", 0, 0, + CNTR_NORMAL, + access_misc_invalid_eep_cmd_err_cnt), +[C_MISC_EFUSE_DONE_PARITY_ERR] = CNTR_ELEM("MISC_EFUSE_DONE_PARITY_ERR", 0, 0, + CNTR_NORMAL, + access_misc_efuse_done_parity_err_cnt), +[C_MISC_EFUSE_WRITE_ERR] = CNTR_ELEM("MISC_EFUSE_WRITE_ERR", 0, 0, + CNTR_NORMAL, + access_misc_efuse_write_err_cnt), +[C_MISC_EFUSE_READ_BAD_ADDR_ERR] = CNTR_ELEM("MISC_EFUSE_READ_BAD_ADDR_ERR", 0, + 0, CNTR_NORMAL, + access_misc_efuse_read_bad_addr_err_cnt), +[C_MISC_EFUSE_CSR_PARITY_ERR] = CNTR_ELEM("MISC_EFUSE_CSR_PARITY_ERR", 0, 0, + CNTR_NORMAL, + access_misc_efuse_csr_parity_err_cnt), +[C_MISC_FW_AUTH_FAILED_ERR] = CNTR_ELEM("MISC_FW_AUTH_FAILED_ERR", 0, 0, + CNTR_NORMAL, + access_misc_fw_auth_failed_err_cnt), +[C_MISC_KEY_MISMATCH_ERR] = CNTR_ELEM("MISC_KEY_MISMATCH_ERR", 0, 0, + CNTR_NORMAL, + access_misc_key_mismatch_err_cnt), +[C_MISC_SBUS_WRITE_FAILED_ERR] = CNTR_ELEM("MISC_SBUS_WRITE_FAILED_ERR", 0, 0, + CNTR_NORMAL, + access_misc_sbus_write_failed_err_cnt), +[C_MISC_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("MISC_CSR_WRITE_BAD_ADDR_ERR", 0, 0, + CNTR_NORMAL, + access_misc_csr_write_bad_addr_err_cnt), +[C_MISC_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("MISC_CSR_READ_BAD_ADDR_ERR", 0, 0, + CNTR_NORMAL, + access_misc_csr_read_bad_addr_err_cnt), +[C_MISC_CSR_PARITY_ERR] = CNTR_ELEM("MISC_CSR_PARITY_ERR", 0, 0, + CNTR_NORMAL, + access_misc_csr_parity_err_cnt), +/* CceErrStatus */ +[C_CCE_ERR_STATUS_AGGREGATED_CNT] = CNTR_ELEM("CceErrStatusAggregatedCnt", 0, 0, + CNTR_NORMAL, + access_sw_cce_err_status_aggregated_cnt), +[C_CCE_MSIX_CSR_PARITY_ERR] = CNTR_ELEM("CceMsixCsrParityErr", 0, 0, + CNTR_NORMAL, + access_cce_msix_csr_parity_err_cnt), +[C_CCE_INT_MAP_UNC_ERR] = CNTR_ELEM("CceIntMapUncErr", 0, 0, + CNTR_NORMAL, + access_cce_int_map_unc_err_cnt), +[C_CCE_INT_MAP_COR_ERR] = CNTR_ELEM("CceIntMapCorErr", 0, 0, + CNTR_NORMAL, + access_cce_int_map_cor_err_cnt), +[C_CCE_MSIX_TABLE_UNC_ERR] = CNTR_ELEM("CceMsixTableUncErr", 0, 0, + CNTR_NORMAL, + access_cce_msix_table_unc_err_cnt), +[C_CCE_MSIX_TABLE_COR_ERR] = CNTR_ELEM("CceMsixTableCorErr", 0, 0, + CNTR_NORMAL, + access_cce_msix_table_cor_err_cnt), +[C_CCE_RXDMA_CONV_FIFO_PARITY_ERR] = CNTR_ELEM("CceRxdmaConvFifoParityErr", 0, + 0, CNTR_NORMAL, + access_cce_rxdma_conv_fifo_parity_err_cnt), +[C_CCE_RCPL_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceRcplAsyncFifoParityErr", 0, + 0, CNTR_NORMAL, + access_cce_rcpl_async_fifo_parity_err_cnt), +[C_CCE_SEG_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("CceSegWriteBadAddrErr", 0, 0, + CNTR_NORMAL, + access_cce_seg_write_bad_addr_err_cnt), +[C_CCE_SEG_READ_BAD_ADDR_ERR] = CNTR_ELEM("CceSegReadBadAddrErr", 0, 0, + CNTR_NORMAL, + access_cce_seg_read_bad_addr_err_cnt), +[C_LA_TRIGGERED] = CNTR_ELEM("Cce LATriggered", 0, 0, + CNTR_NORMAL, + access_la_triggered_cnt), +[C_CCE_TRGT_CPL_TIMEOUT_ERR] = CNTR_ELEM("CceTrgtCplTimeoutErr", 0, 0, + CNTR_NORMAL, + access_cce_trgt_cpl_timeout_err_cnt), +[C_PCIC_RECEIVE_PARITY_ERR] = CNTR_ELEM("PcicReceiveParityErr", 0, 0, + CNTR_NORMAL, + access_pcic_receive_parity_err_cnt), +[C_PCIC_TRANSMIT_BACK_PARITY_ERR] = CNTR_ELEM("PcicTransmitBackParityErr", 0, 0, + CNTR_NORMAL, + access_pcic_transmit_back_parity_err_cnt), +[C_PCIC_TRANSMIT_FRONT_PARITY_ERR] = CNTR_ELEM("PcicTransmitFrontParityErr", 0, + 0, CNTR_NORMAL, + access_pcic_transmit_front_parity_err_cnt), +[C_PCIC_CPL_DAT_Q_UNC_ERR] = CNTR_ELEM("PcicCplDatQUncErr", 0, 0, + CNTR_NORMAL, + access_pcic_cpl_dat_q_unc_err_cnt), +[C_PCIC_CPL_HD_Q_UNC_ERR] = CNTR_ELEM("PcicCplHdQUncErr", 0, 0, + CNTR_NORMAL, + access_pcic_cpl_hd_q_unc_err_cnt), +[C_PCIC_POST_DAT_Q_UNC_ERR] = CNTR_ELEM("PcicPostDatQUncErr", 0, 0, + CNTR_NORMAL, + access_pcic_post_dat_q_unc_err_cnt), +[C_PCIC_POST_HD_Q_UNC_ERR] = CNTR_ELEM("PcicPostHdQUncErr", 0, 0, + CNTR_NORMAL, + access_pcic_post_hd_q_unc_err_cnt), +[C_PCIC_RETRY_SOT_MEM_UNC_ERR] = CNTR_ELEM("PcicRetrySotMemUncErr", 0, 0, + CNTR_NORMAL, + access_pcic_retry_sot_mem_unc_err_cnt), +[C_PCIC_RETRY_MEM_UNC_ERR] = CNTR_ELEM("PcicRetryMemUncErr", 0, 0, + CNTR_NORMAL, + access_pcic_retry_mem_unc_err), +[C_PCIC_N_POST_DAT_Q_PARITY_ERR] = CNTR_ELEM("PcicNPostDatQParityErr", 0, 0, + CNTR_NORMAL, + access_pcic_n_post_dat_q_parity_err_cnt), +[C_PCIC_N_POST_H_Q_PARITY_ERR] = CNTR_ELEM("PcicNPostHQParityErr", 0, 0, + CNTR_NORMAL, + access_pcic_n_post_h_q_parity_err_cnt), +[C_PCIC_CPL_DAT_Q_COR_ERR] = CNTR_ELEM("PcicCplDatQCorErr", 0, 0, + CNTR_NORMAL, + access_pcic_cpl_dat_q_cor_err_cnt), +[C_PCIC_CPL_HD_Q_COR_ERR] = CNTR_ELEM("PcicCplHdQCorErr", 0, 0, + CNTR_NORMAL, + access_pcic_cpl_hd_q_cor_err_cnt), +[C_PCIC_POST_DAT_Q_COR_ERR] = CNTR_ELEM("PcicPostDatQCorErr", 0, 0, + CNTR_NORMAL, + access_pcic_post_dat_q_cor_err_cnt), +[C_PCIC_POST_HD_Q_COR_ERR] = CNTR_ELEM("PcicPostHdQCorErr", 0, 0, + CNTR_NORMAL, + access_pcic_post_hd_q_cor_err_cnt), +[C_PCIC_RETRY_SOT_MEM_COR_ERR] = CNTR_ELEM("PcicRetrySotMemCorErr", 0, 0, + CNTR_NORMAL, + access_pcic_retry_sot_mem_cor_err_cnt), +[C_PCIC_RETRY_MEM_COR_ERR] = CNTR_ELEM("PcicRetryMemCorErr", 0, 0, + CNTR_NORMAL, + access_pcic_retry_mem_cor_err_cnt), +[C_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERR] = CNTR_ELEM( + "CceCli1AsyncFifoDbgParityError", 0, 0, + CNTR_NORMAL, + access_cce_cli1_async_fifo_dbg_parity_err_cnt), +[C_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERR] = CNTR_ELEM( + "CceCli1AsyncFifoRxdmaParityError", 0, 0, + CNTR_NORMAL, + access_cce_cli1_async_fifo_rxdma_parity_err_cnt + ), +[C_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR] = CNTR_ELEM( + "CceCli1AsyncFifoSdmaHdParityErr", 0, 0, + CNTR_NORMAL, + access_cce_cli1_async_fifo_sdma_hd_parity_err_cnt), +[C_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR] = CNTR_ELEM( + "CceCli1AsyncFifoPioCrdtParityErr", 0, 0, + CNTR_NORMAL, + access_cce_cl1_async_fifo_pio_crdt_parity_err_cnt), +[C_CCE_CLI2_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceCli2AsyncFifoParityErr", 0, + 0, CNTR_NORMAL, + access_cce_cli2_async_fifo_parity_err_cnt), +[C_CCE_CSR_CFG_BUS_PARITY_ERR] = CNTR_ELEM("CceCsrCfgBusParityErr", 0, 0, + CNTR_NORMAL, + access_cce_csr_cfg_bus_parity_err_cnt), +[C_CCE_CLI0_ASYNC_FIFO_PARTIY_ERR] = CNTR_ELEM("CceCli0AsyncFifoParityErr", 0, + 0, CNTR_NORMAL, + access_cce_cli0_async_fifo_parity_err_cnt), +[C_CCE_RSPD_DATA_PARITY_ERR] = CNTR_ELEM("CceRspdDataParityErr", 0, 0, + CNTR_NORMAL, + access_cce_rspd_data_parity_err_cnt), +[C_CCE_TRGT_ACCESS_ERR] = CNTR_ELEM("CceTrgtAccessErr", 0, 0, + CNTR_NORMAL, + access_cce_trgt_access_err_cnt), +[C_CCE_TRGT_ASYNC_FIFO_PARITY_ERR] = CNTR_ELEM("CceTrgtAsyncFifoParityErr", 0, + 0, CNTR_NORMAL, + access_cce_trgt_async_fifo_parity_err_cnt), +[C_CCE_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("CceCsrWriteBadAddrErr", 0, 0, + CNTR_NORMAL, + access_cce_csr_write_bad_addr_err_cnt), +[C_CCE_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("CceCsrReadBadAddrErr", 0, 0, + CNTR_NORMAL, + access_cce_csr_read_bad_addr_err_cnt), +[C_CCE_CSR_PARITY_ERR] = CNTR_ELEM("CceCsrParityErr", 0, 0, + CNTR_NORMAL, + access_ccs_csr_parity_err_cnt), + +/* RcvErrStatus */ +[C_RX_CSR_PARITY_ERR] = CNTR_ELEM("RxCsrParityErr", 0, 0, + CNTR_NORMAL, + access_rx_csr_parity_err_cnt), +[C_RX_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("RxCsrWriteBadAddrErr", 0, 0, + CNTR_NORMAL, + access_rx_csr_write_bad_addr_err_cnt), +[C_RX_CSR_READ_BAD_ADDR_ERR] = CNTR_ELEM("RxCsrReadBadAddrErr", 0, 0, + CNTR_NORMAL, + access_rx_csr_read_bad_addr_err_cnt), +[C_RX_DMA_CSR_UNC_ERR] = CNTR_ELEM("RxDmaCsrUncErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_csr_unc_err_cnt), +[C_RX_DMA_DQ_FSM_ENCODING_ERR] = CNTR_ELEM("RxDmaDqFsmEncodingErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_dq_fsm_encoding_err_cnt), +[C_RX_DMA_EQ_FSM_ENCODING_ERR] = CNTR_ELEM("RxDmaEqFsmEncodingErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_eq_fsm_encoding_err_cnt), +[C_RX_DMA_CSR_PARITY_ERR] = CNTR_ELEM("RxDmaCsrParityErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_csr_parity_err_cnt), +[C_RX_RBUF_DATA_COR_ERR] = CNTR_ELEM("RxRbufDataCorErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_data_cor_err_cnt), +[C_RX_RBUF_DATA_UNC_ERR] = CNTR_ELEM("RxRbufDataUncErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_data_unc_err_cnt), +[C_RX_DMA_DATA_FIFO_RD_COR_ERR] = CNTR_ELEM("RxDmaDataFifoRdCorErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_data_fifo_rd_cor_err_cnt), +[C_RX_DMA_DATA_FIFO_RD_UNC_ERR] = CNTR_ELEM("RxDmaDataFifoRdUncErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_data_fifo_rd_unc_err_cnt), +[C_RX_DMA_HDR_FIFO_RD_COR_ERR] = CNTR_ELEM("RxDmaHdrFifoRdCorErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_hdr_fifo_rd_cor_err_cnt), +[C_RX_DMA_HDR_FIFO_RD_UNC_ERR] = CNTR_ELEM("RxDmaHdrFifoRdUncErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_hdr_fifo_rd_unc_err_cnt), +[C_RX_RBUF_DESC_PART2_COR_ERR] = CNTR_ELEM("RxRbufDescPart2CorErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_desc_part2_cor_err_cnt), +[C_RX_RBUF_DESC_PART2_UNC_ERR] = CNTR_ELEM("RxRbufDescPart2UncErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_desc_part2_unc_err_cnt), +[C_RX_RBUF_DESC_PART1_COR_ERR] = CNTR_ELEM("RxRbufDescPart1CorErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_desc_part1_cor_err_cnt), +[C_RX_RBUF_DESC_PART1_UNC_ERR] = CNTR_ELEM("RxRbufDescPart1UncErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_desc_part1_unc_err_cnt), +[C_RX_HQ_INTR_FSM_ERR] = CNTR_ELEM("RxHqIntrFsmErr", 0, 0, + CNTR_NORMAL, + access_rx_hq_intr_fsm_err_cnt), +[C_RX_HQ_INTR_CSR_PARITY_ERR] = CNTR_ELEM("RxHqIntrCsrParityErr", 0, 0, + CNTR_NORMAL, + access_rx_hq_intr_csr_parity_err_cnt), +[C_RX_LOOKUP_CSR_PARITY_ERR] = CNTR_ELEM("RxLookupCsrParityErr", 0, 0, + CNTR_NORMAL, + access_rx_lookup_csr_parity_err_cnt), +[C_RX_LOOKUP_RCV_ARRAY_COR_ERR] = CNTR_ELEM("RxLookupRcvArrayCorErr", 0, 0, + CNTR_NORMAL, + access_rx_lookup_rcv_array_cor_err_cnt), +[C_RX_LOOKUP_RCV_ARRAY_UNC_ERR] = CNTR_ELEM("RxLookupRcvArrayUncErr", 0, 0, + CNTR_NORMAL, + access_rx_lookup_rcv_array_unc_err_cnt), +[C_RX_LOOKUP_DES_PART2_PARITY_ERR] = CNTR_ELEM("RxLookupDesPart2ParityErr", 0, + 0, CNTR_NORMAL, + access_rx_lookup_des_part2_parity_err_cnt), +[C_RX_LOOKUP_DES_PART1_UNC_COR_ERR] = CNTR_ELEM("RxLookupDesPart1UncCorErr", 0, + 0, CNTR_NORMAL, + access_rx_lookup_des_part1_unc_cor_err_cnt), +[C_RX_LOOKUP_DES_PART1_UNC_ERR] = CNTR_ELEM("RxLookupDesPart1UncErr", 0, 0, + CNTR_NORMAL, + access_rx_lookup_des_part1_unc_err_cnt), +[C_RX_RBUF_NEXT_FREE_BUF_COR_ERR] = CNTR_ELEM("RxRbufNextFreeBufCorErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_next_free_buf_cor_err_cnt), +[C_RX_RBUF_NEXT_FREE_BUF_UNC_ERR] = CNTR_ELEM("RxRbufNextFreeBufUncErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_next_free_buf_unc_err_cnt), +[C_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR] = CNTR_ELEM( + "RxRbufFlInitWrAddrParityErr", 0, 0, + CNTR_NORMAL, + access_rbuf_fl_init_wr_addr_parity_err_cnt), +[C_RX_RBUF_FL_INITDONE_PARITY_ERR] = CNTR_ELEM("RxRbufFlInitdoneParityErr", 0, + 0, CNTR_NORMAL, + access_rx_rbuf_fl_initdone_parity_err_cnt), +[C_RX_RBUF_FL_WRITE_ADDR_PARITY_ERR] = CNTR_ELEM("RxRbufFlWrAddrParityErr", 0, + 0, CNTR_NORMAL, + access_rx_rbuf_fl_write_addr_parity_err_cnt), +[C_RX_RBUF_FL_RD_ADDR_PARITY_ERR] = CNTR_ELEM("RxRbufFlRdAddrParityErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_fl_rd_addr_parity_err_cnt), +[C_RX_RBUF_EMPTY_ERR] = CNTR_ELEM("RxRbufEmptyErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_empty_err_cnt), +[C_RX_RBUF_FULL_ERR] = CNTR_ELEM("RxRbufFullErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_full_err_cnt), +[C_RX_RBUF_BAD_LOOKUP_ERR] = CNTR_ELEM("RxRBufBadLookupErr", 0, 0, + CNTR_NORMAL, + access_rbuf_bad_lookup_err_cnt), +[C_RX_RBUF_CTX_ID_PARITY_ERR] = CNTR_ELEM("RxRbufCtxIdParityErr", 0, 0, + CNTR_NORMAL, + access_rbuf_ctx_id_parity_err_cnt), +[C_RX_RBUF_CSR_QEOPDW_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQEOPDWParityErr", 0, 0, + CNTR_NORMAL, + access_rbuf_csr_qeopdw_parity_err_cnt), +[C_RX_RBUF_CSR_Q_NUM_OF_PKT_PARITY_ERR] = CNTR_ELEM( + "RxRbufCsrQNumOfPktParityErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_csr_q_num_of_pkt_parity_err_cnt), +[C_RX_RBUF_CSR_Q_T1_PTR_PARITY_ERR] = CNTR_ELEM( + "RxRbufCsrQTlPtrParityErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_csr_q_t1_ptr_parity_err_cnt), +[C_RX_RBUF_CSR_Q_HD_PTR_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQHdPtrParityErr", 0, + 0, CNTR_NORMAL, + access_rx_rbuf_csr_q_hd_ptr_parity_err_cnt), +[C_RX_RBUF_CSR_Q_VLD_BIT_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQVldBitParityErr", 0, + 0, CNTR_NORMAL, + access_rx_rbuf_csr_q_vld_bit_parity_err_cnt), +[C_RX_RBUF_CSR_Q_NEXT_BUF_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQNextBufParityErr", + 0, 0, CNTR_NORMAL, + access_rx_rbuf_csr_q_next_buf_parity_err_cnt), +[C_RX_RBUF_CSR_Q_ENT_CNT_PARITY_ERR] = CNTR_ELEM("RxRbufCsrQEntCntParityErr", 0, + 0, CNTR_NORMAL, + access_rx_rbuf_csr_q_ent_cnt_parity_err_cnt), +[C_RX_RBUF_CSR_Q_HEAD_BUF_NUM_PARITY_ERR] = CNTR_ELEM( + "RxRbufCsrQHeadBufNumParityErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_csr_q_head_buf_num_parity_err_cnt), +[C_RX_RBUF_BLOCK_LIST_READ_COR_ERR] = CNTR_ELEM("RxRbufBlockListReadCorErr", 0, + 0, CNTR_NORMAL, + access_rx_rbuf_block_list_read_cor_err_cnt), +[C_RX_RBUF_BLOCK_LIST_READ_UNC_ERR] = CNTR_ELEM("RxRbufBlockListReadUncErr", 0, + 0, CNTR_NORMAL, + access_rx_rbuf_block_list_read_unc_err_cnt), +[C_RX_RBUF_LOOKUP_DES_COR_ERR] = CNTR_ELEM("RxRbufLookupDesCorErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_lookup_des_cor_err_cnt), +[C_RX_RBUF_LOOKUP_DES_UNC_ERR] = CNTR_ELEM("RxRbufLookupDesUncErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_lookup_des_unc_err_cnt), +[C_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR] = CNTR_ELEM( + "RxRbufLookupDesRegUncCorErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_lookup_des_reg_unc_cor_err_cnt), +[C_RX_RBUF_LOOKUP_DES_REG_UNC_ERR] = CNTR_ELEM("RxRbufLookupDesRegUncErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_lookup_des_reg_unc_err_cnt), +[C_RX_RBUF_FREE_LIST_COR_ERR] = CNTR_ELEM("RxRbufFreeListCorErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_free_list_cor_err_cnt), +[C_RX_RBUF_FREE_LIST_UNC_ERR] = CNTR_ELEM("RxRbufFreeListUncErr", 0, 0, + CNTR_NORMAL, + access_rx_rbuf_free_list_unc_err_cnt), +[C_RX_RCV_FSM_ENCODING_ERR] = CNTR_ELEM("RxRcvFsmEncodingErr", 0, 0, + CNTR_NORMAL, + access_rx_rcv_fsm_encoding_err_cnt), +[C_RX_DMA_FLAG_COR_ERR] = CNTR_ELEM("RxDmaFlagCorErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_flag_cor_err_cnt), +[C_RX_DMA_FLAG_UNC_ERR] = CNTR_ELEM("RxDmaFlagUncErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_flag_unc_err_cnt), +[C_RX_DC_SOP_EOP_PARITY_ERR] = CNTR_ELEM("RxDcSopEopParityErr", 0, 0, + CNTR_NORMAL, + access_rx_dc_sop_eop_parity_err_cnt), +[C_RX_RCV_CSR_PARITY_ERR] = CNTR_ELEM("RxRcvCsrParityErr", 0, 0, + CNTR_NORMAL, + access_rx_rcv_csr_parity_err_cnt), +[C_RX_RCV_QP_MAP_TABLE_COR_ERR] = CNTR_ELEM("RxRcvQpMapTableCorErr", 0, 0, + CNTR_NORMAL, + access_rx_rcv_qp_map_table_cor_err_cnt), +[C_RX_RCV_QP_MAP_TABLE_UNC_ERR] = CNTR_ELEM("RxRcvQpMapTableUncErr", 0, 0, + CNTR_NORMAL, + access_rx_rcv_qp_map_table_unc_err_cnt), +[C_RX_RCV_DATA_COR_ERR] = CNTR_ELEM("RxRcvDataCorErr", 0, 0, + CNTR_NORMAL, + access_rx_rcv_data_cor_err_cnt), +[C_RX_RCV_DATA_UNC_ERR] = CNTR_ELEM("RxRcvDataUncErr", 0, 0, + CNTR_NORMAL, + access_rx_rcv_data_unc_err_cnt), +[C_RX_RCV_HDR_COR_ERR] = CNTR_ELEM("RxRcvHdrCorErr", 0, 0, + CNTR_NORMAL, + access_rx_rcv_hdr_cor_err_cnt), +[C_RX_RCV_HDR_UNC_ERR] = CNTR_ELEM("RxRcvHdrUncErr", 0, 0, + CNTR_NORMAL, + access_rx_rcv_hdr_unc_err_cnt), +[C_RX_DC_INTF_PARITY_ERR] = CNTR_ELEM("RxDcIntfParityErr", 0, 0, + CNTR_NORMAL, + access_rx_dc_intf_parity_err_cnt), +[C_RX_DMA_CSR_COR_ERR] = CNTR_ELEM("RxDmaCsrCorErr", 0, 0, + CNTR_NORMAL, + access_rx_dma_csr_cor_err_cnt), +/* SendPioErrStatus */ +[C_PIO_PEC_SOP_HEAD_PARITY_ERR] = CNTR_ELEM("PioPecSopHeadParityErr", 0, 0, + CNTR_NORMAL, + access_pio_pec_sop_head_parity_err_cnt), +[C_PIO_PCC_SOP_HEAD_PARITY_ERR] = CNTR_ELEM("PioPccSopHeadParityErr", 0, 0, + CNTR_NORMAL, + access_pio_pcc_sop_head_parity_err_cnt), +[C_PIO_LAST_RETURNED_CNT_PARITY_ERR] = CNTR_ELEM("PioLastReturnedCntParityErr", + 0, 0, CNTR_NORMAL, + access_pio_last_returned_cnt_parity_err_cnt), +[C_PIO_CURRENT_FREE_CNT_PARITY_ERR] = CNTR_ELEM("PioCurrentFreeCntParityErr", 0, + 0, CNTR_NORMAL, + access_pio_current_free_cnt_parity_err_cnt), +[C_PIO_RSVD_31_ERR] = CNTR_ELEM("Pio Reserved 31", 0, 0, + CNTR_NORMAL, + access_pio_reserved_31_err_cnt), +[C_PIO_RSVD_30_ERR] = CNTR_ELEM("Pio Reserved 30", 0, 0, + CNTR_NORMAL, + access_pio_reserved_30_err_cnt), +[C_PIO_PPMC_SOP_LEN_ERR] = CNTR_ELEM("PioPpmcSopLenErr", 0, 0, + CNTR_NORMAL, + access_pio_ppmc_sop_len_err_cnt), +[C_PIO_PPMC_BQC_MEM_PARITY_ERR] = CNTR_ELEM("PioPpmcBqcMemParityErr", 0, 0, + CNTR_NORMAL, + access_pio_ppmc_bqc_mem_parity_err_cnt), +[C_PIO_VL_FIFO_PARITY_ERR] = CNTR_ELEM("PioVlFifoParityErr", 0, 0, + CNTR_NORMAL, + access_pio_vl_fifo_parity_err_cnt), +[C_PIO_VLF_SOP_PARITY_ERR] = CNTR_ELEM("PioVlfSopParityErr", 0, 0, + CNTR_NORMAL, + access_pio_vlf_sop_parity_err_cnt), +[C_PIO_VLF_V1_LEN_PARITY_ERR] = CNTR_ELEM("PioVlfVlLenParityErr", 0, 0, + CNTR_NORMAL, + access_pio_vlf_v1_len_parity_err_cnt), +[C_PIO_BLOCK_QW_COUNT_PARITY_ERR] = CNTR_ELEM("PioBlockQwCountParityErr", 0, 0, + CNTR_NORMAL, + access_pio_block_qw_count_parity_err_cnt), +[C_PIO_WRITE_QW_VALID_PARITY_ERR] = CNTR_ELEM("PioWriteQwValidParityErr", 0, 0, + CNTR_NORMAL, + access_pio_write_qw_valid_parity_err_cnt), +[C_PIO_STATE_MACHINE_ERR] = CNTR_ELEM("PioStateMachineErr", 0, 0, + CNTR_NORMAL, + access_pio_state_machine_err_cnt), +[C_PIO_WRITE_DATA_PARITY_ERR] = CNTR_ELEM("PioWriteDataParityErr", 0, 0, + CNTR_NORMAL, + access_pio_write_data_parity_err_cnt), +[C_PIO_HOST_ADDR_MEM_COR_ERR] = CNTR_ELEM("PioHostAddrMemCorErr", 0, 0, + CNTR_NORMAL, + access_pio_host_addr_mem_cor_err_cnt), +[C_PIO_HOST_ADDR_MEM_UNC_ERR] = CNTR_ELEM("PioHostAddrMemUncErr", 0, 0, + CNTR_NORMAL, + access_pio_host_addr_mem_unc_err_cnt), +[C_PIO_PKT_EVICT_SM_OR_ARM_SM_ERR] = CNTR_ELEM("PioPktEvictSmOrArbSmErr", 0, 0, + CNTR_NORMAL, + access_pio_pkt_evict_sm_or_arb_sm_err_cnt), +[C_PIO_INIT_SM_IN_ERR] = CNTR_ELEM("PioInitSmInErr", 0, 0, + CNTR_NORMAL, + access_pio_init_sm_in_err_cnt), +[C_PIO_PPMC_PBL_FIFO_ERR] = CNTR_ELEM("PioPpmcPblFifoErr", 0, 0, + CNTR_NORMAL, + access_pio_ppmc_pbl_fifo_err_cnt), +[C_PIO_CREDIT_RET_FIFO_PARITY_ERR] = CNTR_ELEM("PioCreditRetFifoParityErr", 0, + 0, CNTR_NORMAL, + access_pio_credit_ret_fifo_parity_err_cnt), +[C_PIO_V1_LEN_MEM_BANK1_COR_ERR] = CNTR_ELEM("PioVlLenMemBank1CorErr", 0, 0, + CNTR_NORMAL, + access_pio_v1_len_mem_bank1_cor_err_cnt), +[C_PIO_V1_LEN_MEM_BANK0_COR_ERR] = CNTR_ELEM("PioVlLenMemBank0CorErr", 0, 0, + CNTR_NORMAL, + access_pio_v1_len_mem_bank0_cor_err_cnt), +[C_PIO_V1_LEN_MEM_BANK1_UNC_ERR] = CNTR_ELEM("PioVlLenMemBank1UncErr", 0, 0, + CNTR_NORMAL, + access_pio_v1_len_mem_bank1_unc_err_cnt), +[C_PIO_V1_LEN_MEM_BANK0_UNC_ERR] = CNTR_ELEM("PioVlLenMemBank0UncErr", 0, 0, + CNTR_NORMAL, + access_pio_v1_len_mem_bank0_unc_err_cnt), +[C_PIO_SM_PKT_RESET_PARITY_ERR] = CNTR_ELEM("PioSmPktResetParityErr", 0, 0, + CNTR_NORMAL, + access_pio_sm_pkt_reset_parity_err_cnt), +[C_PIO_PKT_EVICT_FIFO_PARITY_ERR] = CNTR_ELEM("PioPktEvictFifoParityErr", 0, 0, + CNTR_NORMAL, + access_pio_pkt_evict_fifo_parity_err_cnt), +[C_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR] = CNTR_ELEM( + "PioSbrdctrlCrrelFifoParityErr", 0, 0, + CNTR_NORMAL, + access_pio_sbrdctrl_crrel_fifo_parity_err_cnt), +[C_PIO_SBRDCTL_CRREL_PARITY_ERR] = CNTR_ELEM("PioSbrdctlCrrelParityErr", 0, 0, + CNTR_NORMAL, + access_pio_sbrdctl_crrel_parity_err_cnt), +[C_PIO_PEC_FIFO_PARITY_ERR] = CNTR_ELEM("PioPecFifoParityErr", 0, 0, + CNTR_NORMAL, + access_pio_pec_fifo_parity_err_cnt), +[C_PIO_PCC_FIFO_PARITY_ERR] = CNTR_ELEM("PioPccFifoParityErr", 0, 0, + CNTR_NORMAL, + access_pio_pcc_fifo_parity_err_cnt), +[C_PIO_SB_MEM_FIFO1_ERR] = CNTR_ELEM("PioSbMemFifo1Err", 0, 0, + CNTR_NORMAL, + access_pio_sb_mem_fifo1_err_cnt), +[C_PIO_SB_MEM_FIFO0_ERR] = CNTR_ELEM("PioSbMemFifo0Err", 0, 0, + CNTR_NORMAL, + access_pio_sb_mem_fifo0_err_cnt), +[C_PIO_CSR_PARITY_ERR] = CNTR_ELEM("PioCsrParityErr", 0, 0, + CNTR_NORMAL, + access_pio_csr_parity_err_cnt), +[C_PIO_WRITE_ADDR_PARITY_ERR] = CNTR_ELEM("PioWriteAddrParityErr", 0, 0, + CNTR_NORMAL, + access_pio_write_addr_parity_err_cnt), +[C_PIO_WRITE_BAD_CTXT_ERR] = CNTR_ELEM("PioWriteBadCtxtErr", 0, 0, + CNTR_NORMAL, + access_pio_write_bad_ctxt_err_cnt), +/* SendDmaErrStatus */ +[C_SDMA_PCIE_REQ_TRACKING_COR_ERR] = CNTR_ELEM("SDmaPcieReqTrackingCorErr", 0, + 0, CNTR_NORMAL, + access_sdma_pcie_req_tracking_cor_err_cnt), +[C_SDMA_PCIE_REQ_TRACKING_UNC_ERR] = CNTR_ELEM("SDmaPcieReqTrackingUncErr", 0, + 0, CNTR_NORMAL, + access_sdma_pcie_req_tracking_unc_err_cnt), +[C_SDMA_CSR_PARITY_ERR] = CNTR_ELEM("SDmaCsrParityErr", 0, 0, + CNTR_NORMAL, + access_sdma_csr_parity_err_cnt), +[C_SDMA_RPY_TAG_ERR] = CNTR_ELEM("SDmaRpyTagErr", 0, 0, + CNTR_NORMAL, + access_sdma_rpy_tag_err_cnt), +/* SendEgressErrStatus */ +[C_TX_READ_PIO_MEMORY_CSR_UNC_ERR] = CNTR_ELEM("TxReadPioMemoryCsrUncErr", 0, 0, + CNTR_NORMAL, + access_tx_read_pio_memory_csr_unc_err_cnt), +[C_TX_READ_SDMA_MEMORY_CSR_UNC_ERR] = CNTR_ELEM("TxReadSdmaMemoryCsrUncErr", 0, + 0, CNTR_NORMAL, + access_tx_read_sdma_memory_csr_err_cnt), +[C_TX_EGRESS_FIFO_COR_ERR] = CNTR_ELEM("TxEgressFifoCorErr", 0, 0, + CNTR_NORMAL, + access_tx_egress_fifo_cor_err_cnt), +[C_TX_READ_PIO_MEMORY_COR_ERR] = CNTR_ELEM("TxReadPioMemoryCorErr", 0, 0, + CNTR_NORMAL, + access_tx_read_pio_memory_cor_err_cnt), +[C_TX_READ_SDMA_MEMORY_COR_ERR] = CNTR_ELEM("TxReadSdmaMemoryCorErr", 0, 0, + CNTR_NORMAL, + access_tx_read_sdma_memory_cor_err_cnt), +[C_TX_SB_HDR_COR_ERR] = CNTR_ELEM("TxSbHdrCorErr", 0, 0, + CNTR_NORMAL, + access_tx_sb_hdr_cor_err_cnt), +[C_TX_CREDIT_OVERRUN_ERR] = CNTR_ELEM("TxCreditOverrunErr", 0, 0, + CNTR_NORMAL, + access_tx_credit_overrun_err_cnt), +[C_TX_LAUNCH_FIFO8_COR_ERR] = CNTR_ELEM("TxLaunchFifo8CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo8_cor_err_cnt), +[C_TX_LAUNCH_FIFO7_COR_ERR] = CNTR_ELEM("TxLaunchFifo7CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo7_cor_err_cnt), +[C_TX_LAUNCH_FIFO6_COR_ERR] = CNTR_ELEM("TxLaunchFifo6CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo6_cor_err_cnt), +[C_TX_LAUNCH_FIFO5_COR_ERR] = CNTR_ELEM("TxLaunchFifo5CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo5_cor_err_cnt), +[C_TX_LAUNCH_FIFO4_COR_ERR] = CNTR_ELEM("TxLaunchFifo4CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo4_cor_err_cnt), +[C_TX_LAUNCH_FIFO3_COR_ERR] = CNTR_ELEM("TxLaunchFifo3CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo3_cor_err_cnt), +[C_TX_LAUNCH_FIFO2_COR_ERR] = CNTR_ELEM("TxLaunchFifo2CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo2_cor_err_cnt), +[C_TX_LAUNCH_FIFO1_COR_ERR] = CNTR_ELEM("TxLaunchFifo1CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo1_cor_err_cnt), +[C_TX_LAUNCH_FIFO0_COR_ERR] = CNTR_ELEM("TxLaunchFifo0CorErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_fifo0_cor_err_cnt), +[C_TX_CREDIT_RETURN_VL_ERR] = CNTR_ELEM("TxCreditReturnVLErr", 0, 0, + CNTR_NORMAL, + access_tx_credit_return_vl_err_cnt), +[C_TX_HCRC_INSERTION_ERR] = CNTR_ELEM("TxHcrcInsertionErr", 0, 0, + CNTR_NORMAL, + access_tx_hcrc_insertion_err_cnt), +[C_TX_EGRESS_FIFI_UNC_ERR] = CNTR_ELEM("TxEgressFifoUncErr", 0, 0, + CNTR_NORMAL, + access_tx_egress_fifo_unc_err_cnt), +[C_TX_READ_PIO_MEMORY_UNC_ERR] = CNTR_ELEM("TxReadPioMemoryUncErr", 0, 0, + CNTR_NORMAL, + access_tx_read_pio_memory_unc_err_cnt), +[C_TX_READ_SDMA_MEMORY_UNC_ERR] = CNTR_ELEM("TxReadSdmaMemoryUncErr", 0, 0, + CNTR_NORMAL, + access_tx_read_sdma_memory_unc_err_cnt), +[C_TX_SB_HDR_UNC_ERR] = CNTR_ELEM("TxSbHdrUncErr", 0, 0, + CNTR_NORMAL, + access_tx_sb_hdr_unc_err_cnt), +[C_TX_CREDIT_RETURN_PARITY_ERR] = CNTR_ELEM("TxCreditReturnParityErr", 0, 0, + CNTR_NORMAL, + access_tx_credit_return_partiy_err_cnt), +[C_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo8UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo8_unc_or_parity_err_cnt), +[C_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo7UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo7_unc_or_parity_err_cnt), +[C_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo6UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo6_unc_or_parity_err_cnt), +[C_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo5UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo5_unc_or_parity_err_cnt), +[C_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo4UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo4_unc_or_parity_err_cnt), +[C_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo3UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo3_unc_or_parity_err_cnt), +[C_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo2UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo2_unc_or_parity_err_cnt), +[C_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo1UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo1_unc_or_parity_err_cnt), +[C_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR] = CNTR_ELEM("TxLaunchFifo0UncOrParityErr", + 0, 0, CNTR_NORMAL, + access_tx_launch_fifo0_unc_or_parity_err_cnt), +[C_TX_SDMA15_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma15DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma15_disallowed_packet_err_cnt), +[C_TX_SDMA14_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma14DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma14_disallowed_packet_err_cnt), +[C_TX_SDMA13_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma13DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma13_disallowed_packet_err_cnt), +[C_TX_SDMA12_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma12DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma12_disallowed_packet_err_cnt), +[C_TX_SDMA11_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma11DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma11_disallowed_packet_err_cnt), +[C_TX_SDMA10_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma10DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma10_disallowed_packet_err_cnt), +[C_TX_SDMA9_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma9DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma9_disallowed_packet_err_cnt), +[C_TX_SDMA8_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma8DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma8_disallowed_packet_err_cnt), +[C_TX_SDMA7_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma7DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma7_disallowed_packet_err_cnt), +[C_TX_SDMA6_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma6DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma6_disallowed_packet_err_cnt), +[C_TX_SDMA5_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma5DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma5_disallowed_packet_err_cnt), +[C_TX_SDMA4_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma4DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma4_disallowed_packet_err_cnt), +[C_TX_SDMA3_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma3DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma3_disallowed_packet_err_cnt), +[C_TX_SDMA2_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma2DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma2_disallowed_packet_err_cnt), +[C_TX_SDMA1_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma1DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma1_disallowed_packet_err_cnt), +[C_TX_SDMA0_DISALLOWED_PACKET_ERR] = CNTR_ELEM("TxSdma0DisallowedPacketErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma0_disallowed_packet_err_cnt), +[C_TX_CONFIG_PARITY_ERR] = CNTR_ELEM("TxConfigParityErr", 0, 0, + CNTR_NORMAL, + access_tx_config_parity_err_cnt), +[C_TX_SBRD_CTL_CSR_PARITY_ERR] = CNTR_ELEM("TxSbrdCtlCsrParityErr", 0, 0, + CNTR_NORMAL, + access_tx_sbrd_ctl_csr_parity_err_cnt), +[C_TX_LAUNCH_CSR_PARITY_ERR] = CNTR_ELEM("TxLaunchCsrParityErr", 0, 0, + CNTR_NORMAL, + access_tx_launch_csr_parity_err_cnt), +[C_TX_ILLEGAL_CL_ERR] = CNTR_ELEM("TxIllegalVLErr", 0, 0, + CNTR_NORMAL, + access_tx_illegal_vl_err_cnt), +[C_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR] = CNTR_ELEM( + "TxSbrdCtlStateMachineParityErr", 0, 0, + CNTR_NORMAL, + access_tx_sbrd_ctl_state_machine_parity_err_cnt), +[C_TX_RESERVED_10] = CNTR_ELEM("Tx Egress Reserved 10", 0, 0, + CNTR_NORMAL, + access_egress_reserved_10_err_cnt), +[C_TX_RESERVED_9] = CNTR_ELEM("Tx Egress Reserved 9", 0, 0, + CNTR_NORMAL, + access_egress_reserved_9_err_cnt), +[C_TX_SDMA_LAUNCH_INTF_PARITY_ERR] = CNTR_ELEM("TxSdmaLaunchIntfParityErr", + 0, 0, CNTR_NORMAL, + access_tx_sdma_launch_intf_parity_err_cnt), +[C_TX_PIO_LAUNCH_INTF_PARITY_ERR] = CNTR_ELEM("TxPioLaunchIntfParityErr", 0, 0, + CNTR_NORMAL, + access_tx_pio_launch_intf_parity_err_cnt), +[C_TX_RESERVED_6] = CNTR_ELEM("Tx Egress Reserved 6", 0, 0, + CNTR_NORMAL, + access_egress_reserved_6_err_cnt), +[C_TX_INCORRECT_LINK_STATE_ERR] = CNTR_ELEM("TxIncorrectLinkStateErr", 0, 0, + CNTR_NORMAL, + access_tx_incorrect_link_state_err_cnt), +[C_TX_LINK_DOWN_ERR] = CNTR_ELEM("TxLinkdownErr", 0, 0, + CNTR_NORMAL, + access_tx_linkdown_err_cnt), +[C_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR] = CNTR_ELEM( + "EgressFifoUnderrunOrParityErr", 0, 0, + CNTR_NORMAL, + access_tx_egress_fifi_underrun_or_parity_err_cnt), +[C_TX_RESERVED_2] = CNTR_ELEM("Tx Egress Reserved 2", 0, 0, + CNTR_NORMAL, + access_egress_reserved_2_err_cnt), +[C_TX_PKT_INTEGRITY_MEM_UNC_ERR] = CNTR_ELEM("TxPktIntegrityMemUncErr", 0, 0, + CNTR_NORMAL, + access_tx_pkt_integrity_mem_unc_err_cnt), +[C_TX_PKT_INTEGRITY_MEM_COR_ERR] = CNTR_ELEM("TxPktIntegrityMemCorErr", 0, 0, + CNTR_NORMAL, + access_tx_pkt_integrity_mem_cor_err_cnt), +/* SendErrStatus */ +[C_SEND_CSR_WRITE_BAD_ADDR_ERR] = CNTR_ELEM("SendCsrWriteBadAddrErr", 0, 0, + CNTR_NORMAL, + access_send_csr_write_bad_addr_err_cnt), +[C_SEND_CSR_READ_BAD_ADD_ERR] = CNTR_ELEM("SendCsrReadBadAddrErr", 0, 0, + CNTR_NORMAL, + access_send_csr_read_bad_addr_err_cnt), +[C_SEND_CSR_PARITY_ERR] = CNTR_ELEM("SendCsrParityErr", 0, 0, + CNTR_NORMAL, + access_send_csr_parity_cnt), +/* SendCtxtErrStatus */ +[C_PIO_WRITE_OUT_OF_BOUNDS_ERR] = CNTR_ELEM("PioWriteOutOfBoundsErr", 0, 0, + CNTR_NORMAL, + access_pio_write_out_of_bounds_err_cnt), +[C_PIO_WRITE_OVERFLOW_ERR] = CNTR_ELEM("PioWriteOverflowErr", 0, 0, + CNTR_NORMAL, + access_pio_write_overflow_err_cnt), +[C_PIO_WRITE_CROSSES_BOUNDARY_ERR] = CNTR_ELEM("PioWriteCrossesBoundaryErr", + 0, 0, CNTR_NORMAL, + access_pio_write_crosses_boundary_err_cnt), +[C_PIO_DISALLOWED_PACKET_ERR] = CNTR_ELEM("PioDisallowedPacketErr", 0, 0, + CNTR_NORMAL, + access_pio_disallowed_packet_err_cnt), +[C_PIO_INCONSISTENT_SOP_ERR] = CNTR_ELEM("PioInconsistentSopErr", 0, 0, + CNTR_NORMAL, + access_pio_inconsistent_sop_err_cnt), +/* SendDmaEngErrStatus */ +[C_SDMA_HEADER_REQUEST_FIFO_COR_ERR] = CNTR_ELEM("SDmaHeaderRequestFifoCorErr", + 0, 0, CNTR_NORMAL, + access_sdma_header_request_fifo_cor_err_cnt), +[C_SDMA_HEADER_STORAGE_COR_ERR] = CNTR_ELEM("SDmaHeaderStorageCorErr", 0, 0, + CNTR_NORMAL, + access_sdma_header_storage_cor_err_cnt), +[C_SDMA_PACKET_TRACKING_COR_ERR] = CNTR_ELEM("SDmaPacketTrackingCorErr", 0, 0, + CNTR_NORMAL, + access_sdma_packet_tracking_cor_err_cnt), +[C_SDMA_ASSEMBLY_COR_ERR] = CNTR_ELEM("SDmaAssemblyCorErr", 0, 0, + CNTR_NORMAL, + access_sdma_assembly_cor_err_cnt), +[C_SDMA_DESC_TABLE_COR_ERR] = CNTR_ELEM("SDmaDescTableCorErr", 0, 0, + CNTR_NORMAL, + access_sdma_desc_table_cor_err_cnt), +[C_SDMA_HEADER_REQUEST_FIFO_UNC_ERR] = CNTR_ELEM("SDmaHeaderRequestFifoUncErr", + 0, 0, CNTR_NORMAL, + access_sdma_header_request_fifo_unc_err_cnt), +[C_SDMA_HEADER_STORAGE_UNC_ERR] = CNTR_ELEM("SDmaHeaderStorageUncErr", 0, 0, + CNTR_NORMAL, + access_sdma_header_storage_unc_err_cnt), +[C_SDMA_PACKET_TRACKING_UNC_ERR] = CNTR_ELEM("SDmaPacketTrackingUncErr", 0, 0, + CNTR_NORMAL, + access_sdma_packet_tracking_unc_err_cnt), +[C_SDMA_ASSEMBLY_UNC_ERR] = CNTR_ELEM("SDmaAssemblyUncErr", 0, 0, + CNTR_NORMAL, + access_sdma_assembly_unc_err_cnt), +[C_SDMA_DESC_TABLE_UNC_ERR] = CNTR_ELEM("SDmaDescTableUncErr", 0, 0, + CNTR_NORMAL, + access_sdma_desc_table_unc_err_cnt), +[C_SDMA_TIMEOUT_ERR] = CNTR_ELEM("SDmaTimeoutErr", 0, 0, + CNTR_NORMAL, + access_sdma_timeout_err_cnt), +[C_SDMA_HEADER_LENGTH_ERR] = CNTR_ELEM("SDmaHeaderLengthErr", 0, 0, + CNTR_NORMAL, + access_sdma_header_length_err_cnt), +[C_SDMA_HEADER_ADDRESS_ERR] = CNTR_ELEM("SDmaHeaderAddressErr", 0, 0, + CNTR_NORMAL, + access_sdma_header_address_err_cnt), +[C_SDMA_HEADER_SELECT_ERR] = CNTR_ELEM("SDmaHeaderSelectErr", 0, 0, + CNTR_NORMAL, + access_sdma_header_select_err_cnt), +[C_SMDA_RESERVED_9] = CNTR_ELEM("SDma Reserved 9", 0, 0, + CNTR_NORMAL, + access_sdma_reserved_9_err_cnt), +[C_SDMA_PACKET_DESC_OVERFLOW_ERR] = CNTR_ELEM("SDmaPacketDescOverflowErr", 0, 0, + CNTR_NORMAL, + access_sdma_packet_desc_overflow_err_cnt), +[C_SDMA_LENGTH_MISMATCH_ERR] = CNTR_ELEM("SDmaLengthMismatchErr", 0, 0, + CNTR_NORMAL, + access_sdma_length_mismatch_err_cnt), +[C_SDMA_HALT_ERR] = CNTR_ELEM("SDmaHaltErr", 0, 0, + CNTR_NORMAL, + access_sdma_halt_err_cnt), +[C_SDMA_MEM_READ_ERR] = CNTR_ELEM("SDmaMemReadErr", 0, 0, + CNTR_NORMAL, + access_sdma_mem_read_err_cnt), +[C_SDMA_FIRST_DESC_ERR] = CNTR_ELEM("SDmaFirstDescErr", 0, 0, + CNTR_NORMAL, + access_sdma_first_desc_err_cnt), +[C_SDMA_TAIL_OUT_OF_BOUNDS_ERR] = CNTR_ELEM("SDmaTailOutOfBoundsErr", 0, 0, + CNTR_NORMAL, + access_sdma_tail_out_of_bounds_err_cnt), +[C_SDMA_TOO_LONG_ERR] = CNTR_ELEM("SDmaTooLongErr", 0, 0, + CNTR_NORMAL, + access_sdma_too_long_err_cnt), +[C_SDMA_GEN_MISMATCH_ERR] = CNTR_ELEM("SDmaGenMismatchErr", 0, 0, + CNTR_NORMAL, + access_sdma_gen_mismatch_err_cnt), +[C_SDMA_WRONG_DW_ERR] = CNTR_ELEM("SDmaWrongDwErr", 0, 0, + CNTR_NORMAL, + access_sdma_wrong_dw_err_cnt), +}; + +static struct cntr_entry port_cntrs[PORT_CNTR_LAST] = { +[C_TX_UNSUP_VL] = TXE32_PORT_CNTR_ELEM(TxUnVLErr, SEND_UNSUP_VL_ERR_CNT, + CNTR_NORMAL), +[C_TX_INVAL_LEN] = TXE32_PORT_CNTR_ELEM(TxInvalLen, SEND_LEN_ERR_CNT, + CNTR_NORMAL), +[C_TX_MM_LEN_ERR] = TXE32_PORT_CNTR_ELEM(TxMMLenErr, SEND_MAX_MIN_LEN_ERR_CNT, + CNTR_NORMAL), +[C_TX_UNDERRUN] = TXE32_PORT_CNTR_ELEM(TxUnderrun, SEND_UNDERRUN_CNT, + CNTR_NORMAL), +[C_TX_FLOW_STALL] = TXE32_PORT_CNTR_ELEM(TxFlowStall, SEND_FLOW_STALL_CNT, + CNTR_NORMAL), +[C_TX_DROPPED] = TXE32_PORT_CNTR_ELEM(TxDropped, SEND_DROPPED_PKT_CNT, + CNTR_NORMAL), +[C_TX_HDR_ERR] = TXE32_PORT_CNTR_ELEM(TxHdrErr, SEND_HEADERS_ERR_CNT, + CNTR_NORMAL), +[C_TX_PKT] = TXE64_PORT_CNTR_ELEM(TxPkt, SEND_DATA_PKT_CNT, CNTR_NORMAL), +[C_TX_WORDS] = TXE64_PORT_CNTR_ELEM(TxWords, SEND_DWORD_CNT, CNTR_NORMAL), +[C_TX_WAIT] = TXE64_PORT_CNTR_ELEM(TxWait, SEND_WAIT_CNT, CNTR_SYNTH), +[C_TX_FLIT_VL] = TXE64_PORT_CNTR_ELEM(TxFlitVL, SEND_DATA_VL0_CNT, + CNTR_SYNTH | CNTR_VL), +[C_TX_PKT_VL] = TXE64_PORT_CNTR_ELEM(TxPktVL, SEND_DATA_PKT_VL0_CNT, + CNTR_SYNTH | CNTR_VL), +[C_TX_WAIT_VL] = TXE64_PORT_CNTR_ELEM(TxWaitVL, SEND_WAIT_VL0_CNT, + CNTR_SYNTH | CNTR_VL), +[C_RX_PKT] = RXE64_PORT_CNTR_ELEM(RxPkt, RCV_DATA_PKT_CNT, CNTR_NORMAL), +[C_RX_WORDS] = RXE64_PORT_CNTR_ELEM(RxWords, RCV_DWORD_CNT, CNTR_NORMAL), +[C_SW_LINK_DOWN] = CNTR_ELEM("SwLinkDown", 0, 0, CNTR_SYNTH | CNTR_32BIT, + access_sw_link_dn_cnt), +[C_SW_LINK_UP] = CNTR_ELEM("SwLinkUp", 0, 0, CNTR_SYNTH | CNTR_32BIT, + access_sw_link_up_cnt), +[C_SW_UNKNOWN_FRAME] = CNTR_ELEM("UnknownFrame", 0, 0, CNTR_NORMAL, + access_sw_unknown_frame_cnt), +[C_SW_XMIT_DSCD] = CNTR_ELEM("XmitDscd", 0, 0, CNTR_SYNTH | CNTR_32BIT, + access_sw_xmit_discards), +[C_SW_XMIT_DSCD_VL] = CNTR_ELEM("XmitDscdVl", 0, 0, + CNTR_SYNTH | CNTR_32BIT | CNTR_VL, + access_sw_xmit_discards), +[C_SW_XMIT_CSTR_ERR] = CNTR_ELEM("XmitCstrErr", 0, 0, CNTR_SYNTH, + access_xmit_constraint_errs), +[C_SW_RCV_CSTR_ERR] = CNTR_ELEM("RcvCstrErr", 0, 0, CNTR_SYNTH, + access_rcv_constraint_errs), +[C_SW_IBP_LOOP_PKTS] = SW_IBP_CNTR(LoopPkts, loop_pkts), +[C_SW_IBP_RC_RESENDS] = SW_IBP_CNTR(RcResend, rc_resends), +[C_SW_IBP_RNR_NAKS] = SW_IBP_CNTR(RnrNak, rnr_naks), +[C_SW_IBP_OTHER_NAKS] = SW_IBP_CNTR(OtherNak, other_naks), +[C_SW_IBP_RC_TIMEOUTS] = SW_IBP_CNTR(RcTimeOut, rc_timeouts), +[C_SW_IBP_PKT_DROPS] = SW_IBP_CNTR(PktDrop, pkt_drops), +[C_SW_IBP_DMA_WAIT] = SW_IBP_CNTR(DmaWait, dmawait), +[C_SW_IBP_RC_SEQNAK] = SW_IBP_CNTR(RcSeqNak, rc_seqnak), +[C_SW_IBP_RC_DUPREQ] = SW_IBP_CNTR(RcDupRew, rc_dupreq), +[C_SW_IBP_RDMA_SEQ] = SW_IBP_CNTR(RdmaSeq, rdma_seq), +[C_SW_IBP_UNALIGNED] = SW_IBP_CNTR(Unaligned, unaligned), +[C_SW_IBP_SEQ_NAK] = SW_IBP_CNTR(SeqNak, seq_naks), +[C_SW_CPU_RC_ACKS] = CNTR_ELEM("RcAcks", 0, 0, CNTR_NORMAL, + access_sw_cpu_rc_acks), +[C_SW_CPU_RC_QACKS] = CNTR_ELEM("RcQacks", 0, 0, CNTR_NORMAL, + access_sw_cpu_rc_qacks), +[C_SW_CPU_RC_DELAYED_COMP] = CNTR_ELEM("RcDelayComp", 0, 0, CNTR_NORMAL, + access_sw_cpu_rc_delayed_comp), +[OVR_LBL(0)] = OVR_ELM(0), [OVR_LBL(1)] = OVR_ELM(1), +[OVR_LBL(2)] = OVR_ELM(2), [OVR_LBL(3)] = OVR_ELM(3), +[OVR_LBL(4)] = OVR_ELM(4), [OVR_LBL(5)] = OVR_ELM(5), +[OVR_LBL(6)] = OVR_ELM(6), [OVR_LBL(7)] = OVR_ELM(7), +[OVR_LBL(8)] = OVR_ELM(8), [OVR_LBL(9)] = OVR_ELM(9), +[OVR_LBL(10)] = OVR_ELM(10), [OVR_LBL(11)] = OVR_ELM(11), +[OVR_LBL(12)] = OVR_ELM(12), [OVR_LBL(13)] = OVR_ELM(13), +[OVR_LBL(14)] = OVR_ELM(14), [OVR_LBL(15)] = OVR_ELM(15), +[OVR_LBL(16)] = OVR_ELM(16), [OVR_LBL(17)] = OVR_ELM(17), +[OVR_LBL(18)] = OVR_ELM(18), [OVR_LBL(19)] = OVR_ELM(19), +[OVR_LBL(20)] = OVR_ELM(20), [OVR_LBL(21)] = OVR_ELM(21), +[OVR_LBL(22)] = OVR_ELM(22), [OVR_LBL(23)] = OVR_ELM(23), +[OVR_LBL(24)] = OVR_ELM(24), [OVR_LBL(25)] = OVR_ELM(25), +[OVR_LBL(26)] = OVR_ELM(26), [OVR_LBL(27)] = OVR_ELM(27), +[OVR_LBL(28)] = OVR_ELM(28), [OVR_LBL(29)] = OVR_ELM(29), +[OVR_LBL(30)] = OVR_ELM(30), [OVR_LBL(31)] = OVR_ELM(31), +[OVR_LBL(32)] = OVR_ELM(32), [OVR_LBL(33)] = OVR_ELM(33), +[OVR_LBL(34)] = OVR_ELM(34), [OVR_LBL(35)] = OVR_ELM(35), +[OVR_LBL(36)] = OVR_ELM(36), [OVR_LBL(37)] = OVR_ELM(37), +[OVR_LBL(38)] = OVR_ELM(38), [OVR_LBL(39)] = OVR_ELM(39), +[OVR_LBL(40)] = OVR_ELM(40), [OVR_LBL(41)] = OVR_ELM(41), +[OVR_LBL(42)] = OVR_ELM(42), [OVR_LBL(43)] = OVR_ELM(43), +[OVR_LBL(44)] = OVR_ELM(44), [OVR_LBL(45)] = OVR_ELM(45), +[OVR_LBL(46)] = OVR_ELM(46), [OVR_LBL(47)] = OVR_ELM(47), +[OVR_LBL(48)] = OVR_ELM(48), [OVR_LBL(49)] = OVR_ELM(49), +[OVR_LBL(50)] = OVR_ELM(50), [OVR_LBL(51)] = OVR_ELM(51), +[OVR_LBL(52)] = OVR_ELM(52), [OVR_LBL(53)] = OVR_ELM(53), +[OVR_LBL(54)] = OVR_ELM(54), [OVR_LBL(55)] = OVR_ELM(55), +[OVR_LBL(56)] = OVR_ELM(56), [OVR_LBL(57)] = OVR_ELM(57), +[OVR_LBL(58)] = OVR_ELM(58), [OVR_LBL(59)] = OVR_ELM(59), +[OVR_LBL(60)] = OVR_ELM(60), [OVR_LBL(61)] = OVR_ELM(61), +[OVR_LBL(62)] = OVR_ELM(62), [OVR_LBL(63)] = OVR_ELM(63), +[OVR_LBL(64)] = OVR_ELM(64), [OVR_LBL(65)] = OVR_ELM(65), +[OVR_LBL(66)] = OVR_ELM(66), [OVR_LBL(67)] = OVR_ELM(67), +[OVR_LBL(68)] = OVR_ELM(68), [OVR_LBL(69)] = OVR_ELM(69), +[OVR_LBL(70)] = OVR_ELM(70), [OVR_LBL(71)] = OVR_ELM(71), +[OVR_LBL(72)] = OVR_ELM(72), [OVR_LBL(73)] = OVR_ELM(73), +[OVR_LBL(74)] = OVR_ELM(74), [OVR_LBL(75)] = OVR_ELM(75), +[OVR_LBL(76)] = OVR_ELM(76), [OVR_LBL(77)] = OVR_ELM(77), +[OVR_LBL(78)] = OVR_ELM(78), [OVR_LBL(79)] = OVR_ELM(79), +[OVR_LBL(80)] = OVR_ELM(80), [OVR_LBL(81)] = OVR_ELM(81), +[OVR_LBL(82)] = OVR_ELM(82), [OVR_LBL(83)] = OVR_ELM(83), +[OVR_LBL(84)] = OVR_ELM(84), [OVR_LBL(85)] = OVR_ELM(85), +[OVR_LBL(86)] = OVR_ELM(86), [OVR_LBL(87)] = OVR_ELM(87), +[OVR_LBL(88)] = OVR_ELM(88), [OVR_LBL(89)] = OVR_ELM(89), +[OVR_LBL(90)] = OVR_ELM(90), [OVR_LBL(91)] = OVR_ELM(91), +[OVR_LBL(92)] = OVR_ELM(92), [OVR_LBL(93)] = OVR_ELM(93), +[OVR_LBL(94)] = OVR_ELM(94), [OVR_LBL(95)] = OVR_ELM(95), +[OVR_LBL(96)] = OVR_ELM(96), [OVR_LBL(97)] = OVR_ELM(97), +[OVR_LBL(98)] = OVR_ELM(98), [OVR_LBL(99)] = OVR_ELM(99), +[OVR_LBL(100)] = OVR_ELM(100), [OVR_LBL(101)] = OVR_ELM(101), +[OVR_LBL(102)] = OVR_ELM(102), [OVR_LBL(103)] = OVR_ELM(103), +[OVR_LBL(104)] = OVR_ELM(104), [OVR_LBL(105)] = OVR_ELM(105), +[OVR_LBL(106)] = OVR_ELM(106), [OVR_LBL(107)] = OVR_ELM(107), +[OVR_LBL(108)] = OVR_ELM(108), [OVR_LBL(109)] = OVR_ELM(109), +[OVR_LBL(110)] = OVR_ELM(110), [OVR_LBL(111)] = OVR_ELM(111), +[OVR_LBL(112)] = OVR_ELM(112), [OVR_LBL(113)] = OVR_ELM(113), +[OVR_LBL(114)] = OVR_ELM(114), [OVR_LBL(115)] = OVR_ELM(115), +[OVR_LBL(116)] = OVR_ELM(116), [OVR_LBL(117)] = OVR_ELM(117), +[OVR_LBL(118)] = OVR_ELM(118), [OVR_LBL(119)] = OVR_ELM(119), +[OVR_LBL(120)] = OVR_ELM(120), [OVR_LBL(121)] = OVR_ELM(121), +[OVR_LBL(122)] = OVR_ELM(122), [OVR_LBL(123)] = OVR_ELM(123), +[OVR_LBL(124)] = OVR_ELM(124), [OVR_LBL(125)] = OVR_ELM(125), +[OVR_LBL(126)] = OVR_ELM(126), [OVR_LBL(127)] = OVR_ELM(127), +[OVR_LBL(128)] = OVR_ELM(128), [OVR_LBL(129)] = OVR_ELM(129), +[OVR_LBL(130)] = OVR_ELM(130), [OVR_LBL(131)] = OVR_ELM(131), +[OVR_LBL(132)] = OVR_ELM(132), [OVR_LBL(133)] = OVR_ELM(133), +[OVR_LBL(134)] = OVR_ELM(134), [OVR_LBL(135)] = OVR_ELM(135), +[OVR_LBL(136)] = OVR_ELM(136), [OVR_LBL(137)] = OVR_ELM(137), +[OVR_LBL(138)] = OVR_ELM(138), [OVR_LBL(139)] = OVR_ELM(139), +[OVR_LBL(140)] = OVR_ELM(140), [OVR_LBL(141)] = OVR_ELM(141), +[OVR_LBL(142)] = OVR_ELM(142), [OVR_LBL(143)] = OVR_ELM(143), +[OVR_LBL(144)] = OVR_ELM(144), [OVR_LBL(145)] = OVR_ELM(145), +[OVR_LBL(146)] = OVR_ELM(146), [OVR_LBL(147)] = OVR_ELM(147), +[OVR_LBL(148)] = OVR_ELM(148), [OVR_LBL(149)] = OVR_ELM(149), +[OVR_LBL(150)] = OVR_ELM(150), [OVR_LBL(151)] = OVR_ELM(151), +[OVR_LBL(152)] = OVR_ELM(152), [OVR_LBL(153)] = OVR_ELM(153), +[OVR_LBL(154)] = OVR_ELM(154), [OVR_LBL(155)] = OVR_ELM(155), +[OVR_LBL(156)] = OVR_ELM(156), [OVR_LBL(157)] = OVR_ELM(157), +[OVR_LBL(158)] = OVR_ELM(158), [OVR_LBL(159)] = OVR_ELM(159), +}; + +/* ======================================================================== */ + +/* return true if this is chip revision revision a */ +int is_ax(struct hfi1_devdata *dd) +{ + u8 chip_rev_minor = + dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT + & CCE_REVISION_CHIP_REV_MINOR_MASK; + return (chip_rev_minor & 0xf0) == 0; +} + +/* return true if this is chip revision revision b */ +int is_bx(struct hfi1_devdata *dd) +{ + u8 chip_rev_minor = + dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT + & CCE_REVISION_CHIP_REV_MINOR_MASK; + return (chip_rev_minor & 0xF0) == 0x10; +} + +/* + * Append string s to buffer buf. Arguments curp and len are the current + * position and remaining length, respectively. + * + * return 0 on success, 1 on out of room + */ +static int append_str(char *buf, char **curp, int *lenp, const char *s) +{ + char *p = *curp; + int len = *lenp; + int result = 0; /* success */ + char c; + + /* add a comma, if first in the buffer */ + if (p != buf) { + if (len == 0) { + result = 1; /* out of room */ + goto done; + } + *p++ = ','; + len--; + } + + /* copy the string */ + while ((c = *s++) != 0) { + if (len == 0) { + result = 1; /* out of room */ + goto done; + } + *p++ = c; + len--; + } + +done: + /* write return values */ + *curp = p; + *lenp = len; + + return result; +} + +/* + * Using the given flag table, print a comma separated string into + * the buffer. End in '*' if the buffer is too short. + */ +static char *flag_string(char *buf, int buf_len, u64 flags, + struct flag_table *table, int table_size) +{ + char extra[32]; + char *p = buf; + int len = buf_len; + int no_room = 0; + int i; + + /* make sure there is at least 2 so we can form "*" */ + if (len < 2) + return ""; + + len--; /* leave room for a nul */ + for (i = 0; i < table_size; i++) { + if (flags & table[i].flag) { + no_room = append_str(buf, &p, &len, table[i].str); + if (no_room) + break; + flags &= ~table[i].flag; + } + } + + /* any undocumented bits left? */ + if (!no_room && flags) { + snprintf(extra, sizeof(extra), "bits 0x%llx", flags); + no_room = append_str(buf, &p, &len, extra); + } + + /* add * if ran out of room */ + if (no_room) { + /* may need to back up to add space for a '*' */ + if (len == 0) + --p; + *p++ = '*'; + } + + /* add final nul - space already allocated above */ + *p = 0; + return buf; +} + +/* first 8 CCE error interrupt source names */ +static const char * const cce_misc_names[] = { + "CceErrInt", /* 0 */ + "RxeErrInt", /* 1 */ + "MiscErrInt", /* 2 */ + "Reserved3", /* 3 */ + "PioErrInt", /* 4 */ + "SDmaErrInt", /* 5 */ + "EgressErrInt", /* 6 */ + "TxeErrInt" /* 7 */ +}; + +/* + * Return the miscellaneous error interrupt name. + */ +static char *is_misc_err_name(char *buf, size_t bsize, unsigned int source) +{ + if (source < ARRAY_SIZE(cce_misc_names)) + strncpy(buf, cce_misc_names[source], bsize); + else + snprintf(buf, bsize, "Reserved%u", + source + IS_GENERAL_ERR_START); + + return buf; +} + +/* + * Return the SDMA engine error interrupt name. + */ +static char *is_sdma_eng_err_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "SDmaEngErrInt%u", source); + return buf; +} + +/* + * Return the send context error interrupt name. + */ +static char *is_sendctxt_err_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "SendCtxtErrInt%u", source); + return buf; +} + +static const char * const various_names[] = { + "PbcInt", + "GpioAssertInt", + "Qsfp1Int", + "Qsfp2Int", + "TCritInt" +}; + +/* + * Return the various interrupt name. + */ +static char *is_various_name(char *buf, size_t bsize, unsigned int source) +{ + if (source < ARRAY_SIZE(various_names)) + strncpy(buf, various_names[source], bsize); + else + snprintf(buf, bsize, "Reserved%u", source + IS_VARIOUS_START); + return buf; +} + +/* + * Return the DC interrupt name. + */ +static char *is_dc_name(char *buf, size_t bsize, unsigned int source) +{ + static const char * const dc_int_names[] = { + "common", + "lcb", + "8051", + "lbm" /* local block merge */ + }; + + if (source < ARRAY_SIZE(dc_int_names)) + snprintf(buf, bsize, "dc_%s_int", dc_int_names[source]); + else + snprintf(buf, bsize, "DCInt%u", source); + return buf; +} + +static const char * const sdma_int_names[] = { + "SDmaInt", + "SdmaIdleInt", + "SdmaProgressInt", +}; + +/* + * Return the SDMA engine interrupt name. + */ +static char *is_sdma_eng_name(char *buf, size_t bsize, unsigned int source) +{ + /* what interrupt */ + unsigned int what = source / TXE_NUM_SDMA_ENGINES; + /* which engine */ + unsigned int which = source % TXE_NUM_SDMA_ENGINES; + + if (likely(what < 3)) + snprintf(buf, bsize, "%s%u", sdma_int_names[what], which); + else + snprintf(buf, bsize, "Invalid SDMA interrupt %u", source); + return buf; +} + +/* + * Return the receive available interrupt name. + */ +static char *is_rcv_avail_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "RcvAvailInt%u", source); + return buf; +} + +/* + * Return the receive urgent interrupt name. + */ +static char *is_rcv_urgent_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "RcvUrgentInt%u", source); + return buf; +} + +/* + * Return the send credit interrupt name. + */ +static char *is_send_credit_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "SendCreditInt%u", source); + return buf; +} + +/* + * Return the reserved interrupt name. + */ +static char *is_reserved_name(char *buf, size_t bsize, unsigned int source) +{ + snprintf(buf, bsize, "Reserved%u", source + IS_RESERVED_START); + return buf; +} + +static char *cce_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + cce_err_status_flags, + ARRAY_SIZE(cce_err_status_flags)); +} + +static char *rxe_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + rxe_err_status_flags, + ARRAY_SIZE(rxe_err_status_flags)); +} + +static char *misc_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, misc_err_status_flags, + ARRAY_SIZE(misc_err_status_flags)); +} + +static char *pio_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + pio_err_status_flags, + ARRAY_SIZE(pio_err_status_flags)); +} + +static char *sdma_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + sdma_err_status_flags, + ARRAY_SIZE(sdma_err_status_flags)); +} + +static char *egress_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + egress_err_status_flags, + ARRAY_SIZE(egress_err_status_flags)); +} + +static char *egress_err_info_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + egress_err_info_flags, + ARRAY_SIZE(egress_err_info_flags)); +} + +static char *send_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + send_err_status_flags, + ARRAY_SIZE(send_err_status_flags)); +} + +static void handle_cce_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + int i = 0; + + /* + * For most these errors, there is nothing that can be done except + * report or record it. + */ + dd_dev_info(dd, "CCE Error: %s\n", + cce_err_status_string(buf, sizeof(buf), reg)); + + if ((reg & CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK) && + is_ax(dd) && (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) { + /* this error requires a manual drop into SPC freeze mode */ + /* then a fix up */ + start_freeze_handling(dd->pport, FREEZE_SELF); + } + + for (i = 0; i < NUM_CCE_ERR_STATUS_COUNTERS; i++) { + if (reg & (1ull << i)) { + incr_cntr64(&dd->cce_err_status_cnt[i]); + /* maintain a counter over all cce_err_status errors */ + incr_cntr64(&dd->sw_cce_err_status_aggregate); + } + } +} + +/* + * Check counters for receive errors that do not have an interrupt + * associated with them. + */ +#define RCVERR_CHECK_TIME 10 +static void update_rcverr_timer(unsigned long opaque) +{ + struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque; + struct hfi1_pportdata *ppd = dd->pport; + u32 cur_ovfl_cnt = read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL); + + if (dd->rcv_ovfl_cnt < cur_ovfl_cnt && + ppd->port_error_action & OPA_PI_MASK_EX_BUFFER_OVERRUN) { + dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__); + set_link_down_reason( + ppd, OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN, 0, + OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN); + queue_work(ppd->hfi1_wq, &ppd->link_bounce_work); + } + dd->rcv_ovfl_cnt = (u32)cur_ovfl_cnt; + + mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME); +} + +static int init_rcverr(struct hfi1_devdata *dd) +{ + setup_timer(&dd->rcverr_timer, update_rcverr_timer, (unsigned long)dd); + /* Assume the hardware counter has been reset */ + dd->rcv_ovfl_cnt = 0; + return mod_timer(&dd->rcverr_timer, jiffies + HZ * RCVERR_CHECK_TIME); +} + +static void free_rcverr(struct hfi1_devdata *dd) +{ + if (dd->rcverr_timer.data) + del_timer_sync(&dd->rcverr_timer); + dd->rcverr_timer.data = 0; +} + +static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + int i = 0; + + dd_dev_info(dd, "Receive Error: %s\n", + rxe_err_status_string(buf, sizeof(buf), reg)); + + if (reg & ALL_RXE_FREEZE_ERR) { + int flags = 0; + + /* + * Freeze mode recovery is disabled for the errors + * in RXE_FREEZE_ABORT_MASK + */ + if (is_ax(dd) && (reg & RXE_FREEZE_ABORT_MASK)) + flags = FREEZE_ABORT; + + start_freeze_handling(dd->pport, flags); + } + + for (i = 0; i < NUM_RCV_ERR_STATUS_COUNTERS; i++) { + if (reg & (1ull << i)) + incr_cntr64(&dd->rcv_err_status_cnt[i]); + } +} + +static void handle_misc_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + int i = 0; + + dd_dev_info(dd, "Misc Error: %s", + misc_err_status_string(buf, sizeof(buf), reg)); + for (i = 0; i < NUM_MISC_ERR_STATUS_COUNTERS; i++) { + if (reg & (1ull << i)) + incr_cntr64(&dd->misc_err_status_cnt[i]); + } +} + +static void handle_pio_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + int i = 0; + + dd_dev_info(dd, "PIO Error: %s\n", + pio_err_status_string(buf, sizeof(buf), reg)); + + if (reg & ALL_PIO_FREEZE_ERR) + start_freeze_handling(dd->pport, 0); + + for (i = 0; i < NUM_SEND_PIO_ERR_STATUS_COUNTERS; i++) { + if (reg & (1ull << i)) + incr_cntr64(&dd->send_pio_err_status_cnt[i]); + } +} + +static void handle_sdma_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + int i = 0; + + dd_dev_info(dd, "SDMA Error: %s\n", + sdma_err_status_string(buf, sizeof(buf), reg)); + + if (reg & ALL_SDMA_FREEZE_ERR) + start_freeze_handling(dd->pport, 0); + + for (i = 0; i < NUM_SEND_DMA_ERR_STATUS_COUNTERS; i++) { + if (reg & (1ull << i)) + incr_cntr64(&dd->send_dma_err_status_cnt[i]); + } +} + +static inline void __count_port_discards(struct hfi1_pportdata *ppd) +{ + incr_cntr64(&ppd->port_xmit_discards); +} + +static void count_port_inactive(struct hfi1_devdata *dd) +{ + __count_port_discards(dd->pport); +} + +/* + * We have had a "disallowed packet" error during egress. Determine the + * integrity check which failed, and update relevant error counter, etc. + * + * Note that the SEND_EGRESS_ERR_INFO register has only a single + * bit of state per integrity check, and so we can miss the reason for an + * egress error if more than one packet fails the same integrity check + * since we cleared the corresponding bit in SEND_EGRESS_ERR_INFO. + */ +static void handle_send_egress_err_info(struct hfi1_devdata *dd, + int vl) +{ + struct hfi1_pportdata *ppd = dd->pport; + u64 src = read_csr(dd, SEND_EGRESS_ERR_SOURCE); /* read first */ + u64 info = read_csr(dd, SEND_EGRESS_ERR_INFO); + char buf[96]; + + /* clear down all observed info as quickly as possible after read */ + write_csr(dd, SEND_EGRESS_ERR_INFO, info); + + dd_dev_info(dd, + "Egress Error Info: 0x%llx, %s Egress Error Src 0x%llx\n", + info, egress_err_info_string(buf, sizeof(buf), info), src); + + /* Eventually add other counters for each bit */ + if (info & PORT_DISCARD_EGRESS_ERRS) { + int weight, i; + + /* + * Count all applicable bits as individual errors and + * attribute them to the packet that triggered this handler. + * This may not be completely accurate due to limitations + * on the available hardware error information. There is + * a single information register and any number of error + * packets may have occurred and contributed to it before + * this routine is called. This means that: + * a) If multiple packets with the same error occur before + * this routine is called, earlier packets are missed. + * There is only a single bit for each error type. + * b) Errors may not be attributed to the correct VL. + * The driver is attributing all bits in the info register + * to the packet that triggered this call, but bits + * could be an accumulation of different packets with + * different VLs. + * c) A single error packet may have multiple counts attached + * to it. There is no way for the driver to know if + * multiple bits set in the info register are due to a + * single packet or multiple packets. The driver assumes + * multiple packets. + */ + weight = hweight64(info & PORT_DISCARD_EGRESS_ERRS); + for (i = 0; i < weight; i++) { + __count_port_discards(ppd); + if (vl >= 0 && vl < TXE_NUM_DATA_VL) + incr_cntr64(&ppd->port_xmit_discards_vl[vl]); + else if (vl == 15) + incr_cntr64(&ppd->port_xmit_discards_vl + [C_VL_15]); + } + } +} + +/* + * Input value is a bit position within the SEND_EGRESS_ERR_STATUS + * register. Does it represent a 'port inactive' error? + */ +static inline int port_inactive_err(u64 posn) +{ + return (posn >= SEES(TX_LINKDOWN) && + posn <= SEES(TX_INCORRECT_LINK_STATE)); +} + +/* + * Input value is a bit position within the SEND_EGRESS_ERR_STATUS + * register. Does it represent a 'disallowed packet' error? + */ +static inline int disallowed_pkt_err(int posn) +{ + return (posn >= SEES(TX_SDMA0_DISALLOWED_PACKET) && + posn <= SEES(TX_SDMA15_DISALLOWED_PACKET)); +} + +/* + * Input value is a bit position of one of the SDMA engine disallowed + * packet errors. Return which engine. Use of this must be guarded by + * disallowed_pkt_err(). + */ +static inline int disallowed_pkt_engine(int posn) +{ + return posn - SEES(TX_SDMA0_DISALLOWED_PACKET); +} + +/* + * Translate an SDMA engine to a VL. Return -1 if the tranlation cannot + * be done. + */ +static int engine_to_vl(struct hfi1_devdata *dd, int engine) +{ + struct sdma_vl_map *m; + int vl; + + /* range check */ + if (engine < 0 || engine >= TXE_NUM_SDMA_ENGINES) + return -1; + + rcu_read_lock(); + m = rcu_dereference(dd->sdma_map); + vl = m->engine_to_vl[engine]; + rcu_read_unlock(); + + return vl; +} + +/* + * Translate the send context (sofware index) into a VL. Return -1 if the + * translation cannot be done. + */ +static int sc_to_vl(struct hfi1_devdata *dd, int sw_index) +{ + struct send_context_info *sci; + struct send_context *sc; + int i; + + sci = &dd->send_contexts[sw_index]; + + /* there is no information for user (PSM) and ack contexts */ + if ((sci->type != SC_KERNEL) && (sci->type != SC_VL15)) + return -1; + + sc = sci->sc; + if (!sc) + return -1; + if (dd->vld[15].sc == sc) + return 15; + for (i = 0; i < num_vls; i++) + if (dd->vld[i].sc == sc) + return i; + + return -1; +} + +static void handle_egress_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + u64 reg_copy = reg, handled = 0; + char buf[96]; + int i = 0; + + if (reg & ALL_TXE_EGRESS_FREEZE_ERR) + start_freeze_handling(dd->pport, 0); + else if (is_ax(dd) && + (reg & SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK) && + (dd->icode != ICODE_FUNCTIONAL_SIMULATOR)) + start_freeze_handling(dd->pport, 0); + + while (reg_copy) { + int posn = fls64(reg_copy); + /* fls64() returns a 1-based offset, we want it zero based */ + int shift = posn - 1; + u64 mask = 1ULL << shift; + + if (port_inactive_err(shift)) { + count_port_inactive(dd); + handled |= mask; + } else if (disallowed_pkt_err(shift)) { + int vl = engine_to_vl(dd, disallowed_pkt_engine(shift)); + + handle_send_egress_err_info(dd, vl); + handled |= mask; + } + reg_copy &= ~mask; + } + + reg &= ~handled; + + if (reg) + dd_dev_info(dd, "Egress Error: %s\n", + egress_err_status_string(buf, sizeof(buf), reg)); + + for (i = 0; i < NUM_SEND_EGRESS_ERR_STATUS_COUNTERS; i++) { + if (reg & (1ull << i)) + incr_cntr64(&dd->send_egress_err_status_cnt[i]); + } +} + +static void handle_txe_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + int i = 0; + + dd_dev_info(dd, "Send Error: %s\n", + send_err_status_string(buf, sizeof(buf), reg)); + + for (i = 0; i < NUM_SEND_ERR_STATUS_COUNTERS; i++) { + if (reg & (1ull << i)) + incr_cntr64(&dd->send_err_status_cnt[i]); + } +} + +/* + * The maximum number of times the error clear down will loop before + * blocking a repeating error. This value is arbitrary. + */ +#define MAX_CLEAR_COUNT 20 + +/* + * Clear and handle an error register. All error interrupts are funneled + * through here to have a central location to correctly handle single- + * or multi-shot errors. + * + * For non per-context registers, call this routine with a context value + * of 0 so the per-context offset is zero. + * + * If the handler loops too many times, assume that something is wrong + * and can't be fixed, so mask the error bits. + */ +static void interrupt_clear_down(struct hfi1_devdata *dd, + u32 context, + const struct err_reg_info *eri) +{ + u64 reg; + u32 count; + + /* read in a loop until no more errors are seen */ + count = 0; + while (1) { + reg = read_kctxt_csr(dd, context, eri->status); + if (reg == 0) + break; + write_kctxt_csr(dd, context, eri->clear, reg); + if (likely(eri->handler)) + eri->handler(dd, context, reg); + count++; + if (count > MAX_CLEAR_COUNT) { + u64 mask; + + dd_dev_err(dd, "Repeating %s bits 0x%llx - masking\n", + eri->desc, reg); + /* + * Read-modify-write so any other masked bits + * remain masked. + */ + mask = read_kctxt_csr(dd, context, eri->mask); + mask &= ~reg; + write_kctxt_csr(dd, context, eri->mask, mask); + break; + } + } +} + +/* + * CCE block "misc" interrupt. Source is < 16. + */ +static void is_misc_err_int(struct hfi1_devdata *dd, unsigned int source) +{ + const struct err_reg_info *eri = &misc_errs[source]; + + if (eri->handler) { + interrupt_clear_down(dd, 0, eri); + } else { + dd_dev_err(dd, "Unexpected misc interrupt (%u) - reserved\n", + source); + } +} + +static char *send_context_err_status_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, + sc_err_status_flags, + ARRAY_SIZE(sc_err_status_flags)); +} + +/* + * Send context error interrupt. Source (hw_context) is < 160. + * + * All send context errors cause the send context to halt. The normal + * clear-down mechanism cannot be used because we cannot clear the + * error bits until several other long-running items are done first. + * This is OK because with the context halted, nothing else is going + * to happen on it anyway. + */ +static void is_sendctxt_err_int(struct hfi1_devdata *dd, + unsigned int hw_context) +{ + struct send_context_info *sci; + struct send_context *sc; + char flags[96]; + u64 status; + u32 sw_index; + int i = 0; + + sw_index = dd->hw_to_sw[hw_context]; + if (sw_index >= dd->num_send_contexts) { + dd_dev_err(dd, + "out of range sw index %u for send context %u\n", + sw_index, hw_context); + return; + } + sci = &dd->send_contexts[sw_index]; + sc = sci->sc; + if (!sc) { + dd_dev_err(dd, "%s: context %u(%u): no sc?\n", __func__, + sw_index, hw_context); + return; + } + + /* tell the software that a halt has begun */ + sc_stop(sc, SCF_HALTED); + + status = read_kctxt_csr(dd, hw_context, SEND_CTXT_ERR_STATUS); + + dd_dev_info(dd, "Send Context %u(%u) Error: %s\n", sw_index, hw_context, + send_context_err_status_string(flags, sizeof(flags), + status)); + + if (status & SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK) + handle_send_egress_err_info(dd, sc_to_vl(dd, sw_index)); + + /* + * Automatically restart halted kernel contexts out of interrupt + * context. User contexts must ask the driver to restart the context. + */ + if (sc->type != SC_USER) + queue_work(dd->pport->hfi1_wq, &sc->halt_work); + + /* + * Update the counters for the corresponding status bits. + * Note that these particular counters are aggregated over all + * 160 contexts. + */ + for (i = 0; i < NUM_SEND_CTXT_ERR_STATUS_COUNTERS; i++) { + if (status & (1ull << i)) + incr_cntr64(&dd->sw_ctxt_err_status_cnt[i]); + } +} + +static void handle_sdma_eng_err(struct hfi1_devdata *dd, + unsigned int source, u64 status) +{ + struct sdma_engine *sde; + int i = 0; + + sde = &dd->per_sdma[source]; +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx, + slashstrip(__FILE__), __LINE__, __func__); + dd_dev_err(sde->dd, "CONFIG SDMA(%u) source: %u status 0x%llx\n", + sde->this_idx, source, (unsigned long long)status); +#endif + sde->err_cnt++; + sdma_engine_error(sde, status); + + /* + * Update the counters for the corresponding status bits. + * Note that these particular counters are aggregated over + * all 16 DMA engines. + */ + for (i = 0; i < NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS; i++) { + if (status & (1ull << i)) + incr_cntr64(&dd->sw_send_dma_eng_err_status_cnt[i]); + } +} + +/* + * CCE block SDMA error interrupt. Source is < 16. + */ +static void is_sdma_eng_err_int(struct hfi1_devdata *dd, unsigned int source) +{ +#ifdef CONFIG_SDMA_VERBOSITY + struct sdma_engine *sde = &dd->per_sdma[source]; + + dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx, + slashstrip(__FILE__), __LINE__, __func__); + dd_dev_err(dd, "CONFIG SDMA(%u) source: %u\n", sde->this_idx, + source); + sdma_dumpstate(sde); +#endif + interrupt_clear_down(dd, source, &sdma_eng_err); +} + +/* + * CCE block "various" interrupt. Source is < 8. + */ +static void is_various_int(struct hfi1_devdata *dd, unsigned int source) +{ + const struct err_reg_info *eri = &various_err[source]; + + /* + * TCritInt cannot go through interrupt_clear_down() + * because it is not a second tier interrupt. The handler + * should be called directly. + */ + if (source == TCRIT_INT_SOURCE) + handle_temp_err(dd); + else if (eri->handler) + interrupt_clear_down(dd, 0, eri); + else + dd_dev_info(dd, + "%s: Unimplemented/reserved interrupt %d\n", + __func__, source); +} + +static void handle_qsfp_int(struct hfi1_devdata *dd, u32 src_ctx, u64 reg) +{ + /* src_ctx is always zero */ + struct hfi1_pportdata *ppd = dd->pport; + unsigned long flags; + u64 qsfp_int_mgmt = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N); + + if (reg & QSFP_HFI0_MODPRST_N) { + if (!qsfp_mod_present(ppd)) { + dd_dev_info(dd, "%s: QSFP module removed\n", + __func__); + + ppd->driver_link_ready = 0; + /* + * Cable removed, reset all our information about the + * cache and cable capabilities + */ + + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + /* + * We don't set cache_refresh_required here as we expect + * an interrupt when a cable is inserted + */ + ppd->qsfp_info.cache_valid = 0; + ppd->qsfp_info.reset_needed = 0; + ppd->qsfp_info.limiting_active = 0; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, + flags); + /* Invert the ModPresent pin now to detect plug-in */ + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT : + ASIC_QSFP1_INVERT, qsfp_int_mgmt); + + if ((ppd->offline_disabled_reason > + HFI1_ODR_MASK( + OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED)) || + (ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))) + ppd->offline_disabled_reason = + HFI1_ODR_MASK( + OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED); + + if (ppd->host_link_state == HLS_DN_POLL) { + /* + * The link is still in POLL. This means + * that the normal link down processing + * will not happen. We have to do it here + * before turning the DC off. + */ + queue_work(ppd->hfi1_wq, &ppd->link_down_work); + } + } else { + dd_dev_info(dd, "%s: QSFP module inserted\n", + __func__); + + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + ppd->qsfp_info.cache_valid = 0; + ppd->qsfp_info.cache_refresh_required = 1; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, + flags); + + /* + * Stop inversion of ModPresent pin to detect + * removal of the cable + */ + qsfp_int_mgmt &= ~(u64)QSFP_HFI0_MODPRST_N; + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT : + ASIC_QSFP1_INVERT, qsfp_int_mgmt); + + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT); + } + } + + if (reg & QSFP_HFI0_INT_N) { + dd_dev_info(dd, "%s: Interrupt received from QSFP module\n", + __func__); + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + ppd->qsfp_info.check_interrupt_flags = 1; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags); + } + + /* Schedule the QSFP work only if there is a cable attached. */ + if (qsfp_mod_present(ppd)) + queue_work(ppd->hfi1_wq, &ppd->qsfp_info.qsfp_work); +} + +static int request_host_lcb_access(struct hfi1_devdata *dd) +{ + int ret; + + ret = do_8051_command(dd, HCMD_MISC, + (u64)HCMD_MISC_REQUEST_LCB_ACCESS << + LOAD_DATA_FIELD_ID_SHIFT, NULL); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, "%s: command failed with error %d\n", + __func__, ret); + } + return ret == HCMD_SUCCESS ? 0 : -EBUSY; +} + +static int request_8051_lcb_access(struct hfi1_devdata *dd) +{ + int ret; + + ret = do_8051_command(dd, HCMD_MISC, + (u64)HCMD_MISC_GRANT_LCB_ACCESS << + LOAD_DATA_FIELD_ID_SHIFT, NULL); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, "%s: command failed with error %d\n", + __func__, ret); + } + return ret == HCMD_SUCCESS ? 0 : -EBUSY; +} + +/* + * Set the LCB selector - allow host access. The DCC selector always + * points to the host. + */ +static inline void set_host_lcb_access(struct hfi1_devdata *dd) +{ + write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL, + DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK | + DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK); +} + +/* + * Clear the LCB selector - allow 8051 access. The DCC selector always + * points to the host. + */ +static inline void set_8051_lcb_access(struct hfi1_devdata *dd) +{ + write_csr(dd, DC_DC8051_CFG_CSR_ACCESS_SEL, + DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK); +} + +/* + * Acquire LCB access from the 8051. If the host already has access, + * just increment a counter. Otherwise, inform the 8051 that the + * host is taking access. + * + * Returns: + * 0 on success + * -EBUSY if the 8051 has control and cannot be disturbed + * -errno if unable to acquire access from the 8051 + */ +int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok) +{ + struct hfi1_pportdata *ppd = dd->pport; + int ret = 0; + + /* + * Use the host link state lock so the operation of this routine + * { link state check, selector change, count increment } can occur + * as a unit against a link state change. Otherwise there is a + * race between the state change and the count increment. + */ + if (sleep_ok) { + mutex_lock(&ppd->hls_lock); + } else { + while (!mutex_trylock(&ppd->hls_lock)) + udelay(1); + } + + /* this access is valid only when the link is up */ + if (ppd->host_link_state & HLS_DOWN) { + dd_dev_info(dd, "%s: link state %s not up\n", + __func__, link_state_name(ppd->host_link_state)); + ret = -EBUSY; + goto done; + } + + if (dd->lcb_access_count == 0) { + ret = request_host_lcb_access(dd); + if (ret) { + dd_dev_err(dd, + "%s: unable to acquire LCB access, err %d\n", + __func__, ret); + goto done; + } + set_host_lcb_access(dd); + } + dd->lcb_access_count++; +done: + mutex_unlock(&ppd->hls_lock); + return ret; +} + +/* + * Release LCB access by decrementing the use count. If the count is moving + * from 1 to 0, inform 8051 that it has control back. + * + * Returns: + * 0 on success + * -errno if unable to release access to the 8051 + */ +int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok) +{ + int ret = 0; + + /* + * Use the host link state lock because the acquire needed it. + * Here, we only need to keep { selector change, count decrement } + * as a unit. + */ + if (sleep_ok) { + mutex_lock(&dd->pport->hls_lock); + } else { + while (!mutex_trylock(&dd->pport->hls_lock)) + udelay(1); + } + + if (dd->lcb_access_count == 0) { + dd_dev_err(dd, "%s: LCB access count is zero. Skipping.\n", + __func__); + goto done; + } + + if (dd->lcb_access_count == 1) { + set_8051_lcb_access(dd); + ret = request_8051_lcb_access(dd); + if (ret) { + dd_dev_err(dd, + "%s: unable to release LCB access, err %d\n", + __func__, ret); + /* restore host access if the grant didn't work */ + set_host_lcb_access(dd); + goto done; + } + } + dd->lcb_access_count--; +done: + mutex_unlock(&dd->pport->hls_lock); + return ret; +} + +/* + * Initialize LCB access variables and state. Called during driver load, + * after most of the initialization is finished. + * + * The DC default is LCB access on for the host. The driver defaults to + * leaving access to the 8051. Assign access now - this constrains the call + * to this routine to be after all LCB set-up is done. In particular, after + * hf1_init_dd() -> set_up_interrupts() -> clear_all_interrupts() + */ +static void init_lcb_access(struct hfi1_devdata *dd) +{ + dd->lcb_access_count = 0; +} + +/* + * Write a response back to a 8051 request. + */ +static void hreq_response(struct hfi1_devdata *dd, u8 return_code, u16 rsp_data) +{ + write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, + DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK | + (u64)return_code << + DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT | + (u64)rsp_data << DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT); +} + +/* + * Handle host requests from the 8051. + */ +static void handle_8051_request(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 reg; + u16 data = 0; + u8 type; + + reg = read_csr(dd, DC_DC8051_CFG_EXT_DEV_1); + if ((reg & DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK) == 0) + return; /* no request */ + + /* zero out COMPLETED so the response is seen */ + write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, 0); + + /* extract request details */ + type = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT) + & DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK; + data = (reg >> DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT) + & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK; + + switch (type) { + case HREQ_LOAD_CONFIG: + case HREQ_SAVE_CONFIG: + case HREQ_READ_CONFIG: + case HREQ_SET_TX_EQ_ABS: + case HREQ_SET_TX_EQ_REL: + case HREQ_ENABLE: + dd_dev_info(dd, "8051 request: request 0x%x not supported\n", + type); + hreq_response(dd, HREQ_NOT_SUPPORTED, 0); + break; + case HREQ_CONFIG_DONE: + hreq_response(dd, HREQ_SUCCESS, 0); + break; + + case HREQ_INTERFACE_TEST: + hreq_response(dd, HREQ_SUCCESS, data); + break; + default: + dd_dev_err(dd, "8051 request: unknown request 0x%x\n", type); + hreq_response(dd, HREQ_NOT_SUPPORTED, 0); + break; + } +} + +static void write_global_credit(struct hfi1_devdata *dd, + u8 vau, u16 total, u16 shared) +{ + write_csr(dd, SEND_CM_GLOBAL_CREDIT, + ((u64)total << + SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT) | + ((u64)shared << + SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT) | + ((u64)vau << SEND_CM_GLOBAL_CREDIT_AU_SHIFT)); +} + +/* + * Set up initial VL15 credits of the remote. Assumes the rest of + * the CM credit registers are zero from a previous global or credit reset . + */ +void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf) +{ + /* leave shared count at zero for both global and VL15 */ + write_global_credit(dd, vau, vl15buf, 0); + + /* We may need some credits for another VL when sending packets + * with the snoop interface. Dividing it down the middle for VL15 + * and VL0 should suffice. + */ + if (unlikely(dd->hfi1_snoop.mode_flag == HFI1_PORT_SNOOP_MODE)) { + write_csr(dd, SEND_CM_CREDIT_VL15, (u64)(vl15buf >> 1) + << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT); + write_csr(dd, SEND_CM_CREDIT_VL, (u64)(vl15buf >> 1) + << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT); + } else { + write_csr(dd, SEND_CM_CREDIT_VL15, (u64)vl15buf + << SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT); + } +} + +/* + * Zero all credit details from the previous connection and + * reset the CM manager's internal counters. + */ +void reset_link_credits(struct hfi1_devdata *dd) +{ + int i; + + /* remove all previous VL credit limits */ + for (i = 0; i < TXE_NUM_DATA_VL; i++) + write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0); + write_csr(dd, SEND_CM_CREDIT_VL15, 0); + write_global_credit(dd, 0, 0, 0); + /* reset the CM block */ + pio_send_control(dd, PSC_CM_RESET); +} + +/* convert a vCU to a CU */ +static u32 vcu_to_cu(u8 vcu) +{ + return 1 << vcu; +} + +/* convert a CU to a vCU */ +static u8 cu_to_vcu(u32 cu) +{ + return ilog2(cu); +} + +/* convert a vAU to an AU */ +static u32 vau_to_au(u8 vau) +{ + return 8 * (1 << vau); +} + +static void set_linkup_defaults(struct hfi1_pportdata *ppd) +{ + ppd->sm_trap_qp = 0x0; + ppd->sa_qp = 0x1; +} + +/* + * Graceful LCB shutdown. This leaves the LCB FIFOs in reset. + */ +static void lcb_shutdown(struct hfi1_devdata *dd, int abort) +{ + u64 reg; + + /* clear lcb run: LCB_CFG_RUN.EN = 0 */ + write_csr(dd, DC_LCB_CFG_RUN, 0); + /* set tx fifo reset: LCB_CFG_TX_FIFOS_RESET.VAL = 1 */ + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, + 1ull << DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT); + /* set dcc reset csr: DCC_CFG_RESET.{reset_lcb,reset_rx_fpe} = 1 */ + dd->lcb_err_en = read_csr(dd, DC_LCB_ERR_EN); + reg = read_csr(dd, DCC_CFG_RESET); + write_csr(dd, DCC_CFG_RESET, reg | + (1ull << DCC_CFG_RESET_RESET_LCB_SHIFT) | + (1ull << DCC_CFG_RESET_RESET_RX_FPE_SHIFT)); + (void)read_csr(dd, DCC_CFG_RESET); /* make sure the write completed */ + if (!abort) { + udelay(1); /* must hold for the longer of 16cclks or 20ns */ + write_csr(dd, DCC_CFG_RESET, reg); + write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en); + } +} + +/* + * This routine should be called after the link has been transitioned to + * OFFLINE (OFFLINE state has the side effect of putting the SerDes into + * reset). + * + * The expectation is that the caller of this routine would have taken + * care of properly transitioning the link into the correct state. + */ +static void dc_shutdown(struct hfi1_devdata *dd) +{ + unsigned long flags; + + spin_lock_irqsave(&dd->dc8051_lock, flags); + if (dd->dc_shutdown) { + spin_unlock_irqrestore(&dd->dc8051_lock, flags); + return; + } + dd->dc_shutdown = 1; + spin_unlock_irqrestore(&dd->dc8051_lock, flags); + /* Shutdown the LCB */ + lcb_shutdown(dd, 1); + /* + * Going to OFFLINE would have causes the 8051 to put the + * SerDes into reset already. Just need to shut down the 8051, + * itself. + */ + write_csr(dd, DC_DC8051_CFG_RST, 0x1); +} + +/* + * Calling this after the DC has been brought out of reset should not + * do any damage. + */ +static void dc_start(struct hfi1_devdata *dd) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&dd->dc8051_lock, flags); + if (!dd->dc_shutdown) + goto done; + spin_unlock_irqrestore(&dd->dc8051_lock, flags); + /* Take the 8051 out of reset */ + write_csr(dd, DC_DC8051_CFG_RST, 0ull); + /* Wait until 8051 is ready */ + ret = wait_fm_ready(dd, TIMEOUT_8051_START); + if (ret) { + dd_dev_err(dd, "%s: timeout starting 8051 firmware\n", + __func__); + } + /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */ + write_csr(dd, DCC_CFG_RESET, 0x10); + /* lcb_shutdown() with abort=1 does not restore these */ + write_csr(dd, DC_LCB_ERR_EN, dd->lcb_err_en); + spin_lock_irqsave(&dd->dc8051_lock, flags); + dd->dc_shutdown = 0; +done: + spin_unlock_irqrestore(&dd->dc8051_lock, flags); +} + +/* + * These LCB adjustments are for the Aurora SerDes core in the FPGA. + */ +static void adjust_lcb_for_fpga_serdes(struct hfi1_devdata *dd) +{ + u64 rx_radr, tx_radr; + u32 version; + + if (dd->icode != ICODE_FPGA_EMULATION) + return; + + /* + * These LCB defaults on emulator _s are good, nothing to do here: + * LCB_CFG_TX_FIFOS_RADR + * LCB_CFG_RX_FIFOS_RADR + * LCB_CFG_LN_DCLK + * LCB_CFG_IGNORE_LOST_RCLK + */ + if (is_emulator_s(dd)) + return; + /* else this is _p */ + + version = emulator_rev(dd); + if (!is_ax(dd)) + version = 0x2d; /* all B0 use 0x2d or higher settings */ + + if (version <= 0x12) { + /* release 0x12 and below */ + + /* + * LCB_CFG_RX_FIFOS_RADR.RST_VAL = 0x9 + * LCB_CFG_RX_FIFOS_RADR.OK_TO_JUMP_VAL = 0x9 + * LCB_CFG_RX_FIFOS_RADR.DO_NOT_JUMP_VAL = 0xa + */ + rx_radr = + 0xaull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT + | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT + | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT; + /* + * LCB_CFG_TX_FIFOS_RADR.ON_REINIT = 0 (default) + * LCB_CFG_TX_FIFOS_RADR.RST_VAL = 6 + */ + tx_radr = 6ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT; + } else if (version <= 0x18) { + /* release 0x13 up to 0x18 */ + /* LCB_CFG_RX_FIFOS_RADR = 0x988 */ + rx_radr = + 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT + | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT + | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT; + tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT; + } else if (version == 0x19) { + /* release 0x19 */ + /* LCB_CFG_RX_FIFOS_RADR = 0xa99 */ + rx_radr = + 0xAull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT + | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT + | 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT; + tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT; + } else if (version == 0x1a) { + /* release 0x1a */ + /* LCB_CFG_RX_FIFOS_RADR = 0x988 */ + rx_radr = + 0x9ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT + | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT + | 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT; + tx_radr = 7ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT; + write_csr(dd, DC_LCB_CFG_LN_DCLK, 1ull); + } else { + /* release 0x1b and higher */ + /* LCB_CFG_RX_FIFOS_RADR = 0x877 */ + rx_radr = + 0x8ull << DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT + | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT + | 0x7ull << DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT; + tx_radr = 3ull << DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT; + } + + write_csr(dd, DC_LCB_CFG_RX_FIFOS_RADR, rx_radr); + /* LCB_CFG_IGNORE_LOST_RCLK.EN = 1 */ + write_csr(dd, DC_LCB_CFG_IGNORE_LOST_RCLK, + DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK); + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RADR, tx_radr); +} + +/* + * Handle a SMA idle message + * + * This is a work-queue function outside of the interrupt. + */ +void handle_sma_message(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + sma_message_work); + struct hfi1_devdata *dd = ppd->dd; + u64 msg; + int ret; + + /* + * msg is bytes 1-4 of the 40-bit idle message - the command code + * is stripped off + */ + ret = read_idle_sma(dd, &msg); + if (ret) + return; + dd_dev_info(dd, "%s: SMA message 0x%llx\n", __func__, msg); + /* + * React to the SMA message. Byte[1] (0 for us) is the command. + */ + switch (msg & 0xff) { + case SMA_IDLE_ARM: + /* + * See OPAv1 table 9-14 - HFI and External Switch Ports Key + * State Transitions + * + * Only expected in INIT or ARMED, discard otherwise. + */ + if (ppd->host_link_state & (HLS_UP_INIT | HLS_UP_ARMED)) + ppd->neighbor_normal = 1; + break; + case SMA_IDLE_ACTIVE: + /* + * See OPAv1 table 9-14 - HFI and External Switch Ports Key + * State Transitions + * + * Can activate the node. Discard otherwise. + */ + if (ppd->host_link_state == HLS_UP_ARMED && + ppd->is_active_optimize_enabled) { + ppd->neighbor_normal = 1; + ret = set_link_state(ppd, HLS_UP_ACTIVE); + if (ret) + dd_dev_err( + dd, + "%s: received Active SMA idle message, couldn't set link to Active\n", + __func__); + } + break; + default: + dd_dev_err(dd, + "%s: received unexpected SMA idle message 0x%llx\n", + __func__, msg); + break; + } +} + +static void adjust_rcvctrl(struct hfi1_devdata *dd, u64 add, u64 clear) +{ + u64 rcvctrl; + unsigned long flags; + + spin_lock_irqsave(&dd->rcvctrl_lock, flags); + rcvctrl = read_csr(dd, RCV_CTRL); + rcvctrl |= add; + rcvctrl &= ~clear; + write_csr(dd, RCV_CTRL, rcvctrl); + spin_unlock_irqrestore(&dd->rcvctrl_lock, flags); +} + +static inline void add_rcvctrl(struct hfi1_devdata *dd, u64 add) +{ + adjust_rcvctrl(dd, add, 0); +} + +static inline void clear_rcvctrl(struct hfi1_devdata *dd, u64 clear) +{ + adjust_rcvctrl(dd, 0, clear); +} + +/* + * Called from all interrupt handlers to start handling an SPC freeze. + */ +void start_freeze_handling(struct hfi1_pportdata *ppd, int flags) +{ + struct hfi1_devdata *dd = ppd->dd; + struct send_context *sc; + int i; + + if (flags & FREEZE_SELF) + write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK); + + /* enter frozen mode */ + dd->flags |= HFI1_FROZEN; + + /* notify all SDMA engines that they are going into a freeze */ + sdma_freeze_notify(dd, !!(flags & FREEZE_LINK_DOWN)); + + /* do halt pre-handling on all enabled send contexts */ + for (i = 0; i < dd->num_send_contexts; i++) { + sc = dd->send_contexts[i].sc; + if (sc && (sc->flags & SCF_ENABLED)) + sc_stop(sc, SCF_FROZEN | SCF_HALTED); + } + + /* Send context are frozen. Notify user space */ + hfi1_set_uevent_bits(ppd, _HFI1_EVENT_FROZEN_BIT); + + if (flags & FREEZE_ABORT) { + dd_dev_err(dd, + "Aborted freeze recovery. Please REBOOT system\n"); + return; + } + /* queue non-interrupt handler */ + queue_work(ppd->hfi1_wq, &ppd->freeze_work); +} + +/* + * Wait until all 4 sub-blocks indicate that they have frozen or unfrozen, + * depending on the "freeze" parameter. + * + * No need to return an error if it times out, our only option + * is to proceed anyway. + */ +static void wait_for_freeze_status(struct hfi1_devdata *dd, int freeze) +{ + unsigned long timeout; + u64 reg; + + timeout = jiffies + msecs_to_jiffies(FREEZE_STATUS_TIMEOUT); + while (1) { + reg = read_csr(dd, CCE_STATUS); + if (freeze) { + /* waiting until all indicators are set */ + if ((reg & ALL_FROZE) == ALL_FROZE) + return; /* all done */ + } else { + /* waiting until all indicators are clear */ + if ((reg & ALL_FROZE) == 0) + return; /* all done */ + } + + if (time_after(jiffies, timeout)) { + dd_dev_err(dd, + "Time out waiting for SPC %sfreeze, bits 0x%llx, expecting 0x%llx, continuing", + freeze ? "" : "un", reg & ALL_FROZE, + freeze ? ALL_FROZE : 0ull); + return; + } + usleep_range(80, 120); + } +} + +/* + * Do all freeze handling for the RXE block. + */ +static void rxe_freeze(struct hfi1_devdata *dd) +{ + int i; + + /* disable port */ + clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); + + /* disable all receive contexts */ + for (i = 0; i < dd->num_rcv_contexts; i++) + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS, i); +} + +/* + * Unfreeze handling for the RXE block - kernel contexts only. + * This will also enable the port. User contexts will do unfreeze + * handling on a per-context basis as they call into the driver. + * + */ +static void rxe_kernel_unfreeze(struct hfi1_devdata *dd) +{ + u32 rcvmask; + int i; + + /* enable all kernel contexts */ + for (i = 0; i < dd->n_krcv_queues; i++) { + rcvmask = HFI1_RCVCTRL_CTXT_ENB; + /* HFI1_RCVCTRL_TAILUPD_[ENB|DIS] needs to be set explicitly */ + rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ? + HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; + hfi1_rcvctrl(dd, rcvmask, i); + } + + /* enable port */ + add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); +} + +/* + * Non-interrupt SPC freeze handling. + * + * This is a work-queue function outside of the triggering interrupt. + */ +void handle_freeze(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + freeze_work); + struct hfi1_devdata *dd = ppd->dd; + + /* wait for freeze indicators on all affected blocks */ + wait_for_freeze_status(dd, 1); + + /* SPC is now frozen */ + + /* do send PIO freeze steps */ + pio_freeze(dd); + + /* do send DMA freeze steps */ + sdma_freeze(dd); + + /* do send egress freeze steps - nothing to do */ + + /* do receive freeze steps */ + rxe_freeze(dd); + + /* + * Unfreeze the hardware - clear the freeze, wait for each + * block's frozen bit to clear, then clear the frozen flag. + */ + write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK); + wait_for_freeze_status(dd, 0); + + if (is_ax(dd)) { + write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_FREEZE_SMASK); + wait_for_freeze_status(dd, 1); + write_csr(dd, CCE_CTRL, CCE_CTRL_SPC_UNFREEZE_SMASK); + wait_for_freeze_status(dd, 0); + } + + /* do send PIO unfreeze steps for kernel contexts */ + pio_kernel_unfreeze(dd); + + /* do send DMA unfreeze steps */ + sdma_unfreeze(dd); + + /* do send egress unfreeze steps - nothing to do */ + + /* do receive unfreeze steps for kernel contexts */ + rxe_kernel_unfreeze(dd); + + /* + * The unfreeze procedure touches global device registers when + * it disables and re-enables RXE. Mark the device unfrozen + * after all that is done so other parts of the driver waiting + * for the device to unfreeze don't do things out of order. + * + * The above implies that the meaning of HFI1_FROZEN flag is + * "Device has gone into freeze mode and freeze mode handling + * is still in progress." + * + * The flag will be removed when freeze mode processing has + * completed. + */ + dd->flags &= ~HFI1_FROZEN; + wake_up(&dd->event_queue); + + /* no longer frozen */ +} + +/* + * Handle a link up interrupt from the 8051. + * + * This is a work-queue function outside of the interrupt. + */ +void handle_link_up(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + link_up_work); + set_link_state(ppd, HLS_UP_INIT); + + /* cache the read of DC_LCB_STS_ROUND_TRIP_LTP_CNT */ + read_ltp_rtt(ppd->dd); + /* + * OPA specifies that certain counters are cleared on a transition + * to link up, so do that. + */ + clear_linkup_counters(ppd->dd); + /* + * And (re)set link up default values. + */ + set_linkup_defaults(ppd); + + /* enforce link speed enabled */ + if ((ppd->link_speed_active & ppd->link_speed_enabled) == 0) { + /* oops - current speed is not enabled, bounce */ + dd_dev_err(ppd->dd, + "Link speed active 0x%x is outside enabled 0x%x, downing link\n", + ppd->link_speed_active, ppd->link_speed_enabled); + set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SPEED_POLICY, 0, + OPA_LINKDOWN_REASON_SPEED_POLICY); + set_link_state(ppd, HLS_DN_OFFLINE); + tune_serdes(ppd); + start_link(ppd); + } +} + +/* + * Several pieces of LNI information were cached for SMA in ppd. + * Reset these on link down + */ +static void reset_neighbor_info(struct hfi1_pportdata *ppd) +{ + ppd->neighbor_guid = 0; + ppd->neighbor_port_number = 0; + ppd->neighbor_type = 0; + ppd->neighbor_fm_security = 0; +} + +static const char * const link_down_reason_strs[] = { + [OPA_LINKDOWN_REASON_NONE] = "None", + [OPA_LINKDOWN_REASON_RCV_ERROR_0] = "Recive error 0", + [OPA_LINKDOWN_REASON_BAD_PKT_LEN] = "Bad packet length", + [OPA_LINKDOWN_REASON_PKT_TOO_LONG] = "Packet too long", + [OPA_LINKDOWN_REASON_PKT_TOO_SHORT] = "Packet too short", + [OPA_LINKDOWN_REASON_BAD_SLID] = "Bad SLID", + [OPA_LINKDOWN_REASON_BAD_DLID] = "Bad DLID", + [OPA_LINKDOWN_REASON_BAD_L2] = "Bad L2", + [OPA_LINKDOWN_REASON_BAD_SC] = "Bad SC", + [OPA_LINKDOWN_REASON_RCV_ERROR_8] = "Receive error 8", + [OPA_LINKDOWN_REASON_BAD_MID_TAIL] = "Bad mid tail", + [OPA_LINKDOWN_REASON_RCV_ERROR_10] = "Receive error 10", + [OPA_LINKDOWN_REASON_PREEMPT_ERROR] = "Preempt error", + [OPA_LINKDOWN_REASON_PREEMPT_VL15] = "Preempt vl15", + [OPA_LINKDOWN_REASON_BAD_VL_MARKER] = "Bad VL marker", + [OPA_LINKDOWN_REASON_RCV_ERROR_14] = "Receive error 14", + [OPA_LINKDOWN_REASON_RCV_ERROR_15] = "Receive error 15", + [OPA_LINKDOWN_REASON_BAD_HEAD_DIST] = "Bad head distance", + [OPA_LINKDOWN_REASON_BAD_TAIL_DIST] = "Bad tail distance", + [OPA_LINKDOWN_REASON_BAD_CTRL_DIST] = "Bad control distance", + [OPA_LINKDOWN_REASON_BAD_CREDIT_ACK] = "Bad credit ack", + [OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER] = "Unsupported VL marker", + [OPA_LINKDOWN_REASON_BAD_PREEMPT] = "Bad preempt", + [OPA_LINKDOWN_REASON_BAD_CONTROL_FLIT] = "Bad control flit", + [OPA_LINKDOWN_REASON_EXCEED_MULTICAST_LIMIT] = "Exceed multicast limit", + [OPA_LINKDOWN_REASON_RCV_ERROR_24] = "Receive error 24", + [OPA_LINKDOWN_REASON_RCV_ERROR_25] = "Receive error 25", + [OPA_LINKDOWN_REASON_RCV_ERROR_26] = "Receive error 26", + [OPA_LINKDOWN_REASON_RCV_ERROR_27] = "Receive error 27", + [OPA_LINKDOWN_REASON_RCV_ERROR_28] = "Receive error 28", + [OPA_LINKDOWN_REASON_RCV_ERROR_29] = "Receive error 29", + [OPA_LINKDOWN_REASON_RCV_ERROR_30] = "Receive error 30", + [OPA_LINKDOWN_REASON_EXCESSIVE_BUFFER_OVERRUN] = + "Excessive buffer overrun", + [OPA_LINKDOWN_REASON_UNKNOWN] = "Unknown", + [OPA_LINKDOWN_REASON_REBOOT] = "Reboot", + [OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN] = "Neighbor unknown", + [OPA_LINKDOWN_REASON_FM_BOUNCE] = "FM bounce", + [OPA_LINKDOWN_REASON_SPEED_POLICY] = "Speed policy", + [OPA_LINKDOWN_REASON_WIDTH_POLICY] = "Width policy", + [OPA_LINKDOWN_REASON_DISCONNECTED] = "Disconnected", + [OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED] = + "Local media not installed", + [OPA_LINKDOWN_REASON_NOT_INSTALLED] = "Not installed", + [OPA_LINKDOWN_REASON_CHASSIS_CONFIG] = "Chassis config", + [OPA_LINKDOWN_REASON_END_TO_END_NOT_INSTALLED] = + "End to end not installed", + [OPA_LINKDOWN_REASON_POWER_POLICY] = "Power policy", + [OPA_LINKDOWN_REASON_LINKSPEED_POLICY] = "Link speed policy", + [OPA_LINKDOWN_REASON_LINKWIDTH_POLICY] = "Link width policy", + [OPA_LINKDOWN_REASON_SWITCH_MGMT] = "Switch management", + [OPA_LINKDOWN_REASON_SMA_DISABLED] = "SMA disabled", + [OPA_LINKDOWN_REASON_TRANSIENT] = "Transient" +}; + +/* return the neighbor link down reason string */ +static const char *link_down_reason_str(u8 reason) +{ + const char *str = NULL; + + if (reason < ARRAY_SIZE(link_down_reason_strs)) + str = link_down_reason_strs[reason]; + if (!str) + str = "(invalid)"; + + return str; +} + +/* + * Handle a link down interrupt from the 8051. + * + * This is a work-queue function outside of the interrupt. + */ +void handle_link_down(struct work_struct *work) +{ + u8 lcl_reason, neigh_reason = 0; + u8 link_down_reason; + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + link_down_work); + int was_up; + static const char ldr_str[] = "Link down reason: "; + + if ((ppd->host_link_state & + (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) && + ppd->port_type == PORT_TYPE_FIXED) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NOT_INSTALLED); + + /* Go offline first, then deal with reading/writing through 8051 */ + was_up = !!(ppd->host_link_state & HLS_UP); + set_link_state(ppd, HLS_DN_OFFLINE); + + if (was_up) { + lcl_reason = 0; + /* link down reason is only valid if the link was up */ + read_link_down_reason(ppd->dd, &link_down_reason); + switch (link_down_reason) { + case LDR_LINK_TRANSFER_ACTIVE_LOW: + /* the link went down, no idle message reason */ + dd_dev_info(ppd->dd, "%sUnexpected link down\n", + ldr_str); + break; + case LDR_RECEIVED_LINKDOWN_IDLE_MSG: + /* + * The neighbor reason is only valid if an idle message + * was received for it. + */ + read_planned_down_reason_code(ppd->dd, &neigh_reason); + dd_dev_info(ppd->dd, + "%sNeighbor link down message %d, %s\n", + ldr_str, neigh_reason, + link_down_reason_str(neigh_reason)); + break; + case LDR_RECEIVED_HOST_OFFLINE_REQ: + dd_dev_info(ppd->dd, + "%sHost requested link to go offline\n", + ldr_str); + break; + default: + dd_dev_info(ppd->dd, "%sUnknown reason 0x%x\n", + ldr_str, link_down_reason); + break; + } + + /* + * If no reason, assume peer-initiated but missed + * LinkGoingDown idle flits. + */ + if (neigh_reason == 0) + lcl_reason = OPA_LINKDOWN_REASON_NEIGHBOR_UNKNOWN; + } else { + /* went down while polling or going up */ + lcl_reason = OPA_LINKDOWN_REASON_TRANSIENT; + } + + set_link_down_reason(ppd, lcl_reason, neigh_reason, 0); + + /* inform the SMA when the link transitions from up to down */ + if (was_up && ppd->local_link_down_reason.sma == 0 && + ppd->neigh_link_down_reason.sma == 0) { + ppd->local_link_down_reason.sma = + ppd->local_link_down_reason.latest; + ppd->neigh_link_down_reason.sma = + ppd->neigh_link_down_reason.latest; + } + + reset_neighbor_info(ppd); + + /* disable the port */ + clear_rcvctrl(ppd->dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); + + /* + * If there is no cable attached, turn the DC off. Otherwise, + * start the link bring up. + */ + if (ppd->port_type == PORT_TYPE_QSFP && !qsfp_mod_present(ppd)) { + dc_shutdown(ppd->dd); + } else { + tune_serdes(ppd); + start_link(ppd); + } +} + +void handle_link_bounce(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + link_bounce_work); + + /* + * Only do something if the link is currently up. + */ + if (ppd->host_link_state & HLS_UP) { + set_link_state(ppd, HLS_DN_OFFLINE); + tune_serdes(ppd); + start_link(ppd); + } else { + dd_dev_info(ppd->dd, "%s: link not up (%s), nothing to do\n", + __func__, link_state_name(ppd->host_link_state)); + } +} + +/* + * Mask conversion: Capability exchange to Port LTP. The capability + * exchange has an implicit 16b CRC that is mandatory. + */ +static int cap_to_port_ltp(int cap) +{ + int port_ltp = PORT_LTP_CRC_MODE_16; /* this mode is mandatory */ + + if (cap & CAP_CRC_14B) + port_ltp |= PORT_LTP_CRC_MODE_14; + if (cap & CAP_CRC_48B) + port_ltp |= PORT_LTP_CRC_MODE_48; + if (cap & CAP_CRC_12B_16B_PER_LANE) + port_ltp |= PORT_LTP_CRC_MODE_PER_LANE; + + return port_ltp; +} + +/* + * Convert an OPA Port LTP mask to capability mask + */ +int port_ltp_to_cap(int port_ltp) +{ + int cap_mask = 0; + + if (port_ltp & PORT_LTP_CRC_MODE_14) + cap_mask |= CAP_CRC_14B; + if (port_ltp & PORT_LTP_CRC_MODE_48) + cap_mask |= CAP_CRC_48B; + if (port_ltp & PORT_LTP_CRC_MODE_PER_LANE) + cap_mask |= CAP_CRC_12B_16B_PER_LANE; + + return cap_mask; +} + +/* + * Convert a single DC LCB CRC mode to an OPA Port LTP mask. + */ +static int lcb_to_port_ltp(int lcb_crc) +{ + int port_ltp = 0; + + if (lcb_crc == LCB_CRC_12B_16B_PER_LANE) + port_ltp = PORT_LTP_CRC_MODE_PER_LANE; + else if (lcb_crc == LCB_CRC_48B) + port_ltp = PORT_LTP_CRC_MODE_48; + else if (lcb_crc == LCB_CRC_14B) + port_ltp = PORT_LTP_CRC_MODE_14; + else + port_ltp = PORT_LTP_CRC_MODE_16; + + return port_ltp; +} + +/* + * Our neighbor has indicated that we are allowed to act as a fabric + * manager, so place the full management partition key in the second + * (0-based) pkey array position (see OPAv1, section 20.2.2.6.8). Note + * that we should already have the limited management partition key in + * array element 1, and also that the port is not yet up when + * add_full_mgmt_pkey() is invoked. + */ +static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + + /* Sanity check - ppd->pkeys[2] should be 0, or already initalized */ + if (!((ppd->pkeys[2] == 0) || (ppd->pkeys[2] == FULL_MGMT_P_KEY))) + dd_dev_warn(dd, "%s pkey[2] already set to 0x%x, resetting it to 0x%x\n", + __func__, ppd->pkeys[2], FULL_MGMT_P_KEY); + ppd->pkeys[2] = FULL_MGMT_P_KEY; + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0); + hfi1_event_pkey_change(ppd->dd, ppd->port); +} + +static void clear_full_mgmt_pkey(struct hfi1_pportdata *ppd) +{ + if (ppd->pkeys[2] != 0) { + ppd->pkeys[2] = 0; + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0); + hfi1_event_pkey_change(ppd->dd, ppd->port); + } +} + +/* + * Convert the given link width to the OPA link width bitmask. + */ +static u16 link_width_to_bits(struct hfi1_devdata *dd, u16 width) +{ + switch (width) { + case 0: + /* + * Simulator and quick linkup do not set the width. + * Just set it to 4x without complaint. + */ + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR || quick_linkup) + return OPA_LINK_WIDTH_4X; + return 0; /* no lanes up */ + case 1: return OPA_LINK_WIDTH_1X; + case 2: return OPA_LINK_WIDTH_2X; + case 3: return OPA_LINK_WIDTH_3X; + default: + dd_dev_info(dd, "%s: invalid width %d, using 4\n", + __func__, width); + /* fall through */ + case 4: return OPA_LINK_WIDTH_4X; + } +} + +/* + * Do a population count on the bottom nibble. + */ +static const u8 bit_counts[16] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 +}; + +static inline u8 nibble_to_count(u8 nibble) +{ + return bit_counts[nibble & 0xf]; +} + +/* + * Read the active lane information from the 8051 registers and return + * their widths. + * + * Active lane information is found in these 8051 registers: + * enable_lane_tx + * enable_lane_rx + */ +static void get_link_widths(struct hfi1_devdata *dd, u16 *tx_width, + u16 *rx_width) +{ + u16 tx, rx; + u8 enable_lane_rx; + u8 enable_lane_tx; + u8 tx_polarity_inversion; + u8 rx_polarity_inversion; + u8 max_rate; + + /* read the active lanes */ + read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion, + &rx_polarity_inversion, &max_rate); + read_local_lni(dd, &enable_lane_rx); + + /* convert to counts */ + tx = nibble_to_count(enable_lane_tx); + rx = nibble_to_count(enable_lane_rx); + + /* + * Set link_speed_active here, overriding what was set in + * handle_verify_cap(). The ASIC 8051 firmware does not correctly + * set the max_rate field in handle_verify_cap until v0.19. + */ + if ((dd->icode == ICODE_RTL_SILICON) && + (dd->dc8051_ver < dc8051_ver(0, 19))) { + /* max_rate: 0 = 12.5G, 1 = 25G */ + switch (max_rate) { + case 0: + dd->pport[0].link_speed_active = OPA_LINK_SPEED_12_5G; + break; + default: + dd_dev_err(dd, + "%s: unexpected max rate %d, using 25Gb\n", + __func__, (int)max_rate); + /* fall through */ + case 1: + dd->pport[0].link_speed_active = OPA_LINK_SPEED_25G; + break; + } + } + + dd_dev_info(dd, + "Fabric active lanes (width): tx 0x%x (%d), rx 0x%x (%d)\n", + enable_lane_tx, tx, enable_lane_rx, rx); + *tx_width = link_width_to_bits(dd, tx); + *rx_width = link_width_to_bits(dd, rx); +} + +/* + * Read verify_cap_local_fm_link_width[1] to obtain the link widths. + * Valid after the end of VerifyCap and during LinkUp. Does not change + * after link up. I.e. look elsewhere for downgrade information. + * + * Bits are: + * + bits [7:4] contain the number of active transmitters + * + bits [3:0] contain the number of active receivers + * These are numbers 1 through 4 and can be different values if the + * link is asymmetric. + * + * verify_cap_local_fm_link_width[0] retains its original value. + */ +static void get_linkup_widths(struct hfi1_devdata *dd, u16 *tx_width, + u16 *rx_width) +{ + u16 widths, tx, rx; + u8 misc_bits, local_flags; + u16 active_tx, active_rx; + + read_vc_local_link_width(dd, &misc_bits, &local_flags, &widths); + tx = widths >> 12; + rx = (widths >> 8) & 0xf; + + *tx_width = link_width_to_bits(dd, tx); + *rx_width = link_width_to_bits(dd, rx); + + /* print the active widths */ + get_link_widths(dd, &active_tx, &active_rx); +} + +/* + * Set ppd->link_width_active and ppd->link_width_downgrade_active using + * hardware information when the link first comes up. + * + * The link width is not available until after VerifyCap.AllFramesReceived + * (the trigger for handle_verify_cap), so this is outside that routine + * and should be called when the 8051 signals linkup. + */ +void get_linkup_link_widths(struct hfi1_pportdata *ppd) +{ + u16 tx_width, rx_width; + + /* get end-of-LNI link widths */ + get_linkup_widths(ppd->dd, &tx_width, &rx_width); + + /* use tx_width as the link is supposed to be symmetric on link up */ + ppd->link_width_active = tx_width; + /* link width downgrade active (LWD.A) starts out matching LW.A */ + ppd->link_width_downgrade_tx_active = ppd->link_width_active; + ppd->link_width_downgrade_rx_active = ppd->link_width_active; + /* per OPA spec, on link up LWD.E resets to LWD.S */ + ppd->link_width_downgrade_enabled = ppd->link_width_downgrade_supported; + /* cache the active egress rate (units {10^6 bits/sec]) */ + ppd->current_egress_rate = active_egress_rate(ppd); +} + +/* + * Handle a verify capabilities interrupt from the 8051. + * + * This is a work-queue function outside of the interrupt. + */ +void handle_verify_cap(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + link_vc_work); + struct hfi1_devdata *dd = ppd->dd; + u64 reg; + u8 power_management; + u8 continious; + u8 vcu; + u8 vau; + u8 z; + u16 vl15buf; + u16 link_widths; + u16 crc_mask; + u16 crc_val; + u16 device_id; + u16 active_tx, active_rx; + u8 partner_supported_crc; + u8 remote_tx_rate; + u8 device_rev; + + set_link_state(ppd, HLS_VERIFY_CAP); + + lcb_shutdown(dd, 0); + adjust_lcb_for_fpga_serdes(dd); + + /* + * These are now valid: + * remote VerifyCap fields in the general LNI config + * CSR DC8051_STS_REMOTE_GUID + * CSR DC8051_STS_REMOTE_NODE_TYPE + * CSR DC8051_STS_REMOTE_FM_SECURITY + * CSR DC8051_STS_REMOTE_PORT_NO + */ + + read_vc_remote_phy(dd, &power_management, &continious); + read_vc_remote_fabric(dd, &vau, &z, &vcu, &vl15buf, + &partner_supported_crc); + read_vc_remote_link_width(dd, &remote_tx_rate, &link_widths); + read_remote_device_id(dd, &device_id, &device_rev); + /* + * And the 'MgmtAllowed' information, which is exchanged during + * LNI, is also be available at this point. + */ + read_mgmt_allowed(dd, &ppd->mgmt_allowed); + /* print the active widths */ + get_link_widths(dd, &active_tx, &active_rx); + dd_dev_info(dd, + "Peer PHY: power management 0x%x, continuous updates 0x%x\n", + (int)power_management, (int)continious); + dd_dev_info(dd, + "Peer Fabric: vAU %d, Z %d, vCU %d, vl15 credits 0x%x, CRC sizes 0x%x\n", + (int)vau, (int)z, (int)vcu, (int)vl15buf, + (int)partner_supported_crc); + dd_dev_info(dd, "Peer Link Width: tx rate 0x%x, widths 0x%x\n", + (u32)remote_tx_rate, (u32)link_widths); + dd_dev_info(dd, "Peer Device ID: 0x%04x, Revision 0x%02x\n", + (u32)device_id, (u32)device_rev); + /* + * The peer vAU value just read is the peer receiver value. HFI does + * not support a transmit vAU of 0 (AU == 8). We advertised that + * with Z=1 in the fabric capabilities sent to the peer. The peer + * will see our Z=1, and, if it advertised a vAU of 0, will move its + * receive to vAU of 1 (AU == 16). Do the same here. We do not care + * about the peer Z value - our sent vAU is 3 (hardwired) and is not + * subject to the Z value exception. + */ + if (vau == 0) + vau = 1; + set_up_vl15(dd, vau, vl15buf); + + /* set up the LCB CRC mode */ + crc_mask = ppd->port_crc_mode_enabled & partner_supported_crc; + + /* order is important: use the lowest bit in common */ + if (crc_mask & CAP_CRC_14B) + crc_val = LCB_CRC_14B; + else if (crc_mask & CAP_CRC_48B) + crc_val = LCB_CRC_48B; + else if (crc_mask & CAP_CRC_12B_16B_PER_LANE) + crc_val = LCB_CRC_12B_16B_PER_LANE; + else + crc_val = LCB_CRC_16B; + + dd_dev_info(dd, "Final LCB CRC mode: %d\n", (int)crc_val); + write_csr(dd, DC_LCB_CFG_CRC_MODE, + (u64)crc_val << DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT); + + /* set (14b only) or clear sideband credit */ + reg = read_csr(dd, SEND_CM_CTRL); + if (crc_val == LCB_CRC_14B && crc_14b_sideband) { + write_csr(dd, SEND_CM_CTRL, + reg | SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK); + } else { + write_csr(dd, SEND_CM_CTRL, + reg & ~SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK); + } + + ppd->link_speed_active = 0; /* invalid value */ + if (dd->dc8051_ver < dc8051_ver(0, 20)) { + /* remote_tx_rate: 0 = 12.5G, 1 = 25G */ + switch (remote_tx_rate) { + case 0: + ppd->link_speed_active = OPA_LINK_SPEED_12_5G; + break; + case 1: + ppd->link_speed_active = OPA_LINK_SPEED_25G; + break; + } + } else { + /* actual rate is highest bit of the ANDed rates */ + u8 rate = remote_tx_rate & ppd->local_tx_rate; + + if (rate & 2) + ppd->link_speed_active = OPA_LINK_SPEED_25G; + else if (rate & 1) + ppd->link_speed_active = OPA_LINK_SPEED_12_5G; + } + if (ppd->link_speed_active == 0) { + dd_dev_err(dd, "%s: unexpected remote tx rate %d, using 25Gb\n", + __func__, (int)remote_tx_rate); + ppd->link_speed_active = OPA_LINK_SPEED_25G; + } + + /* + * Cache the values of the supported, enabled, and active + * LTP CRC modes to return in 'portinfo' queries. But the bit + * flags that are returned in the portinfo query differ from + * what's in the link_crc_mask, crc_sizes, and crc_val + * variables. Convert these here. + */ + ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8; + /* supported crc modes */ + ppd->port_ltp_crc_mode |= + cap_to_port_ltp(ppd->port_crc_mode_enabled) << 4; + /* enabled crc modes */ + ppd->port_ltp_crc_mode |= lcb_to_port_ltp(crc_val); + /* active crc mode */ + + /* set up the remote credit return table */ + assign_remote_cm_au_table(dd, vcu); + + /* + * The LCB is reset on entry to handle_verify_cap(), so this must + * be applied on every link up. + * + * Adjust LCB error kill enable to kill the link if + * these RBUF errors are seen: + * REPLAY_BUF_MBE_SMASK + * FLIT_INPUT_BUF_MBE_SMASK + */ + if (is_ax(dd)) { /* fixed in B0 */ + reg = read_csr(dd, DC_LCB_CFG_LINK_KILL_EN); + reg |= DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK + | DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK; + write_csr(dd, DC_LCB_CFG_LINK_KILL_EN, reg); + } + + /* pull LCB fifos out of reset - all fifo clocks must be stable */ + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0); + + /* give 8051 access to the LCB CSRs */ + write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */ + set_8051_lcb_access(dd); + + ppd->neighbor_guid = + read_csr(dd, DC_DC8051_STS_REMOTE_GUID); + ppd->neighbor_port_number = read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) & + DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK; + ppd->neighbor_type = + read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) & + DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK; + ppd->neighbor_fm_security = + read_csr(dd, DC_DC8051_STS_REMOTE_FM_SECURITY) & + DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK; + dd_dev_info(dd, + "Neighbor Guid: %llx Neighbor type %d MgmtAllowed %d FM security bypass %d\n", + ppd->neighbor_guid, ppd->neighbor_type, + ppd->mgmt_allowed, ppd->neighbor_fm_security); + if (ppd->mgmt_allowed) + add_full_mgmt_pkey(ppd); + + /* tell the 8051 to go to LinkUp */ + set_link_state(ppd, HLS_GOING_UP); +} + +/* + * Apply the link width downgrade enabled policy against the current active + * link widths. + * + * Called when the enabled policy changes or the active link widths change. + */ +void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, int refresh_widths) +{ + int do_bounce = 0; + int tries; + u16 lwde; + u16 tx, rx; + + /* use the hls lock to avoid a race with actual link up */ + tries = 0; +retry: + mutex_lock(&ppd->hls_lock); + /* only apply if the link is up */ + if (ppd->host_link_state & HLS_DOWN) { + /* still going up..wait and retry */ + if (ppd->host_link_state & HLS_GOING_UP) { + if (++tries < 1000) { + mutex_unlock(&ppd->hls_lock); + usleep_range(100, 120); /* arbitrary */ + goto retry; + } + dd_dev_err(ppd->dd, + "%s: giving up waiting for link state change\n", + __func__); + } + goto done; + } + + lwde = ppd->link_width_downgrade_enabled; + + if (refresh_widths) { + get_link_widths(ppd->dd, &tx, &rx); + ppd->link_width_downgrade_tx_active = tx; + ppd->link_width_downgrade_rx_active = rx; + } + + if (ppd->link_width_downgrade_tx_active == 0 || + ppd->link_width_downgrade_rx_active == 0) { + /* the 8051 reported a dead link as a downgrade */ + dd_dev_err(ppd->dd, "Link downgrade is really a link down, ignoring\n"); + } else if (lwde == 0) { + /* downgrade is disabled */ + + /* bounce if not at starting active width */ + if ((ppd->link_width_active != + ppd->link_width_downgrade_tx_active) || + (ppd->link_width_active != + ppd->link_width_downgrade_rx_active)) { + dd_dev_err(ppd->dd, + "Link downgrade is disabled and link has downgraded, downing link\n"); + dd_dev_err(ppd->dd, + " original 0x%x, tx active 0x%x, rx active 0x%x\n", + ppd->link_width_active, + ppd->link_width_downgrade_tx_active, + ppd->link_width_downgrade_rx_active); + do_bounce = 1; + } + } else if ((lwde & ppd->link_width_downgrade_tx_active) == 0 || + (lwde & ppd->link_width_downgrade_rx_active) == 0) { + /* Tx or Rx is outside the enabled policy */ + dd_dev_err(ppd->dd, + "Link is outside of downgrade allowed, downing link\n"); + dd_dev_err(ppd->dd, + " enabled 0x%x, tx active 0x%x, rx active 0x%x\n", + lwde, ppd->link_width_downgrade_tx_active, + ppd->link_width_downgrade_rx_active); + do_bounce = 1; + } + +done: + mutex_unlock(&ppd->hls_lock); + + if (do_bounce) { + set_link_down_reason(ppd, OPA_LINKDOWN_REASON_WIDTH_POLICY, 0, + OPA_LINKDOWN_REASON_WIDTH_POLICY); + set_link_state(ppd, HLS_DN_OFFLINE); + tune_serdes(ppd); + start_link(ppd); + } +} + +/* + * Handle a link downgrade interrupt from the 8051. + * + * This is a work-queue function outside of the interrupt. + */ +void handle_link_downgrade(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + link_downgrade_work); + + dd_dev_info(ppd->dd, "8051: Link width downgrade\n"); + apply_link_downgrade_policy(ppd, 1); +} + +static char *dcc_err_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, dcc_err_flags, + ARRAY_SIZE(dcc_err_flags)); +} + +static char *lcb_err_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, lcb_err_flags, + ARRAY_SIZE(lcb_err_flags)); +} + +static char *dc8051_err_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, dc8051_err_flags, + ARRAY_SIZE(dc8051_err_flags)); +} + +static char *dc8051_info_err_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, dc8051_info_err_flags, + ARRAY_SIZE(dc8051_info_err_flags)); +} + +static char *dc8051_info_host_msg_string(char *buf, int buf_len, u64 flags) +{ + return flag_string(buf, buf_len, flags, dc8051_info_host_msg_flags, + ARRAY_SIZE(dc8051_info_host_msg_flags)); +} + +static void handle_8051_interrupt(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + struct hfi1_pportdata *ppd = dd->pport; + u64 info, err, host_msg; + int queue_link_down = 0; + char buf[96]; + + /* look at the flags */ + if (reg & DC_DC8051_ERR_FLG_SET_BY_8051_SMASK) { + /* 8051 information set by firmware */ + /* read DC8051_DBG_ERR_INFO_SET_BY_8051 for details */ + info = read_csr(dd, DC_DC8051_DBG_ERR_INFO_SET_BY_8051); + err = (info >> DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT) + & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK; + host_msg = (info >> + DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT) + & DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK; + + /* + * Handle error flags. + */ + if (err & FAILED_LNI) { + /* + * LNI error indications are cleared by the 8051 + * only when starting polling. Only pay attention + * to them when in the states that occur during + * LNI. + */ + if (ppd->host_link_state + & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) { + queue_link_down = 1; + dd_dev_info(dd, "Link error: %s\n", + dc8051_info_err_string(buf, + sizeof(buf), + err & + FAILED_LNI)); + } + err &= ~(u64)FAILED_LNI; + } + /* unknown frames can happen durning LNI, just count */ + if (err & UNKNOWN_FRAME) { + ppd->unknown_frame_count++; + err &= ~(u64)UNKNOWN_FRAME; + } + if (err) { + /* report remaining errors, but do not do anything */ + dd_dev_err(dd, "8051 info error: %s\n", + dc8051_info_err_string(buf, sizeof(buf), + err)); + } + + /* + * Handle host message flags. + */ + if (host_msg & HOST_REQ_DONE) { + /* + * Presently, the driver does a busy wait for + * host requests to complete. This is only an + * informational message. + * NOTE: The 8051 clears the host message + * information *on the next 8051 command*. + * Therefore, when linkup is achieved, + * this flag will still be set. + */ + host_msg &= ~(u64)HOST_REQ_DONE; + } + if (host_msg & BC_SMA_MSG) { + queue_work(ppd->hfi1_wq, &ppd->sma_message_work); + host_msg &= ~(u64)BC_SMA_MSG; + } + if (host_msg & LINKUP_ACHIEVED) { + dd_dev_info(dd, "8051: Link up\n"); + queue_work(ppd->hfi1_wq, &ppd->link_up_work); + host_msg &= ~(u64)LINKUP_ACHIEVED; + } + if (host_msg & EXT_DEVICE_CFG_REQ) { + handle_8051_request(ppd); + host_msg &= ~(u64)EXT_DEVICE_CFG_REQ; + } + if (host_msg & VERIFY_CAP_FRAME) { + queue_work(ppd->hfi1_wq, &ppd->link_vc_work); + host_msg &= ~(u64)VERIFY_CAP_FRAME; + } + if (host_msg & LINK_GOING_DOWN) { + const char *extra = ""; + /* no downgrade action needed if going down */ + if (host_msg & LINK_WIDTH_DOWNGRADED) { + host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED; + extra = " (ignoring downgrade)"; + } + dd_dev_info(dd, "8051: Link down%s\n", extra); + queue_link_down = 1; + host_msg &= ~(u64)LINK_GOING_DOWN; + } + if (host_msg & LINK_WIDTH_DOWNGRADED) { + queue_work(ppd->hfi1_wq, &ppd->link_downgrade_work); + host_msg &= ~(u64)LINK_WIDTH_DOWNGRADED; + } + if (host_msg) { + /* report remaining messages, but do not do anything */ + dd_dev_info(dd, "8051 info host message: %s\n", + dc8051_info_host_msg_string(buf, + sizeof(buf), + host_msg)); + } + + reg &= ~DC_DC8051_ERR_FLG_SET_BY_8051_SMASK; + } + if (reg & DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK) { + /* + * Lost the 8051 heartbeat. If this happens, we + * receive constant interrupts about it. Disable + * the interrupt after the first. + */ + dd_dev_err(dd, "Lost 8051 heartbeat\n"); + write_csr(dd, DC_DC8051_ERR_EN, + read_csr(dd, DC_DC8051_ERR_EN) & + ~DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK); + + reg &= ~DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK; + } + if (reg) { + /* report the error, but do not do anything */ + dd_dev_err(dd, "8051 error: %s\n", + dc8051_err_string(buf, sizeof(buf), reg)); + } + + if (queue_link_down) { + /* + * if the link is already going down or disabled, do not + * queue another + */ + if ((ppd->host_link_state & + (HLS_GOING_OFFLINE | HLS_LINK_COOLDOWN)) || + ppd->link_enabled == 0) { + dd_dev_info(dd, "%s: not queuing link down\n", + __func__); + } else { + queue_work(ppd->hfi1_wq, &ppd->link_down_work); + } + } +} + +static const char * const fm_config_txt[] = { +[0] = + "BadHeadDist: Distance violation between two head flits", +[1] = + "BadTailDist: Distance violation between two tail flits", +[2] = + "BadCtrlDist: Distance violation between two credit control flits", +[3] = + "BadCrdAck: Credits return for unsupported VL", +[4] = + "UnsupportedVLMarker: Received VL Marker", +[5] = + "BadPreempt: Exceeded the preemption nesting level", +[6] = + "BadControlFlit: Received unsupported control flit", +/* no 7 */ +[8] = + "UnsupportedVLMarker: Received VL Marker for unconfigured or disabled VL", +}; + +static const char * const port_rcv_txt[] = { +[1] = + "BadPktLen: Illegal PktLen", +[2] = + "PktLenTooLong: Packet longer than PktLen", +[3] = + "PktLenTooShort: Packet shorter than PktLen", +[4] = + "BadSLID: Illegal SLID (0, using multicast as SLID, does not include security validation of SLID)", +[5] = + "BadDLID: Illegal DLID (0, doesn't match HFI)", +[6] = + "BadL2: Illegal L2 opcode", +[7] = + "BadSC: Unsupported SC", +[9] = + "BadRC: Illegal RC", +[11] = + "PreemptError: Preempting with same VL", +[12] = + "PreemptVL15: Preempting a VL15 packet", +}; + +#define OPA_LDR_FMCONFIG_OFFSET 16 +#define OPA_LDR_PORTRCV_OFFSET 0 +static void handle_dcc_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + u64 info, hdr0, hdr1; + const char *extra; + char buf[96]; + struct hfi1_pportdata *ppd = dd->pport; + u8 lcl_reason = 0; + int do_bounce = 0; + + if (reg & DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK) { + if (!(dd->err_info_uncorrectable & OPA_EI_STATUS_SMASK)) { + info = read_csr(dd, DCC_ERR_INFO_UNCORRECTABLE); + dd->err_info_uncorrectable = info & OPA_EI_CODE_SMASK; + /* set status bit */ + dd->err_info_uncorrectable |= OPA_EI_STATUS_SMASK; + } + reg &= ~DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK; + } + + if (reg & DCC_ERR_FLG_LINK_ERR_SMASK) { + struct hfi1_pportdata *ppd = dd->pport; + /* this counter saturates at (2^32) - 1 */ + if (ppd->link_downed < (u32)UINT_MAX) + ppd->link_downed++; + reg &= ~DCC_ERR_FLG_LINK_ERR_SMASK; + } + + if (reg & DCC_ERR_FLG_FMCONFIG_ERR_SMASK) { + u8 reason_valid = 1; + + info = read_csr(dd, DCC_ERR_INFO_FMCONFIG); + if (!(dd->err_info_fmconfig & OPA_EI_STATUS_SMASK)) { + dd->err_info_fmconfig = info & OPA_EI_CODE_SMASK; + /* set status bit */ + dd->err_info_fmconfig |= OPA_EI_STATUS_SMASK; + } + switch (info) { + case 0: + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + extra = fm_config_txt[info]; + break; + case 8: + extra = fm_config_txt[info]; + if (ppd->port_error_action & + OPA_PI_MASK_FM_CFG_UNSUPPORTED_VL_MARKER) { + do_bounce = 1; + /* + * lcl_reason cannot be derived from info + * for this error + */ + lcl_reason = + OPA_LINKDOWN_REASON_UNSUPPORTED_VL_MARKER; + } + break; + default: + reason_valid = 0; + snprintf(buf, sizeof(buf), "reserved%lld", info); + extra = buf; + break; + } + + if (reason_valid && !do_bounce) { + do_bounce = ppd->port_error_action & + (1 << (OPA_LDR_FMCONFIG_OFFSET + info)); + lcl_reason = info + OPA_LINKDOWN_REASON_BAD_HEAD_DIST; + } + + /* just report this */ + dd_dev_info(dd, "DCC Error: fmconfig error: %s\n", extra); + reg &= ~DCC_ERR_FLG_FMCONFIG_ERR_SMASK; + } + + if (reg & DCC_ERR_FLG_RCVPORT_ERR_SMASK) { + u8 reason_valid = 1; + + info = read_csr(dd, DCC_ERR_INFO_PORTRCV); + hdr0 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR0); + hdr1 = read_csr(dd, DCC_ERR_INFO_PORTRCV_HDR1); + if (!(dd->err_info_rcvport.status_and_code & + OPA_EI_STATUS_SMASK)) { + dd->err_info_rcvport.status_and_code = + info & OPA_EI_CODE_SMASK; + /* set status bit */ + dd->err_info_rcvport.status_and_code |= + OPA_EI_STATUS_SMASK; + /* + * save first 2 flits in the packet that caused + * the error + */ + dd->err_info_rcvport.packet_flit1 = hdr0; + dd->err_info_rcvport.packet_flit2 = hdr1; + } + switch (info) { + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: + case 9: + case 11: + case 12: + extra = port_rcv_txt[info]; + break; + default: + reason_valid = 0; + snprintf(buf, sizeof(buf), "reserved%lld", info); + extra = buf; + break; + } + + if (reason_valid && !do_bounce) { + do_bounce = ppd->port_error_action & + (1 << (OPA_LDR_PORTRCV_OFFSET + info)); + lcl_reason = info + OPA_LINKDOWN_REASON_RCV_ERROR_0; + } + + /* just report this */ + dd_dev_info(dd, "DCC Error: PortRcv error: %s\n", extra); + dd_dev_info(dd, " hdr0 0x%llx, hdr1 0x%llx\n", + hdr0, hdr1); + + reg &= ~DCC_ERR_FLG_RCVPORT_ERR_SMASK; + } + + if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK) { + /* informative only */ + dd_dev_info(dd, "8051 access to LCB blocked\n"); + reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK; + } + if (reg & DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK) { + /* informative only */ + dd_dev_info(dd, "host access to LCB blocked\n"); + reg &= ~DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK; + } + + /* report any remaining errors */ + if (reg) + dd_dev_info(dd, "DCC Error: %s\n", + dcc_err_string(buf, sizeof(buf), reg)); + + if (lcl_reason == 0) + lcl_reason = OPA_LINKDOWN_REASON_UNKNOWN; + + if (do_bounce) { + dd_dev_info(dd, "%s: PortErrorAction bounce\n", __func__); + set_link_down_reason(ppd, lcl_reason, 0, lcl_reason); + queue_work(ppd->hfi1_wq, &ppd->link_bounce_work); + } +} + +static void handle_lcb_err(struct hfi1_devdata *dd, u32 unused, u64 reg) +{ + char buf[96]; + + dd_dev_info(dd, "LCB Error: %s\n", + lcb_err_string(buf, sizeof(buf), reg)); +} + +/* + * CCE block DC interrupt. Source is < 8. + */ +static void is_dc_int(struct hfi1_devdata *dd, unsigned int source) +{ + const struct err_reg_info *eri = &dc_errs[source]; + + if (eri->handler) { + interrupt_clear_down(dd, 0, eri); + } else if (source == 3 /* dc_lbm_int */) { + /* + * This indicates that a parity error has occurred on the + * address/control lines presented to the LBM. The error + * is a single pulse, there is no associated error flag, + * and it is non-maskable. This is because if a parity + * error occurs on the request the request is dropped. + * This should never occur, but it is nice to know if it + * ever does. + */ + dd_dev_err(dd, "Parity error in DC LBM block\n"); + } else { + dd_dev_err(dd, "Invalid DC interrupt %u\n", source); + } +} + +/* + * TX block send credit interrupt. Source is < 160. + */ +static void is_send_credit_int(struct hfi1_devdata *dd, unsigned int source) +{ + sc_group_release_update(dd, source); +} + +/* + * TX block SDMA interrupt. Source is < 48. + * + * SDMA interrupts are grouped by type: + * + * 0 - N-1 = SDma + * N - 2N-1 = SDmaProgress + * 2N - 3N-1 = SDmaIdle + */ +static void is_sdma_eng_int(struct hfi1_devdata *dd, unsigned int source) +{ + /* what interrupt */ + unsigned int what = source / TXE_NUM_SDMA_ENGINES; + /* which engine */ + unsigned int which = source % TXE_NUM_SDMA_ENGINES; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", which, + slashstrip(__FILE__), __LINE__, __func__); + sdma_dumpstate(&dd->per_sdma[which]); +#endif + + if (likely(what < 3 && which < dd->num_sdma)) { + sdma_engine_interrupt(&dd->per_sdma[which], 1ull << source); + } else { + /* should not happen */ + dd_dev_err(dd, "Invalid SDMA interrupt 0x%x\n", source); + } +} + +/* + * RX block receive available interrupt. Source is < 160. + */ +static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source) +{ + struct hfi1_ctxtdata *rcd; + char *err_detail; + + if (likely(source < dd->num_rcv_contexts)) { + rcd = dd->rcd[source]; + if (rcd) { + if (source < dd->first_user_ctxt) + rcd->do_interrupt(rcd, 0); + else + handle_user_interrupt(rcd); + return; /* OK */ + } + /* received an interrupt, but no rcd */ + err_detail = "dataless"; + } else { + /* received an interrupt, but are not using that context */ + err_detail = "out of range"; + } + dd_dev_err(dd, "unexpected %s receive available context interrupt %u\n", + err_detail, source); +} + +/* + * RX block receive urgent interrupt. Source is < 160. + */ +static void is_rcv_urgent_int(struct hfi1_devdata *dd, unsigned int source) +{ + struct hfi1_ctxtdata *rcd; + char *err_detail; + + if (likely(source < dd->num_rcv_contexts)) { + rcd = dd->rcd[source]; + if (rcd) { + /* only pay attention to user urgent interrupts */ + if (source >= dd->first_user_ctxt) + handle_user_interrupt(rcd); + return; /* OK */ + } + /* received an interrupt, but no rcd */ + err_detail = "dataless"; + } else { + /* received an interrupt, but are not using that context */ + err_detail = "out of range"; + } + dd_dev_err(dd, "unexpected %s receive urgent context interrupt %u\n", + err_detail, source); +} + +/* + * Reserved range interrupt. Should not be called in normal operation. + */ +static void is_reserved_int(struct hfi1_devdata *dd, unsigned int source) +{ + char name[64]; + + dd_dev_err(dd, "unexpected %s interrupt\n", + is_reserved_name(name, sizeof(name), source)); +} + +static const struct is_table is_table[] = { +/* + * start end + * name func interrupt func + */ +{ IS_GENERAL_ERR_START, IS_GENERAL_ERR_END, + is_misc_err_name, is_misc_err_int }, +{ IS_SDMAENG_ERR_START, IS_SDMAENG_ERR_END, + is_sdma_eng_err_name, is_sdma_eng_err_int }, +{ IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, + is_sendctxt_err_name, is_sendctxt_err_int }, +{ IS_SDMA_START, IS_SDMA_END, + is_sdma_eng_name, is_sdma_eng_int }, +{ IS_VARIOUS_START, IS_VARIOUS_END, + is_various_name, is_various_int }, +{ IS_DC_START, IS_DC_END, + is_dc_name, is_dc_int }, +{ IS_RCVAVAIL_START, IS_RCVAVAIL_END, + is_rcv_avail_name, is_rcv_avail_int }, +{ IS_RCVURGENT_START, IS_RCVURGENT_END, + is_rcv_urgent_name, is_rcv_urgent_int }, +{ IS_SENDCREDIT_START, IS_SENDCREDIT_END, + is_send_credit_name, is_send_credit_int}, +{ IS_RESERVED_START, IS_RESERVED_END, + is_reserved_name, is_reserved_int}, +}; + +/* + * Interrupt source interrupt - called when the given source has an interrupt. + * Source is a bit index into an array of 64-bit integers. + */ +static void is_interrupt(struct hfi1_devdata *dd, unsigned int source) +{ + const struct is_table *entry; + + /* avoids a double compare by walking the table in-order */ + for (entry = &is_table[0]; entry->is_name; entry++) { + if (source < entry->end) { + trace_hfi1_interrupt(dd, entry, source); + entry->is_int(dd, source - entry->start); + return; + } + } + /* fell off the end */ + dd_dev_err(dd, "invalid interrupt source %u\n", source); +} + +/* + * General interrupt handler. This is able to correctly handle + * all interrupts in case INTx is used. + */ +static irqreturn_t general_interrupt(int irq, void *data) +{ + struct hfi1_devdata *dd = data; + u64 regs[CCE_NUM_INT_CSRS]; + u32 bit; + int i; + + this_cpu_inc(*dd->int_counter); + + /* phase 1: scan and clear all handled interrupts */ + for (i = 0; i < CCE_NUM_INT_CSRS; i++) { + if (dd->gi_mask[i] == 0) { + regs[i] = 0; /* used later */ + continue; + } + regs[i] = read_csr(dd, CCE_INT_STATUS + (8 * i)) & + dd->gi_mask[i]; + /* only clear if anything is set */ + if (regs[i]) + write_csr(dd, CCE_INT_CLEAR + (8 * i), regs[i]); + } + + /* phase 2: call the appropriate handler */ + for_each_set_bit(bit, (unsigned long *)®s[0], + CCE_NUM_INT_CSRS * 64) { + is_interrupt(dd, bit); + } + + return IRQ_HANDLED; +} + +static irqreturn_t sdma_interrupt(int irq, void *data) +{ + struct sdma_engine *sde = data; + struct hfi1_devdata *dd = sde->dd; + u64 status; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx, + slashstrip(__FILE__), __LINE__, __func__); + sdma_dumpstate(sde); +#endif + + this_cpu_inc(*dd->int_counter); + + /* This read_csr is really bad in the hot path */ + status = read_csr(dd, + CCE_INT_STATUS + (8 * (IS_SDMA_START / 64))) + & sde->imask; + if (likely(status)) { + /* clear the interrupt(s) */ + write_csr(dd, + CCE_INT_CLEAR + (8 * (IS_SDMA_START / 64)), + status); + + /* handle the interrupt(s) */ + sdma_engine_interrupt(sde, status); + } else + dd_dev_err(dd, "SDMA engine %u interrupt, but no status bits set\n", + sde->this_idx); + + return IRQ_HANDLED; +} + +/* + * Clear the receive interrupt. Use a read of the interrupt clear CSR + * to insure that the write completed. This does NOT guarantee that + * queued DMA writes to memory from the chip are pushed. + */ +static inline void clear_recv_intr(struct hfi1_ctxtdata *rcd) +{ + struct hfi1_devdata *dd = rcd->dd; + u32 addr = CCE_INT_CLEAR + (8 * rcd->ireg); + + mmiowb(); /* make sure everything before is written */ + write_csr(dd, addr, rcd->imask); + /* force the above write on the chip and get a value back */ + (void)read_csr(dd, addr); +} + +/* force the receive interrupt */ +void force_recv_intr(struct hfi1_ctxtdata *rcd) +{ + write_csr(rcd->dd, CCE_INT_FORCE + (8 * rcd->ireg), rcd->imask); +} + +/* + * Return non-zero if a packet is present. + * + * This routine is called when rechecking for packets after the RcvAvail + * interrupt has been cleared down. First, do a quick check of memory for + * a packet present. If not found, use an expensive CSR read of the context + * tail to determine the actual tail. The CSR read is necessary because there + * is no method to push pending DMAs to memory other than an interrupt and we + * are trying to determine if we need to force an interrupt. + */ +static inline int check_packet_present(struct hfi1_ctxtdata *rcd) +{ + u32 tail; + int present; + + if (!HFI1_CAP_IS_KSET(DMA_RTAIL)) + present = (rcd->seq_cnt == + rhf_rcv_seq(rhf_to_cpu(get_rhf_addr(rcd)))); + else /* is RDMA rtail */ + present = (rcd->head != get_rcvhdrtail(rcd)); + + if (present) + return 1; + + /* fall back to a CSR read, correct indpendent of DMA_RTAIL */ + tail = (u32)read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL); + return rcd->head != tail; +} + +/* + * Receive packet IRQ handler. This routine expects to be on its own IRQ. + * This routine will try to handle packets immediately (latency), but if + * it finds too many, it will invoke the thread handler (bandwitdh). The + * chip receive interrupt is *not* cleared down until this or the thread (if + * invoked) is finished. The intent is to avoid extra interrupts while we + * are processing packets anyway. + */ +static irqreturn_t receive_context_interrupt(int irq, void *data) +{ + struct hfi1_ctxtdata *rcd = data; + struct hfi1_devdata *dd = rcd->dd; + int disposition; + int present; + + trace_hfi1_receive_interrupt(dd, rcd->ctxt); + this_cpu_inc(*dd->int_counter); + aspm_ctx_disable(rcd); + + /* receive interrupt remains blocked while processing packets */ + disposition = rcd->do_interrupt(rcd, 0); + + /* + * Too many packets were seen while processing packets in this + * IRQ handler. Invoke the handler thread. The receive interrupt + * remains blocked. + */ + if (disposition == RCV_PKT_LIMIT) + return IRQ_WAKE_THREAD; + + /* + * The packet processor detected no more packets. Clear the receive + * interrupt and recheck for a packet packet that may have arrived + * after the previous check and interrupt clear. If a packet arrived, + * force another interrupt. + */ + clear_recv_intr(rcd); + present = check_packet_present(rcd); + if (present) + force_recv_intr(rcd); + + return IRQ_HANDLED; +} + +/* + * Receive packet thread handler. This expects to be invoked with the + * receive interrupt still blocked. + */ +static irqreturn_t receive_context_thread(int irq, void *data) +{ + struct hfi1_ctxtdata *rcd = data; + int present; + + /* receive interrupt is still blocked from the IRQ handler */ + (void)rcd->do_interrupt(rcd, 1); + + /* + * The packet processor will only return if it detected no more + * packets. Hold IRQs here so we can safely clear the interrupt and + * recheck for a packet that may have arrived after the previous + * check and the interrupt clear. If a packet arrived, force another + * interrupt. + */ + local_irq_disable(); + clear_recv_intr(rcd); + present = check_packet_present(rcd); + if (present) + force_recv_intr(rcd); + local_irq_enable(); + + return IRQ_HANDLED; +} + +/* ========================================================================= */ + +u32 read_physical_state(struct hfi1_devdata *dd) +{ + u64 reg; + + reg = read_csr(dd, DC_DC8051_STS_CUR_STATE); + return (reg >> DC_DC8051_STS_CUR_STATE_PORT_SHIFT) + & DC_DC8051_STS_CUR_STATE_PORT_MASK; +} + +u32 read_logical_state(struct hfi1_devdata *dd) +{ + u64 reg; + + reg = read_csr(dd, DCC_CFG_PORT_CONFIG); + return (reg >> DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT) + & DCC_CFG_PORT_CONFIG_LINK_STATE_MASK; +} + +static void set_logical_state(struct hfi1_devdata *dd, u32 chip_lstate) +{ + u64 reg; + + reg = read_csr(dd, DCC_CFG_PORT_CONFIG); + /* clear current state, set new state */ + reg &= ~DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK; + reg |= (u64)chip_lstate << DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT; + write_csr(dd, DCC_CFG_PORT_CONFIG, reg); +} + +/* + * Use the 8051 to read a LCB CSR. + */ +static int read_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 *data) +{ + u32 regno; + int ret; + + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { + if (acquire_lcb_access(dd, 0) == 0) { + *data = read_csr(dd, addr); + release_lcb_access(dd, 0); + return 0; + } + return -EBUSY; + } + + /* register is an index of LCB registers: (offset - base) / 8 */ + regno = (addr - DC_LCB_CFG_RUN) >> 3; + ret = do_8051_command(dd, HCMD_READ_LCB_CSR, regno, data); + if (ret != HCMD_SUCCESS) + return -EBUSY; + return 0; +} + +/* + * Read an LCB CSR. Access may not be in host control, so check. + * Return 0 on success, -EBUSY on failure. + */ +int read_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 *data) +{ + struct hfi1_pportdata *ppd = dd->pport; + + /* if up, go through the 8051 for the value */ + if (ppd->host_link_state & HLS_UP) + return read_lcb_via_8051(dd, addr, data); + /* if going up or down, no access */ + if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE)) + return -EBUSY; + /* otherwise, host has access */ + *data = read_csr(dd, addr); + return 0; +} + +/* + * Use the 8051 to write a LCB CSR. + */ +static int write_lcb_via_8051(struct hfi1_devdata *dd, u32 addr, u64 data) +{ + u32 regno; + int ret; + + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR || + (dd->dc8051_ver < dc8051_ver(0, 20))) { + if (acquire_lcb_access(dd, 0) == 0) { + write_csr(dd, addr, data); + release_lcb_access(dd, 0); + return 0; + } + return -EBUSY; + } + + /* register is an index of LCB registers: (offset - base) / 8 */ + regno = (addr - DC_LCB_CFG_RUN) >> 3; + ret = do_8051_command(dd, HCMD_WRITE_LCB_CSR, regno, &data); + if (ret != HCMD_SUCCESS) + return -EBUSY; + return 0; +} + +/* + * Write an LCB CSR. Access may not be in host control, so check. + * Return 0 on success, -EBUSY on failure. + */ +int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data) +{ + struct hfi1_pportdata *ppd = dd->pport; + + /* if up, go through the 8051 for the value */ + if (ppd->host_link_state & HLS_UP) + return write_lcb_via_8051(dd, addr, data); + /* if going up or down, no access */ + if (ppd->host_link_state & (HLS_GOING_UP | HLS_GOING_OFFLINE)) + return -EBUSY; + /* otherwise, host has access */ + write_csr(dd, addr, data); + return 0; +} + +/* + * Returns: + * < 0 = Linux error, not able to get access + * > 0 = 8051 command RETURN_CODE + */ +static int do_8051_command( + struct hfi1_devdata *dd, + u32 type, + u64 in_data, + u64 *out_data) +{ + u64 reg, completed; + int return_code; + unsigned long flags; + unsigned long timeout; + + hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data); + + /* + * Alternative to holding the lock for a long time: + * - keep busy wait - have other users bounce off + */ + spin_lock_irqsave(&dd->dc8051_lock, flags); + + /* We can't send any commands to the 8051 if it's in reset */ + if (dd->dc_shutdown) { + return_code = -ENODEV; + goto fail; + } + + /* + * If an 8051 host command timed out previously, then the 8051 is + * stuck. + * + * On first timeout, attempt to reset and restart the entire DC + * block (including 8051). (Is this too big of a hammer?) + * + * If the 8051 times out a second time, the reset did not bring it + * back to healthy life. In that case, fail any subsequent commands. + */ + if (dd->dc8051_timed_out) { + if (dd->dc8051_timed_out > 1) { + dd_dev_err(dd, + "Previous 8051 host command timed out, skipping command %u\n", + type); + return_code = -ENXIO; + goto fail; + } + spin_unlock_irqrestore(&dd->dc8051_lock, flags); + dc_shutdown(dd); + dc_start(dd); + spin_lock_irqsave(&dd->dc8051_lock, flags); + } + + /* + * If there is no timeout, then the 8051 command interface is + * waiting for a command. + */ + + /* + * When writing a LCB CSR, out_data contains the full value to + * to be written, while in_data contains the relative LCB + * address in 7:0. Do the work here, rather than the caller, + * of distrubting the write data to where it needs to go: + * + * Write data + * 39:00 -> in_data[47:8] + * 47:40 -> DC8051_CFG_EXT_DEV_0.RETURN_CODE + * 63:48 -> DC8051_CFG_EXT_DEV_0.RSP_DATA + */ + if (type == HCMD_WRITE_LCB_CSR) { + in_data |= ((*out_data) & 0xffffffffffull) << 8; + reg = ((((*out_data) >> 40) & 0xff) << + DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT) + | ((((*out_data) >> 48) & 0xffff) << + DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT); + write_csr(dd, DC_DC8051_CFG_EXT_DEV_0, reg); + } + + /* + * Do two writes: the first to stabilize the type and req_data, the + * second to activate. + */ + reg = ((u64)type & DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK) + << DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT + | (in_data & DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK) + << DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT; + write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg); + reg |= DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK; + write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, reg); + + /* wait for completion, alternate: interrupt */ + timeout = jiffies + msecs_to_jiffies(DC8051_COMMAND_TIMEOUT); + while (1) { + reg = read_csr(dd, DC_DC8051_CFG_HOST_CMD_1); + completed = reg & DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK; + if (completed) + break; + if (time_after(jiffies, timeout)) { + dd->dc8051_timed_out++; + dd_dev_err(dd, "8051 host command %u timeout\n", type); + if (out_data) + *out_data = 0; + return_code = -ETIMEDOUT; + goto fail; + } + udelay(2); + } + + if (out_data) { + *out_data = (reg >> DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT) + & DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK; + if (type == HCMD_READ_LCB_CSR) { + /* top 16 bits are in a different register */ + *out_data |= (read_csr(dd, DC_DC8051_CFG_EXT_DEV_1) + & DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK) + << (48 + - DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT); + } + } + return_code = (reg >> DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT) + & DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK; + dd->dc8051_timed_out = 0; + /* + * Clear command for next user. + */ + write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0); + +fail: + spin_unlock_irqrestore(&dd->dc8051_lock, flags); + + return return_code; +} + +static int set_physical_link_state(struct hfi1_devdata *dd, u64 state) +{ + return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL); +} + +int load_8051_config(struct hfi1_devdata *dd, u8 field_id, + u8 lane_id, u32 config_data) +{ + u64 data; + int ret; + + data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT + | (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT + | (u64)config_data << LOAD_DATA_DATA_SHIFT; + ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, + "load 8051 config: field id %d, lane %d, err %d\n", + (int)field_id, (int)lane_id, ret); + } + return ret; +} + +/* + * Read the 8051 firmware "registers". Use the RAM directly. Always + * set the result, even on error. + * Return 0 on success, -errno on failure + */ +int read_8051_config(struct hfi1_devdata *dd, u8 field_id, u8 lane_id, + u32 *result) +{ + u64 big_data; + u32 addr; + int ret; + + /* address start depends on the lane_id */ + if (lane_id < 4) + addr = (4 * NUM_GENERAL_FIELDS) + + (lane_id * 4 * NUM_LANE_FIELDS); + else + addr = 0; + addr += field_id * 4; + + /* read is in 8-byte chunks, hardware will truncate the address down */ + ret = read_8051_data(dd, addr, 8, &big_data); + + if (ret == 0) { + /* extract the 4 bytes we want */ + if (addr & 0x4) + *result = (u32)(big_data >> 32); + else + *result = (u32)big_data; + } else { + *result = 0; + dd_dev_err(dd, "%s: direct read failed, lane %d, field %d!\n", + __func__, lane_id, field_id); + } + + return ret; +} + +static int write_vc_local_phy(struct hfi1_devdata *dd, u8 power_management, + u8 continuous) +{ + u32 frame; + + frame = continuous << CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT + | power_management << POWER_MANAGEMENT_SHIFT; + return load_8051_config(dd, VERIFY_CAP_LOCAL_PHY, + GENERAL_CONFIG, frame); +} + +static int write_vc_local_fabric(struct hfi1_devdata *dd, u8 vau, u8 z, u8 vcu, + u16 vl15buf, u8 crc_sizes) +{ + u32 frame; + + frame = (u32)vau << VAU_SHIFT + | (u32)z << Z_SHIFT + | (u32)vcu << VCU_SHIFT + | (u32)vl15buf << VL15BUF_SHIFT + | (u32)crc_sizes << CRC_SIZES_SHIFT; + return load_8051_config(dd, VERIFY_CAP_LOCAL_FABRIC, + GENERAL_CONFIG, frame); +} + +static void read_vc_local_link_width(struct hfi1_devdata *dd, u8 *misc_bits, + u8 *flag_bits, u16 *link_widths) +{ + u32 frame; + + read_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG, + &frame); + *misc_bits = (frame >> MISC_CONFIG_BITS_SHIFT) & MISC_CONFIG_BITS_MASK; + *flag_bits = (frame >> LOCAL_FLAG_BITS_SHIFT) & LOCAL_FLAG_BITS_MASK; + *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK; +} + +static int write_vc_local_link_width(struct hfi1_devdata *dd, + u8 misc_bits, + u8 flag_bits, + u16 link_widths) +{ + u32 frame; + + frame = (u32)misc_bits << MISC_CONFIG_BITS_SHIFT + | (u32)flag_bits << LOCAL_FLAG_BITS_SHIFT + | (u32)link_widths << LINK_WIDTH_SHIFT; + return load_8051_config(dd, VERIFY_CAP_LOCAL_LINK_WIDTH, GENERAL_CONFIG, + frame); +} + +static int write_local_device_id(struct hfi1_devdata *dd, u16 device_id, + u8 device_rev) +{ + u32 frame; + + frame = ((u32)device_id << LOCAL_DEVICE_ID_SHIFT) + | ((u32)device_rev << LOCAL_DEVICE_REV_SHIFT); + return load_8051_config(dd, LOCAL_DEVICE_ID, GENERAL_CONFIG, frame); +} + +static void read_remote_device_id(struct hfi1_devdata *dd, u16 *device_id, + u8 *device_rev) +{ + u32 frame; + + read_8051_config(dd, REMOTE_DEVICE_ID, GENERAL_CONFIG, &frame); + *device_id = (frame >> REMOTE_DEVICE_ID_SHIFT) & REMOTE_DEVICE_ID_MASK; + *device_rev = (frame >> REMOTE_DEVICE_REV_SHIFT) + & REMOTE_DEVICE_REV_MASK; +} + +void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b) +{ + u32 frame; + + read_8051_config(dd, MISC_STATUS, GENERAL_CONFIG, &frame); + *ver_a = (frame >> STS_FM_VERSION_A_SHIFT) & STS_FM_VERSION_A_MASK; + *ver_b = (frame >> STS_FM_VERSION_B_SHIFT) & STS_FM_VERSION_B_MASK; +} + +static void read_vc_remote_phy(struct hfi1_devdata *dd, u8 *power_management, + u8 *continuous) +{ + u32 frame; + + read_8051_config(dd, VERIFY_CAP_REMOTE_PHY, GENERAL_CONFIG, &frame); + *power_management = (frame >> POWER_MANAGEMENT_SHIFT) + & POWER_MANAGEMENT_MASK; + *continuous = (frame >> CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT) + & CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK; +} + +static void read_vc_remote_fabric(struct hfi1_devdata *dd, u8 *vau, u8 *z, + u8 *vcu, u16 *vl15buf, u8 *crc_sizes) +{ + u32 frame; + + read_8051_config(dd, VERIFY_CAP_REMOTE_FABRIC, GENERAL_CONFIG, &frame); + *vau = (frame >> VAU_SHIFT) & VAU_MASK; + *z = (frame >> Z_SHIFT) & Z_MASK; + *vcu = (frame >> VCU_SHIFT) & VCU_MASK; + *vl15buf = (frame >> VL15BUF_SHIFT) & VL15BUF_MASK; + *crc_sizes = (frame >> CRC_SIZES_SHIFT) & CRC_SIZES_MASK; +} + +static void read_vc_remote_link_width(struct hfi1_devdata *dd, + u8 *remote_tx_rate, + u16 *link_widths) +{ + u32 frame; + + read_8051_config(dd, VERIFY_CAP_REMOTE_LINK_WIDTH, GENERAL_CONFIG, + &frame); + *remote_tx_rate = (frame >> REMOTE_TX_RATE_SHIFT) + & REMOTE_TX_RATE_MASK; + *link_widths = (frame >> LINK_WIDTH_SHIFT) & LINK_WIDTH_MASK; +} + +static void read_local_lni(struct hfi1_devdata *dd, u8 *enable_lane_rx) +{ + u32 frame; + + read_8051_config(dd, LOCAL_LNI_INFO, GENERAL_CONFIG, &frame); + *enable_lane_rx = (frame >> ENABLE_LANE_RX_SHIFT) & ENABLE_LANE_RX_MASK; +} + +static void read_mgmt_allowed(struct hfi1_devdata *dd, u8 *mgmt_allowed) +{ + u32 frame; + + read_8051_config(dd, REMOTE_LNI_INFO, GENERAL_CONFIG, &frame); + *mgmt_allowed = (frame >> MGMT_ALLOWED_SHIFT) & MGMT_ALLOWED_MASK; +} + +static void read_last_local_state(struct hfi1_devdata *dd, u32 *lls) +{ + read_8051_config(dd, LAST_LOCAL_STATE_COMPLETE, GENERAL_CONFIG, lls); +} + +static void read_last_remote_state(struct hfi1_devdata *dd, u32 *lrs) +{ + read_8051_config(dd, LAST_REMOTE_STATE_COMPLETE, GENERAL_CONFIG, lrs); +} + +void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality) +{ + u32 frame; + int ret; + + *link_quality = 0; + if (dd->pport->host_link_state & HLS_UP) { + ret = read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, + &frame); + if (ret == 0) + *link_quality = (frame >> LINK_QUALITY_SHIFT) + & LINK_QUALITY_MASK; + } +} + +static void read_planned_down_reason_code(struct hfi1_devdata *dd, u8 *pdrrc) +{ + u32 frame; + + read_8051_config(dd, LINK_QUALITY_INFO, GENERAL_CONFIG, &frame); + *pdrrc = (frame >> DOWN_REMOTE_REASON_SHIFT) & DOWN_REMOTE_REASON_MASK; +} + +static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr) +{ + u32 frame; + + read_8051_config(dd, LINK_DOWN_REASON, GENERAL_CONFIG, &frame); + *ldr = (frame & 0xff); +} + +static int read_tx_settings(struct hfi1_devdata *dd, + u8 *enable_lane_tx, + u8 *tx_polarity_inversion, + u8 *rx_polarity_inversion, + u8 *max_rate) +{ + u32 frame; + int ret; + + ret = read_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, &frame); + *enable_lane_tx = (frame >> ENABLE_LANE_TX_SHIFT) + & ENABLE_LANE_TX_MASK; + *tx_polarity_inversion = (frame >> TX_POLARITY_INVERSION_SHIFT) + & TX_POLARITY_INVERSION_MASK; + *rx_polarity_inversion = (frame >> RX_POLARITY_INVERSION_SHIFT) + & RX_POLARITY_INVERSION_MASK; + *max_rate = (frame >> MAX_RATE_SHIFT) & MAX_RATE_MASK; + return ret; +} + +static int write_tx_settings(struct hfi1_devdata *dd, + u8 enable_lane_tx, + u8 tx_polarity_inversion, + u8 rx_polarity_inversion, + u8 max_rate) +{ + u32 frame; + + /* no need to mask, all variable sizes match field widths */ + frame = enable_lane_tx << ENABLE_LANE_TX_SHIFT + | tx_polarity_inversion << TX_POLARITY_INVERSION_SHIFT + | rx_polarity_inversion << RX_POLARITY_INVERSION_SHIFT + | max_rate << MAX_RATE_SHIFT; + return load_8051_config(dd, TX_SETTINGS, GENERAL_CONFIG, frame); +} + +static void check_fabric_firmware_versions(struct hfi1_devdata *dd) +{ + u32 frame, version, prod_id; + int ret, lane; + + /* 4 lanes */ + for (lane = 0; lane < 4; lane++) { + ret = read_8051_config(dd, SPICO_FW_VERSION, lane, &frame); + if (ret) { + dd_dev_err(dd, + "Unable to read lane %d firmware details\n", + lane); + continue; + } + version = (frame >> SPICO_ROM_VERSION_SHIFT) + & SPICO_ROM_VERSION_MASK; + prod_id = (frame >> SPICO_ROM_PROD_ID_SHIFT) + & SPICO_ROM_PROD_ID_MASK; + dd_dev_info(dd, + "Lane %d firmware: version 0x%04x, prod_id 0x%04x\n", + lane, version, prod_id); + } +} + +/* + * Read an idle LCB message. + * + * Returns 0 on success, -EINVAL on error + */ +static int read_idle_message(struct hfi1_devdata *dd, u64 type, u64 *data_out) +{ + int ret; + + ret = do_8051_command(dd, HCMD_READ_LCB_IDLE_MSG, type, data_out); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, "read idle message: type %d, err %d\n", + (u32)type, ret); + return -EINVAL; + } + dd_dev_info(dd, "%s: read idle message 0x%llx\n", __func__, *data_out); + /* return only the payload as we already know the type */ + *data_out >>= IDLE_PAYLOAD_SHIFT; + return 0; +} + +/* + * Read an idle SMA message. To be done in response to a notification from + * the 8051. + * + * Returns 0 on success, -EINVAL on error + */ +static int read_idle_sma(struct hfi1_devdata *dd, u64 *data) +{ + return read_idle_message(dd, (u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT, + data); +} + +/* + * Send an idle LCB message. + * + * Returns 0 on success, -EINVAL on error + */ +static int send_idle_message(struct hfi1_devdata *dd, u64 data) +{ + int ret; + + dd_dev_info(dd, "%s: sending idle message 0x%llx\n", __func__, data); + ret = do_8051_command(dd, HCMD_SEND_LCB_IDLE_MSG, data, NULL); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, "send idle message: data 0x%llx, err %d\n", + data, ret); + return -EINVAL; + } + return 0; +} + +/* + * Send an idle SMA message. + * + * Returns 0 on success, -EINVAL on error + */ +int send_idle_sma(struct hfi1_devdata *dd, u64 message) +{ + u64 data; + + data = ((message & IDLE_PAYLOAD_MASK) << IDLE_PAYLOAD_SHIFT) | + ((u64)IDLE_SMA << IDLE_MSG_TYPE_SHIFT); + return send_idle_message(dd, data); +} + +/* + * Initialize the LCB then do a quick link up. This may or may not be + * in loopback. + * + * return 0 on success, -errno on error + */ +static int do_quick_linkup(struct hfi1_devdata *dd) +{ + u64 reg; + unsigned long timeout; + int ret; + + lcb_shutdown(dd, 0); + + if (loopback) { + /* LCB_CFG_LOOPBACK.VAL = 2 */ + /* LCB_CFG_LANE_WIDTH.VAL = 0 */ + write_csr(dd, DC_LCB_CFG_LOOPBACK, + IB_PACKET_TYPE << DC_LCB_CFG_LOOPBACK_VAL_SHIFT); + write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0); + } + + /* start the LCBs */ + /* LCB_CFG_TX_FIFOS_RESET.VAL = 0 */ + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0); + + /* simulator only loopback steps */ + if (loopback && dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { + /* LCB_CFG_RUN.EN = 1 */ + write_csr(dd, DC_LCB_CFG_RUN, + 1ull << DC_LCB_CFG_RUN_EN_SHIFT); + + /* watch LCB_STS_LINK_TRANSFER_ACTIVE */ + timeout = jiffies + msecs_to_jiffies(10); + while (1) { + reg = read_csr(dd, DC_LCB_STS_LINK_TRANSFER_ACTIVE); + if (reg) + break; + if (time_after(jiffies, timeout)) { + dd_dev_err(dd, + "timeout waiting for LINK_TRANSFER_ACTIVE\n"); + return -ETIMEDOUT; + } + udelay(2); + } + + write_csr(dd, DC_LCB_CFG_ALLOW_LINK_UP, + 1ull << DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT); + } + + if (!loopback) { + /* + * When doing quick linkup and not in loopback, both + * sides must be done with LCB set-up before either + * starts the quick linkup. Put a delay here so that + * both sides can be started and have a chance to be + * done with LCB set up before resuming. + */ + dd_dev_err(dd, + "Pausing for peer to be finished with LCB set up\n"); + msleep(5000); + dd_dev_err(dd, "Continuing with quick linkup\n"); + } + + write_csr(dd, DC_LCB_ERR_EN, 0); /* mask LCB errors */ + set_8051_lcb_access(dd); + + /* + * State "quick" LinkUp request sets the physical link state to + * LinkUp without a verify capability sequence. + * This state is in simulator v37 and later. + */ + ret = set_physical_link_state(dd, PLS_QUICK_LINKUP); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, + "%s: set physical link state to quick LinkUp failed with return %d\n", + __func__, ret); + + set_host_lcb_access(dd); + write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */ + + if (ret >= 0) + ret = -EINVAL; + return ret; + } + + return 0; /* success */ +} + +/* + * Set the SerDes to internal loopback mode. + * Returns 0 on success, -errno on error. + */ +static int set_serdes_loopback_mode(struct hfi1_devdata *dd) +{ + int ret; + + ret = set_physical_link_state(dd, PLS_INTERNAL_SERDES_LOOPBACK); + if (ret == HCMD_SUCCESS) + return 0; + dd_dev_err(dd, + "Set physical link state to SerDes Loopback failed with return %d\n", + ret); + if (ret >= 0) + ret = -EINVAL; + return ret; +} + +/* + * Do all special steps to set up loopback. + */ +static int init_loopback(struct hfi1_devdata *dd) +{ + dd_dev_info(dd, "Entering loopback mode\n"); + + /* all loopbacks should disable self GUID check */ + write_csr(dd, DC_DC8051_CFG_MODE, + (read_csr(dd, DC_DC8051_CFG_MODE) | DISABLE_SELF_GUID_CHECK)); + + /* + * The simulator has only one loopback option - LCB. Switch + * to that option, which includes quick link up. + * + * Accept all valid loopback values. + */ + if ((dd->icode == ICODE_FUNCTIONAL_SIMULATOR) && + (loopback == LOOPBACK_SERDES || loopback == LOOPBACK_LCB || + loopback == LOOPBACK_CABLE)) { + loopback = LOOPBACK_LCB; + quick_linkup = 1; + return 0; + } + + /* handle serdes loopback */ + if (loopback == LOOPBACK_SERDES) { + /* internal serdes loopack needs quick linkup on RTL */ + if (dd->icode == ICODE_RTL_SILICON) + quick_linkup = 1; + return set_serdes_loopback_mode(dd); + } + + /* LCB loopback - handled at poll time */ + if (loopback == LOOPBACK_LCB) { + quick_linkup = 1; /* LCB is always quick linkup */ + + /* not supported in emulation due to emulation RTL changes */ + if (dd->icode == ICODE_FPGA_EMULATION) { + dd_dev_err(dd, + "LCB loopback not supported in emulation\n"); + return -EINVAL; + } + return 0; + } + + /* external cable loopback requires no extra steps */ + if (loopback == LOOPBACK_CABLE) + return 0; + + dd_dev_err(dd, "Invalid loopback mode %d\n", loopback); + return -EINVAL; +} + +/* + * Translate from the OPA_LINK_WIDTH handed to us by the FM to bits + * used in the Verify Capability link width attribute. + */ +static u16 opa_to_vc_link_widths(u16 opa_widths) +{ + int i; + u16 result = 0; + + static const struct link_bits { + u16 from; + u16 to; + } opa_link_xlate[] = { + { OPA_LINK_WIDTH_1X, 1 << (1 - 1) }, + { OPA_LINK_WIDTH_2X, 1 << (2 - 1) }, + { OPA_LINK_WIDTH_3X, 1 << (3 - 1) }, + { OPA_LINK_WIDTH_4X, 1 << (4 - 1) }, + }; + + for (i = 0; i < ARRAY_SIZE(opa_link_xlate); i++) { + if (opa_widths & opa_link_xlate[i].from) + result |= opa_link_xlate[i].to; + } + return result; +} + +/* + * Set link attributes before moving to polling. + */ +static int set_local_link_attributes(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u8 enable_lane_tx; + u8 tx_polarity_inversion; + u8 rx_polarity_inversion; + int ret; + + /* reset our fabric serdes to clear any lingering problems */ + fabric_serdes_reset(dd); + + /* set the local tx rate - need to read-modify-write */ + ret = read_tx_settings(dd, &enable_lane_tx, &tx_polarity_inversion, + &rx_polarity_inversion, &ppd->local_tx_rate); + if (ret) + goto set_local_link_attributes_fail; + + if (dd->dc8051_ver < dc8051_ver(0, 20)) { + /* set the tx rate to the fastest enabled */ + if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G) + ppd->local_tx_rate = 1; + else + ppd->local_tx_rate = 0; + } else { + /* set the tx rate to all enabled */ + ppd->local_tx_rate = 0; + if (ppd->link_speed_enabled & OPA_LINK_SPEED_25G) + ppd->local_tx_rate |= 2; + if (ppd->link_speed_enabled & OPA_LINK_SPEED_12_5G) + ppd->local_tx_rate |= 1; + } + + enable_lane_tx = 0xF; /* enable all four lanes */ + ret = write_tx_settings(dd, enable_lane_tx, tx_polarity_inversion, + rx_polarity_inversion, ppd->local_tx_rate); + if (ret != HCMD_SUCCESS) + goto set_local_link_attributes_fail; + + /* + * DC supports continuous updates. + */ + ret = write_vc_local_phy(dd, + 0 /* no power management */, + 1 /* continuous updates */); + if (ret != HCMD_SUCCESS) + goto set_local_link_attributes_fail; + + /* z=1 in the next call: AU of 0 is not supported by the hardware */ + ret = write_vc_local_fabric(dd, dd->vau, 1, dd->vcu, dd->vl15_init, + ppd->port_crc_mode_enabled); + if (ret != HCMD_SUCCESS) + goto set_local_link_attributes_fail; + + ret = write_vc_local_link_width(dd, 0, 0, + opa_to_vc_link_widths( + ppd->link_width_enabled)); + if (ret != HCMD_SUCCESS) + goto set_local_link_attributes_fail; + + /* let peer know who we are */ + ret = write_local_device_id(dd, dd->pcidev->device, dd->minrev); + if (ret == HCMD_SUCCESS) + return 0; + +set_local_link_attributes_fail: + dd_dev_err(dd, + "Failed to set local link attributes, return 0x%x\n", + ret); + return ret; +} + +/* + * Call this to start the link. + * Do not do anything if the link is disabled. + * Returns 0 if link is disabled, moved to polling, or the driver is not ready. + */ +int start_link(struct hfi1_pportdata *ppd) +{ + if (!ppd->link_enabled) { + dd_dev_info(ppd->dd, + "%s: stopping link start because link is disabled\n", + __func__); + return 0; + } + if (!ppd->driver_link_ready) { + dd_dev_info(ppd->dd, + "%s: stopping link start because driver is not ready\n", + __func__); + return 0; + } + + /* + * FULL_MGMT_P_KEY is cleared from the pkey table, so that the + * pkey table can be configured properly if the HFI unit is connected + * to switch port with MgmtAllowed=NO + */ + clear_full_mgmt_pkey(ppd); + + return set_link_state(ppd, HLS_DN_POLL); +} + +static void wait_for_qsfp_init(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 mask; + unsigned long timeout; + + /* + * Check for QSFP interrupt for t_init (SFF 8679) + */ + timeout = jiffies + msecs_to_jiffies(2000); + while (1) { + mask = read_csr(dd, dd->hfi1_id ? + ASIC_QSFP2_IN : ASIC_QSFP1_IN); + if (!(mask & QSFP_HFI0_INT_N)) { + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : + ASIC_QSFP1_CLEAR, QSFP_HFI0_INT_N); + break; + } + if (time_after(jiffies, timeout)) { + dd_dev_info(dd, "%s: No IntN detected, reset complete\n", + __func__); + break; + } + udelay(2); + } +} + +static void set_qsfp_int_n(struct hfi1_pportdata *ppd, u8 enable) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 mask; + + mask = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK); + if (enable) + mask |= (u64)QSFP_HFI0_INT_N; + else + mask &= ~(u64)QSFP_HFI0_INT_N; + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, mask); +} + +void reset_qsfp(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 mask, qsfp_mask; + + /* Disable INT_N from triggering QSFP interrupts */ + set_qsfp_int_n(ppd, 0); + + /* Reset the QSFP */ + mask = (u64)QSFP_HFI0_RESET_N; + + qsfp_mask = read_csr(dd, + dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT); + qsfp_mask &= ~mask; + write_csr(dd, + dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask); + + udelay(10); + + qsfp_mask |= mask; + write_csr(dd, + dd->hfi1_id ? ASIC_QSFP2_OUT : ASIC_QSFP1_OUT, qsfp_mask); + + wait_for_qsfp_init(ppd); + + /* + * Allow INT_N to trigger the QSFP interrupt to watch + * for alarms and warnings + */ + set_qsfp_int_n(ppd, 1); +} + +static int handle_qsfp_error_conditions(struct hfi1_pportdata *ppd, + u8 *qsfp_interrupt_status) +{ + struct hfi1_devdata *dd = ppd->dd; + + if ((qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_ALARM) || + (qsfp_interrupt_status[0] & QSFP_HIGH_TEMP_WARNING)) + dd_dev_info(dd, "%s: QSFP cable on fire\n", + __func__); + + if ((qsfp_interrupt_status[0] & QSFP_LOW_TEMP_ALARM) || + (qsfp_interrupt_status[0] & QSFP_LOW_TEMP_WARNING)) + dd_dev_info(dd, "%s: QSFP cable temperature too low\n", + __func__); + + /* + * The remaining alarms/warnings don't matter if the link is down. + */ + if (ppd->host_link_state & HLS_DOWN) + return 0; + + if ((qsfp_interrupt_status[1] & QSFP_HIGH_VCC_ALARM) || + (qsfp_interrupt_status[1] & QSFP_HIGH_VCC_WARNING)) + dd_dev_info(dd, "%s: QSFP supply voltage too high\n", + __func__); + + if ((qsfp_interrupt_status[1] & QSFP_LOW_VCC_ALARM) || + (qsfp_interrupt_status[1] & QSFP_LOW_VCC_WARNING)) + dd_dev_info(dd, "%s: QSFP supply voltage too low\n", + __func__); + + /* Byte 2 is vendor specific */ + + if ((qsfp_interrupt_status[3] & QSFP_HIGH_POWER_ALARM) || + (qsfp_interrupt_status[3] & QSFP_HIGH_POWER_WARNING)) + dd_dev_info(dd, "%s: Cable RX channel 1/2 power too high\n", + __func__); + + if ((qsfp_interrupt_status[3] & QSFP_LOW_POWER_ALARM) || + (qsfp_interrupt_status[3] & QSFP_LOW_POWER_WARNING)) + dd_dev_info(dd, "%s: Cable RX channel 1/2 power too low\n", + __func__); + + if ((qsfp_interrupt_status[4] & QSFP_HIGH_POWER_ALARM) || + (qsfp_interrupt_status[4] & QSFP_HIGH_POWER_WARNING)) + dd_dev_info(dd, "%s: Cable RX channel 3/4 power too high\n", + __func__); + + if ((qsfp_interrupt_status[4] & QSFP_LOW_POWER_ALARM) || + (qsfp_interrupt_status[4] & QSFP_LOW_POWER_WARNING)) + dd_dev_info(dd, "%s: Cable RX channel 3/4 power too low\n", + __func__); + + if ((qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_ALARM) || + (qsfp_interrupt_status[5] & QSFP_HIGH_BIAS_WARNING)) + dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too high\n", + __func__); + + if ((qsfp_interrupt_status[5] & QSFP_LOW_BIAS_ALARM) || + (qsfp_interrupt_status[5] & QSFP_LOW_BIAS_WARNING)) + dd_dev_info(dd, "%s: Cable TX channel 1/2 bias too low\n", + __func__); + + if ((qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_ALARM) || + (qsfp_interrupt_status[6] & QSFP_HIGH_BIAS_WARNING)) + dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too high\n", + __func__); + + if ((qsfp_interrupt_status[6] & QSFP_LOW_BIAS_ALARM) || + (qsfp_interrupt_status[6] & QSFP_LOW_BIAS_WARNING)) + dd_dev_info(dd, "%s: Cable TX channel 3/4 bias too low\n", + __func__); + + if ((qsfp_interrupt_status[7] & QSFP_HIGH_POWER_ALARM) || + (qsfp_interrupt_status[7] & QSFP_HIGH_POWER_WARNING)) + dd_dev_info(dd, "%s: Cable TX channel 1/2 power too high\n", + __func__); + + if ((qsfp_interrupt_status[7] & QSFP_LOW_POWER_ALARM) || + (qsfp_interrupt_status[7] & QSFP_LOW_POWER_WARNING)) + dd_dev_info(dd, "%s: Cable TX channel 1/2 power too low\n", + __func__); + + if ((qsfp_interrupt_status[8] & QSFP_HIGH_POWER_ALARM) || + (qsfp_interrupt_status[8] & QSFP_HIGH_POWER_WARNING)) + dd_dev_info(dd, "%s: Cable TX channel 3/4 power too high\n", + __func__); + + if ((qsfp_interrupt_status[8] & QSFP_LOW_POWER_ALARM) || + (qsfp_interrupt_status[8] & QSFP_LOW_POWER_WARNING)) + dd_dev_info(dd, "%s: Cable TX channel 3/4 power too low\n", + __func__); + + /* Bytes 9-10 and 11-12 are reserved */ + /* Bytes 13-15 are vendor specific */ + + return 0; +} + +/* This routine will only be scheduled if the QSFP module present is asserted */ +void qsfp_event(struct work_struct *work) +{ + struct qsfp_data *qd; + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + + qd = container_of(work, struct qsfp_data, qsfp_work); + ppd = qd->ppd; + dd = ppd->dd; + + /* Sanity check */ + if (!qsfp_mod_present(ppd)) + return; + + /* + * Turn DC back on after cable has been re-inserted. Up until + * now, the DC has been in reset to save power. + */ + dc_start(dd); + + if (qd->cache_refresh_required) { + set_qsfp_int_n(ppd, 0); + + wait_for_qsfp_init(ppd); + + /* + * Allow INT_N to trigger the QSFP interrupt to watch + * for alarms and warnings + */ + set_qsfp_int_n(ppd, 1); + + tune_serdes(ppd); + + start_link(ppd); + } + + if (qd->check_interrupt_flags) { + u8 qsfp_interrupt_status[16] = {0,}; + + if (one_qsfp_read(ppd, dd->hfi1_id, 6, + &qsfp_interrupt_status[0], 16) != 16) { + dd_dev_info(dd, + "%s: Failed to read status of QSFP module\n", + __func__); + } else { + unsigned long flags; + + handle_qsfp_error_conditions( + ppd, qsfp_interrupt_status); + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + ppd->qsfp_info.check_interrupt_flags = 0; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, + flags); + } + } +} + +static void init_qsfp_int(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd = dd->pport; + u64 qsfp_mask, cce_int_mask; + const int qsfp1_int_smask = QSFP1_INT % 64; + const int qsfp2_int_smask = QSFP2_INT % 64; + + /* + * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0 + * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR, + * therefore just one of QSFP1_INT/QSFP2_INT can be used to find + * the index of the appropriate CSR in the CCEIntMask CSR array + */ + cce_int_mask = read_csr(dd, CCE_INT_MASK + + (8 * (QSFP1_INT / 64))); + if (dd->hfi1_id) { + cce_int_mask &= ~((u64)1 << qsfp1_int_smask); + write_csr(dd, CCE_INT_MASK + (8 * (QSFP1_INT / 64)), + cce_int_mask); + } else { + cce_int_mask &= ~((u64)1 << qsfp2_int_smask); + write_csr(dd, CCE_INT_MASK + (8 * (QSFP2_INT / 64)), + cce_int_mask); + } + + qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N); + /* Clear current status to avoid spurious interrupts */ + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_CLEAR : ASIC_QSFP1_CLEAR, + qsfp_mask); + write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_MASK : ASIC_QSFP1_MASK, + qsfp_mask); + + set_qsfp_int_n(ppd, 0); + + /* Handle active low nature of INT_N and MODPRST_N pins */ + if (qsfp_mod_present(ppd)) + qsfp_mask &= ~(u64)QSFP_HFI0_MODPRST_N; + write_csr(dd, + dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT, + qsfp_mask); +} + +/* + * Do a one-time initialize of the LCB block. + */ +static void init_lcb(struct hfi1_devdata *dd) +{ + /* simulator does not correctly handle LCB cclk loopback, skip */ + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) + return; + + /* the DC has been reset earlier in the driver load */ + + /* set LCB for cclk loopback on the port */ + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x01); + write_csr(dd, DC_LCB_CFG_LANE_WIDTH, 0x00); + write_csr(dd, DC_LCB_CFG_REINIT_AS_SLAVE, 0x00); + write_csr(dd, DC_LCB_CFG_CNT_FOR_SKIP_STALL, 0x110); + write_csr(dd, DC_LCB_CFG_CLK_CNTR, 0x08); + write_csr(dd, DC_LCB_CFG_LOOPBACK, 0x02); + write_csr(dd, DC_LCB_CFG_TX_FIFOS_RESET, 0x00); +} + +int bringup_serdes(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 guid; + int ret; + + if (HFI1_CAP_IS_KSET(EXTENDED_PSN)) + add_rcvctrl(dd, RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK); + + guid = ppd->guid; + if (!guid) { + if (dd->base_guid) + guid = dd->base_guid + ppd->port - 1; + ppd->guid = guid; + } + + /* Set linkinit_reason on power up per OPA spec */ + ppd->linkinit_reason = OPA_LINKINIT_REASON_LINKUP; + + /* one-time init of the LCB */ + init_lcb(dd); + + if (loopback) { + ret = init_loopback(dd); + if (ret < 0) + return ret; + } + + get_port_type(ppd); + if (ppd->port_type == PORT_TYPE_QSFP) { + set_qsfp_int_n(ppd, 0); + wait_for_qsfp_init(ppd); + set_qsfp_int_n(ppd, 1); + } + + /* + * Tune the SerDes to a ballpark setting for + * optimal signal and bit error rate + * Needs to be done before starting the link + */ + tune_serdes(ppd); + + return start_link(ppd); +} + +void hfi1_quiet_serdes(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + + /* + * Shut down the link and keep it down. First turn off that the + * driver wants to allow the link to be up (driver_link_ready). + * Then make sure the link is not automatically restarted + * (link_enabled). Cancel any pending restart. And finally + * go offline. + */ + ppd->driver_link_ready = 0; + ppd->link_enabled = 0; + + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED); + set_link_down_reason(ppd, OPA_LINKDOWN_REASON_SMA_DISABLED, 0, + OPA_LINKDOWN_REASON_SMA_DISABLED); + set_link_state(ppd, HLS_DN_OFFLINE); + + /* disable the port */ + clear_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); +} + +static inline int init_cpu_counters(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + int i; + + ppd = (struct hfi1_pportdata *)(dd + 1); + for (i = 0; i < dd->num_pports; i++, ppd++) { + ppd->ibport_data.rvp.rc_acks = NULL; + ppd->ibport_data.rvp.rc_qacks = NULL; + ppd->ibport_data.rvp.rc_acks = alloc_percpu(u64); + ppd->ibport_data.rvp.rc_qacks = alloc_percpu(u64); + ppd->ibport_data.rvp.rc_delayed_comp = alloc_percpu(u64); + if (!ppd->ibport_data.rvp.rc_acks || + !ppd->ibport_data.rvp.rc_delayed_comp || + !ppd->ibport_data.rvp.rc_qacks) + return -ENOMEM; + } + + return 0; +} + +static const char * const pt_names[] = { + "expected", + "eager", + "invalid" +}; + +static const char *pt_name(u32 type) +{ + return type >= ARRAY_SIZE(pt_names) ? "unknown" : pt_names[type]; +} + +/* + * index is the index into the receive array + */ +void hfi1_put_tid(struct hfi1_devdata *dd, u32 index, + u32 type, unsigned long pa, u16 order) +{ + u64 reg; + void __iomem *base = (dd->rcvarray_wc ? dd->rcvarray_wc : + (dd->kregbase + RCV_ARRAY)); + + if (!(dd->flags & HFI1_PRESENT)) + goto done; + + if (type == PT_INVALID) { + pa = 0; + } else if (type > PT_INVALID) { + dd_dev_err(dd, + "unexpected receive array type %u for index %u, not handled\n", + type, index); + goto done; + } + + hfi1_cdbg(TID, "type %s, index 0x%x, pa 0x%lx, bsize 0x%lx", + pt_name(type), index, pa, (unsigned long)order); + +#define RT_ADDR_SHIFT 12 /* 4KB kernel address boundary */ + reg = RCV_ARRAY_RT_WRITE_ENABLE_SMASK + | (u64)order << RCV_ARRAY_RT_BUF_SIZE_SHIFT + | ((pa >> RT_ADDR_SHIFT) & RCV_ARRAY_RT_ADDR_MASK) + << RCV_ARRAY_RT_ADDR_SHIFT; + writeq(reg, base + (index * 8)); + + if (type == PT_EAGER) + /* + * Eager entries are written one-by-one so we have to push them + * after we write the entry. + */ + flush_wc(); +done: + return; +} + +void hfi1_clear_tids(struct hfi1_ctxtdata *rcd) +{ + struct hfi1_devdata *dd = rcd->dd; + u32 i; + + /* this could be optimized */ + for (i = rcd->eager_base; i < rcd->eager_base + + rcd->egrbufs.alloced; i++) + hfi1_put_tid(dd, i, PT_INVALID, 0, 0); + + for (i = rcd->expected_base; + i < rcd->expected_base + rcd->expected_count; i++) + hfi1_put_tid(dd, i, PT_INVALID, 0, 0); +} + +int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd, + struct hfi1_ctxt_info *kinfo) +{ + kinfo->runtime_flags = (HFI1_MISC_GET() << HFI1_CAP_USER_SHIFT) | + HFI1_CAP_UGET(MASK) | HFI1_CAP_KGET(K2U); + return 0; +} + +struct hfi1_message_header *hfi1_get_msgheader( + struct hfi1_devdata *dd, __le32 *rhf_addr) +{ + u32 offset = rhf_hdrq_offset(rhf_to_cpu(rhf_addr)); + + return (struct hfi1_message_header *) + (rhf_addr - dd->rhf_offset + offset); +} + +static const char * const ib_cfg_name_strings[] = { + "HFI1_IB_CFG_LIDLMC", + "HFI1_IB_CFG_LWID_DG_ENB", + "HFI1_IB_CFG_LWID_ENB", + "HFI1_IB_CFG_LWID", + "HFI1_IB_CFG_SPD_ENB", + "HFI1_IB_CFG_SPD", + "HFI1_IB_CFG_RXPOL_ENB", + "HFI1_IB_CFG_LREV_ENB", + "HFI1_IB_CFG_LINKLATENCY", + "HFI1_IB_CFG_HRTBT", + "HFI1_IB_CFG_OP_VLS", + "HFI1_IB_CFG_VL_HIGH_CAP", + "HFI1_IB_CFG_VL_LOW_CAP", + "HFI1_IB_CFG_OVERRUN_THRESH", + "HFI1_IB_CFG_PHYERR_THRESH", + "HFI1_IB_CFG_LINKDEFAULT", + "HFI1_IB_CFG_PKEYS", + "HFI1_IB_CFG_MTU", + "HFI1_IB_CFG_LSTATE", + "HFI1_IB_CFG_VL_HIGH_LIMIT", + "HFI1_IB_CFG_PMA_TICKS", + "HFI1_IB_CFG_PORT" +}; + +static const char *ib_cfg_name(int which) +{ + if (which < 0 || which >= ARRAY_SIZE(ib_cfg_name_strings)) + return "invalid"; + return ib_cfg_name_strings[which]; +} + +int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which) +{ + struct hfi1_devdata *dd = ppd->dd; + int val = 0; + + switch (which) { + case HFI1_IB_CFG_LWID_ENB: /* allowed Link-width */ + val = ppd->link_width_enabled; + break; + case HFI1_IB_CFG_LWID: /* currently active Link-width */ + val = ppd->link_width_active; + break; + case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */ + val = ppd->link_speed_enabled; + break; + case HFI1_IB_CFG_SPD: /* current Link speed */ + val = ppd->link_speed_active; + break; + + case HFI1_IB_CFG_RXPOL_ENB: /* Auto-RX-polarity enable */ + case HFI1_IB_CFG_LREV_ENB: /* Auto-Lane-reversal enable */ + case HFI1_IB_CFG_LINKLATENCY: + goto unimplemented; + + case HFI1_IB_CFG_OP_VLS: + val = ppd->vls_operational; + break; + case HFI1_IB_CFG_VL_HIGH_CAP: /* VL arb high priority table size */ + val = VL_ARB_HIGH_PRIO_TABLE_SIZE; + break; + case HFI1_IB_CFG_VL_LOW_CAP: /* VL arb low priority table size */ + val = VL_ARB_LOW_PRIO_TABLE_SIZE; + break; + case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + val = ppd->overrun_threshold; + break; + case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + val = ppd->phy_error_threshold; + break; + case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + val = dd->link_default; + break; + + case HFI1_IB_CFG_HRTBT: /* Heartbeat off/enable/auto */ + case HFI1_IB_CFG_PMA_TICKS: + default: +unimplemented: + if (HFI1_CAP_IS_KSET(PRINT_UNIMPL)) + dd_dev_info( + dd, + "%s: which %s: not implemented\n", + __func__, + ib_cfg_name(which)); + break; + } + + return val; +} + +/* + * The largest MAD packet size. + */ +#define MAX_MAD_PACKET 2048 + +/* + * Return the maximum header bytes that can go on the _wire_ + * for this device. This count includes the ICRC which is + * not part of the packet held in memory but it is appended + * by the HW. + * This is dependent on the device's receive header entry size. + * HFI allows this to be set per-receive context, but the + * driver presently enforces a global value. + */ +u32 lrh_max_header_bytes(struct hfi1_devdata *dd) +{ + /* + * The maximum non-payload (MTU) bytes in LRH.PktLen are + * the Receive Header Entry Size minus the PBC (or RHF) size + * plus one DW for the ICRC appended by HW. + * + * dd->rcd[0].rcvhdrqentsize is in DW. + * We use rcd[0] as all context will have the same value. Also, + * the first kernel context would have been allocated by now so + * we are guaranteed a valid value. + */ + return (dd->rcd[0]->rcvhdrqentsize - 2/*PBC/RHF*/ + 1/*ICRC*/) << 2; +} + +/* + * Set Send Length + * @ppd - per port data + * + * Set the MTU by limiting how many DWs may be sent. The SendLenCheck* + * registers compare against LRH.PktLen, so use the max bytes included + * in the LRH. + * + * This routine changes all VL values except VL15, which it maintains at + * the same value. + */ +static void set_send_length(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 max_hb = lrh_max_header_bytes(dd), dcmtu; + u32 maxvlmtu = dd->vld[15].mtu; + u64 len1 = 0, len2 = (((dd->vld[15].mtu + max_hb) >> 2) + & SEND_LEN_CHECK1_LEN_VL15_MASK) << + SEND_LEN_CHECK1_LEN_VL15_SHIFT; + int i, j; + u32 thres; + + for (i = 0; i < ppd->vls_supported; i++) { + if (dd->vld[i].mtu > maxvlmtu) + maxvlmtu = dd->vld[i].mtu; + if (i <= 3) + len1 |= (((dd->vld[i].mtu + max_hb) >> 2) + & SEND_LEN_CHECK0_LEN_VL0_MASK) << + ((i % 4) * SEND_LEN_CHECK0_LEN_VL1_SHIFT); + else + len2 |= (((dd->vld[i].mtu + max_hb) >> 2) + & SEND_LEN_CHECK1_LEN_VL4_MASK) << + ((i % 4) * SEND_LEN_CHECK1_LEN_VL5_SHIFT); + } + write_csr(dd, SEND_LEN_CHECK0, len1); + write_csr(dd, SEND_LEN_CHECK1, len2); + /* adjust kernel credit return thresholds based on new MTUs */ + /* all kernel receive contexts have the same hdrqentsize */ + for (i = 0; i < ppd->vls_supported; i++) { + thres = min(sc_percent_to_threshold(dd->vld[i].sc, 50), + sc_mtu_to_threshold(dd->vld[i].sc, + dd->vld[i].mtu, + dd->rcd[0]->rcvhdrqentsize)); + for (j = 0; j < INIT_SC_PER_VL; j++) + sc_set_cr_threshold( + pio_select_send_context_vl(dd, j, i), + thres); + } + thres = min(sc_percent_to_threshold(dd->vld[15].sc, 50), + sc_mtu_to_threshold(dd->vld[15].sc, + dd->vld[15].mtu, + dd->rcd[0]->rcvhdrqentsize)); + sc_set_cr_threshold(dd->vld[15].sc, thres); + + /* Adjust maximum MTU for the port in DC */ + dcmtu = maxvlmtu == 10240 ? DCC_CFG_PORT_MTU_CAP_10240 : + (ilog2(maxvlmtu >> 8) + 1); + len1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG); + len1 &= ~DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK; + len1 |= ((u64)dcmtu & DCC_CFG_PORT_CONFIG_MTU_CAP_MASK) << + DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT; + write_csr(ppd->dd, DCC_CFG_PORT_CONFIG, len1); +} + +static void set_lidlmc(struct hfi1_pportdata *ppd) +{ + int i; + u64 sreg = 0; + struct hfi1_devdata *dd = ppd->dd; + u32 mask = ~((1U << ppd->lmc) - 1); + u64 c1 = read_csr(ppd->dd, DCC_CFG_PORT_CONFIG1); + + if (dd->hfi1_snoop.mode_flag) + dd_dev_info(dd, "Set lid/lmc while snooping"); + + c1 &= ~(DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK + | DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK); + c1 |= ((ppd->lid & DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK) + << DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT) | + ((mask & DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK) + << DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT); + write_csr(ppd->dd, DCC_CFG_PORT_CONFIG1, c1); + + /* + * Iterate over all the send contexts and set their SLID check + */ + sreg = ((mask & SEND_CTXT_CHECK_SLID_MASK_MASK) << + SEND_CTXT_CHECK_SLID_MASK_SHIFT) | + (((ppd->lid & mask) & SEND_CTXT_CHECK_SLID_VALUE_MASK) << + SEND_CTXT_CHECK_SLID_VALUE_SHIFT); + + for (i = 0; i < dd->chip_send_contexts; i++) { + hfi1_cdbg(LINKVERB, "SendContext[%d].SLID_CHECK = 0x%x", + i, (u32)sreg); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, sreg); + } + + /* Now we have to do the same thing for the sdma engines */ + sdma_update_lmc(dd, mask, ppd->lid); +} + +static int wait_phy_linkstate(struct hfi1_devdata *dd, u32 state, u32 msecs) +{ + unsigned long timeout; + u32 curr_state; + + timeout = jiffies + msecs_to_jiffies(msecs); + while (1) { + curr_state = read_physical_state(dd); + if (curr_state == state) + break; + if (time_after(jiffies, timeout)) { + dd_dev_err(dd, + "timeout waiting for phy link state 0x%x, current state is 0x%x\n", + state, curr_state); + return -ETIMEDOUT; + } + usleep_range(1950, 2050); /* sleep 2ms-ish */ + } + + return 0; +} + +/* + * Helper for set_link_state(). Do not call except from that routine. + * Expects ppd->hls_mutex to be held. + * + * @rem_reason value to be sent to the neighbor + * + * LinkDownReasons only set if transition succeeds. + */ +static int goto_offline(struct hfi1_pportdata *ppd, u8 rem_reason) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 pstate, previous_state; + u32 last_local_state; + u32 last_remote_state; + int ret; + int do_transition; + int do_wait; + + previous_state = ppd->host_link_state; + ppd->host_link_state = HLS_GOING_OFFLINE; + pstate = read_physical_state(dd); + if (pstate == PLS_OFFLINE) { + do_transition = 0; /* in right state */ + do_wait = 0; /* ...no need to wait */ + } else if ((pstate & 0xff) == PLS_OFFLINE) { + do_transition = 0; /* in an offline transient state */ + do_wait = 1; /* ...wait for it to settle */ + } else { + do_transition = 1; /* need to move to offline */ + do_wait = 1; /* ...will need to wait */ + } + + if (do_transition) { + ret = set_physical_link_state(dd, + (rem_reason << 8) | PLS_OFFLINE); + + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to transition to Offline link state, return %d\n", + ret); + return -EINVAL; + } + if (ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_TRANSIENT); + } + + if (do_wait) { + /* it can take a while for the link to go down */ + ret = wait_phy_linkstate(dd, PLS_OFFLINE, 10000); + if (ret < 0) + return ret; + } + + /* make sure the logical state is also down */ + wait_logical_linkstate(ppd, IB_PORT_DOWN, 1000); + + /* + * Now in charge of LCB - must be after the physical state is + * offline.quiet and before host_link_state is changed. + */ + set_host_lcb_access(dd); + write_csr(dd, DC_LCB_ERR_EN, ~0ull); /* watch LCB errors */ + ppd->host_link_state = HLS_LINK_COOLDOWN; /* LCB access allowed */ + + if (ppd->port_type == PORT_TYPE_QSFP && + ppd->qsfp_info.limiting_active && + qsfp_mod_present(ppd)) { + int ret; + + ret = acquire_chip_resource(dd, qsfp_resource(dd), QSFP_WAIT); + if (ret == 0) { + set_qsfp_tx(ppd, 0); + release_chip_resource(dd, qsfp_resource(dd)); + } else { + /* not fatal, but should warn */ + dd_dev_err(dd, + "Unable to acquire lock to turn off QSFP TX\n"); + } + } + + /* + * The LNI has a mandatory wait time after the physical state + * moves to Offline.Quiet. The wait time may be different + * depending on how the link went down. The 8051 firmware + * will observe the needed wait time and only move to ready + * when that is completed. The largest of the quiet timeouts + * is 6s, so wait that long and then at least 0.5s more for + * other transitions, and another 0.5s for a buffer. + */ + ret = wait_fm_ready(dd, 7000); + if (ret) { + dd_dev_err(dd, + "After going offline, timed out waiting for the 8051 to become ready to accept host requests\n"); + /* state is really offline, so make it so */ + ppd->host_link_state = HLS_DN_OFFLINE; + return ret; + } + + /* + * The state is now offline and the 8051 is ready to accept host + * requests. + * - change our state + * - notify others if we were previously in a linkup state + */ + ppd->host_link_state = HLS_DN_OFFLINE; + if (previous_state & HLS_UP) { + /* went down while link was up */ + handle_linkup_change(dd, 0); + } else if (previous_state + & (HLS_DN_POLL | HLS_VERIFY_CAP | HLS_GOING_UP)) { + /* went down while attempting link up */ + /* byte 1 of last_*_state is the failure reason */ + read_last_local_state(dd, &last_local_state); + read_last_remote_state(dd, &last_remote_state); + dd_dev_err(dd, + "LNI failure last states: local 0x%08x, remote 0x%08x\n", + last_local_state, last_remote_state); + } + + /* the active link width (downgrade) is 0 on link down */ + ppd->link_width_active = 0; + ppd->link_width_downgrade_tx_active = 0; + ppd->link_width_downgrade_rx_active = 0; + ppd->current_egress_rate = 0; + return 0; +} + +/* return the link state name */ +static const char *link_state_name(u32 state) +{ + const char *name; + int n = ilog2(state); + static const char * const names[] = { + [__HLS_UP_INIT_BP] = "INIT", + [__HLS_UP_ARMED_BP] = "ARMED", + [__HLS_UP_ACTIVE_BP] = "ACTIVE", + [__HLS_DN_DOWNDEF_BP] = "DOWNDEF", + [__HLS_DN_POLL_BP] = "POLL", + [__HLS_DN_DISABLE_BP] = "DISABLE", + [__HLS_DN_OFFLINE_BP] = "OFFLINE", + [__HLS_VERIFY_CAP_BP] = "VERIFY_CAP", + [__HLS_GOING_UP_BP] = "GOING_UP", + [__HLS_GOING_OFFLINE_BP] = "GOING_OFFLINE", + [__HLS_LINK_COOLDOWN_BP] = "LINK_COOLDOWN" + }; + + name = n < ARRAY_SIZE(names) ? names[n] : NULL; + return name ? name : "unknown"; +} + +/* return the link state reason name */ +static const char *link_state_reason_name(struct hfi1_pportdata *ppd, u32 state) +{ + if (state == HLS_UP_INIT) { + switch (ppd->linkinit_reason) { + case OPA_LINKINIT_REASON_LINKUP: + return "(LINKUP)"; + case OPA_LINKINIT_REASON_FLAPPING: + return "(FLAPPING)"; + case OPA_LINKINIT_OUTSIDE_POLICY: + return "(OUTSIDE_POLICY)"; + case OPA_LINKINIT_QUARANTINED: + return "(QUARANTINED)"; + case OPA_LINKINIT_INSUFIC_CAPABILITY: + return "(INSUFIC_CAPABILITY)"; + default: + break; + } + } + return ""; +} + +/* + * driver_physical_state - convert the driver's notion of a port's + * state (an HLS_*) into a physical state (a {IB,OPA}_PORTPHYSSTATE_*). + * Return -1 (converted to a u32) to indicate error. + */ +u32 driver_physical_state(struct hfi1_pportdata *ppd) +{ + switch (ppd->host_link_state) { + case HLS_UP_INIT: + case HLS_UP_ARMED: + case HLS_UP_ACTIVE: + return IB_PORTPHYSSTATE_LINKUP; + case HLS_DN_POLL: + return IB_PORTPHYSSTATE_POLLING; + case HLS_DN_DISABLE: + return IB_PORTPHYSSTATE_DISABLED; + case HLS_DN_OFFLINE: + return OPA_PORTPHYSSTATE_OFFLINE; + case HLS_VERIFY_CAP: + return IB_PORTPHYSSTATE_POLLING; + case HLS_GOING_UP: + return IB_PORTPHYSSTATE_POLLING; + case HLS_GOING_OFFLINE: + return OPA_PORTPHYSSTATE_OFFLINE; + case HLS_LINK_COOLDOWN: + return OPA_PORTPHYSSTATE_OFFLINE; + case HLS_DN_DOWNDEF: + default: + dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n", + ppd->host_link_state); + return -1; + } +} + +/* + * driver_logical_state - convert the driver's notion of a port's + * state (an HLS_*) into a logical state (a IB_PORT_*). Return -1 + * (converted to a u32) to indicate error. + */ +u32 driver_logical_state(struct hfi1_pportdata *ppd) +{ + if (ppd->host_link_state && (ppd->host_link_state & HLS_DOWN)) + return IB_PORT_DOWN; + + switch (ppd->host_link_state & HLS_UP) { + case HLS_UP_INIT: + return IB_PORT_INIT; + case HLS_UP_ARMED: + return IB_PORT_ARMED; + case HLS_UP_ACTIVE: + return IB_PORT_ACTIVE; + default: + dd_dev_err(ppd->dd, "invalid host_link_state 0x%x\n", + ppd->host_link_state); + return -1; + } +} + +void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason, + u8 neigh_reason, u8 rem_reason) +{ + if (ppd->local_link_down_reason.latest == 0 && + ppd->neigh_link_down_reason.latest == 0) { + ppd->local_link_down_reason.latest = lcl_reason; + ppd->neigh_link_down_reason.latest = neigh_reason; + ppd->remote_link_down_reason = rem_reason; + } +} + +/* + * Change the physical and/or logical link state. + * + * Do not call this routine while inside an interrupt. It contains + * calls to routines that can take multiple seconds to finish. + * + * Returns 0 on success, -errno on failure. + */ +int set_link_state(struct hfi1_pportdata *ppd, u32 state) +{ + struct hfi1_devdata *dd = ppd->dd; + struct ib_event event = {.device = NULL}; + int ret1, ret = 0; + int orig_new_state, poll_bounce; + + mutex_lock(&ppd->hls_lock); + + orig_new_state = state; + if (state == HLS_DN_DOWNDEF) + state = dd->link_default; + + /* interpret poll -> poll as a link bounce */ + poll_bounce = ppd->host_link_state == HLS_DN_POLL && + state == HLS_DN_POLL; + + dd_dev_info(dd, "%s: current %s, new %s %s%s\n", __func__, + link_state_name(ppd->host_link_state), + link_state_name(orig_new_state), + poll_bounce ? "(bounce) " : "", + link_state_reason_name(ppd, state)); + + /* + * If we're going to a (HLS_*) link state that implies the logical + * link state is neither of (IB_PORT_ARMED, IB_PORT_ACTIVE), then + * reset is_sm_config_started to 0. + */ + if (!(state & (HLS_UP_ARMED | HLS_UP_ACTIVE))) + ppd->is_sm_config_started = 0; + + /* + * Do nothing if the states match. Let a poll to poll link bounce + * go through. + */ + if (ppd->host_link_state == state && !poll_bounce) + goto done; + + switch (state) { + case HLS_UP_INIT: + if (ppd->host_link_state == HLS_DN_POLL && + (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR)) { + /* + * Quick link up jumps from polling to here. + * + * Whether in normal or loopback mode, the + * simulator jumps from polling to link up. + * Accept that here. + */ + /* OK */ + } else if (ppd->host_link_state != HLS_GOING_UP) { + goto unexpected; + } + + ppd->host_link_state = HLS_UP_INIT; + ret = wait_logical_linkstate(ppd, IB_PORT_INIT, 1000); + if (ret) { + /* logical state didn't change, stay at going_up */ + ppd->host_link_state = HLS_GOING_UP; + dd_dev_err(dd, + "%s: logical state did not change to INIT\n", + __func__); + } else { + /* clear old transient LINKINIT_REASON code */ + if (ppd->linkinit_reason >= OPA_LINKINIT_REASON_CLEAR) + ppd->linkinit_reason = + OPA_LINKINIT_REASON_LINKUP; + + /* enable the port */ + add_rcvctrl(dd, RCV_CTRL_RCV_PORT_ENABLE_SMASK); + + handle_linkup_change(dd, 1); + } + break; + case HLS_UP_ARMED: + if (ppd->host_link_state != HLS_UP_INIT) + goto unexpected; + + ppd->host_link_state = HLS_UP_ARMED; + set_logical_state(dd, LSTATE_ARMED); + ret = wait_logical_linkstate(ppd, IB_PORT_ARMED, 1000); + if (ret) { + /* logical state didn't change, stay at init */ + ppd->host_link_state = HLS_UP_INIT; + dd_dev_err(dd, + "%s: logical state did not change to ARMED\n", + __func__); + } + /* + * The simulator does not currently implement SMA messages, + * so neighbor_normal is not set. Set it here when we first + * move to Armed. + */ + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) + ppd->neighbor_normal = 1; + break; + case HLS_UP_ACTIVE: + if (ppd->host_link_state != HLS_UP_ARMED) + goto unexpected; + + ppd->host_link_state = HLS_UP_ACTIVE; + set_logical_state(dd, LSTATE_ACTIVE); + ret = wait_logical_linkstate(ppd, IB_PORT_ACTIVE, 1000); + if (ret) { + /* logical state didn't change, stay at armed */ + ppd->host_link_state = HLS_UP_ARMED; + dd_dev_err(dd, + "%s: logical state did not change to ACTIVE\n", + __func__); + } else { + /* tell all engines to go running */ + sdma_all_running(dd); + + /* Signal the IB layer that the port has went active */ + event.device = &dd->verbs_dev.rdi.ibdev; + event.element.port_num = ppd->port; + event.event = IB_EVENT_PORT_ACTIVE; + } + break; + case HLS_DN_POLL: + if ((ppd->host_link_state == HLS_DN_DISABLE || + ppd->host_link_state == HLS_DN_OFFLINE) && + dd->dc_shutdown) + dc_start(dd); + /* Hand LED control to the DC */ + write_csr(dd, DCC_CFG_LED_CNTRL, 0); + + if (ppd->host_link_state != HLS_DN_OFFLINE) { + u8 tmp = ppd->link_enabled; + + ret = goto_offline(ppd, ppd->remote_link_down_reason); + if (ret) { + ppd->link_enabled = tmp; + break; + } + ppd->remote_link_down_reason = 0; + + if (ppd->driver_link_ready) + ppd->link_enabled = 1; + } + + set_all_slowpath(ppd->dd); + ret = set_local_link_attributes(ppd); + if (ret) + break; + + ppd->port_error_action = 0; + ppd->host_link_state = HLS_DN_POLL; + + if (quick_linkup) { + /* quick linkup does not go into polling */ + ret = do_quick_linkup(dd); + } else { + ret1 = set_physical_link_state(dd, PLS_POLLING); + if (ret1 != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to transition to Polling link state, return 0x%x\n", + ret1); + ret = -EINVAL; + } + } + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE); + /* + * If an error occurred above, go back to offline. The + * caller may reschedule another attempt. + */ + if (ret) + goto_offline(ppd, 0); + break; + case HLS_DN_DISABLE: + /* link is disabled */ + ppd->link_enabled = 0; + + /* allow any state to transition to disabled */ + + /* must transition to offline first */ + if (ppd->host_link_state != HLS_DN_OFFLINE) { + ret = goto_offline(ppd, ppd->remote_link_down_reason); + if (ret) + break; + ppd->remote_link_down_reason = 0; + } + + ret1 = set_physical_link_state(dd, PLS_DISABLED); + if (ret1 != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to transition to Disabled link state, return 0x%x\n", + ret1); + ret = -EINVAL; + break; + } + ppd->host_link_state = HLS_DN_DISABLE; + dc_shutdown(dd); + break; + case HLS_DN_OFFLINE: + if (ppd->host_link_state == HLS_DN_DISABLE) + dc_start(dd); + + /* allow any state to transition to offline */ + ret = goto_offline(ppd, ppd->remote_link_down_reason); + if (!ret) + ppd->remote_link_down_reason = 0; + break; + case HLS_VERIFY_CAP: + if (ppd->host_link_state != HLS_DN_POLL) + goto unexpected; + ppd->host_link_state = HLS_VERIFY_CAP; + break; + case HLS_GOING_UP: + if (ppd->host_link_state != HLS_VERIFY_CAP) + goto unexpected; + + ret1 = set_physical_link_state(dd, PLS_LINKUP); + if (ret1 != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to transition to link up state, return 0x%x\n", + ret1); + ret = -EINVAL; + break; + } + ppd->host_link_state = HLS_GOING_UP; + break; + + case HLS_GOING_OFFLINE: /* transient within goto_offline() */ + case HLS_LINK_COOLDOWN: /* transient within goto_offline() */ + default: + dd_dev_info(dd, "%s: state 0x%x: not supported\n", + __func__, state); + ret = -EINVAL; + break; + } + + goto done; + +unexpected: + dd_dev_err(dd, "%s: unexpected state transition from %s to %s\n", + __func__, link_state_name(ppd->host_link_state), + link_state_name(state)); + ret = -EINVAL; + +done: + mutex_unlock(&ppd->hls_lock); + + if (event.device) + ib_dispatch_event(&event); + + return ret; +} + +int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val) +{ + u64 reg; + int ret = 0; + + switch (which) { + case HFI1_IB_CFG_LIDLMC: + set_lidlmc(ppd); + break; + case HFI1_IB_CFG_VL_HIGH_LIMIT: + /* + * The VL Arbitrator high limit is sent in units of 4k + * bytes, while HFI stores it in units of 64 bytes. + */ + val *= 4096 / 64; + reg = ((u64)val & SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK) + << SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT; + write_csr(ppd->dd, SEND_HIGH_PRIORITY_LIMIT, reg); + break; + case HFI1_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */ + /* HFI only supports POLL as the default link down state */ + if (val != HLS_DN_POLL) + ret = -EINVAL; + break; + case HFI1_IB_CFG_OP_VLS: + if (ppd->vls_operational != val) { + ppd->vls_operational = val; + if (!ppd->port) + ret = -EINVAL; + } + break; + /* + * For link width, link width downgrade, and speed enable, always AND + * the setting with what is actually supported. This has two benefits. + * First, enabled can't have unsupported values, no matter what the + * SM or FM might want. Second, the ALL_SUPPORTED wildcards that mean + * "fill in with your supported value" have all the bits in the + * field set, so simply ANDing with supported has the desired result. + */ + case HFI1_IB_CFG_LWID_ENB: /* set allowed Link-width */ + ppd->link_width_enabled = val & ppd->link_width_supported; + break; + case HFI1_IB_CFG_LWID_DG_ENB: /* set allowed link width downgrade */ + ppd->link_width_downgrade_enabled = + val & ppd->link_width_downgrade_supported; + break; + case HFI1_IB_CFG_SPD_ENB: /* allowed Link speeds */ + ppd->link_speed_enabled = val & ppd->link_speed_supported; + break; + case HFI1_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */ + /* + * HFI does not follow IB specs, save this value + * so we can report it, if asked. + */ + ppd->overrun_threshold = val; + break; + case HFI1_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */ + /* + * HFI does not follow IB specs, save this value + * so we can report it, if asked. + */ + ppd->phy_error_threshold = val; + break; + + case HFI1_IB_CFG_MTU: + set_send_length(ppd); + break; + + case HFI1_IB_CFG_PKEYS: + if (HFI1_CAP_IS_KSET(PKEY_CHECK)) + set_partition_keys(ppd); + break; + + default: + if (HFI1_CAP_IS_KSET(PRINT_UNIMPL)) + dd_dev_info(ppd->dd, + "%s: which %s, val 0x%x: not implemented\n", + __func__, ib_cfg_name(which), val); + break; + } + return ret; +} + +/* begin functions related to vl arbitration table caching */ +static void init_vl_arb_caches(struct hfi1_pportdata *ppd) +{ + int i; + + BUILD_BUG_ON(VL_ARB_TABLE_SIZE != + VL_ARB_LOW_PRIO_TABLE_SIZE); + BUILD_BUG_ON(VL_ARB_TABLE_SIZE != + VL_ARB_HIGH_PRIO_TABLE_SIZE); + + /* + * Note that we always return values directly from the + * 'vl_arb_cache' (and do no CSR reads) in response to a + * 'Get(VLArbTable)'. This is obviously correct after a + * 'Set(VLArbTable)', since the cache will then be up to + * date. But it's also correct prior to any 'Set(VLArbTable)' + * since then both the cache, and the relevant h/w registers + * will be zeroed. + */ + + for (i = 0; i < MAX_PRIO_TABLE; i++) + spin_lock_init(&ppd->vl_arb_cache[i].lock); +} + +/* + * vl_arb_lock_cache + * + * All other vl_arb_* functions should be called only after locking + * the cache. + */ +static inline struct vl_arb_cache * +vl_arb_lock_cache(struct hfi1_pportdata *ppd, int idx) +{ + if (idx != LO_PRIO_TABLE && idx != HI_PRIO_TABLE) + return NULL; + spin_lock(&ppd->vl_arb_cache[idx].lock); + return &ppd->vl_arb_cache[idx]; +} + +static inline void vl_arb_unlock_cache(struct hfi1_pportdata *ppd, int idx) +{ + spin_unlock(&ppd->vl_arb_cache[idx].lock); +} + +static void vl_arb_get_cache(struct vl_arb_cache *cache, + struct ib_vl_weight_elem *vl) +{ + memcpy(vl, cache->table, VL_ARB_TABLE_SIZE * sizeof(*vl)); +} + +static void vl_arb_set_cache(struct vl_arb_cache *cache, + struct ib_vl_weight_elem *vl) +{ + memcpy(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl)); +} + +static int vl_arb_match_cache(struct vl_arb_cache *cache, + struct ib_vl_weight_elem *vl) +{ + return !memcmp(cache->table, vl, VL_ARB_TABLE_SIZE * sizeof(*vl)); +} + +/* end functions related to vl arbitration table caching */ + +static int set_vl_weights(struct hfi1_pportdata *ppd, u32 target, + u32 size, struct ib_vl_weight_elem *vl) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 reg; + unsigned int i, is_up = 0; + int drain, ret = 0; + + mutex_lock(&ppd->hls_lock); + + if (ppd->host_link_state & HLS_UP) + is_up = 1; + + drain = !is_ax(dd) && is_up; + + if (drain) + /* + * Before adjusting VL arbitration weights, empty per-VL + * FIFOs, otherwise a packet whose VL weight is being + * set to 0 could get stuck in a FIFO with no chance to + * egress. + */ + ret = stop_drain_data_vls(dd); + + if (ret) { + dd_dev_err( + dd, + "%s: cannot stop/drain VLs - refusing to change VL arbitration weights\n", + __func__); + goto err; + } + + for (i = 0; i < size; i++, vl++) { + /* + * NOTE: The low priority shift and mask are used here, but + * they are the same for both the low and high registers. + */ + reg = (((u64)vl->vl & SEND_LOW_PRIORITY_LIST_VL_MASK) + << SEND_LOW_PRIORITY_LIST_VL_SHIFT) + | (((u64)vl->weight + & SEND_LOW_PRIORITY_LIST_WEIGHT_MASK) + << SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT); + write_csr(dd, target + (i * 8), reg); + } + pio_send_control(dd, PSC_GLOBAL_VLARB_ENABLE); + + if (drain) + open_fill_data_vls(dd); /* reopen all VLs */ + +err: + mutex_unlock(&ppd->hls_lock); + + return ret; +} + +/* + * Read one credit merge VL register. + */ +static void read_one_cm_vl(struct hfi1_devdata *dd, u32 csr, + struct vl_limit *vll) +{ + u64 reg = read_csr(dd, csr); + + vll->dedicated = cpu_to_be16( + (reg >> SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT) + & SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK); + vll->shared = cpu_to_be16( + (reg >> SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT) + & SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK); +} + +/* + * Read the current credit merge limits. + */ +static int get_buffer_control(struct hfi1_devdata *dd, + struct buffer_control *bc, u16 *overall_limit) +{ + u64 reg; + int i; + + /* not all entries are filled in */ + memset(bc, 0, sizeof(*bc)); + + /* OPA and HFI have a 1-1 mapping */ + for (i = 0; i < TXE_NUM_DATA_VL; i++) + read_one_cm_vl(dd, SEND_CM_CREDIT_VL + (8 * i), &bc->vl[i]); + + /* NOTE: assumes that VL* and VL15 CSRs are bit-wise identical */ + read_one_cm_vl(dd, SEND_CM_CREDIT_VL15, &bc->vl[15]); + + reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT); + bc->overall_shared_limit = cpu_to_be16( + (reg >> SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT) + & SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK); + if (overall_limit) + *overall_limit = (reg + >> SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT) + & SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK; + return sizeof(struct buffer_control); +} + +static int get_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp) +{ + u64 reg; + int i; + + /* each register contains 16 SC->VLnt mappings, 4 bits each */ + reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_15_0); + for (i = 0; i < sizeof(u64); i++) { + u8 byte = *(((u8 *)®) + i); + + dp->vlnt[2 * i] = byte & 0xf; + dp->vlnt[(2 * i) + 1] = (byte & 0xf0) >> 4; + } + + reg = read_csr(dd, DCC_CFG_SC_VL_TABLE_31_16); + for (i = 0; i < sizeof(u64); i++) { + u8 byte = *(((u8 *)®) + i); + + dp->vlnt[16 + (2 * i)] = byte & 0xf; + dp->vlnt[16 + (2 * i) + 1] = (byte & 0xf0) >> 4; + } + return sizeof(struct sc2vlnt); +} + +static void get_vlarb_preempt(struct hfi1_devdata *dd, u32 nelems, + struct ib_vl_weight_elem *vl) +{ + unsigned int i; + + for (i = 0; i < nelems; i++, vl++) { + vl->vl = 0xf; + vl->weight = 0; + } +} + +static void set_sc2vlnt(struct hfi1_devdata *dd, struct sc2vlnt *dp) +{ + write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, + DC_SC_VL_VAL(15_0, + 0, dp->vlnt[0] & 0xf, + 1, dp->vlnt[1] & 0xf, + 2, dp->vlnt[2] & 0xf, + 3, dp->vlnt[3] & 0xf, + 4, dp->vlnt[4] & 0xf, + 5, dp->vlnt[5] & 0xf, + 6, dp->vlnt[6] & 0xf, + 7, dp->vlnt[7] & 0xf, + 8, dp->vlnt[8] & 0xf, + 9, dp->vlnt[9] & 0xf, + 10, dp->vlnt[10] & 0xf, + 11, dp->vlnt[11] & 0xf, + 12, dp->vlnt[12] & 0xf, + 13, dp->vlnt[13] & 0xf, + 14, dp->vlnt[14] & 0xf, + 15, dp->vlnt[15] & 0xf)); + write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, + DC_SC_VL_VAL(31_16, + 16, dp->vlnt[16] & 0xf, + 17, dp->vlnt[17] & 0xf, + 18, dp->vlnt[18] & 0xf, + 19, dp->vlnt[19] & 0xf, + 20, dp->vlnt[20] & 0xf, + 21, dp->vlnt[21] & 0xf, + 22, dp->vlnt[22] & 0xf, + 23, dp->vlnt[23] & 0xf, + 24, dp->vlnt[24] & 0xf, + 25, dp->vlnt[25] & 0xf, + 26, dp->vlnt[26] & 0xf, + 27, dp->vlnt[27] & 0xf, + 28, dp->vlnt[28] & 0xf, + 29, dp->vlnt[29] & 0xf, + 30, dp->vlnt[30] & 0xf, + 31, dp->vlnt[31] & 0xf)); +} + +static void nonzero_msg(struct hfi1_devdata *dd, int idx, const char *what, + u16 limit) +{ + if (limit != 0) + dd_dev_info(dd, "Invalid %s limit %d on VL %d, ignoring\n", + what, (int)limit, idx); +} + +/* change only the shared limit portion of SendCmGLobalCredit */ +static void set_global_shared(struct hfi1_devdata *dd, u16 limit) +{ + u64 reg; + + reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT); + reg &= ~SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK; + reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT; + write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg); +} + +/* change only the total credit limit portion of SendCmGLobalCredit */ +static void set_global_limit(struct hfi1_devdata *dd, u16 limit) +{ + u64 reg; + + reg = read_csr(dd, SEND_CM_GLOBAL_CREDIT); + reg &= ~SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK; + reg |= (u64)limit << SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT; + write_csr(dd, SEND_CM_GLOBAL_CREDIT, reg); +} + +/* set the given per-VL shared limit */ +static void set_vl_shared(struct hfi1_devdata *dd, int vl, u16 limit) +{ + u64 reg; + u32 addr; + + if (vl < TXE_NUM_DATA_VL) + addr = SEND_CM_CREDIT_VL + (8 * vl); + else + addr = SEND_CM_CREDIT_VL15; + + reg = read_csr(dd, addr); + reg &= ~SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK; + reg |= (u64)limit << SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT; + write_csr(dd, addr, reg); +} + +/* set the given per-VL dedicated limit */ +static void set_vl_dedicated(struct hfi1_devdata *dd, int vl, u16 limit) +{ + u64 reg; + u32 addr; + + if (vl < TXE_NUM_DATA_VL) + addr = SEND_CM_CREDIT_VL + (8 * vl); + else + addr = SEND_CM_CREDIT_VL15; + + reg = read_csr(dd, addr); + reg &= ~SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK; + reg |= (u64)limit << SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT; + write_csr(dd, addr, reg); +} + +/* spin until the given per-VL status mask bits clear */ +static void wait_for_vl_status_clear(struct hfi1_devdata *dd, u64 mask, + const char *which) +{ + unsigned long timeout; + u64 reg; + + timeout = jiffies + msecs_to_jiffies(VL_STATUS_CLEAR_TIMEOUT); + while (1) { + reg = read_csr(dd, SEND_CM_CREDIT_USED_STATUS) & mask; + + if (reg == 0) + return; /* success */ + if (time_after(jiffies, timeout)) + break; /* timed out */ + udelay(1); + } + + dd_dev_err(dd, + "%s credit change status not clearing after %dms, mask 0x%llx, not clear 0x%llx\n", + which, VL_STATUS_CLEAR_TIMEOUT, mask, reg); + /* + * If this occurs, it is likely there was a credit loss on the link. + * The only recovery from that is a link bounce. + */ + dd_dev_err(dd, + "Continuing anyway. A credit loss may occur. Suggest a link bounce\n"); +} + +/* + * The number of credits on the VLs may be changed while everything + * is "live", but the following algorithm must be followed due to + * how the hardware is actually implemented. In particular, + * Return_Credit_Status[] is the only correct status check. + * + * if (reducing Global_Shared_Credit_Limit or any shared limit changing) + * set Global_Shared_Credit_Limit = 0 + * use_all_vl = 1 + * mask0 = all VLs that are changing either dedicated or shared limits + * set Shared_Limit[mask0] = 0 + * spin until Return_Credit_Status[use_all_vl ? all VL : mask0] == 0 + * if (changing any dedicated limit) + * mask1 = all VLs that are lowering dedicated limits + * lower Dedicated_Limit[mask1] + * spin until Return_Credit_Status[mask1] == 0 + * raise Dedicated_Limits + * raise Shared_Limits + * raise Global_Shared_Credit_Limit + * + * lower = if the new limit is lower, set the limit to the new value + * raise = if the new limit is higher than the current value (may be changed + * earlier in the algorithm), set the new limit to the new value + */ +int set_buffer_control(struct hfi1_pportdata *ppd, + struct buffer_control *new_bc) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 changing_mask, ld_mask, stat_mask; + int change_count; + int i, use_all_mask; + int this_shared_changing; + int vl_count = 0, ret; + /* + * A0: add the variable any_shared_limit_changing below and in the + * algorithm above. If removing A0 support, it can be removed. + */ + int any_shared_limit_changing; + struct buffer_control cur_bc; + u8 changing[OPA_MAX_VLS]; + u8 lowering_dedicated[OPA_MAX_VLS]; + u16 cur_total; + u32 new_total = 0; + const u64 all_mask = + SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK + | SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK; + +#define valid_vl(idx) ((idx) < TXE_NUM_DATA_VL || (idx) == 15) +#define NUM_USABLE_VLS 16 /* look at VL15 and less */ + + /* find the new total credits, do sanity check on unused VLs */ + for (i = 0; i < OPA_MAX_VLS; i++) { + if (valid_vl(i)) { + new_total += be16_to_cpu(new_bc->vl[i].dedicated); + continue; + } + nonzero_msg(dd, i, "dedicated", + be16_to_cpu(new_bc->vl[i].dedicated)); + nonzero_msg(dd, i, "shared", + be16_to_cpu(new_bc->vl[i].shared)); + new_bc->vl[i].dedicated = 0; + new_bc->vl[i].shared = 0; + } + new_total += be16_to_cpu(new_bc->overall_shared_limit); + + /* fetch the current values */ + get_buffer_control(dd, &cur_bc, &cur_total); + + /* + * Create the masks we will use. + */ + memset(changing, 0, sizeof(changing)); + memset(lowering_dedicated, 0, sizeof(lowering_dedicated)); + /* + * NOTE: Assumes that the individual VL bits are adjacent and in + * increasing order + */ + stat_mask = + SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK; + changing_mask = 0; + ld_mask = 0; + change_count = 0; + any_shared_limit_changing = 0; + for (i = 0; i < NUM_USABLE_VLS; i++, stat_mask <<= 1) { + if (!valid_vl(i)) + continue; + this_shared_changing = new_bc->vl[i].shared + != cur_bc.vl[i].shared; + if (this_shared_changing) + any_shared_limit_changing = 1; + if (new_bc->vl[i].dedicated != cur_bc.vl[i].dedicated || + this_shared_changing) { + changing[i] = 1; + changing_mask |= stat_mask; + change_count++; + } + if (be16_to_cpu(new_bc->vl[i].dedicated) < + be16_to_cpu(cur_bc.vl[i].dedicated)) { + lowering_dedicated[i] = 1; + ld_mask |= stat_mask; + } + } + + /* bracket the credit change with a total adjustment */ + if (new_total > cur_total) + set_global_limit(dd, new_total); + + /* + * Start the credit change algorithm. + */ + use_all_mask = 0; + if ((be16_to_cpu(new_bc->overall_shared_limit) < + be16_to_cpu(cur_bc.overall_shared_limit)) || + (is_ax(dd) && any_shared_limit_changing)) { + set_global_shared(dd, 0); + cur_bc.overall_shared_limit = 0; + use_all_mask = 1; + } + + for (i = 0; i < NUM_USABLE_VLS; i++) { + if (!valid_vl(i)) + continue; + + if (changing[i]) { + set_vl_shared(dd, i, 0); + cur_bc.vl[i].shared = 0; + } + } + + wait_for_vl_status_clear(dd, use_all_mask ? all_mask : changing_mask, + "shared"); + + if (change_count > 0) { + for (i = 0; i < NUM_USABLE_VLS; i++) { + if (!valid_vl(i)) + continue; + + if (lowering_dedicated[i]) { + set_vl_dedicated(dd, i, + be16_to_cpu(new_bc-> + vl[i].dedicated)); + cur_bc.vl[i].dedicated = + new_bc->vl[i].dedicated; + } + } + + wait_for_vl_status_clear(dd, ld_mask, "dedicated"); + + /* now raise all dedicated that are going up */ + for (i = 0; i < NUM_USABLE_VLS; i++) { + if (!valid_vl(i)) + continue; + + if (be16_to_cpu(new_bc->vl[i].dedicated) > + be16_to_cpu(cur_bc.vl[i].dedicated)) + set_vl_dedicated(dd, i, + be16_to_cpu(new_bc-> + vl[i].dedicated)); + } + } + + /* next raise all shared that are going up */ + for (i = 0; i < NUM_USABLE_VLS; i++) { + if (!valid_vl(i)) + continue; + + if (be16_to_cpu(new_bc->vl[i].shared) > + be16_to_cpu(cur_bc.vl[i].shared)) + set_vl_shared(dd, i, be16_to_cpu(new_bc->vl[i].shared)); + } + + /* finally raise the global shared */ + if (be16_to_cpu(new_bc->overall_shared_limit) > + be16_to_cpu(cur_bc.overall_shared_limit)) + set_global_shared(dd, + be16_to_cpu(new_bc->overall_shared_limit)); + + /* bracket the credit change with a total adjustment */ + if (new_total < cur_total) + set_global_limit(dd, new_total); + + /* + * Determine the actual number of operational VLS using the number of + * dedicated and shared credits for each VL. + */ + if (change_count > 0) { + for (i = 0; i < TXE_NUM_DATA_VL; i++) + if (be16_to_cpu(new_bc->vl[i].dedicated) > 0 || + be16_to_cpu(new_bc->vl[i].shared) > 0) + vl_count++; + ppd->actual_vls_operational = vl_count; + ret = sdma_map_init(dd, ppd->port - 1, vl_count ? + ppd->actual_vls_operational : + ppd->vls_operational, + NULL); + if (ret == 0) + ret = pio_map_init(dd, ppd->port - 1, vl_count ? + ppd->actual_vls_operational : + ppd->vls_operational, NULL); + if (ret) + return ret; + } + return 0; +} + +/* + * Read the given fabric manager table. Return the size of the + * table (in bytes) on success, and a negative error code on + * failure. + */ +int fm_get_table(struct hfi1_pportdata *ppd, int which, void *t) + +{ + int size; + struct vl_arb_cache *vlc; + + switch (which) { + case FM_TBL_VL_HIGH_ARB: + size = 256; + /* + * OPA specifies 128 elements (of 2 bytes each), though + * HFI supports only 16 elements in h/w. + */ + vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE); + vl_arb_get_cache(vlc, t); + vl_arb_unlock_cache(ppd, HI_PRIO_TABLE); + break; + case FM_TBL_VL_LOW_ARB: + size = 256; + /* + * OPA specifies 128 elements (of 2 bytes each), though + * HFI supports only 16 elements in h/w. + */ + vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE); + vl_arb_get_cache(vlc, t); + vl_arb_unlock_cache(ppd, LO_PRIO_TABLE); + break; + case FM_TBL_BUFFER_CONTROL: + size = get_buffer_control(ppd->dd, t, NULL); + break; + case FM_TBL_SC2VLNT: + size = get_sc2vlnt(ppd->dd, t); + break; + case FM_TBL_VL_PREEMPT_ELEMS: + size = 256; + /* OPA specifies 128 elements, of 2 bytes each */ + get_vlarb_preempt(ppd->dd, OPA_MAX_VLS, t); + break; + case FM_TBL_VL_PREEMPT_MATRIX: + size = 256; + /* + * OPA specifies that this is the same size as the VL + * arbitration tables (i.e., 256 bytes). + */ + break; + default: + return -EINVAL; + } + return size; +} + +/* + * Write the given fabric manager table. + */ +int fm_set_table(struct hfi1_pportdata *ppd, int which, void *t) +{ + int ret = 0; + struct vl_arb_cache *vlc; + + switch (which) { + case FM_TBL_VL_HIGH_ARB: + vlc = vl_arb_lock_cache(ppd, HI_PRIO_TABLE); + if (vl_arb_match_cache(vlc, t)) { + vl_arb_unlock_cache(ppd, HI_PRIO_TABLE); + break; + } + vl_arb_set_cache(vlc, t); + vl_arb_unlock_cache(ppd, HI_PRIO_TABLE); + ret = set_vl_weights(ppd, SEND_HIGH_PRIORITY_LIST, + VL_ARB_HIGH_PRIO_TABLE_SIZE, t); + break; + case FM_TBL_VL_LOW_ARB: + vlc = vl_arb_lock_cache(ppd, LO_PRIO_TABLE); + if (vl_arb_match_cache(vlc, t)) { + vl_arb_unlock_cache(ppd, LO_PRIO_TABLE); + break; + } + vl_arb_set_cache(vlc, t); + vl_arb_unlock_cache(ppd, LO_PRIO_TABLE); + ret = set_vl_weights(ppd, SEND_LOW_PRIORITY_LIST, + VL_ARB_LOW_PRIO_TABLE_SIZE, t); + break; + case FM_TBL_BUFFER_CONTROL: + ret = set_buffer_control(ppd, t); + break; + case FM_TBL_SC2VLNT: + set_sc2vlnt(ppd->dd, t); + break; + default: + ret = -EINVAL; + } + return ret; +} + +/* + * Disable all data VLs. + * + * Return 0 if disabled, non-zero if the VLs cannot be disabled. + */ +static int disable_data_vls(struct hfi1_devdata *dd) +{ + if (is_ax(dd)) + return 1; + + pio_send_control(dd, PSC_DATA_VL_DISABLE); + + return 0; +} + +/* + * open_fill_data_vls() - the counterpart to stop_drain_data_vls(). + * Just re-enables all data VLs (the "fill" part happens + * automatically - the name was chosen for symmetry with + * stop_drain_data_vls()). + * + * Return 0 if successful, non-zero if the VLs cannot be enabled. + */ +int open_fill_data_vls(struct hfi1_devdata *dd) +{ + if (is_ax(dd)) + return 1; + + pio_send_control(dd, PSC_DATA_VL_ENABLE); + + return 0; +} + +/* + * drain_data_vls() - assumes that disable_data_vls() has been called, + * wait for occupancy (of per-VL FIFOs) for all contexts, and SDMA + * engines to drop to 0. + */ +static void drain_data_vls(struct hfi1_devdata *dd) +{ + sc_wait(dd); + sdma_wait(dd); + pause_for_credit_return(dd); +} + +/* + * stop_drain_data_vls() - disable, then drain all per-VL fifos. + * + * Use open_fill_data_vls() to resume using data VLs. This pair is + * meant to be used like this: + * + * stop_drain_data_vls(dd); + * // do things with per-VL resources + * open_fill_data_vls(dd); + */ +int stop_drain_data_vls(struct hfi1_devdata *dd) +{ + int ret; + + ret = disable_data_vls(dd); + if (ret == 0) + drain_data_vls(dd); + + return ret; +} + +/* + * Convert a nanosecond time to a cclock count. No matter how slow + * the cclock, a non-zero ns will always have a non-zero result. + */ +u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns) +{ + u32 cclocks; + + if (dd->icode == ICODE_FPGA_EMULATION) + cclocks = (ns * 1000) / FPGA_CCLOCK_PS; + else /* simulation pretends to be ASIC */ + cclocks = (ns * 1000) / ASIC_CCLOCK_PS; + if (ns && !cclocks) /* if ns nonzero, must be at least 1 */ + cclocks = 1; + return cclocks; +} + +/* + * Convert a cclock count to nanoseconds. Not matter how slow + * the cclock, a non-zero cclocks will always have a non-zero result. + */ +u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclocks) +{ + u32 ns; + + if (dd->icode == ICODE_FPGA_EMULATION) + ns = (cclocks * FPGA_CCLOCK_PS) / 1000; + else /* simulation pretends to be ASIC */ + ns = (cclocks * ASIC_CCLOCK_PS) / 1000; + if (cclocks && !ns) + ns = 1; + return ns; +} + +/* + * Dynamically adjust the receive interrupt timeout for a context based on + * incoming packet rate. + * + * NOTE: Dynamic adjustment does not allow rcv_intr_count to be zero. + */ +static void adjust_rcv_timeout(struct hfi1_ctxtdata *rcd, u32 npkts) +{ + struct hfi1_devdata *dd = rcd->dd; + u32 timeout = rcd->rcvavail_timeout; + + /* + * This algorithm doubles or halves the timeout depending on whether + * the number of packets received in this interrupt were less than or + * greater equal the interrupt count. + * + * The calculations below do not allow a steady state to be achieved. + * Only at the endpoints it is possible to have an unchanging + * timeout. + */ + if (npkts < rcv_intr_count) { + /* + * Not enough packets arrived before the timeout, adjust + * timeout downward. + */ + if (timeout < 2) /* already at minimum? */ + return; + timeout >>= 1; + } else { + /* + * More than enough packets arrived before the timeout, adjust + * timeout upward. + */ + if (timeout >= dd->rcv_intr_timeout_csr) /* already at max? */ + return; + timeout = min(timeout << 1, dd->rcv_intr_timeout_csr); + } + + rcd->rcvavail_timeout = timeout; + /* + * timeout cannot be larger than rcv_intr_timeout_csr which has already + * been verified to be in range + */ + write_kctxt_csr(dd, rcd->ctxt, RCV_AVAIL_TIME_OUT, + (u64)timeout << + RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT); +} + +void update_usrhead(struct hfi1_ctxtdata *rcd, u32 hd, u32 updegr, u32 egrhd, + u32 intr_adjust, u32 npkts) +{ + struct hfi1_devdata *dd = rcd->dd; + u64 reg; + u32 ctxt = rcd->ctxt; + + /* + * Need to write timeout register before updating RcvHdrHead to ensure + * that a new value is used when the HW decides to restart counting. + */ + if (intr_adjust) + adjust_rcv_timeout(rcd, npkts); + if (updegr) { + reg = (egrhd & RCV_EGR_INDEX_HEAD_HEAD_MASK) + << RCV_EGR_INDEX_HEAD_HEAD_SHIFT; + write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, reg); + } + mmiowb(); + reg = ((u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT) | + (((u64)hd & RCV_HDR_HEAD_HEAD_MASK) + << RCV_HDR_HEAD_HEAD_SHIFT); + write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg); + mmiowb(); +} + +u32 hdrqempty(struct hfi1_ctxtdata *rcd) +{ + u32 head, tail; + + head = (read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_HEAD) + & RCV_HDR_HEAD_HEAD_SMASK) >> RCV_HDR_HEAD_HEAD_SHIFT; + + if (rcd->rcvhdrtail_kvaddr) + tail = get_rcvhdrtail(rcd); + else + tail = read_uctxt_csr(rcd->dd, rcd->ctxt, RCV_HDR_TAIL); + + return head == tail; +} + +/* + * Context Control and Receive Array encoding for buffer size: + * 0x0 invalid + * 0x1 4 KB + * 0x2 8 KB + * 0x3 16 KB + * 0x4 32 KB + * 0x5 64 KB + * 0x6 128 KB + * 0x7 256 KB + * 0x8 512 KB (Receive Array only) + * 0x9 1 MB (Receive Array only) + * 0xa 2 MB (Receive Array only) + * + * 0xB-0xF - reserved (Receive Array only) + * + * + * This routine assumes that the value has already been sanity checked. + */ +static u32 encoded_size(u32 size) +{ + switch (size) { + case 4 * 1024: return 0x1; + case 8 * 1024: return 0x2; + case 16 * 1024: return 0x3; + case 32 * 1024: return 0x4; + case 64 * 1024: return 0x5; + case 128 * 1024: return 0x6; + case 256 * 1024: return 0x7; + case 512 * 1024: return 0x8; + case 1 * 1024 * 1024: return 0x9; + case 2 * 1024 * 1024: return 0xa; + } + return 0x1; /* if invalid, go with the minimum size */ +} + +void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt) +{ + struct hfi1_ctxtdata *rcd; + u64 rcvctrl, reg; + int did_enable = 0; + + rcd = dd->rcd[ctxt]; + if (!rcd) + return; + + hfi1_cdbg(RCVCTRL, "ctxt %d op 0x%x", ctxt, op); + + rcvctrl = read_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL); + /* if the context already enabled, don't do the extra steps */ + if ((op & HFI1_RCVCTRL_CTXT_ENB) && + !(rcvctrl & RCV_CTXT_CTRL_ENABLE_SMASK)) { + /* reset the tail and hdr addresses, and sequence count */ + write_kctxt_csr(dd, ctxt, RCV_HDR_ADDR, + rcd->rcvhdrq_phys); + if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) + write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR, + rcd->rcvhdrqtailaddr_phys); + rcd->seq_cnt = 1; + + /* reset the cached receive header queue head value */ + rcd->head = 0; + + /* + * Zero the receive header queue so we don't get false + * positives when checking the sequence number. The + * sequence numbers could land exactly on the same spot. + * E.g. a rcd restart before the receive header wrapped. + */ + memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size); + + /* starting timeout */ + rcd->rcvavail_timeout = dd->rcv_intr_timeout_csr; + + /* enable the context */ + rcvctrl |= RCV_CTXT_CTRL_ENABLE_SMASK; + + /* clean the egr buffer size first */ + rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK; + rcvctrl |= ((u64)encoded_size(rcd->egrbufs.rcvtid_size) + & RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK) + << RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT; + + /* zero RcvHdrHead - set RcvHdrHead.Counter after enable */ + write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0); + did_enable = 1; + + /* zero RcvEgrIndexHead */ + write_uctxt_csr(dd, ctxt, RCV_EGR_INDEX_HEAD, 0); + + /* set eager count and base index */ + reg = (((u64)(rcd->egrbufs.alloced >> RCV_SHIFT) + & RCV_EGR_CTRL_EGR_CNT_MASK) + << RCV_EGR_CTRL_EGR_CNT_SHIFT) | + (((rcd->eager_base >> RCV_SHIFT) + & RCV_EGR_CTRL_EGR_BASE_INDEX_MASK) + << RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT); + write_kctxt_csr(dd, ctxt, RCV_EGR_CTRL, reg); + + /* + * Set TID (expected) count and base index. + * rcd->expected_count is set to individual RcvArray entries, + * not pairs, and the CSR takes a pair-count in groups of + * four, so divide by 8. + */ + reg = (((rcd->expected_count >> RCV_SHIFT) + & RCV_TID_CTRL_TID_PAIR_CNT_MASK) + << RCV_TID_CTRL_TID_PAIR_CNT_SHIFT) | + (((rcd->expected_base >> RCV_SHIFT) + & RCV_TID_CTRL_TID_BASE_INDEX_MASK) + << RCV_TID_CTRL_TID_BASE_INDEX_SHIFT); + write_kctxt_csr(dd, ctxt, RCV_TID_CTRL, reg); + if (ctxt == HFI1_CTRL_CTXT) + write_csr(dd, RCV_VL15, HFI1_CTRL_CTXT); + } + if (op & HFI1_RCVCTRL_CTXT_DIS) { + write_csr(dd, RCV_VL15, 0); + /* + * When receive context is being disabled turn on tail + * update with a dummy tail address and then disable + * receive context. + */ + if (dd->rcvhdrtail_dummy_physaddr) { + write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR, + dd->rcvhdrtail_dummy_physaddr); + /* Enabling RcvCtxtCtrl.TailUpd is intentional. */ + rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK; + } + + rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK; + } + if (op & HFI1_RCVCTRL_INTRAVAIL_ENB) + rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK; + if (op & HFI1_RCVCTRL_INTRAVAIL_DIS) + rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK; + if (op & HFI1_RCVCTRL_TAILUPD_ENB && rcd->rcvhdrqtailaddr_phys) + rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK; + if (op & HFI1_RCVCTRL_TAILUPD_DIS) { + /* See comment on RcvCtxtCtrl.TailUpd above */ + if (!(op & HFI1_RCVCTRL_CTXT_DIS)) + rcvctrl &= ~RCV_CTXT_CTRL_TAIL_UPD_SMASK; + } + if (op & HFI1_RCVCTRL_TIDFLOW_ENB) + rcvctrl |= RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK; + if (op & HFI1_RCVCTRL_TIDFLOW_DIS) + rcvctrl &= ~RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK; + if (op & HFI1_RCVCTRL_ONE_PKT_EGR_ENB) { + /* + * In one-packet-per-eager mode, the size comes from + * the RcvArray entry. + */ + rcvctrl &= ~RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK; + rcvctrl |= RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK; + } + if (op & HFI1_RCVCTRL_ONE_PKT_EGR_DIS) + rcvctrl &= ~RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK; + if (op & HFI1_RCVCTRL_NO_RHQ_DROP_ENB) + rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK; + if (op & HFI1_RCVCTRL_NO_RHQ_DROP_DIS) + rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK; + if (op & HFI1_RCVCTRL_NO_EGR_DROP_ENB) + rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK; + if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS) + rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK; + rcd->rcvctrl = rcvctrl; + hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl); + write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcd->rcvctrl); + + /* work around sticky RcvCtxtStatus.BlockedRHQFull */ + if (did_enable && + (rcvctrl & RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK)) { + reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS); + if (reg != 0) { + dd_dev_info(dd, "ctxt %d status %lld (blocked)\n", + ctxt, reg); + read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD); + write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x10); + write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, 0x00); + read_uctxt_csr(dd, ctxt, RCV_HDR_HEAD); + reg = read_kctxt_csr(dd, ctxt, RCV_CTXT_STATUS); + dd_dev_info(dd, "ctxt %d status %lld (%s blocked)\n", + ctxt, reg, reg == 0 ? "not" : "still"); + } + } + + if (did_enable) { + /* + * The interrupt timeout and count must be set after + * the context is enabled to take effect. + */ + /* set interrupt timeout */ + write_kctxt_csr(dd, ctxt, RCV_AVAIL_TIME_OUT, + (u64)rcd->rcvavail_timeout << + RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT); + + /* set RcvHdrHead.Counter, zero RcvHdrHead.Head (again) */ + reg = (u64)rcv_intr_count << RCV_HDR_HEAD_COUNTER_SHIFT; + write_uctxt_csr(dd, ctxt, RCV_HDR_HEAD, reg); + } + + if (op & (HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_CTXT_DIS)) + /* + * If the context has been disabled and the Tail Update has + * been cleared, set the RCV_HDR_TAIL_ADDR CSR to dummy address + * so it doesn't contain an address that is invalid. + */ + write_kctxt_csr(dd, ctxt, RCV_HDR_TAIL_ADDR, + dd->rcvhdrtail_dummy_physaddr); +} + +u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp) +{ + int ret; + u64 val = 0; + + if (namep) { + ret = dd->cntrnameslen; + *namep = dd->cntrnames; + } else { + const struct cntr_entry *entry; + int i, j; + + ret = (dd->ndevcntrs) * sizeof(u64); + + /* Get the start of the block of counters */ + *cntrp = dd->cntrs; + + /* + * Now go and fill in each counter in the block. + */ + for (i = 0; i < DEV_CNTR_LAST; i++) { + entry = &dev_cntrs[i]; + hfi1_cdbg(CNTR, "reading %s", entry->name); + if (entry->flags & CNTR_DISABLED) { + /* Nothing */ + hfi1_cdbg(CNTR, "\tDisabled\n"); + } else { + if (entry->flags & CNTR_VL) { + hfi1_cdbg(CNTR, "\tPer VL\n"); + for (j = 0; j < C_VL_COUNT; j++) { + val = entry->rw_cntr(entry, + dd, j, + CNTR_MODE_R, + 0); + hfi1_cdbg( + CNTR, + "\t\tRead 0x%llx for %d\n", + val, j); + dd->cntrs[entry->offset + j] = + val; + } + } else if (entry->flags & CNTR_SDMA) { + hfi1_cdbg(CNTR, + "\t Per SDMA Engine\n"); + for (j = 0; j < dd->chip_sdma_engines; + j++) { + val = + entry->rw_cntr(entry, dd, j, + CNTR_MODE_R, 0); + hfi1_cdbg(CNTR, + "\t\tRead 0x%llx for %d\n", + val, j); + dd->cntrs[entry->offset + j] = + val; + } + } else { + val = entry->rw_cntr(entry, dd, + CNTR_INVALID_VL, + CNTR_MODE_R, 0); + dd->cntrs[entry->offset] = val; + hfi1_cdbg(CNTR, "\tRead 0x%llx", val); + } + } + } + } + return ret; +} + +/* + * Used by sysfs to create files for hfi stats to read + */ +u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp) +{ + int ret; + u64 val = 0; + + if (namep) { + ret = ppd->dd->portcntrnameslen; + *namep = ppd->dd->portcntrnames; + } else { + const struct cntr_entry *entry; + int i, j; + + ret = ppd->dd->nportcntrs * sizeof(u64); + *cntrp = ppd->cntrs; + + for (i = 0; i < PORT_CNTR_LAST; i++) { + entry = &port_cntrs[i]; + hfi1_cdbg(CNTR, "reading %s", entry->name); + if (entry->flags & CNTR_DISABLED) { + /* Nothing */ + hfi1_cdbg(CNTR, "\tDisabled\n"); + continue; + } + + if (entry->flags & CNTR_VL) { + hfi1_cdbg(CNTR, "\tPer VL"); + for (j = 0; j < C_VL_COUNT; j++) { + val = entry->rw_cntr(entry, ppd, j, + CNTR_MODE_R, + 0); + hfi1_cdbg( + CNTR, + "\t\tRead 0x%llx for %d", + val, j); + ppd->cntrs[entry->offset + j] = val; + } + } else { + val = entry->rw_cntr(entry, ppd, + CNTR_INVALID_VL, + CNTR_MODE_R, + 0); + ppd->cntrs[entry->offset] = val; + hfi1_cdbg(CNTR, "\tRead 0x%llx", val); + } + } + } + return ret; +} + +static void free_cntrs(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + int i; + + if (dd->synth_stats_timer.data) + del_timer_sync(&dd->synth_stats_timer); + dd->synth_stats_timer.data = 0; + ppd = (struct hfi1_pportdata *)(dd + 1); + for (i = 0; i < dd->num_pports; i++, ppd++) { + kfree(ppd->cntrs); + kfree(ppd->scntrs); + free_percpu(ppd->ibport_data.rvp.rc_acks); + free_percpu(ppd->ibport_data.rvp.rc_qacks); + free_percpu(ppd->ibport_data.rvp.rc_delayed_comp); + ppd->cntrs = NULL; + ppd->scntrs = NULL; + ppd->ibport_data.rvp.rc_acks = NULL; + ppd->ibport_data.rvp.rc_qacks = NULL; + ppd->ibport_data.rvp.rc_delayed_comp = NULL; + } + kfree(dd->portcntrnames); + dd->portcntrnames = NULL; + kfree(dd->cntrs); + dd->cntrs = NULL; + kfree(dd->scntrs); + dd->scntrs = NULL; + kfree(dd->cntrnames); + dd->cntrnames = NULL; +} + +#define CNTR_MAX 0xFFFFFFFFFFFFFFFFULL +#define CNTR_32BIT_MAX 0x00000000FFFFFFFF + +static u64 read_dev_port_cntr(struct hfi1_devdata *dd, struct cntr_entry *entry, + u64 *psval, void *context, int vl) +{ + u64 val; + u64 sval = *psval; + + if (entry->flags & CNTR_DISABLED) { + dd_dev_err(dd, "Counter %s not enabled", entry->name); + return 0; + } + + hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval); + + val = entry->rw_cntr(entry, context, vl, CNTR_MODE_R, 0); + + /* If its a synthetic counter there is more work we need to do */ + if (entry->flags & CNTR_SYNTH) { + if (sval == CNTR_MAX) { + /* No need to read already saturated */ + return CNTR_MAX; + } + + if (entry->flags & CNTR_32BIT) { + /* 32bit counters can wrap multiple times */ + u64 upper = sval >> 32; + u64 lower = (sval << 32) >> 32; + + if (lower > val) { /* hw wrapped */ + if (upper == CNTR_32BIT_MAX) + val = CNTR_MAX; + else + upper++; + } + + if (val != CNTR_MAX) + val = (upper << 32) | val; + + } else { + /* If we rolled we are saturated */ + if ((val < sval) || (val > CNTR_MAX)) + val = CNTR_MAX; + } + } + + *psval = val; + + hfi1_cdbg(CNTR, "\tNew val=0x%llx", val); + + return val; +} + +static u64 write_dev_port_cntr(struct hfi1_devdata *dd, + struct cntr_entry *entry, + u64 *psval, void *context, int vl, u64 data) +{ + u64 val; + + if (entry->flags & CNTR_DISABLED) { + dd_dev_err(dd, "Counter %s not enabled", entry->name); + return 0; + } + + hfi1_cdbg(CNTR, "cntr: %s vl %d psval 0x%llx", entry->name, vl, *psval); + + if (entry->flags & CNTR_SYNTH) { + *psval = data; + if (entry->flags & CNTR_32BIT) { + val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, + (data << 32) >> 32); + val = data; /* return the full 64bit value */ + } else { + val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, + data); + } + } else { + val = entry->rw_cntr(entry, context, vl, CNTR_MODE_W, data); + } + + *psval = val; + + hfi1_cdbg(CNTR, "\tNew val=0x%llx", val); + + return val; +} + +u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl) +{ + struct cntr_entry *entry; + u64 *sval; + + entry = &dev_cntrs[index]; + sval = dd->scntrs + entry->offset; + + if (vl != CNTR_INVALID_VL) + sval += vl; + + return read_dev_port_cntr(dd, entry, sval, dd, vl); +} + +u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data) +{ + struct cntr_entry *entry; + u64 *sval; + + entry = &dev_cntrs[index]; + sval = dd->scntrs + entry->offset; + + if (vl != CNTR_INVALID_VL) + sval += vl; + + return write_dev_port_cntr(dd, entry, sval, dd, vl, data); +} + +u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl) +{ + struct cntr_entry *entry; + u64 *sval; + + entry = &port_cntrs[index]; + sval = ppd->scntrs + entry->offset; + + if (vl != CNTR_INVALID_VL) + sval += vl; + + if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) && + (index <= C_RCV_HDR_OVF_LAST)) { + /* We do not want to bother for disabled contexts */ + return 0; + } + + return read_dev_port_cntr(ppd->dd, entry, sval, ppd, vl); +} + +u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data) +{ + struct cntr_entry *entry; + u64 *sval; + + entry = &port_cntrs[index]; + sval = ppd->scntrs + entry->offset; + + if (vl != CNTR_INVALID_VL) + sval += vl; + + if ((index >= C_RCV_HDR_OVF_FIRST + ppd->dd->num_rcv_contexts) && + (index <= C_RCV_HDR_OVF_LAST)) { + /* We do not want to bother for disabled contexts */ + return 0; + } + + return write_dev_port_cntr(ppd->dd, entry, sval, ppd, vl, data); +} + +static void update_synth_timer(unsigned long opaque) +{ + u64 cur_tx; + u64 cur_rx; + u64 total_flits; + u8 update = 0; + int i, j, vl; + struct hfi1_pportdata *ppd; + struct cntr_entry *entry; + + struct hfi1_devdata *dd = (struct hfi1_devdata *)opaque; + + /* + * Rather than keep beating on the CSRs pick a minimal set that we can + * check to watch for potential roll over. We can do this by looking at + * the number of flits sent/recv. If the total flits exceeds 32bits then + * we have to iterate all the counters and update. + */ + entry = &dev_cntrs[C_DC_RCV_FLITS]; + cur_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0); + + entry = &dev_cntrs[C_DC_XMIT_FLITS]; + cur_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, CNTR_MODE_R, 0); + + hfi1_cdbg( + CNTR, + "[%d] curr tx=0x%llx rx=0x%llx :: last tx=0x%llx rx=0x%llx\n", + dd->unit, cur_tx, cur_rx, dd->last_tx, dd->last_rx); + + if ((cur_tx < dd->last_tx) || (cur_rx < dd->last_rx)) { + /* + * May not be strictly necessary to update but it won't hurt and + * simplifies the logic here. + */ + update = 1; + hfi1_cdbg(CNTR, "[%d] Tripwire counter rolled, updating", + dd->unit); + } else { + total_flits = (cur_tx - dd->last_tx) + (cur_rx - dd->last_rx); + hfi1_cdbg(CNTR, + "[%d] total flits 0x%llx limit 0x%llx\n", dd->unit, + total_flits, (u64)CNTR_32BIT_MAX); + if (total_flits >= CNTR_32BIT_MAX) { + hfi1_cdbg(CNTR, "[%d] 32bit limit hit, updating", + dd->unit); + update = 1; + } + } + + if (update) { + hfi1_cdbg(CNTR, "[%d] Updating dd and ppd counters", dd->unit); + for (i = 0; i < DEV_CNTR_LAST; i++) { + entry = &dev_cntrs[i]; + if (entry->flags & CNTR_VL) { + for (vl = 0; vl < C_VL_COUNT; vl++) + read_dev_cntr(dd, i, vl); + } else { + read_dev_cntr(dd, i, CNTR_INVALID_VL); + } + } + ppd = (struct hfi1_pportdata *)(dd + 1); + for (i = 0; i < dd->num_pports; i++, ppd++) { + for (j = 0; j < PORT_CNTR_LAST; j++) { + entry = &port_cntrs[j]; + if (entry->flags & CNTR_VL) { + for (vl = 0; vl < C_VL_COUNT; vl++) + read_port_cntr(ppd, j, vl); + } else { + read_port_cntr(ppd, j, CNTR_INVALID_VL); + } + } + } + + /* + * We want the value in the register. The goal is to keep track + * of the number of "ticks" not the counter value. In other + * words if the register rolls we want to notice it and go ahead + * and force an update. + */ + entry = &dev_cntrs[C_DC_XMIT_FLITS]; + dd->last_tx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, + CNTR_MODE_R, 0); + + entry = &dev_cntrs[C_DC_RCV_FLITS]; + dd->last_rx = entry->rw_cntr(entry, dd, CNTR_INVALID_VL, + CNTR_MODE_R, 0); + + hfi1_cdbg(CNTR, "[%d] setting last tx/rx to 0x%llx 0x%llx", + dd->unit, dd->last_tx, dd->last_rx); + + } else { + hfi1_cdbg(CNTR, "[%d] No update necessary", dd->unit); + } + + mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME); +} + +#define C_MAX_NAME 13 /* 12 chars + one for /0 */ +static int init_cntrs(struct hfi1_devdata *dd) +{ + int i, rcv_ctxts, j; + size_t sz; + char *p; + char name[C_MAX_NAME]; + struct hfi1_pportdata *ppd; + const char *bit_type_32 = ",32"; + const int bit_type_32_sz = strlen(bit_type_32); + + /* set up the stats timer; the add_timer is done at the end */ + setup_timer(&dd->synth_stats_timer, update_synth_timer, + (unsigned long)dd); + + /***********************/ + /* per device counters */ + /***********************/ + + /* size names and determine how many we have*/ + dd->ndevcntrs = 0; + sz = 0; + + for (i = 0; i < DEV_CNTR_LAST; i++) { + if (dev_cntrs[i].flags & CNTR_DISABLED) { + hfi1_dbg_early("\tSkipping %s\n", dev_cntrs[i].name); + continue; + } + + if (dev_cntrs[i].flags & CNTR_VL) { + dev_cntrs[i].offset = dd->ndevcntrs; + for (j = 0; j < C_VL_COUNT; j++) { + snprintf(name, C_MAX_NAME, "%s%d", + dev_cntrs[i].name, vl_from_idx(j)); + sz += strlen(name); + /* Add ",32" for 32-bit counters */ + if (dev_cntrs[i].flags & CNTR_32BIT) + sz += bit_type_32_sz; + sz++; + dd->ndevcntrs++; + } + } else if (dev_cntrs[i].flags & CNTR_SDMA) { + dev_cntrs[i].offset = dd->ndevcntrs; + for (j = 0; j < dd->chip_sdma_engines; j++) { + snprintf(name, C_MAX_NAME, "%s%d", + dev_cntrs[i].name, j); + sz += strlen(name); + /* Add ",32" for 32-bit counters */ + if (dev_cntrs[i].flags & CNTR_32BIT) + sz += bit_type_32_sz; + sz++; + dd->ndevcntrs++; + } + } else { + /* +1 for newline. */ + sz += strlen(dev_cntrs[i].name) + 1; + /* Add ",32" for 32-bit counters */ + if (dev_cntrs[i].flags & CNTR_32BIT) + sz += bit_type_32_sz; + dev_cntrs[i].offset = dd->ndevcntrs; + dd->ndevcntrs++; + } + } + + /* allocate space for the counter values */ + dd->cntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL); + if (!dd->cntrs) + goto bail; + + dd->scntrs = kcalloc(dd->ndevcntrs, sizeof(u64), GFP_KERNEL); + if (!dd->scntrs) + goto bail; + + /* allocate space for the counter names */ + dd->cntrnameslen = sz; + dd->cntrnames = kmalloc(sz, GFP_KERNEL); + if (!dd->cntrnames) + goto bail; + + /* fill in the names */ + for (p = dd->cntrnames, i = 0; i < DEV_CNTR_LAST; i++) { + if (dev_cntrs[i].flags & CNTR_DISABLED) { + /* Nothing */ + } else if (dev_cntrs[i].flags & CNTR_VL) { + for (j = 0; j < C_VL_COUNT; j++) { + snprintf(name, C_MAX_NAME, "%s%d", + dev_cntrs[i].name, + vl_from_idx(j)); + memcpy(p, name, strlen(name)); + p += strlen(name); + + /* Counter is 32 bits */ + if (dev_cntrs[i].flags & CNTR_32BIT) { + memcpy(p, bit_type_32, bit_type_32_sz); + p += bit_type_32_sz; + } + + *p++ = '\n'; + } + } else if (dev_cntrs[i].flags & CNTR_SDMA) { + for (j = 0; j < dd->chip_sdma_engines; j++) { + snprintf(name, C_MAX_NAME, "%s%d", + dev_cntrs[i].name, j); + memcpy(p, name, strlen(name)); + p += strlen(name); + + /* Counter is 32 bits */ + if (dev_cntrs[i].flags & CNTR_32BIT) { + memcpy(p, bit_type_32, bit_type_32_sz); + p += bit_type_32_sz; + } + + *p++ = '\n'; + } + } else { + memcpy(p, dev_cntrs[i].name, strlen(dev_cntrs[i].name)); + p += strlen(dev_cntrs[i].name); + + /* Counter is 32 bits */ + if (dev_cntrs[i].flags & CNTR_32BIT) { + memcpy(p, bit_type_32, bit_type_32_sz); + p += bit_type_32_sz; + } + + *p++ = '\n'; + } + } + + /*********************/ + /* per port counters */ + /*********************/ + + /* + * Go through the counters for the overflows and disable the ones we + * don't need. This varies based on platform so we need to do it + * dynamically here. + */ + rcv_ctxts = dd->num_rcv_contexts; + for (i = C_RCV_HDR_OVF_FIRST + rcv_ctxts; + i <= C_RCV_HDR_OVF_LAST; i++) { + port_cntrs[i].flags |= CNTR_DISABLED; + } + + /* size port counter names and determine how many we have*/ + sz = 0; + dd->nportcntrs = 0; + for (i = 0; i < PORT_CNTR_LAST; i++) { + if (port_cntrs[i].flags & CNTR_DISABLED) { + hfi1_dbg_early("\tSkipping %s\n", port_cntrs[i].name); + continue; + } + + if (port_cntrs[i].flags & CNTR_VL) { + port_cntrs[i].offset = dd->nportcntrs; + for (j = 0; j < C_VL_COUNT; j++) { + snprintf(name, C_MAX_NAME, "%s%d", + port_cntrs[i].name, vl_from_idx(j)); + sz += strlen(name); + /* Add ",32" for 32-bit counters */ + if (port_cntrs[i].flags & CNTR_32BIT) + sz += bit_type_32_sz; + sz++; + dd->nportcntrs++; + } + } else { + /* +1 for newline */ + sz += strlen(port_cntrs[i].name) + 1; + /* Add ",32" for 32-bit counters */ + if (port_cntrs[i].flags & CNTR_32BIT) + sz += bit_type_32_sz; + port_cntrs[i].offset = dd->nportcntrs; + dd->nportcntrs++; + } + } + + /* allocate space for the counter names */ + dd->portcntrnameslen = sz; + dd->portcntrnames = kmalloc(sz, GFP_KERNEL); + if (!dd->portcntrnames) + goto bail; + + /* fill in port cntr names */ + for (p = dd->portcntrnames, i = 0; i < PORT_CNTR_LAST; i++) { + if (port_cntrs[i].flags & CNTR_DISABLED) + continue; + + if (port_cntrs[i].flags & CNTR_VL) { + for (j = 0; j < C_VL_COUNT; j++) { + snprintf(name, C_MAX_NAME, "%s%d", + port_cntrs[i].name, vl_from_idx(j)); + memcpy(p, name, strlen(name)); + p += strlen(name); + + /* Counter is 32 bits */ + if (port_cntrs[i].flags & CNTR_32BIT) { + memcpy(p, bit_type_32, bit_type_32_sz); + p += bit_type_32_sz; + } + + *p++ = '\n'; + } + } else { + memcpy(p, port_cntrs[i].name, + strlen(port_cntrs[i].name)); + p += strlen(port_cntrs[i].name); + + /* Counter is 32 bits */ + if (port_cntrs[i].flags & CNTR_32BIT) { + memcpy(p, bit_type_32, bit_type_32_sz); + p += bit_type_32_sz; + } + + *p++ = '\n'; + } + } + + /* allocate per port storage for counter values */ + ppd = (struct hfi1_pportdata *)(dd + 1); + for (i = 0; i < dd->num_pports; i++, ppd++) { + ppd->cntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL); + if (!ppd->cntrs) + goto bail; + + ppd->scntrs = kcalloc(dd->nportcntrs, sizeof(u64), GFP_KERNEL); + if (!ppd->scntrs) + goto bail; + } + + /* CPU counters need to be allocated and zeroed */ + if (init_cpu_counters(dd)) + goto bail; + + mod_timer(&dd->synth_stats_timer, jiffies + HZ * SYNTH_CNT_TIME); + return 0; +bail: + free_cntrs(dd); + return -ENOMEM; +} + +static u32 chip_to_opa_lstate(struct hfi1_devdata *dd, u32 chip_lstate) +{ + switch (chip_lstate) { + default: + dd_dev_err(dd, + "Unknown logical state 0x%x, reporting IB_PORT_DOWN\n", + chip_lstate); + /* fall through */ + case LSTATE_DOWN: + return IB_PORT_DOWN; + case LSTATE_INIT: + return IB_PORT_INIT; + case LSTATE_ARMED: + return IB_PORT_ARMED; + case LSTATE_ACTIVE: + return IB_PORT_ACTIVE; + } +} + +u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate) +{ + /* look at the HFI meta-states only */ + switch (chip_pstate & 0xf0) { + default: + dd_dev_err(dd, "Unexpected chip physical state of 0x%x\n", + chip_pstate); + /* fall through */ + case PLS_DISABLED: + return IB_PORTPHYSSTATE_DISABLED; + case PLS_OFFLINE: + return OPA_PORTPHYSSTATE_OFFLINE; + case PLS_POLLING: + return IB_PORTPHYSSTATE_POLLING; + case PLS_CONFIGPHY: + return IB_PORTPHYSSTATE_TRAINING; + case PLS_LINKUP: + return IB_PORTPHYSSTATE_LINKUP; + case PLS_PHYTEST: + return IB_PORTPHYSSTATE_PHY_TEST; + } +} + +/* return the OPA port logical state name */ +const char *opa_lstate_name(u32 lstate) +{ + static const char * const port_logical_names[] = { + "PORT_NOP", + "PORT_DOWN", + "PORT_INIT", + "PORT_ARMED", + "PORT_ACTIVE", + "PORT_ACTIVE_DEFER", + }; + if (lstate < ARRAY_SIZE(port_logical_names)) + return port_logical_names[lstate]; + return "unknown"; +} + +/* return the OPA port physical state name */ +const char *opa_pstate_name(u32 pstate) +{ + static const char * const port_physical_names[] = { + "PHYS_NOP", + "reserved1", + "PHYS_POLL", + "PHYS_DISABLED", + "PHYS_TRAINING", + "PHYS_LINKUP", + "PHYS_LINK_ERR_RECOVER", + "PHYS_PHY_TEST", + "reserved8", + "PHYS_OFFLINE", + "PHYS_GANGED", + "PHYS_TEST", + }; + if (pstate < ARRAY_SIZE(port_physical_names)) + return port_physical_names[pstate]; + return "unknown"; +} + +/* + * Read the hardware link state and set the driver's cached value of it. + * Return the (new) current value. + */ +u32 get_logical_state(struct hfi1_pportdata *ppd) +{ + u32 new_state; + + new_state = chip_to_opa_lstate(ppd->dd, read_logical_state(ppd->dd)); + if (new_state != ppd->lstate) { + dd_dev_info(ppd->dd, "logical state changed to %s (0x%x)\n", + opa_lstate_name(new_state), new_state); + ppd->lstate = new_state; + } + /* + * Set port status flags in the page mapped into userspace + * memory. Do it here to ensure a reliable state - this is + * the only function called by all state handling code. + * Always set the flags due to the fact that the cache value + * might have been changed explicitly outside of this + * function. + */ + if (ppd->statusp) { + switch (ppd->lstate) { + case IB_PORT_DOWN: + case IB_PORT_INIT: + *ppd->statusp &= ~(HFI1_STATUS_IB_CONF | + HFI1_STATUS_IB_READY); + break; + case IB_PORT_ARMED: + *ppd->statusp |= HFI1_STATUS_IB_CONF; + break; + case IB_PORT_ACTIVE: + *ppd->statusp |= HFI1_STATUS_IB_READY; + break; + } + } + return ppd->lstate; +} + +/** + * wait_logical_linkstate - wait for an IB link state change to occur + * @ppd: port device + * @state: the state to wait for + * @msecs: the number of milliseconds to wait + * + * Wait up to msecs milliseconds for IB link state change to occur. + * For now, take the easy polling route. + * Returns 0 if state reached, otherwise -ETIMEDOUT. + */ +static int wait_logical_linkstate(struct hfi1_pportdata *ppd, u32 state, + int msecs) +{ + unsigned long timeout; + + timeout = jiffies + msecs_to_jiffies(msecs); + while (1) { + if (get_logical_state(ppd) == state) + return 0; + if (time_after(jiffies, timeout)) + break; + msleep(20); + } + dd_dev_err(ppd->dd, "timeout waiting for link state 0x%x\n", state); + + return -ETIMEDOUT; +} + +u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd) +{ + u32 pstate; + u32 ib_pstate; + + pstate = read_physical_state(ppd->dd); + ib_pstate = chip_to_opa_pstate(ppd->dd, pstate); + if (ppd->last_pstate != ib_pstate) { + dd_dev_info(ppd->dd, + "%s: physical state changed to %s (0x%x), phy 0x%x\n", + __func__, opa_pstate_name(ib_pstate), ib_pstate, + pstate); + ppd->last_pstate = ib_pstate; + } + return ib_pstate; +} + +/* + * Read/modify/write ASIC_QSFP register bits as selected by mask + * data: 0 or 1 in the positions depending on what needs to be written + * dir: 0 for read, 1 for write + * mask: select by setting + * I2CCLK (bit 0) + * I2CDATA (bit 1) + */ +u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir, + u32 mask) +{ + u64 qsfp_oe, target_oe; + + target_oe = target ? ASIC_QSFP2_OE : ASIC_QSFP1_OE; + if (mask) { + /* We are writing register bits, so lock access */ + dir &= mask; + data &= mask; + + qsfp_oe = read_csr(dd, target_oe); + qsfp_oe = (qsfp_oe & ~(u64)mask) | (u64)dir; + write_csr(dd, target_oe, qsfp_oe); + } + /* We are exclusively reading bits here, but it is unlikely + * we'll get valid data when we set the direction of the pin + * in the same call, so read should call this function again + * to get valid data + */ + return read_csr(dd, target ? ASIC_QSFP2_IN : ASIC_QSFP1_IN); +} + +#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \ +(r &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) + +#define SET_STATIC_RATE_CONTROL_SMASK(r) \ +(r |= SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) + +int hfi1_init_ctxt(struct send_context *sc) +{ + if (sc) { + struct hfi1_devdata *dd = sc->dd; + u64 reg; + u8 set = (sc->type == SC_USER ? + HFI1_CAP_IS_USET(STATIC_RATE_CTRL) : + HFI1_CAP_IS_KSET(STATIC_RATE_CTRL)); + reg = read_kctxt_csr(dd, sc->hw_context, + SEND_CTXT_CHECK_ENABLE); + if (set) + CLEAR_STATIC_RATE_CONTROL_SMASK(reg); + else + SET_STATIC_RATE_CONTROL_SMASK(reg); + write_kctxt_csr(dd, sc->hw_context, + SEND_CTXT_CHECK_ENABLE, reg); + } + return 0; +} + +int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp) +{ + int ret = 0; + u64 reg; + + if (dd->icode != ICODE_RTL_SILICON) { + if (HFI1_CAP_IS_KSET(PRINT_UNIMPL)) + dd_dev_info(dd, "%s: tempsense not supported by HW\n", + __func__); + return -EINVAL; + } + reg = read_csr(dd, ASIC_STS_THERM); + temp->curr = ((reg >> ASIC_STS_THERM_CURR_TEMP_SHIFT) & + ASIC_STS_THERM_CURR_TEMP_MASK); + temp->lo_lim = ((reg >> ASIC_STS_THERM_LO_TEMP_SHIFT) & + ASIC_STS_THERM_LO_TEMP_MASK); + temp->hi_lim = ((reg >> ASIC_STS_THERM_HI_TEMP_SHIFT) & + ASIC_STS_THERM_HI_TEMP_MASK); + temp->crit_lim = ((reg >> ASIC_STS_THERM_CRIT_TEMP_SHIFT) & + ASIC_STS_THERM_CRIT_TEMP_MASK); + /* triggers is a 3-bit value - 1 bit per trigger. */ + temp->triggers = (u8)((reg >> ASIC_STS_THERM_LOW_SHIFT) & 0x7); + + return ret; +} + +/* ========================================================================= */ + +/* + * Enable/disable chip from delivering interrupts. + */ +void set_intr_state(struct hfi1_devdata *dd, u32 enable) +{ + int i; + + /* + * In HFI, the mask needs to be 1 to allow interrupts. + */ + if (enable) { + /* enable all interrupts */ + for (i = 0; i < CCE_NUM_INT_CSRS; i++) + write_csr(dd, CCE_INT_MASK + (8 * i), ~(u64)0); + + init_qsfp_int(dd); + } else { + for (i = 0; i < CCE_NUM_INT_CSRS; i++) + write_csr(dd, CCE_INT_MASK + (8 * i), 0ull); + } +} + +/* + * Clear all interrupt sources on the chip. + */ +static void clear_all_interrupts(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < CCE_NUM_INT_CSRS; i++) + write_csr(dd, CCE_INT_CLEAR + (8 * i), ~(u64)0); + + write_csr(dd, CCE_ERR_CLEAR, ~(u64)0); + write_csr(dd, MISC_ERR_CLEAR, ~(u64)0); + write_csr(dd, RCV_ERR_CLEAR, ~(u64)0); + write_csr(dd, SEND_ERR_CLEAR, ~(u64)0); + write_csr(dd, SEND_PIO_ERR_CLEAR, ~(u64)0); + write_csr(dd, SEND_DMA_ERR_CLEAR, ~(u64)0); + write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~(u64)0); + for (i = 0; i < dd->chip_send_contexts; i++) + write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~(u64)0); + for (i = 0; i < dd->chip_sdma_engines; i++) + write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~(u64)0); + + write_csr(dd, DCC_ERR_FLG_CLR, ~(u64)0); + write_csr(dd, DC_LCB_ERR_CLR, ~(u64)0); + write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0); +} + +/* Move to pcie.c? */ +static void disable_intx(struct pci_dev *pdev) +{ + pci_intx(pdev, 0); +} + +static void clean_up_interrupts(struct hfi1_devdata *dd) +{ + int i; + + /* remove irqs - must happen before disabling/turning off */ + if (dd->num_msix_entries) { + /* MSI-X */ + struct hfi1_msix_entry *me = dd->msix_entries; + + for (i = 0; i < dd->num_msix_entries; i++, me++) { + if (!me->arg) /* => no irq, no affinity */ + continue; + hfi1_put_irq_affinity(dd, &dd->msix_entries[i]); + free_irq(me->msix.vector, me->arg); + } + } else { + /* INTx */ + if (dd->requested_intx_irq) { + free_irq(dd->pcidev->irq, dd); + dd->requested_intx_irq = 0; + } + } + + /* turn off interrupts */ + if (dd->num_msix_entries) { + /* MSI-X */ + pci_disable_msix(dd->pcidev); + } else { + /* INTx */ + disable_intx(dd->pcidev); + } + + /* clean structures */ + kfree(dd->msix_entries); + dd->msix_entries = NULL; + dd->num_msix_entries = 0; +} + +/* + * Remap the interrupt source from the general handler to the given MSI-X + * interrupt. + */ +static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr) +{ + u64 reg; + int m, n; + + /* clear from the handled mask of the general interrupt */ + m = isrc / 64; + n = isrc % 64; + dd->gi_mask[m] &= ~((u64)1 << n); + + /* direct the chip source to the given MSI-X interrupt */ + m = isrc / 8; + n = isrc % 8; + reg = read_csr(dd, CCE_INT_MAP + (8 * m)); + reg &= ~((u64)0xff << (8 * n)); + reg |= ((u64)msix_intr & 0xff) << (8 * n); + write_csr(dd, CCE_INT_MAP + (8 * m), reg); +} + +static void remap_sdma_interrupts(struct hfi1_devdata *dd, + int engine, int msix_intr) +{ + /* + * SDMA engine interrupt sources grouped by type, rather than + * engine. Per-engine interrupts are as follows: + * SDMA + * SDMAProgress + * SDMAIdle + */ + remap_intr(dd, IS_SDMA_START + 0 * TXE_NUM_SDMA_ENGINES + engine, + msix_intr); + remap_intr(dd, IS_SDMA_START + 1 * TXE_NUM_SDMA_ENGINES + engine, + msix_intr); + remap_intr(dd, IS_SDMA_START + 2 * TXE_NUM_SDMA_ENGINES + engine, + msix_intr); +} + +static int request_intx_irq(struct hfi1_devdata *dd) +{ + int ret; + + snprintf(dd->intx_name, sizeof(dd->intx_name), DRIVER_NAME "_%d", + dd->unit); + ret = request_irq(dd->pcidev->irq, general_interrupt, + IRQF_SHARED, dd->intx_name, dd); + if (ret) + dd_dev_err(dd, "unable to request INTx interrupt, err %d\n", + ret); + else + dd->requested_intx_irq = 1; + return ret; +} + +static int request_msix_irqs(struct hfi1_devdata *dd) +{ + int first_general, last_general; + int first_sdma, last_sdma; + int first_rx, last_rx; + int i, ret = 0; + + /* calculate the ranges we are going to use */ + first_general = 0; + last_general = first_general + 1; + first_sdma = last_general; + last_sdma = first_sdma + dd->num_sdma; + first_rx = last_sdma; + last_rx = first_rx + dd->n_krcv_queues; + + /* + * Sanity check - the code expects all SDMA chip source + * interrupts to be in the same CSR, starting at bit 0. Verify + * that this is true by checking the bit location of the start. + */ + BUILD_BUG_ON(IS_SDMA_START % 64); + + for (i = 0; i < dd->num_msix_entries; i++) { + struct hfi1_msix_entry *me = &dd->msix_entries[i]; + const char *err_info; + irq_handler_t handler; + irq_handler_t thread = NULL; + void *arg; + int idx; + struct hfi1_ctxtdata *rcd = NULL; + struct sdma_engine *sde = NULL; + + /* obtain the arguments to request_irq */ + if (first_general <= i && i < last_general) { + idx = i - first_general; + handler = general_interrupt; + arg = dd; + snprintf(me->name, sizeof(me->name), + DRIVER_NAME "_%d", dd->unit); + err_info = "general"; + me->type = IRQ_GENERAL; + } else if (first_sdma <= i && i < last_sdma) { + idx = i - first_sdma; + sde = &dd->per_sdma[idx]; + handler = sdma_interrupt; + arg = sde; + snprintf(me->name, sizeof(me->name), + DRIVER_NAME "_%d sdma%d", dd->unit, idx); + err_info = "sdma"; + remap_sdma_interrupts(dd, idx, i); + me->type = IRQ_SDMA; + } else if (first_rx <= i && i < last_rx) { + idx = i - first_rx; + rcd = dd->rcd[idx]; + /* no interrupt if no rcd */ + if (!rcd) + continue; + /* + * Set the interrupt register and mask for this + * context's interrupt. + */ + rcd->ireg = (IS_RCVAVAIL_START + idx) / 64; + rcd->imask = ((u64)1) << + ((IS_RCVAVAIL_START + idx) % 64); + handler = receive_context_interrupt; + thread = receive_context_thread; + arg = rcd; + snprintf(me->name, sizeof(me->name), + DRIVER_NAME "_%d kctxt%d", dd->unit, idx); + err_info = "receive context"; + remap_intr(dd, IS_RCVAVAIL_START + idx, i); + me->type = IRQ_RCVCTXT; + } else { + /* not in our expected range - complain, then + * ignore it + */ + dd_dev_err(dd, + "Unexpected extra MSI-X interrupt %d\n", i); + continue; + } + /* no argument, no interrupt */ + if (!arg) + continue; + /* make sure the name is terminated */ + me->name[sizeof(me->name) - 1] = 0; + + ret = request_threaded_irq(me->msix.vector, handler, thread, 0, + me->name, arg); + if (ret) { + dd_dev_err(dd, + "unable to allocate %s interrupt, vector %d, index %d, err %d\n", + err_info, me->msix.vector, idx, ret); + return ret; + } + /* + * assign arg after request_irq call, so it will be + * cleaned up + */ + me->arg = arg; + + ret = hfi1_get_irq_affinity(dd, me); + if (ret) + dd_dev_err(dd, + "unable to pin IRQ %d\n", ret); + } + + return ret; +} + +/* + * Set the general handler to accept all interrupts, remap all + * chip interrupts back to MSI-X 0. + */ +static void reset_interrupts(struct hfi1_devdata *dd) +{ + int i; + + /* all interrupts handled by the general handler */ + for (i = 0; i < CCE_NUM_INT_CSRS; i++) + dd->gi_mask[i] = ~(u64)0; + + /* all chip interrupts map to MSI-X 0 */ + for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++) + write_csr(dd, CCE_INT_MAP + (8 * i), 0); +} + +static int set_up_interrupts(struct hfi1_devdata *dd) +{ + struct hfi1_msix_entry *entries; + u32 total, request; + int i, ret; + int single_interrupt = 0; /* we expect to have all the interrupts */ + + /* + * Interrupt count: + * 1 general, "slow path" interrupt (includes the SDMA engines + * slow source, SDMACleanupDone) + * N interrupts - one per used SDMA engine + * M interrupt - one per kernel receive context + */ + total = 1 + dd->num_sdma + dd->n_krcv_queues; + + entries = kcalloc(total, sizeof(*entries), GFP_KERNEL); + if (!entries) { + ret = -ENOMEM; + goto fail; + } + /* 1-1 MSI-X entry assignment */ + for (i = 0; i < total; i++) + entries[i].msix.entry = i; + + /* ask for MSI-X interrupts */ + request = total; + request_msix(dd, &request, entries); + + if (request == 0) { + /* using INTx */ + /* dd->num_msix_entries already zero */ + kfree(entries); + single_interrupt = 1; + dd_dev_err(dd, "MSI-X failed, using INTx interrupts\n"); + } else { + /* using MSI-X */ + dd->num_msix_entries = request; + dd->msix_entries = entries; + + if (request != total) { + /* using MSI-X, with reduced interrupts */ + dd_dev_err( + dd, + "cannot handle reduced interrupt case, want %u, got %u\n", + total, request); + ret = -EINVAL; + goto fail; + } + dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total); + } + + /* mask all interrupts */ + set_intr_state(dd, 0); + /* clear all pending interrupts */ + clear_all_interrupts(dd); + + /* reset general handler mask, chip MSI-X mappings */ + reset_interrupts(dd); + + if (single_interrupt) + ret = request_intx_irq(dd); + else + ret = request_msix_irqs(dd); + if (ret) + goto fail; + + return 0; + +fail: + clean_up_interrupts(dd); + return ret; +} + +/* + * Set up context values in dd. Sets: + * + * num_rcv_contexts - number of contexts being used + * n_krcv_queues - number of kernel contexts + * first_user_ctxt - first non-kernel context in array of contexts + * freectxts - number of free user contexts + * num_send_contexts - number of PIO send contexts being used + */ +static int set_up_context_variables(struct hfi1_devdata *dd) +{ + int num_kernel_contexts; + int total_contexts; + int ret; + unsigned ngroups; + int qos_rmt_count; + int user_rmt_reduced; + + /* + * Kernel receive contexts: + * - min of 2 or 1 context/numa (excluding control context) + * - Context 0 - control context (VL15/multicast/error) + * - Context 1 - first kernel context + * - Context 2 - second kernel context + * ... + */ + if (n_krcvqs) + /* + * n_krcvqs is the sum of module parameter kernel receive + * contexts, krcvqs[]. It does not include the control + * context, so add that. + */ + num_kernel_contexts = n_krcvqs + 1; + else + num_kernel_contexts = num_online_nodes() + 1; + num_kernel_contexts = + max_t(int, MIN_KERNEL_KCTXTS, num_kernel_contexts); + /* + * Every kernel receive context needs an ACK send context. + * one send context is allocated for each VL{0-7} and VL15 + */ + if (num_kernel_contexts > (dd->chip_send_contexts - num_vls - 1)) { + dd_dev_err(dd, + "Reducing # kernel rcv contexts to: %d, from %d\n", + (int)(dd->chip_send_contexts - num_vls - 1), + (int)num_kernel_contexts); + num_kernel_contexts = dd->chip_send_contexts - num_vls - 1; + } + /* + * User contexts: + * - default to 1 user context per real (non-HT) CPU core if + * num_user_contexts is negative + */ + if (num_user_contexts < 0) + num_user_contexts = + cpumask_weight(&dd->affinity->real_cpu_mask); + + total_contexts = num_kernel_contexts + num_user_contexts; + + /* + * Adjust the counts given a global max. + */ + if (total_contexts > dd->chip_rcv_contexts) { + dd_dev_err(dd, + "Reducing # user receive contexts to: %d, from %d\n", + (int)(dd->chip_rcv_contexts - num_kernel_contexts), + (int)num_user_contexts); + num_user_contexts = dd->chip_rcv_contexts - num_kernel_contexts; + /* recalculate */ + total_contexts = num_kernel_contexts + num_user_contexts; + } + + /* each user context requires an entry in the RMT */ + qos_rmt_count = qos_rmt_entries(dd, NULL, NULL); + if (qos_rmt_count + num_user_contexts > NUM_MAP_ENTRIES) { + user_rmt_reduced = NUM_MAP_ENTRIES - qos_rmt_count; + dd_dev_err(dd, + "RMT size is reducing the number of user receive contexts from %d to %d\n", + (int)num_user_contexts, + user_rmt_reduced); + /* recalculate */ + num_user_contexts = user_rmt_reduced; + total_contexts = num_kernel_contexts + num_user_contexts; + } + + /* the first N are kernel contexts, the rest are user contexts */ + dd->num_rcv_contexts = total_contexts; + dd->n_krcv_queues = num_kernel_contexts; + dd->first_user_ctxt = num_kernel_contexts; + dd->num_user_contexts = num_user_contexts; + dd->freectxts = num_user_contexts; + dd_dev_info(dd, + "rcv contexts: chip %d, used %d (kernel %d, user %d)\n", + (int)dd->chip_rcv_contexts, + (int)dd->num_rcv_contexts, + (int)dd->n_krcv_queues, + (int)dd->num_rcv_contexts - dd->n_krcv_queues); + + /* + * Receive array allocation: + * All RcvArray entries are divided into groups of 8. This + * is required by the hardware and will speed up writes to + * consecutive entries by using write-combining of the entire + * cacheline. + * + * The number of groups are evenly divided among all contexts. + * any left over groups will be given to the first N user + * contexts. + */ + dd->rcv_entries.group_size = RCV_INCREMENT; + ngroups = dd->chip_rcv_array_count / dd->rcv_entries.group_size; + dd->rcv_entries.ngroups = ngroups / dd->num_rcv_contexts; + dd->rcv_entries.nctxt_extra = ngroups - + (dd->num_rcv_contexts * dd->rcv_entries.ngroups); + dd_dev_info(dd, "RcvArray groups %u, ctxts extra %u\n", + dd->rcv_entries.ngroups, + dd->rcv_entries.nctxt_extra); + if (dd->rcv_entries.ngroups * dd->rcv_entries.group_size > + MAX_EAGER_ENTRIES * 2) { + dd->rcv_entries.ngroups = (MAX_EAGER_ENTRIES * 2) / + dd->rcv_entries.group_size; + dd_dev_info(dd, + "RcvArray group count too high, change to %u\n", + dd->rcv_entries.ngroups); + dd->rcv_entries.nctxt_extra = 0; + } + /* + * PIO send contexts + */ + ret = init_sc_pools_and_sizes(dd); + if (ret >= 0) { /* success */ + dd->num_send_contexts = ret; + dd_dev_info( + dd, + "send contexts: chip %d, used %d (kernel %d, ack %d, user %d, vl15 %d)\n", + dd->chip_send_contexts, + dd->num_send_contexts, + dd->sc_sizes[SC_KERNEL].count, + dd->sc_sizes[SC_ACK].count, + dd->sc_sizes[SC_USER].count, + dd->sc_sizes[SC_VL15].count); + ret = 0; /* success */ + } + + return ret; +} + +/* + * Set the device/port partition key table. The MAD code + * will ensure that, at least, the partial management + * partition key is present in the table. + */ +static void set_partition_keys(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 reg = 0; + int i; + + dd_dev_info(dd, "Setting partition keys\n"); + for (i = 0; i < hfi1_get_npkeys(dd); i++) { + reg |= (ppd->pkeys[i] & + RCV_PARTITION_KEY_PARTITION_KEY_A_MASK) << + ((i % 4) * + RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT); + /* Each register holds 4 PKey values. */ + if ((i % 4) == 3) { + write_csr(dd, RCV_PARTITION_KEY + + ((i - 3) * 2), reg); + reg = 0; + } + } + + /* Always enable HW pkeys check when pkeys table is set */ + add_rcvctrl(dd, RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK); +} + +/* + * These CSRs and memories are uninitialized on reset and must be + * written before reading to set the ECC/parity bits. + * + * NOTE: All user context CSRs that are not mmaped write-only + * (e.g. the TID flows) must be initialized even if the driver never + * reads them. + */ +static void write_uninitialized_csrs_and_memories(struct hfi1_devdata *dd) +{ + int i, j; + + /* CceIntMap */ + for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++) + write_csr(dd, CCE_INT_MAP + (8 * i), 0); + + /* SendCtxtCreditReturnAddr */ + for (i = 0; i < dd->chip_send_contexts; i++) + write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0); + + /* PIO Send buffers */ + /* SDMA Send buffers */ + /* + * These are not normally read, and (presently) have no method + * to be read, so are not pre-initialized + */ + + /* RcvHdrAddr */ + /* RcvHdrTailAddr */ + /* RcvTidFlowTable */ + for (i = 0; i < dd->chip_rcv_contexts; i++) { + write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0); + write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0); + for (j = 0; j < RXE_NUM_TID_FLOWS; j++) + write_uctxt_csr(dd, i, RCV_TID_FLOW_TABLE + (8 * j), 0); + } + + /* RcvArray */ + for (i = 0; i < dd->chip_rcv_array_count; i++) + write_csr(dd, RCV_ARRAY + (8 * i), + RCV_ARRAY_RT_WRITE_ENABLE_SMASK); + + /* RcvQPMapTable */ + for (i = 0; i < 32; i++) + write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0); +} + +/* + * Use the ctrl_bits in CceCtrl to clear the status_bits in CceStatus. + */ +static void clear_cce_status(struct hfi1_devdata *dd, u64 status_bits, + u64 ctrl_bits) +{ + unsigned long timeout; + u64 reg; + + /* is the condition present? */ + reg = read_csr(dd, CCE_STATUS); + if ((reg & status_bits) == 0) + return; + + /* clear the condition */ + write_csr(dd, CCE_CTRL, ctrl_bits); + + /* wait for the condition to clear */ + timeout = jiffies + msecs_to_jiffies(CCE_STATUS_TIMEOUT); + while (1) { + reg = read_csr(dd, CCE_STATUS); + if ((reg & status_bits) == 0) + return; + if (time_after(jiffies, timeout)) { + dd_dev_err(dd, + "Timeout waiting for CceStatus to clear bits 0x%llx, remaining 0x%llx\n", + status_bits, reg & status_bits); + return; + } + udelay(1); + } +} + +/* set CCE CSRs to chip reset defaults */ +static void reset_cce_csrs(struct hfi1_devdata *dd) +{ + int i; + + /* CCE_REVISION read-only */ + /* CCE_REVISION2 read-only */ + /* CCE_CTRL - bits clear automatically */ + /* CCE_STATUS read-only, use CceCtrl to clear */ + clear_cce_status(dd, ALL_FROZE, CCE_CTRL_SPC_UNFREEZE_SMASK); + clear_cce_status(dd, ALL_TXE_PAUSE, CCE_CTRL_TXE_RESUME_SMASK); + clear_cce_status(dd, ALL_RXE_PAUSE, CCE_CTRL_RXE_RESUME_SMASK); + for (i = 0; i < CCE_NUM_SCRATCH; i++) + write_csr(dd, CCE_SCRATCH + (8 * i), 0); + /* CCE_ERR_STATUS read-only */ + write_csr(dd, CCE_ERR_MASK, 0); + write_csr(dd, CCE_ERR_CLEAR, ~0ull); + /* CCE_ERR_FORCE leave alone */ + for (i = 0; i < CCE_NUM_32_BIT_COUNTERS; i++) + write_csr(dd, CCE_COUNTER_ARRAY32 + (8 * i), 0); + write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_RESETCSR); + /* CCE_PCIE_CTRL leave alone */ + for (i = 0; i < CCE_NUM_MSIX_VECTORS; i++) { + write_csr(dd, CCE_MSIX_TABLE_LOWER + (8 * i), 0); + write_csr(dd, CCE_MSIX_TABLE_UPPER + (8 * i), + CCE_MSIX_TABLE_UPPER_RESETCSR); + } + for (i = 0; i < CCE_NUM_MSIX_PBAS; i++) { + /* CCE_MSIX_PBA read-only */ + write_csr(dd, CCE_MSIX_INT_GRANTED, ~0ull); + write_csr(dd, CCE_MSIX_VEC_CLR_WITHOUT_INT, ~0ull); + } + for (i = 0; i < CCE_NUM_INT_MAP_CSRS; i++) + write_csr(dd, CCE_INT_MAP, 0); + for (i = 0; i < CCE_NUM_INT_CSRS; i++) { + /* CCE_INT_STATUS read-only */ + write_csr(dd, CCE_INT_MASK + (8 * i), 0); + write_csr(dd, CCE_INT_CLEAR + (8 * i), ~0ull); + /* CCE_INT_FORCE leave alone */ + /* CCE_INT_BLOCKED read-only */ + } + for (i = 0; i < CCE_NUM_32_BIT_INT_COUNTERS; i++) + write_csr(dd, CCE_INT_COUNTER_ARRAY32 + (8 * i), 0); +} + +/* set MISC CSRs to chip reset defaults */ +static void reset_misc_csrs(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < 32; i++) { + write_csr(dd, MISC_CFG_RSA_R2 + (8 * i), 0); + write_csr(dd, MISC_CFG_RSA_SIGNATURE + (8 * i), 0); + write_csr(dd, MISC_CFG_RSA_MODULUS + (8 * i), 0); + } + /* + * MISC_CFG_SHA_PRELOAD leave alone - always reads 0 and can + * only be written 128-byte chunks + */ + /* init RSA engine to clear lingering errors */ + write_csr(dd, MISC_CFG_RSA_CMD, 1); + write_csr(dd, MISC_CFG_RSA_MU, 0); + write_csr(dd, MISC_CFG_FW_CTRL, 0); + /* MISC_STS_8051_DIGEST read-only */ + /* MISC_STS_SBM_DIGEST read-only */ + /* MISC_STS_PCIE_DIGEST read-only */ + /* MISC_STS_FAB_DIGEST read-only */ + /* MISC_ERR_STATUS read-only */ + write_csr(dd, MISC_ERR_MASK, 0); + write_csr(dd, MISC_ERR_CLEAR, ~0ull); + /* MISC_ERR_FORCE leave alone */ +} + +/* set TXE CSRs to chip reset defaults */ +static void reset_txe_csrs(struct hfi1_devdata *dd) +{ + int i; + + /* + * TXE Kernel CSRs + */ + write_csr(dd, SEND_CTRL, 0); + __cm_reset(dd, 0); /* reset CM internal state */ + /* SEND_CONTEXTS read-only */ + /* SEND_DMA_ENGINES read-only */ + /* SEND_PIO_MEM_SIZE read-only */ + /* SEND_DMA_MEM_SIZE read-only */ + write_csr(dd, SEND_HIGH_PRIORITY_LIMIT, 0); + pio_reset_all(dd); /* SEND_PIO_INIT_CTXT */ + /* SEND_PIO_ERR_STATUS read-only */ + write_csr(dd, SEND_PIO_ERR_MASK, 0); + write_csr(dd, SEND_PIO_ERR_CLEAR, ~0ull); + /* SEND_PIO_ERR_FORCE leave alone */ + /* SEND_DMA_ERR_STATUS read-only */ + write_csr(dd, SEND_DMA_ERR_MASK, 0); + write_csr(dd, SEND_DMA_ERR_CLEAR, ~0ull); + /* SEND_DMA_ERR_FORCE leave alone */ + /* SEND_EGRESS_ERR_STATUS read-only */ + write_csr(dd, SEND_EGRESS_ERR_MASK, 0); + write_csr(dd, SEND_EGRESS_ERR_CLEAR, ~0ull); + /* SEND_EGRESS_ERR_FORCE leave alone */ + write_csr(dd, SEND_BTH_QP, 0); + write_csr(dd, SEND_STATIC_RATE_CONTROL, 0); + write_csr(dd, SEND_SC2VLT0, 0); + write_csr(dd, SEND_SC2VLT1, 0); + write_csr(dd, SEND_SC2VLT2, 0); + write_csr(dd, SEND_SC2VLT3, 0); + write_csr(dd, SEND_LEN_CHECK0, 0); + write_csr(dd, SEND_LEN_CHECK1, 0); + /* SEND_ERR_STATUS read-only */ + write_csr(dd, SEND_ERR_MASK, 0); + write_csr(dd, SEND_ERR_CLEAR, ~0ull); + /* SEND_ERR_FORCE read-only */ + for (i = 0; i < VL_ARB_LOW_PRIO_TABLE_SIZE; i++) + write_csr(dd, SEND_LOW_PRIORITY_LIST + (8 * i), 0); + for (i = 0; i < VL_ARB_HIGH_PRIO_TABLE_SIZE; i++) + write_csr(dd, SEND_HIGH_PRIORITY_LIST + (8 * i), 0); + for (i = 0; i < dd->chip_send_contexts / NUM_CONTEXTS_PER_SET; i++) + write_csr(dd, SEND_CONTEXT_SET_CTRL + (8 * i), 0); + for (i = 0; i < TXE_NUM_32_BIT_COUNTER; i++) + write_csr(dd, SEND_COUNTER_ARRAY32 + (8 * i), 0); + for (i = 0; i < TXE_NUM_64_BIT_COUNTER; i++) + write_csr(dd, SEND_COUNTER_ARRAY64 + (8 * i), 0); + write_csr(dd, SEND_CM_CTRL, SEND_CM_CTRL_RESETCSR); + write_csr(dd, SEND_CM_GLOBAL_CREDIT, SEND_CM_GLOBAL_CREDIT_RESETCSR); + /* SEND_CM_CREDIT_USED_STATUS read-only */ + write_csr(dd, SEND_CM_TIMER_CTRL, 0); + write_csr(dd, SEND_CM_LOCAL_AU_TABLE0_TO3, 0); + write_csr(dd, SEND_CM_LOCAL_AU_TABLE4_TO7, 0); + write_csr(dd, SEND_CM_REMOTE_AU_TABLE0_TO3, 0); + write_csr(dd, SEND_CM_REMOTE_AU_TABLE4_TO7, 0); + for (i = 0; i < TXE_NUM_DATA_VL; i++) + write_csr(dd, SEND_CM_CREDIT_VL + (8 * i), 0); + write_csr(dd, SEND_CM_CREDIT_VL15, 0); + /* SEND_CM_CREDIT_USED_VL read-only */ + /* SEND_CM_CREDIT_USED_VL15 read-only */ + /* SEND_EGRESS_CTXT_STATUS read-only */ + /* SEND_EGRESS_SEND_DMA_STATUS read-only */ + write_csr(dd, SEND_EGRESS_ERR_INFO, ~0ull); + /* SEND_EGRESS_ERR_INFO read-only */ + /* SEND_EGRESS_ERR_SOURCE read-only */ + + /* + * TXE Per-Context CSRs + */ + for (i = 0; i < dd->chip_send_contexts; i++) { + write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_CTRL, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_RETURN_ADDR, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CREDIT_FORCE, 0); + write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, 0); + write_kctxt_csr(dd, i, SEND_CTXT_ERR_CLEAR, ~0ull); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_ENABLE, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_VL, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_JOB_KEY, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_PARTITION_KEY, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_SLID, 0); + write_kctxt_csr(dd, i, SEND_CTXT_CHECK_OPCODE, 0); + } + + /* + * TXE Per-SDMA CSRs + */ + for (i = 0; i < dd->chip_sdma_engines; i++) { + write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0); + /* SEND_DMA_STATUS read-only */ + write_kctxt_csr(dd, i, SEND_DMA_BASE_ADDR, 0); + write_kctxt_csr(dd, i, SEND_DMA_LEN_GEN, 0); + write_kctxt_csr(dd, i, SEND_DMA_TAIL, 0); + /* SEND_DMA_HEAD read-only */ + write_kctxt_csr(dd, i, SEND_DMA_HEAD_ADDR, 0); + write_kctxt_csr(dd, i, SEND_DMA_PRIORITY_THLD, 0); + /* SEND_DMA_IDLE_CNT read-only */ + write_kctxt_csr(dd, i, SEND_DMA_RELOAD_CNT, 0); + write_kctxt_csr(dd, i, SEND_DMA_DESC_CNT, 0); + /* SEND_DMA_DESC_FETCHED_CNT read-only */ + /* SEND_DMA_ENG_ERR_STATUS read-only */ + write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, 0); + write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_CLEAR, ~0ull); + /* SEND_DMA_ENG_ERR_FORCE leave alone */ + write_kctxt_csr(dd, i, SEND_DMA_CHECK_ENABLE, 0); + write_kctxt_csr(dd, i, SEND_DMA_CHECK_VL, 0); + write_kctxt_csr(dd, i, SEND_DMA_CHECK_JOB_KEY, 0); + write_kctxt_csr(dd, i, SEND_DMA_CHECK_PARTITION_KEY, 0); + write_kctxt_csr(dd, i, SEND_DMA_CHECK_SLID, 0); + write_kctxt_csr(dd, i, SEND_DMA_CHECK_OPCODE, 0); + write_kctxt_csr(dd, i, SEND_DMA_MEMORY, 0); + } +} + +/* + * Expect on entry: + * o Packet ingress is disabled, i.e. RcvCtrl.RcvPortEnable == 0 + */ +static void init_rbufs(struct hfi1_devdata *dd) +{ + u64 reg; + int count; + + /* + * Wait for DMA to stop: RxRbufPktPending and RxPktInProgress are + * clear. + */ + count = 0; + while (1) { + reg = read_csr(dd, RCV_STATUS); + if ((reg & (RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK + | RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK)) == 0) + break; + /* + * Give up after 1ms - maximum wait time. + * + * RBuf size is 148KiB. Slowest possible is PCIe Gen1 x1 at + * 250MB/s bandwidth. Lower rate to 66% for overhead to get: + * 148 KB / (66% * 250MB/s) = 920us + */ + if (count++ > 500) { + dd_dev_err(dd, + "%s: in-progress DMA not clearing: RcvStatus 0x%llx, continuing\n", + __func__, reg); + break; + } + udelay(2); /* do not busy-wait the CSR */ + } + + /* start the init - expect RcvCtrl to be 0 */ + write_csr(dd, RCV_CTRL, RCV_CTRL_RX_RBUF_INIT_SMASK); + + /* + * Read to force the write of Rcvtrl.RxRbufInit. There is a brief + * period after the write before RcvStatus.RxRbufInitDone is valid. + * The delay in the first run through the loop below is sufficient and + * required before the first read of RcvStatus.RxRbufInintDone. + */ + read_csr(dd, RCV_CTRL); + + /* wait for the init to finish */ + count = 0; + while (1) { + /* delay is required first time through - see above */ + udelay(2); /* do not busy-wait the CSR */ + reg = read_csr(dd, RCV_STATUS); + if (reg & (RCV_STATUS_RX_RBUF_INIT_DONE_SMASK)) + break; + + /* give up after 100us - slowest possible at 33MHz is 73us */ + if (count++ > 50) { + dd_dev_err(dd, + "%s: RcvStatus.RxRbufInit not set, continuing\n", + __func__); + break; + } + } +} + +/* set RXE CSRs to chip reset defaults */ +static void reset_rxe_csrs(struct hfi1_devdata *dd) +{ + int i, j; + + /* + * RXE Kernel CSRs + */ + write_csr(dd, RCV_CTRL, 0); + init_rbufs(dd); + /* RCV_STATUS read-only */ + /* RCV_CONTEXTS read-only */ + /* RCV_ARRAY_CNT read-only */ + /* RCV_BUF_SIZE read-only */ + write_csr(dd, RCV_BTH_QP, 0); + write_csr(dd, RCV_MULTICAST, 0); + write_csr(dd, RCV_BYPASS, 0); + write_csr(dd, RCV_VL15, 0); + /* this is a clear-down */ + write_csr(dd, RCV_ERR_INFO, + RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK); + /* RCV_ERR_STATUS read-only */ + write_csr(dd, RCV_ERR_MASK, 0); + write_csr(dd, RCV_ERR_CLEAR, ~0ull); + /* RCV_ERR_FORCE leave alone */ + for (i = 0; i < 32; i++) + write_csr(dd, RCV_QP_MAP_TABLE + (8 * i), 0); + for (i = 0; i < 4; i++) + write_csr(dd, RCV_PARTITION_KEY + (8 * i), 0); + for (i = 0; i < RXE_NUM_32_BIT_COUNTERS; i++) + write_csr(dd, RCV_COUNTER_ARRAY32 + (8 * i), 0); + for (i = 0; i < RXE_NUM_64_BIT_COUNTERS; i++) + write_csr(dd, RCV_COUNTER_ARRAY64 + (8 * i), 0); + for (i = 0; i < RXE_NUM_RSM_INSTANCES; i++) { + write_csr(dd, RCV_RSM_CFG + (8 * i), 0); + write_csr(dd, RCV_RSM_SELECT + (8 * i), 0); + write_csr(dd, RCV_RSM_MATCH + (8 * i), 0); + } + for (i = 0; i < 32; i++) + write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), 0); + + /* + * RXE Kernel and User Per-Context CSRs + */ + for (i = 0; i < dd->chip_rcv_contexts; i++) { + /* kernel */ + write_kctxt_csr(dd, i, RCV_CTXT_CTRL, 0); + /* RCV_CTXT_STATUS read-only */ + write_kctxt_csr(dd, i, RCV_EGR_CTRL, 0); + write_kctxt_csr(dd, i, RCV_TID_CTRL, 0); + write_kctxt_csr(dd, i, RCV_KEY_CTRL, 0); + write_kctxt_csr(dd, i, RCV_HDR_ADDR, 0); + write_kctxt_csr(dd, i, RCV_HDR_CNT, 0); + write_kctxt_csr(dd, i, RCV_HDR_ENT_SIZE, 0); + write_kctxt_csr(dd, i, RCV_HDR_SIZE, 0); + write_kctxt_csr(dd, i, RCV_HDR_TAIL_ADDR, 0); + write_kctxt_csr(dd, i, RCV_AVAIL_TIME_OUT, 0); + write_kctxt_csr(dd, i, RCV_HDR_OVFL_CNT, 0); + + /* user */ + /* RCV_HDR_TAIL read-only */ + write_uctxt_csr(dd, i, RCV_HDR_HEAD, 0); + /* RCV_EGR_INDEX_TAIL read-only */ + write_uctxt_csr(dd, i, RCV_EGR_INDEX_HEAD, 0); + /* RCV_EGR_OFFSET_TAIL read-only */ + for (j = 0; j < RXE_NUM_TID_FLOWS; j++) { + write_uctxt_csr(dd, i, + RCV_TID_FLOW_TABLE + (8 * j), 0); + } + } +} + +/* + * Set sc2vl tables. + * + * They power on to zeros, so to avoid send context errors + * they need to be set: + * + * SC 0-7 -> VL 0-7 (respectively) + * SC 15 -> VL 15 + * otherwise + * -> VL 0 + */ +static void init_sc2vl_tables(struct hfi1_devdata *dd) +{ + int i; + /* init per architecture spec, constrained by hardware capability */ + + /* HFI maps sent packets */ + write_csr(dd, SEND_SC2VLT0, SC2VL_VAL( + 0, + 0, 0, 1, 1, + 2, 2, 3, 3, + 4, 4, 5, 5, + 6, 6, 7, 7)); + write_csr(dd, SEND_SC2VLT1, SC2VL_VAL( + 1, + 8, 0, 9, 0, + 10, 0, 11, 0, + 12, 0, 13, 0, + 14, 0, 15, 15)); + write_csr(dd, SEND_SC2VLT2, SC2VL_VAL( + 2, + 16, 0, 17, 0, + 18, 0, 19, 0, + 20, 0, 21, 0, + 22, 0, 23, 0)); + write_csr(dd, SEND_SC2VLT3, SC2VL_VAL( + 3, + 24, 0, 25, 0, + 26, 0, 27, 0, + 28, 0, 29, 0, + 30, 0, 31, 0)); + + /* DC maps received packets */ + write_csr(dd, DCC_CFG_SC_VL_TABLE_15_0, DC_SC_VL_VAL( + 15_0, + 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, + 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 15)); + write_csr(dd, DCC_CFG_SC_VL_TABLE_31_16, DC_SC_VL_VAL( + 31_16, + 16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0, + 24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0)); + + /* initialize the cached sc2vl values consistently with h/w */ + for (i = 0; i < 32; i++) { + if (i < 8 || i == 15) + *((u8 *)(dd->sc2vl) + i) = (u8)i; + else + *((u8 *)(dd->sc2vl) + i) = 0; + } +} + +/* + * Read chip sizes and then reset parts to sane, disabled, values. We cannot + * depend on the chip going through a power-on reset - a driver may be loaded + * and unloaded many times. + * + * Do not write any CSR values to the chip in this routine - there may be + * a reset following the (possible) FLR in this routine. + * + */ +static void init_chip(struct hfi1_devdata *dd) +{ + int i; + + /* + * Put the HFI CSRs in a known state. + * Combine this with a DC reset. + * + * Stop the device from doing anything while we do a + * reset. We know there are no other active users of + * the device since we are now in charge. Turn off + * off all outbound and inbound traffic and make sure + * the device does not generate any interrupts. + */ + + /* disable send contexts and SDMA engines */ + write_csr(dd, SEND_CTRL, 0); + for (i = 0; i < dd->chip_send_contexts; i++) + write_kctxt_csr(dd, i, SEND_CTXT_CTRL, 0); + for (i = 0; i < dd->chip_sdma_engines; i++) + write_kctxt_csr(dd, i, SEND_DMA_CTRL, 0); + /* disable port (turn off RXE inbound traffic) and contexts */ + write_csr(dd, RCV_CTRL, 0); + for (i = 0; i < dd->chip_rcv_contexts; i++) + write_csr(dd, RCV_CTXT_CTRL, 0); + /* mask all interrupt sources */ + for (i = 0; i < CCE_NUM_INT_CSRS; i++) + write_csr(dd, CCE_INT_MASK + (8 * i), 0ull); + + /* + * DC Reset: do a full DC reset before the register clear. + * A recommended length of time to hold is one CSR read, + * so reread the CceDcCtrl. Then, hold the DC in reset + * across the clear. + */ + write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK); + (void)read_csr(dd, CCE_DC_CTRL); + + if (use_flr) { + /* + * A FLR will reset the SPC core and part of the PCIe. + * The parts that need to be restored have already been + * saved. + */ + dd_dev_info(dd, "Resetting CSRs with FLR\n"); + + /* do the FLR, the DC reset will remain */ + hfi1_pcie_flr(dd); + + /* restore command and BARs */ + restore_pci_variables(dd); + + if (is_ax(dd)) { + dd_dev_info(dd, "Resetting CSRs with FLR\n"); + hfi1_pcie_flr(dd); + restore_pci_variables(dd); + } + } else { + dd_dev_info(dd, "Resetting CSRs with writes\n"); + reset_cce_csrs(dd); + reset_txe_csrs(dd); + reset_rxe_csrs(dd); + reset_misc_csrs(dd); + } + /* clear the DC reset */ + write_csr(dd, CCE_DC_CTRL, 0); + + /* Set the LED off */ + setextled(dd, 0); + + /* + * Clear the QSFP reset. + * An FLR enforces a 0 on all out pins. The driver does not touch + * ASIC_QSFPn_OUT otherwise. This leaves RESET_N low and + * anything plugged constantly in reset, if it pays attention + * to RESET_N. + * Prime examples of this are optical cables. Set all pins high. + * I2CCLK and I2CDAT will change per direction, and INT_N and + * MODPRS_N are input only and their value is ignored. + */ + write_csr(dd, ASIC_QSFP1_OUT, 0x1f); + write_csr(dd, ASIC_QSFP2_OUT, 0x1f); + init_chip_resources(dd); +} + +static void init_early_variables(struct hfi1_devdata *dd) +{ + int i; + + /* assign link credit variables */ + dd->vau = CM_VAU; + dd->link_credits = CM_GLOBAL_CREDITS; + if (is_ax(dd)) + dd->link_credits--; + dd->vcu = cu_to_vcu(hfi1_cu); + /* enough room for 8 MAD packets plus header - 17K */ + dd->vl15_init = (8 * (2048 + 128)) / vau_to_au(dd->vau); + if (dd->vl15_init > dd->link_credits) + dd->vl15_init = dd->link_credits; + + write_uninitialized_csrs_and_memories(dd); + + if (HFI1_CAP_IS_KSET(PKEY_CHECK)) + for (i = 0; i < dd->num_pports; i++) { + struct hfi1_pportdata *ppd = &dd->pport[i]; + + set_partition_keys(ppd); + } + init_sc2vl_tables(dd); +} + +static void init_kdeth_qp(struct hfi1_devdata *dd) +{ + /* user changed the KDETH_QP */ + if (kdeth_qp != 0 && kdeth_qp >= 0xff) { + /* out of range or illegal value */ + dd_dev_err(dd, "Invalid KDETH queue pair prefix, ignoring"); + kdeth_qp = 0; + } + if (kdeth_qp == 0) /* not set, or failed range check */ + kdeth_qp = DEFAULT_KDETH_QP; + + write_csr(dd, SEND_BTH_QP, + (kdeth_qp & SEND_BTH_QP_KDETH_QP_MASK) << + SEND_BTH_QP_KDETH_QP_SHIFT); + + write_csr(dd, RCV_BTH_QP, + (kdeth_qp & RCV_BTH_QP_KDETH_QP_MASK) << + RCV_BTH_QP_KDETH_QP_SHIFT); +} + +/** + * init_qpmap_table + * @dd - device data + * @first_ctxt - first context + * @last_ctxt - first context + * + * This return sets the qpn mapping table that + * is indexed by qpn[8:1]. + * + * The routine will round robin the 256 settings + * from first_ctxt to last_ctxt. + * + * The first/last looks ahead to having specialized + * receive contexts for mgmt and bypass. Normal + * verbs traffic will assumed to be on a range + * of receive contexts. + */ +static void init_qpmap_table(struct hfi1_devdata *dd, + u32 first_ctxt, + u32 last_ctxt) +{ + u64 reg = 0; + u64 regno = RCV_QP_MAP_TABLE; + int i; + u64 ctxt = first_ctxt; + + for (i = 0; i < 256; i++) { + reg |= ctxt << (8 * (i % 8)); + ctxt++; + if (ctxt > last_ctxt) + ctxt = first_ctxt; + if (i % 8 == 7) { + write_csr(dd, regno, reg); + reg = 0; + regno += 8; + } + } + + add_rcvctrl(dd, RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK + | RCV_CTRL_RCV_BYPASS_ENABLE_SMASK); +} + +struct rsm_map_table { + u64 map[NUM_MAP_REGS]; + unsigned int used; +}; + +struct rsm_rule_data { + u8 offset; + u8 pkt_type; + u32 field1_off; + u32 field2_off; + u32 index1_off; + u32 index1_width; + u32 index2_off; + u32 index2_width; + u32 mask1; + u32 value1; + u32 mask2; + u32 value2; +}; + +/* + * Return an initialized RMT map table for users to fill in. OK if it + * returns NULL, indicating no table. + */ +static struct rsm_map_table *alloc_rsm_map_table(struct hfi1_devdata *dd) +{ + struct rsm_map_table *rmt; + u8 rxcontext = is_ax(dd) ? 0 : 0xff; /* 0 is default if a0 ver. */ + + rmt = kmalloc(sizeof(*rmt), GFP_KERNEL); + if (rmt) { + memset(rmt->map, rxcontext, sizeof(rmt->map)); + rmt->used = 0; + } + + return rmt; +} + +/* + * Write the final RMT map table to the chip and free the table. OK if + * table is NULL. + */ +static void complete_rsm_map_table(struct hfi1_devdata *dd, + struct rsm_map_table *rmt) +{ + int i; + + if (rmt) { + /* write table to chip */ + for (i = 0; i < NUM_MAP_REGS; i++) + write_csr(dd, RCV_RSM_MAP_TABLE + (8 * i), rmt->map[i]); + + /* enable RSM */ + add_rcvctrl(dd, RCV_CTRL_RCV_RSM_ENABLE_SMASK); + } +} + +/* + * Add a receive side mapping rule. + */ +static void add_rsm_rule(struct hfi1_devdata *dd, u8 rule_index, + struct rsm_rule_data *rrd) +{ + write_csr(dd, RCV_RSM_CFG + (8 * rule_index), + (u64)rrd->offset << RCV_RSM_CFG_OFFSET_SHIFT | + 1ull << rule_index | /* enable bit */ + (u64)rrd->pkt_type << RCV_RSM_CFG_PACKET_TYPE_SHIFT); + write_csr(dd, RCV_RSM_SELECT + (8 * rule_index), + (u64)rrd->field1_off << RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT | + (u64)rrd->field2_off << RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT | + (u64)rrd->index1_off << RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT | + (u64)rrd->index1_width << RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT | + (u64)rrd->index2_off << RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT | + (u64)rrd->index2_width << RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT); + write_csr(dd, RCV_RSM_MATCH + (8 * rule_index), + (u64)rrd->mask1 << RCV_RSM_MATCH_MASK1_SHIFT | + (u64)rrd->value1 << RCV_RSM_MATCH_VALUE1_SHIFT | + (u64)rrd->mask2 << RCV_RSM_MATCH_MASK2_SHIFT | + (u64)rrd->value2 << RCV_RSM_MATCH_VALUE2_SHIFT); +} + +/* return the number of RSM map table entries that will be used for QOS */ +static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp, + unsigned int *np) +{ + int i; + unsigned int m, n; + u8 max_by_vl = 0; + + /* is QOS active at all? */ + if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS || + num_vls == 1 || + krcvqsset <= 1) + goto no_qos; + + /* determine bits for qpn */ + for (i = 0; i < min_t(unsigned int, num_vls, krcvqsset); i++) + if (krcvqs[i] > max_by_vl) + max_by_vl = krcvqs[i]; + if (max_by_vl > 32) + goto no_qos; + m = ilog2(__roundup_pow_of_two(max_by_vl)); + + /* determine bits for vl */ + n = ilog2(__roundup_pow_of_two(num_vls)); + + /* reject if too much is used */ + if ((m + n) > 7) + goto no_qos; + + if (mp) + *mp = m; + if (np) + *np = n; + + return 1 << (m + n); + +no_qos: + if (mp) + *mp = 0; + if (np) + *np = 0; + return 0; +} + +/** + * init_qos - init RX qos + * @dd - device data + * @rmt - RSM map table + * + * This routine initializes Rule 0 and the RSM map table to implement + * quality of service (qos). + * + * If all of the limit tests succeed, qos is applied based on the array + * interpretation of krcvqs where entry 0 is VL0. + * + * The number of vl bits (n) and the number of qpn bits (m) are computed to + * feed both the RSM map table and the single rule. + */ +static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt) +{ + struct rsm_rule_data rrd; + unsigned qpns_per_vl, ctxt, i, qpn, n = 1, m; + unsigned int rmt_entries; + u64 reg; + + if (!rmt) + goto bail; + rmt_entries = qos_rmt_entries(dd, &m, &n); + if (rmt_entries == 0) + goto bail; + qpns_per_vl = 1 << m; + + /* enough room in the map table? */ + rmt_entries = 1 << (m + n); + if (rmt->used + rmt_entries >= NUM_MAP_ENTRIES) + goto bail; + + /* add qos entries to the the RSM map table */ + for (i = 0, ctxt = FIRST_KERNEL_KCTXT; i < num_vls; i++) { + unsigned tctxt; + + for (qpn = 0, tctxt = ctxt; + krcvqs[i] && qpn < qpns_per_vl; qpn++) { + unsigned idx, regoff, regidx; + + /* generate the index the hardware will produce */ + idx = rmt->used + ((qpn << n) ^ i); + regoff = (idx % 8) * 8; + regidx = idx / 8; + /* replace default with context number */ + reg = rmt->map[regidx]; + reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK + << regoff); + reg |= (u64)(tctxt++) << regoff; + rmt->map[regidx] = reg; + if (tctxt == ctxt + krcvqs[i]) + tctxt = ctxt; + } + ctxt += krcvqs[i]; + } + + rrd.offset = rmt->used; + rrd.pkt_type = 2; + rrd.field1_off = LRH_BTH_MATCH_OFFSET; + rrd.field2_off = LRH_SC_MATCH_OFFSET; + rrd.index1_off = LRH_SC_SELECT_OFFSET; + rrd.index1_width = n; + rrd.index2_off = QPN_SELECT_OFFSET; + rrd.index2_width = m + n; + rrd.mask1 = LRH_BTH_MASK; + rrd.value1 = LRH_BTH_VALUE; + rrd.mask2 = LRH_SC_MASK; + rrd.value2 = LRH_SC_VALUE; + + /* add rule 0 */ + add_rsm_rule(dd, 0, &rrd); + + /* mark RSM map entries as used */ + rmt->used += rmt_entries; + /* map everything else to the mcast/err/vl15 context */ + init_qpmap_table(dd, HFI1_CTRL_CTXT, HFI1_CTRL_CTXT); + dd->qos_shift = n + 1; + return; +bail: + dd->qos_shift = 1; + init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1); +} + +static void init_user_fecn_handling(struct hfi1_devdata *dd, + struct rsm_map_table *rmt) +{ + struct rsm_rule_data rrd; + u64 reg; + int i, idx, regoff, regidx; + u8 offset; + + /* there needs to be enough room in the map table */ + if (rmt->used + dd->num_user_contexts >= NUM_MAP_ENTRIES) { + dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n"); + return; + } + + /* + * RSM will extract the destination context as an index into the + * map table. The destination contexts are a sequential block + * in the range first_user_ctxt...num_rcv_contexts-1 (inclusive). + * Map entries are accessed as offset + extracted value. Adjust + * the added offset so this sequence can be placed anywhere in + * the table - as long as the entries themselves do not wrap. + * There are only enough bits in offset for the table size, so + * start with that to allow for a "negative" offset. + */ + offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used - + (int)dd->first_user_ctxt); + + for (i = dd->first_user_ctxt, idx = rmt->used; + i < dd->num_rcv_contexts; i++, idx++) { + /* replace with identity mapping */ + regoff = (idx % 8) * 8; + regidx = idx / 8; + reg = rmt->map[regidx]; + reg &= ~(RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK << regoff); + reg |= (u64)i << regoff; + rmt->map[regidx] = reg; + } + + /* + * For RSM intercept of Expected FECN packets: + * o packet type 0 - expected + * o match on F (bit 95), using select/match 1, and + * o match on SH (bit 133), using select/match 2. + * + * Use index 1 to extract the 8-bit receive context from DestQP + * (start at bit 64). Use that as the RSM map table index. + */ + rrd.offset = offset; + rrd.pkt_type = 0; + rrd.field1_off = 95; + rrd.field2_off = 133; + rrd.index1_off = 64; + rrd.index1_width = 8; + rrd.index2_off = 0; + rrd.index2_width = 0; + rrd.mask1 = 1; + rrd.value1 = 1; + rrd.mask2 = 1; + rrd.value2 = 1; + + /* add rule 1 */ + add_rsm_rule(dd, 1, &rrd); + + rmt->used += dd->num_user_contexts; +} + +static void init_rxe(struct hfi1_devdata *dd) +{ + struct rsm_map_table *rmt; + + /* enable all receive errors */ + write_csr(dd, RCV_ERR_MASK, ~0ull); + + rmt = alloc_rsm_map_table(dd); + /* set up QOS, including the QPN map table */ + init_qos(dd, rmt); + init_user_fecn_handling(dd, rmt); + complete_rsm_map_table(dd, rmt); + kfree(rmt); + + /* + * make sure RcvCtrl.RcvWcb <= PCIe Device Control + * Register Max_Payload_Size (PCI_EXP_DEVCTL in Linux PCIe config + * space, PciCfgCap2.MaxPayloadSize in HFI). There is only one + * invalid configuration: RcvCtrl.RcvWcb set to its max of 256 and + * Max_PayLoad_Size set to its minimum of 128. + * + * Presently, RcvCtrl.RcvWcb is not modified from its default of 0 + * (64 bytes). Max_Payload_Size is possibly modified upward in + * tune_pcie_caps() which is called after this routine. + */ +} + +static void init_other(struct hfi1_devdata *dd) +{ + /* enable all CCE errors */ + write_csr(dd, CCE_ERR_MASK, ~0ull); + /* enable *some* Misc errors */ + write_csr(dd, MISC_ERR_MASK, DRIVER_MISC_MASK); + /* enable all DC errors, except LCB */ + write_csr(dd, DCC_ERR_FLG_EN, ~0ull); + write_csr(dd, DC_DC8051_ERR_EN, ~0ull); +} + +/* + * Fill out the given AU table using the given CU. A CU is defined in terms + * AUs. The table is a an encoding: given the index, how many AUs does that + * represent? + * + * NOTE: Assumes that the register layout is the same for the + * local and remote tables. + */ +static void assign_cm_au_table(struct hfi1_devdata *dd, u32 cu, + u32 csr0to3, u32 csr4to7) +{ + write_csr(dd, csr0to3, + 0ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT | + 1ull << SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT | + 2ull * cu << + SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT | + 4ull * cu << + SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT); + write_csr(dd, csr4to7, + 8ull * cu << + SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT | + 16ull * cu << + SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT | + 32ull * cu << + SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT | + 64ull * cu << + SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT); +} + +static void assign_local_cm_au_table(struct hfi1_devdata *dd, u8 vcu) +{ + assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_LOCAL_AU_TABLE0_TO3, + SEND_CM_LOCAL_AU_TABLE4_TO7); +} + +void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu) +{ + assign_cm_au_table(dd, vcu_to_cu(vcu), SEND_CM_REMOTE_AU_TABLE0_TO3, + SEND_CM_REMOTE_AU_TABLE4_TO7); +} + +static void init_txe(struct hfi1_devdata *dd) +{ + int i; + + /* enable all PIO, SDMA, general, and Egress errors */ + write_csr(dd, SEND_PIO_ERR_MASK, ~0ull); + write_csr(dd, SEND_DMA_ERR_MASK, ~0ull); + write_csr(dd, SEND_ERR_MASK, ~0ull); + write_csr(dd, SEND_EGRESS_ERR_MASK, ~0ull); + + /* enable all per-context and per-SDMA engine errors */ + for (i = 0; i < dd->chip_send_contexts; i++) + write_kctxt_csr(dd, i, SEND_CTXT_ERR_MASK, ~0ull); + for (i = 0; i < dd->chip_sdma_engines; i++) + write_kctxt_csr(dd, i, SEND_DMA_ENG_ERR_MASK, ~0ull); + + /* set the local CU to AU mapping */ + assign_local_cm_au_table(dd, dd->vcu); + + /* + * Set reasonable default for Credit Return Timer + * Don't set on Simulator - causes it to choke. + */ + if (dd->icode != ICODE_FUNCTIONAL_SIMULATOR) + write_csr(dd, SEND_CM_TIMER_CTRL, HFI1_CREDIT_RETURN_RATE); +} + +int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey) +{ + struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; + unsigned sctxt; + int ret = 0; + u64 reg; + + if (!rcd || !rcd->sc) { + ret = -EINVAL; + goto done; + } + sctxt = rcd->sc->hw_context; + reg = SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK | /* mask is always 1's */ + ((jkey & SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK) << + SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT); + /* JOB_KEY_ALLOW_PERMISSIVE is not allowed by default */ + if (HFI1_CAP_KGET_MASK(rcd->flags, ALLOW_PERM_JKEY)) + reg |= SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK; + write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, reg); + /* + * Enable send-side J_KEY integrity check, unless this is A0 h/w + */ + if (!is_ax(dd)) { + reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE); + reg |= SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg); + } + + /* Enable J_KEY check on receive context. */ + reg = RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK | + ((jkey & RCV_KEY_CTRL_JOB_KEY_VALUE_MASK) << + RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT); + write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, reg); +done: + return ret; +} + +int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt) +{ + struct hfi1_ctxtdata *rcd = dd->rcd[ctxt]; + unsigned sctxt; + int ret = 0; + u64 reg; + + if (!rcd || !rcd->sc) { + ret = -EINVAL; + goto done; + } + sctxt = rcd->sc->hw_context; + write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_JOB_KEY, 0); + /* + * Disable send-side J_KEY integrity check, unless this is A0 h/w. + * This check would not have been enabled for A0 h/w, see + * set_ctxt_jkey(). + */ + if (!is_ax(dd)) { + reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE); + reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg); + } + /* Turn off the J_KEY on the receive side */ + write_kctxt_csr(dd, ctxt, RCV_KEY_CTRL, 0); +done: + return ret; +} + +int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey) +{ + struct hfi1_ctxtdata *rcd; + unsigned sctxt; + int ret = 0; + u64 reg; + + if (ctxt < dd->num_rcv_contexts) { + rcd = dd->rcd[ctxt]; + } else { + ret = -EINVAL; + goto done; + } + if (!rcd || !rcd->sc) { + ret = -EINVAL; + goto done; + } + sctxt = rcd->sc->hw_context; + reg = ((u64)pkey & SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK) << + SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT; + write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, reg); + reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE); + reg |= SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK; + reg &= ~SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK; + write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg); +done: + return ret; +} + +int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt) +{ + struct hfi1_ctxtdata *rcd; + unsigned sctxt; + int ret = 0; + u64 reg; + + if (ctxt < dd->num_rcv_contexts) { + rcd = dd->rcd[ctxt]; + } else { + ret = -EINVAL; + goto done; + } + if (!rcd || !rcd->sc) { + ret = -EINVAL; + goto done; + } + sctxt = rcd->sc->hw_context; + reg = read_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE); + reg &= ~SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK; + write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_ENABLE, reg); + write_kctxt_csr(dd, sctxt, SEND_CTXT_CHECK_PARTITION_KEY, 0); +done: + return ret; +} + +/* + * Start doing the clean up the the chip. Our clean up happens in multiple + * stages and this is just the first. + */ +void hfi1_start_cleanup(struct hfi1_devdata *dd) +{ + aspm_exit(dd); + free_cntrs(dd); + free_rcverr(dd); + clean_up_interrupts(dd); + finish_chip_resources(dd); +} + +#define HFI_BASE_GUID(dev) \ + ((dev)->base_guid & ~(1ULL << GUID_HFI_INDEX_SHIFT)) + +/* + * Information can be shared between the two HFIs on the same ASIC + * in the same OS. This function finds the peer device and sets + * up a shared structure. + */ +static int init_asic_data(struct hfi1_devdata *dd) +{ + unsigned long flags; + struct hfi1_devdata *tmp, *peer = NULL; + struct hfi1_asic_data *asic_data; + int ret = 0; + + /* pre-allocate the asic structure in case we are the first device */ + asic_data = kzalloc(sizeof(*dd->asic_data), GFP_KERNEL); + if (!asic_data) + return -ENOMEM; + + spin_lock_irqsave(&hfi1_devs_lock, flags); + /* Find our peer device */ + list_for_each_entry(tmp, &hfi1_dev_list, list) { + if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(tmp)) && + dd->unit != tmp->unit) { + peer = tmp; + break; + } + } + + if (peer) { + /* use already allocated structure */ + dd->asic_data = peer->asic_data; + kfree(asic_data); + } else { + dd->asic_data = asic_data; + mutex_init(&dd->asic_data->asic_resource_mutex); + } + dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */ + spin_unlock_irqrestore(&hfi1_devs_lock, flags); + return ret; +} + +/* + * Set dd->boardname. Use a generic name if a name is not returned from + * EFI variable space. + * + * Return 0 on success, -ENOMEM if space could not be allocated. + */ +static int obtain_boardname(struct hfi1_devdata *dd) +{ + /* generic board description */ + const char generic[] = + "Intel Omni-Path Host Fabric Interface Adapter 100 Series"; + unsigned long size; + int ret; + + ret = read_hfi1_efi_var(dd, "description", &size, + (void **)&dd->boardname); + if (ret) { + dd_dev_info(dd, "Board description not found\n"); + /* use generic description */ + dd->boardname = kstrdup(generic, GFP_KERNEL); + if (!dd->boardname) + return -ENOMEM; + } + return 0; +} + +/* + * Check the interrupt registers to make sure that they are mapped correctly. + * It is intended to help user identify any mismapping by VMM when the driver + * is running in a VM. This function should only be called before interrupt + * is set up properly. + * + * Return 0 on success, -EINVAL on failure. + */ +static int check_int_registers(struct hfi1_devdata *dd) +{ + u64 reg; + u64 all_bits = ~(u64)0; + u64 mask; + + /* Clear CceIntMask[0] to avoid raising any interrupts */ + mask = read_csr(dd, CCE_INT_MASK); + write_csr(dd, CCE_INT_MASK, 0ull); + reg = read_csr(dd, CCE_INT_MASK); + if (reg) + goto err_exit; + + /* Clear all interrupt status bits */ + write_csr(dd, CCE_INT_CLEAR, all_bits); + reg = read_csr(dd, CCE_INT_STATUS); + if (reg) + goto err_exit; + + /* Set all interrupt status bits */ + write_csr(dd, CCE_INT_FORCE, all_bits); + reg = read_csr(dd, CCE_INT_STATUS); + if (reg != all_bits) + goto err_exit; + + /* Restore the interrupt mask */ + write_csr(dd, CCE_INT_CLEAR, all_bits); + write_csr(dd, CCE_INT_MASK, mask); + + return 0; +err_exit: + write_csr(dd, CCE_INT_MASK, mask); + dd_dev_err(dd, "Interrupt registers not properly mapped by VMM\n"); + return -EINVAL; +} + +/** + * Allocate and initialize the device structure for the hfi. + * @dev: the pci_dev for hfi1_ib device + * @ent: pci_device_id struct for this dev + * + * Also allocates, initializes, and returns the devdata struct for this + * device instance + * + * This is global, and is called directly at init to set up the + * chip-specific function pointers for later use. + */ +struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + struct hfi1_devdata *dd; + struct hfi1_pportdata *ppd; + u64 reg; + int i, ret; + static const char * const inames[] = { /* implementation names */ + "RTL silicon", + "RTL VCS simulation", + "RTL FPGA emulation", + "Functional simulator" + }; + struct pci_dev *parent = pdev->bus->self; + + dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS * + sizeof(struct hfi1_pportdata)); + if (IS_ERR(dd)) + goto bail; + ppd = dd->pport; + for (i = 0; i < dd->num_pports; i++, ppd++) { + int vl; + /* init common fields */ + hfi1_init_pportdata(pdev, ppd, dd, 0, 1); + /* DC supports 4 link widths */ + ppd->link_width_supported = + OPA_LINK_WIDTH_1X | OPA_LINK_WIDTH_2X | + OPA_LINK_WIDTH_3X | OPA_LINK_WIDTH_4X; + ppd->link_width_downgrade_supported = + ppd->link_width_supported; + /* start out enabling only 4X */ + ppd->link_width_enabled = OPA_LINK_WIDTH_4X; + ppd->link_width_downgrade_enabled = + ppd->link_width_downgrade_supported; + /* link width active is 0 when link is down */ + /* link width downgrade active is 0 when link is down */ + + if (num_vls < HFI1_MIN_VLS_SUPPORTED || + num_vls > HFI1_MAX_VLS_SUPPORTED) { + hfi1_early_err(&pdev->dev, + "Invalid num_vls %u, using %u VLs\n", + num_vls, HFI1_MAX_VLS_SUPPORTED); + num_vls = HFI1_MAX_VLS_SUPPORTED; + } + ppd->vls_supported = num_vls; + ppd->vls_operational = ppd->vls_supported; + ppd->actual_vls_operational = ppd->vls_supported; + /* Set the default MTU. */ + for (vl = 0; vl < num_vls; vl++) + dd->vld[vl].mtu = hfi1_max_mtu; + dd->vld[15].mtu = MAX_MAD_PACKET; + /* + * Set the initial values to reasonable default, will be set + * for real when link is up. + */ + ppd->lstate = IB_PORT_DOWN; + ppd->overrun_threshold = 0x4; + ppd->phy_error_threshold = 0xf; + ppd->port_crc_mode_enabled = link_crc_mask; + /* initialize supported LTP CRC mode */ + ppd->port_ltp_crc_mode = cap_to_port_ltp(link_crc_mask) << 8; + /* initialize enabled LTP CRC mode */ + ppd->port_ltp_crc_mode |= cap_to_port_ltp(link_crc_mask) << 4; + /* start in offline */ + ppd->host_link_state = HLS_DN_OFFLINE; + init_vl_arb_caches(ppd); + ppd->last_pstate = 0xff; /* invalid value */ + } + + dd->link_default = HLS_DN_POLL; + + /* + * Do remaining PCIe setup and save PCIe values in dd. + * Any error printing is already done by the init code. + * On return, we have the chip mapped. + */ + ret = hfi1_pcie_ddinit(dd, pdev, ent); + if (ret < 0) + goto bail_free; + + /* verify that reads actually work, save revision for reset check */ + dd->revision = read_csr(dd, CCE_REVISION); + if (dd->revision == ~(u64)0) { + dd_dev_err(dd, "cannot read chip CSRs\n"); + ret = -EINVAL; + goto bail_cleanup; + } + dd->majrev = (dd->revision >> CCE_REVISION_CHIP_REV_MAJOR_SHIFT) + & CCE_REVISION_CHIP_REV_MAJOR_MASK; + dd->minrev = (dd->revision >> CCE_REVISION_CHIP_REV_MINOR_SHIFT) + & CCE_REVISION_CHIP_REV_MINOR_MASK; + + /* + * Check interrupt registers mapping if the driver has no access to + * the upstream component. In this case, it is likely that the driver + * is running in a VM. + */ + if (!parent) { + ret = check_int_registers(dd); + if (ret) + goto bail_cleanup; + } + + /* + * obtain the hardware ID - NOT related to unit, which is a + * software enumeration + */ + reg = read_csr(dd, CCE_REVISION2); + dd->hfi1_id = (reg >> CCE_REVISION2_HFI_ID_SHIFT) + & CCE_REVISION2_HFI_ID_MASK; + /* the variable size will remove unwanted bits */ + dd->icode = reg >> CCE_REVISION2_IMPL_CODE_SHIFT; + dd->irev = reg >> CCE_REVISION2_IMPL_REVISION_SHIFT; + dd_dev_info(dd, "Implementation: %s, revision 0x%x\n", + dd->icode < ARRAY_SIZE(inames) ? + inames[dd->icode] : "unknown", (int)dd->irev); + + /* speeds the hardware can support */ + dd->pport->link_speed_supported = OPA_LINK_SPEED_25G; + /* speeds allowed to run at */ + dd->pport->link_speed_enabled = dd->pport->link_speed_supported; + /* give a reasonable active value, will be set on link up */ + dd->pport->link_speed_active = OPA_LINK_SPEED_25G; + + dd->chip_rcv_contexts = read_csr(dd, RCV_CONTEXTS); + dd->chip_send_contexts = read_csr(dd, SEND_CONTEXTS); + dd->chip_sdma_engines = read_csr(dd, SEND_DMA_ENGINES); + dd->chip_pio_mem_size = read_csr(dd, SEND_PIO_MEM_SIZE); + dd->chip_sdma_mem_size = read_csr(dd, SEND_DMA_MEM_SIZE); + /* fix up link widths for emulation _p */ + ppd = dd->pport; + if (dd->icode == ICODE_FPGA_EMULATION && is_emulator_p(dd)) { + ppd->link_width_supported = + ppd->link_width_enabled = + ppd->link_width_downgrade_supported = + ppd->link_width_downgrade_enabled = + OPA_LINK_WIDTH_1X; + } + /* insure num_vls isn't larger than number of sdma engines */ + if (HFI1_CAP_IS_KSET(SDMA) && num_vls > dd->chip_sdma_engines) { + dd_dev_err(dd, "num_vls %u too large, using %u VLs\n", + num_vls, dd->chip_sdma_engines); + num_vls = dd->chip_sdma_engines; + ppd->vls_supported = dd->chip_sdma_engines; + ppd->vls_operational = ppd->vls_supported; + } + + /* + * Convert the ns parameter to the 64 * cclocks used in the CSR. + * Limit the max if larger than the field holds. If timeout is + * non-zero, then the calculated field will be at least 1. + * + * Must be after icode is set up - the cclock rate depends + * on knowing the hardware being used. + */ + dd->rcv_intr_timeout_csr = ns_to_cclock(dd, rcv_intr_timeout) / 64; + if (dd->rcv_intr_timeout_csr > + RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK) + dd->rcv_intr_timeout_csr = + RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK; + else if (dd->rcv_intr_timeout_csr == 0 && rcv_intr_timeout) + dd->rcv_intr_timeout_csr = 1; + + /* needs to be done before we look for the peer device */ + read_guid(dd); + + /* set up shared ASIC data with peer device */ + ret = init_asic_data(dd); + if (ret) + goto bail_cleanup; + + /* obtain chip sizes, reset chip CSRs */ + init_chip(dd); + + /* read in the PCIe link speed information */ + ret = pcie_speeds(dd); + if (ret) + goto bail_cleanup; + + /* Needs to be called before hfi1_firmware_init */ + get_platform_config(dd); + + /* read in firmware */ + ret = hfi1_firmware_init(dd); + if (ret) + goto bail_cleanup; + + /* + * In general, the PCIe Gen3 transition must occur after the + * chip has been idled (so it won't initiate any PCIe transactions + * e.g. an interrupt) and before the driver changes any registers + * (the transition will reset the registers). + * + * In particular, place this call after: + * - init_chip() - the chip will not initiate any PCIe transactions + * - pcie_speeds() - reads the current link speed + * - hfi1_firmware_init() - the needed firmware is ready to be + * downloaded + */ + ret = do_pcie_gen3_transition(dd); + if (ret) + goto bail_cleanup; + + /* start setting dd values and adjusting CSRs */ + init_early_variables(dd); + + parse_platform_config(dd); + + ret = obtain_boardname(dd); + if (ret) + goto bail_cleanup; + + snprintf(dd->boardversion, BOARD_VERS_MAX, + "ChipABI %u.%u, ChipRev %u.%u, SW Compat %llu\n", + HFI1_CHIP_VERS_MAJ, HFI1_CHIP_VERS_MIN, + (u32)dd->majrev, + (u32)dd->minrev, + (dd->revision >> CCE_REVISION_SW_SHIFT) + & CCE_REVISION_SW_MASK); + + /* + * The real cpu mask is part of the affinity struct but has to be + * initialized earlier than the rest of the affinity struct because it + * is needed to calculate the number of user contexts in + * set_up_context_variables(). However, hfi1_dev_affinity_init(), + * which initializes the rest of the affinity struct members, + * depends on set_up_context_variables() for the number of kernel + * contexts, so it cannot be called before set_up_context_variables(). + */ + ret = init_real_cpu_mask(dd); + if (ret) + goto bail_cleanup; + + ret = set_up_context_variables(dd); + if (ret) + goto bail_cleanup; + + /* set initial RXE CSRs */ + init_rxe(dd); + /* set initial TXE CSRs */ + init_txe(dd); + /* set initial non-RXE, non-TXE CSRs */ + init_other(dd); + /* set up KDETH QP prefix in both RX and TX CSRs */ + init_kdeth_qp(dd); + + hfi1_dev_affinity_init(dd); + + /* send contexts must be set up before receive contexts */ + ret = init_send_contexts(dd); + if (ret) + goto bail_cleanup; + + ret = hfi1_create_ctxts(dd); + if (ret) + goto bail_cleanup; + + dd->rcvhdrsize = DEFAULT_RCVHDRSIZE; + /* + * rcd[0] is guaranteed to be valid by this point. Also, all + * context are using the same value, as per the module parameter. + */ + dd->rhf_offset = dd->rcd[0]->rcvhdrqentsize - sizeof(u64) / sizeof(u32); + + ret = init_pervl_scs(dd); + if (ret) + goto bail_cleanup; + + /* sdma init */ + for (i = 0; i < dd->num_pports; ++i) { + ret = sdma_init(dd, i); + if (ret) + goto bail_cleanup; + } + + /* use contexts created by hfi1_create_ctxts */ + ret = set_up_interrupts(dd); + if (ret) + goto bail_cleanup; + + /* set up LCB access - must be after set_up_interrupts() */ + init_lcb_access(dd); + + snprintf(dd->serial, SERIAL_MAX, "0x%08llx\n", + dd->base_guid & 0xFFFFFF); + + dd->oui1 = dd->base_guid >> 56 & 0xFF; + dd->oui2 = dd->base_guid >> 48 & 0xFF; + dd->oui3 = dd->base_guid >> 40 & 0xFF; + + ret = load_firmware(dd); /* asymmetric with dispose_firmware() */ + if (ret) + goto bail_clear_intr; + check_fabric_firmware_versions(dd); + + thermal_init(dd); + + ret = init_cntrs(dd); + if (ret) + goto bail_clear_intr; + + ret = init_rcverr(dd); + if (ret) + goto bail_free_cntrs; + + ret = eprom_init(dd); + if (ret) + goto bail_free_rcverr; + + goto bail; + +bail_free_rcverr: + free_rcverr(dd); +bail_free_cntrs: + free_cntrs(dd); +bail_clear_intr: + clean_up_interrupts(dd); +bail_cleanup: + hfi1_pcie_ddcleanup(dd); +bail_free: + hfi1_free_devdata(dd); + dd = ERR_PTR(ret); +bail: + return dd; +} + +static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate, + u32 dw_len) +{ + u32 delta_cycles; + u32 current_egress_rate = ppd->current_egress_rate; + /* rates here are in units of 10^6 bits/sec */ + + if (desired_egress_rate == -1) + return 0; /* shouldn't happen */ + + if (desired_egress_rate >= current_egress_rate) + return 0; /* we can't help go faster, only slower */ + + delta_cycles = egress_cycles(dw_len * 4, desired_egress_rate) - + egress_cycles(dw_len * 4, current_egress_rate); + + return (u16)delta_cycles; +} + +/** + * create_pbc - build a pbc for transmission + * @flags: special case flags or-ed in built pbc + * @srate: static rate + * @vl: vl + * @dwlen: dword length (header words + data words + pbc words) + * + * Create a PBC with the given flags, rate, VL, and length. + * + * NOTE: The PBC created will not insert any HCRC - all callers but one are + * for verbs, which does not use this PSM feature. The lone other caller + * is for the diagnostic interface which calls this if the user does not + * supply their own PBC. + */ +u64 create_pbc(struct hfi1_pportdata *ppd, u64 flags, int srate_mbs, u32 vl, + u32 dw_len) +{ + u64 pbc, delay = 0; + + if (unlikely(srate_mbs)) + delay = delay_cycles(ppd, srate_mbs, dw_len); + + pbc = flags + | (delay << PBC_STATIC_RATE_CONTROL_COUNT_SHIFT) + | ((u64)PBC_IHCRC_NONE << PBC_INSERT_HCRC_SHIFT) + | (vl & PBC_VL_MASK) << PBC_VL_SHIFT + | (dw_len & PBC_LENGTH_DWS_MASK) + << PBC_LENGTH_DWS_SHIFT; + + return pbc; +} + +#define SBUS_THERMAL 0x4f +#define SBUS_THERM_MONITOR_MODE 0x1 + +#define THERM_FAILURE(dev, ret, reason) \ + dd_dev_err((dd), \ + "Thermal sensor initialization failed: %s (%d)\n", \ + (reason), (ret)) + +/* + * Initialize the thermal sensor. + * + * After initialization, enable polling of thermal sensor through + * SBus interface. In order for this to work, the SBus Master + * firmware has to be loaded due to the fact that the HW polling + * logic uses SBus interrupts, which are not supported with + * default firmware. Otherwise, no data will be returned through + * the ASIC_STS_THERM CSR. + */ +static int thermal_init(struct hfi1_devdata *dd) +{ + int ret = 0; + + if (dd->icode != ICODE_RTL_SILICON || + check_chip_resource(dd, CR_THERM_INIT, NULL)) + return ret; + + ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT); + if (ret) { + THERM_FAILURE(dd, ret, "Acquire SBus"); + return ret; + } + + dd_dev_info(dd, "Initializing thermal sensor\n"); + /* Disable polling of thermal readings */ + write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0); + msleep(100); + /* Thermal Sensor Initialization */ + /* Step 1: Reset the Thermal SBus Receiver */ + ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0, + RESET_SBUS_RECEIVER, 0); + if (ret) { + THERM_FAILURE(dd, ret, "Bus Reset"); + goto done; + } + /* Step 2: Set Reset bit in Thermal block */ + ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0, + WRITE_SBUS_RECEIVER, 0x1); + if (ret) { + THERM_FAILURE(dd, ret, "Therm Block Reset"); + goto done; + } + /* Step 3: Write clock divider value (100MHz -> 2MHz) */ + ret = sbus_request_slow(dd, SBUS_THERMAL, 0x1, + WRITE_SBUS_RECEIVER, 0x32); + if (ret) { + THERM_FAILURE(dd, ret, "Write Clock Div"); + goto done; + } + /* Step 4: Select temperature mode */ + ret = sbus_request_slow(dd, SBUS_THERMAL, 0x3, + WRITE_SBUS_RECEIVER, + SBUS_THERM_MONITOR_MODE); + if (ret) { + THERM_FAILURE(dd, ret, "Write Mode Sel"); + goto done; + } + /* Step 5: De-assert block reset and start conversion */ + ret = sbus_request_slow(dd, SBUS_THERMAL, 0x0, + WRITE_SBUS_RECEIVER, 0x2); + if (ret) { + THERM_FAILURE(dd, ret, "Write Reset Deassert"); + goto done; + } + /* Step 5.1: Wait for first conversion (21.5ms per spec) */ + msleep(22); + + /* Enable polling of thermal readings */ + write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1); + + /* Set initialized flag */ + ret = acquire_chip_resource(dd, CR_THERM_INIT, 0); + if (ret) + THERM_FAILURE(dd, ret, "Unable to set thermal init flag"); + +done: + release_chip_resource(dd, CR_SBUS); + return ret; +} + +static void handle_temp_err(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd = &dd->pport[0]; + /* + * Thermal Critical Interrupt + * Put the device into forced freeze mode, take link down to + * offline, and put DC into reset. + */ + dd_dev_emerg(dd, + "Critical temperature reached! Forcing device into freeze mode!\n"); + dd->flags |= HFI1_FORCED_FREEZE; + start_freeze_handling(ppd, FREEZE_SELF | FREEZE_ABORT); + /* + * Shut DC down as much and as quickly as possible. + * + * Step 1: Take the link down to OFFLINE. This will cause the + * 8051 to put the Serdes in reset. However, we don't want to + * go through the entire link state machine since we want to + * shutdown ASAP. Furthermore, this is not a graceful shutdown + * but rather an attempt to save the chip. + * Code below is almost the same as quiet_serdes() but avoids + * all the extra work and the sleeps. + */ + ppd->driver_link_ready = 0; + ppd->link_enabled = 0; + set_physical_link_state(dd, (OPA_LINKDOWN_REASON_SMA_DISABLED << 8) | + PLS_OFFLINE); + /* + * Step 2: Shutdown LCB and 8051 + * After shutdown, do not restore DC_CFG_RESET value. + */ + dc_shutdown(dd); +} diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h new file mode 100644 index 000000000..66a327978 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -0,0 +1,1374 @@ +#ifndef _CHIP_H +#define _CHIP_H +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This file contains all of the defines that is specific to the HFI chip + */ + +/* sizes */ +#define CCE_NUM_MSIX_VECTORS 256 +#define CCE_NUM_INT_CSRS 12 +#define CCE_NUM_INT_MAP_CSRS 96 +#define NUM_INTERRUPT_SOURCES 768 +#define RXE_NUM_CONTEXTS 160 +#define RXE_PER_CONTEXT_SIZE 0x1000 /* 4k */ +#define RXE_NUM_TID_FLOWS 32 +#define RXE_NUM_DATA_VL 8 +#define TXE_NUM_CONTEXTS 160 +#define TXE_NUM_SDMA_ENGINES 16 +#define NUM_CONTEXTS_PER_SET 8 +#define VL_ARB_HIGH_PRIO_TABLE_SIZE 16 +#define VL_ARB_LOW_PRIO_TABLE_SIZE 16 +#define VL_ARB_TABLE_SIZE 16 +#define TXE_NUM_32_BIT_COUNTER 7 +#define TXE_NUM_64_BIT_COUNTER 30 +#define TXE_NUM_DATA_VL 8 +#define TXE_PIO_SIZE (32 * 0x100000) /* 32 MB */ +#define PIO_BLOCK_SIZE 64 /* bytes */ +#define SDMA_BLOCK_SIZE 64 /* bytes */ +#define RCV_BUF_BLOCK_SIZE 64 /* bytes */ +#define PIO_CMASK 0x7ff /* counter mask for free and fill counters */ +#define MAX_EAGER_ENTRIES 2048 /* max receive eager entries */ +#define MAX_TID_PAIR_ENTRIES 1024 /* max receive expected pairs */ +/* + * Virtual? Allocation Unit, defined as AU = 8*2^vAU, 64 bytes, AU is fixed + * at 64 bytes for all generation one devices + */ +#define CM_VAU 3 +/* HFI link credit count, AKA receive buffer depth (RBUF_DEPTH) */ +#define CM_GLOBAL_CREDITS 0x940 +/* Number of PKey entries in the HW */ +#define MAX_PKEY_VALUES 16 + +#include "chip_registers.h" + +#define RXE_PER_CONTEXT_USER (RXE + RXE_PER_CONTEXT_OFFSET) +#define TXE_PIO_SEND (TXE + TXE_PIO_SEND_OFFSET) + +/* PBC flags */ +#define PBC_INTR BIT_ULL(31) +#define PBC_DC_INFO_SHIFT (30) +#define PBC_DC_INFO BIT_ULL(PBC_DC_INFO_SHIFT) +#define PBC_TEST_EBP BIT_ULL(29) +#define PBC_PACKET_BYPASS BIT_ULL(28) +#define PBC_CREDIT_RETURN BIT_ULL(25) +#define PBC_INSERT_BYPASS_ICRC BIT_ULL(24) +#define PBC_TEST_BAD_ICRC BIT_ULL(23) +#define PBC_FECN BIT_ULL(22) + +/* PbcInsertHcrc field settings */ +#define PBC_IHCRC_LKDETH 0x0 /* insert @ local KDETH offset */ +#define PBC_IHCRC_GKDETH 0x1 /* insert @ global KDETH offset */ +#define PBC_IHCRC_NONE 0x2 /* no HCRC inserted */ + +/* PBC fields */ +#define PBC_STATIC_RATE_CONTROL_COUNT_SHIFT 32 +#define PBC_STATIC_RATE_CONTROL_COUNT_MASK 0xffffull +#define PBC_STATIC_RATE_CONTROL_COUNT_SMASK \ + (PBC_STATIC_RATE_CONTROL_COUNT_MASK << \ + PBC_STATIC_RATE_CONTROL_COUNT_SHIFT) + +#define PBC_INSERT_HCRC_SHIFT 26 +#define PBC_INSERT_HCRC_MASK 0x3ull +#define PBC_INSERT_HCRC_SMASK \ + (PBC_INSERT_HCRC_MASK << PBC_INSERT_HCRC_SHIFT) + +#define PBC_VL_SHIFT 12 +#define PBC_VL_MASK 0xfull +#define PBC_VL_SMASK (PBC_VL_MASK << PBC_VL_SHIFT) + +#define PBC_LENGTH_DWS_SHIFT 0 +#define PBC_LENGTH_DWS_MASK 0xfffull +#define PBC_LENGTH_DWS_SMASK \ + (PBC_LENGTH_DWS_MASK << PBC_LENGTH_DWS_SHIFT) + +/* Credit Return Fields */ +#define CR_COUNTER_SHIFT 0 +#define CR_COUNTER_MASK 0x7ffull +#define CR_COUNTER_SMASK (CR_COUNTER_MASK << CR_COUNTER_SHIFT) + +#define CR_STATUS_SHIFT 11 +#define CR_STATUS_MASK 0x1ull +#define CR_STATUS_SMASK (CR_STATUS_MASK << CR_STATUS_SHIFT) + +#define CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT 12 +#define CR_CREDIT_RETURN_DUE_TO_PBC_MASK 0x1ull +#define CR_CREDIT_RETURN_DUE_TO_PBC_SMASK \ + (CR_CREDIT_RETURN_DUE_TO_PBC_MASK << \ + CR_CREDIT_RETURN_DUE_TO_PBC_SHIFT) + +#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT 13 +#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK 0x1ull +#define CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK \ + (CR_CREDIT_RETURN_DUE_TO_THRESHOLD_MASK << \ + CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SHIFT) + +#define CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT 14 +#define CR_CREDIT_RETURN_DUE_TO_ERR_MASK 0x1ull +#define CR_CREDIT_RETURN_DUE_TO_ERR_SMASK \ + (CR_CREDIT_RETURN_DUE_TO_ERR_MASK << \ + CR_CREDIT_RETURN_DUE_TO_ERR_SHIFT) + +#define CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT 15 +#define CR_CREDIT_RETURN_DUE_TO_FORCE_MASK 0x1ull +#define CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK \ + (CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \ + CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT) + +/* interrupt source numbers */ +#define IS_GENERAL_ERR_START 0 +#define IS_SDMAENG_ERR_START 16 +#define IS_SENDCTXT_ERR_START 32 +#define IS_SDMA_START 192 /* includes SDmaProgress,SDmaIdle */ +#define IS_VARIOUS_START 240 +#define IS_DC_START 248 +#define IS_RCVAVAIL_START 256 +#define IS_RCVURGENT_START 416 +#define IS_SENDCREDIT_START 576 +#define IS_RESERVED_START 736 +#define IS_MAX_SOURCES 768 + +/* derived interrupt source values */ +#define IS_GENERAL_ERR_END IS_SDMAENG_ERR_START +#define IS_SDMAENG_ERR_END IS_SENDCTXT_ERR_START +#define IS_SENDCTXT_ERR_END IS_SDMA_START +#define IS_SDMA_END IS_VARIOUS_START +#define IS_VARIOUS_END IS_DC_START +#define IS_DC_END IS_RCVAVAIL_START +#define IS_RCVAVAIL_END IS_RCVURGENT_START +#define IS_RCVURGENT_END IS_SENDCREDIT_START +#define IS_SENDCREDIT_END IS_RESERVED_START +#define IS_RESERVED_END IS_MAX_SOURCES + +/* absolute interrupt numbers for QSFP1Int and QSFP2Int */ +#define QSFP1_INT 242 +#define QSFP2_INT 243 + +/* DCC_CFG_PORT_CONFIG logical link states */ +#define LSTATE_DOWN 0x1 +#define LSTATE_INIT 0x2 +#define LSTATE_ARMED 0x3 +#define LSTATE_ACTIVE 0x4 + +/* DC8051_STS_CUR_STATE port values (physical link states) */ +#define PLS_DISABLED 0x30 +#define PLS_OFFLINE 0x90 +#define PLS_OFFLINE_QUIET 0x90 +#define PLS_OFFLINE_PLANNED_DOWN_INFORM 0x91 +#define PLS_OFFLINE_READY_TO_QUIET_LT 0x92 +#define PLS_OFFLINE_REPORT_FAILURE 0x93 +#define PLS_OFFLINE_READY_TO_QUIET_BCC 0x94 +#define PLS_POLLING 0x20 +#define PLS_POLLING_QUIET 0x20 +#define PLS_POLLING_ACTIVE 0x21 +#define PLS_CONFIGPHY 0x40 +#define PLS_CONFIGPHY_DEBOUCE 0x40 +#define PLS_CONFIGPHY_ESTCOMM 0x41 +#define PLS_CONFIGPHY_ESTCOMM_TXRX_HUNT 0x42 +#define PLS_CONFIGPHY_ESTCOMM_LOCAL_COMPLETE 0x43 +#define PLS_CONFIGPHY_OPTEQ 0x44 +#define PLS_CONFIGPHY_OPTEQ_OPTIMIZING 0x44 +#define PLS_CONFIGPHY_OPTEQ_LOCAL_COMPLETE 0x45 +#define PLS_CONFIGPHY_VERIFYCAP 0x46 +#define PLS_CONFIGPHY_VERIFYCAP_EXCHANGE 0x46 +#define PLS_CONFIGPHY_VERIFYCAP_LOCAL_COMPLETE 0x47 +#define PLS_CONFIGLT 0x48 +#define PLS_CONFIGLT_CONFIGURE 0x48 +#define PLS_CONFIGLT_LINK_TRANSFER_ACTIVE 0x49 +#define PLS_LINKUP 0x50 +#define PLS_PHYTEST 0xB0 +#define PLS_INTERNAL_SERDES_LOOPBACK 0xe1 +#define PLS_QUICK_LINKUP 0xe2 + +/* DC_DC8051_CFG_HOST_CMD_0.REQ_TYPE - 8051 host commands */ +#define HCMD_LOAD_CONFIG_DATA 0x01 +#define HCMD_READ_CONFIG_DATA 0x02 +#define HCMD_CHANGE_PHY_STATE 0x03 +#define HCMD_SEND_LCB_IDLE_MSG 0x04 +#define HCMD_MISC 0x05 +#define HCMD_READ_LCB_IDLE_MSG 0x06 +#define HCMD_READ_LCB_CSR 0x07 +#define HCMD_WRITE_LCB_CSR 0x08 +#define HCMD_INTERFACE_TEST 0xff + +/* DC_DC8051_CFG_HOST_CMD_1.RETURN_CODE - 8051 host command return */ +#define HCMD_SUCCESS 2 + +/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.ERROR - error flags */ +#define SPICO_ROM_FAILED BIT(0) +#define UNKNOWN_FRAME BIT(1) +#define TARGET_BER_NOT_MET BIT(2) +#define FAILED_SERDES_INTERNAL_LOOPBACK BIT(3) +#define FAILED_SERDES_INIT BIT(4) +#define FAILED_LNI_POLLING BIT(5) +#define FAILED_LNI_DEBOUNCE BIT(6) +#define FAILED_LNI_ESTBCOMM BIT(7) +#define FAILED_LNI_OPTEQ BIT(8) +#define FAILED_LNI_VERIFY_CAP1 BIT(9) +#define FAILED_LNI_VERIFY_CAP2 BIT(10) +#define FAILED_LNI_CONFIGLT BIT(11) +#define HOST_HANDSHAKE_TIMEOUT BIT(12) + +#define FAILED_LNI (FAILED_LNI_POLLING | FAILED_LNI_DEBOUNCE \ + | FAILED_LNI_ESTBCOMM | FAILED_LNI_OPTEQ \ + | FAILED_LNI_VERIFY_CAP1 \ + | FAILED_LNI_VERIFY_CAP2 \ + | FAILED_LNI_CONFIGLT | HOST_HANDSHAKE_TIMEOUT) + +/* DC_DC8051_DBG_ERR_INFO_SET_BY_8051.HOST_MSG - host message flags */ +#define HOST_REQ_DONE BIT(0) +#define BC_PWR_MGM_MSG BIT(1) +#define BC_SMA_MSG BIT(2) +#define BC_BCC_UNKNOWN_MSG BIT(3) +#define BC_IDLE_UNKNOWN_MSG BIT(4) +#define EXT_DEVICE_CFG_REQ BIT(5) +#define VERIFY_CAP_FRAME BIT(6) +#define LINKUP_ACHIEVED BIT(7) +#define LINK_GOING_DOWN BIT(8) +#define LINK_WIDTH_DOWNGRADED BIT(9) + +/* DC_DC8051_CFG_EXT_DEV_1.REQ_TYPE - 8051 host requests */ +#define HREQ_LOAD_CONFIG 0x01 +#define HREQ_SAVE_CONFIG 0x02 +#define HREQ_READ_CONFIG 0x03 +#define HREQ_SET_TX_EQ_ABS 0x04 +#define HREQ_SET_TX_EQ_REL 0x05 +#define HREQ_ENABLE 0x06 +#define HREQ_CONFIG_DONE 0xfe +#define HREQ_INTERFACE_TEST 0xff + +/* DC_DC8051_CFG_EXT_DEV_0.RETURN_CODE - 8051 host request return codes */ +#define HREQ_INVALID 0x01 +#define HREQ_SUCCESS 0x02 +#define HREQ_NOT_SUPPORTED 0x03 +#define HREQ_FEATURE_NOT_SUPPORTED 0x04 /* request specific feature */ +#define HREQ_REQUEST_REJECTED 0xfe +#define HREQ_EXECUTION_ONGOING 0xff + +/* MISC host command functions */ +#define HCMD_MISC_REQUEST_LCB_ACCESS 0x1 +#define HCMD_MISC_GRANT_LCB_ACCESS 0x2 + +/* idle flit message types */ +#define IDLE_PHYSICAL_LINK_MGMT 0x1 +#define IDLE_CRU 0x2 +#define IDLE_SMA 0x3 +#define IDLE_POWER_MGMT 0x4 + +/* idle flit message send fields (both send and read) */ +#define IDLE_PAYLOAD_MASK 0xffffffffffull /* 40 bits */ +#define IDLE_PAYLOAD_SHIFT 8 +#define IDLE_MSG_TYPE_MASK 0xf +#define IDLE_MSG_TYPE_SHIFT 0 + +/* idle flit message read fields */ +#define READ_IDLE_MSG_TYPE_MASK 0xf +#define READ_IDLE_MSG_TYPE_SHIFT 0 + +/* SMA idle flit payload commands */ +#define SMA_IDLE_ARM 1 +#define SMA_IDLE_ACTIVE 2 + +/* DC_DC8051_CFG_MODE.GENERAL bits */ +#define DISABLE_SELF_GUID_CHECK 0x2 + +/* + * Eager buffer minimum and maximum sizes supported by the hardware. + * All power-of-two sizes in between are supported as well. + * MAX_EAGER_BUFFER_TOTAL is the maximum size of memory + * allocatable for Eager buffer to a single context. All others + * are limits for the RcvArray entries. + */ +#define MIN_EAGER_BUFFER (4 * 1024) +#define MAX_EAGER_BUFFER (256 * 1024) +#define MAX_EAGER_BUFFER_TOTAL (64 * (1 << 20)) /* max per ctxt 64MB */ +#define MAX_EXPECTED_BUFFER (2048 * 1024) + +/* + * Receive expected base and count and eager base and count increment - + * the CSR fields hold multiples of this value. + */ +#define RCV_SHIFT 3 +#define RCV_INCREMENT BIT(RCV_SHIFT) + +/* + * Receive header queue entry increment - the CSR holds multiples of + * this value. + */ +#define HDRQ_SIZE_SHIFT 5 +#define HDRQ_INCREMENT BIT(HDRQ_SIZE_SHIFT) + +/* + * Freeze handling flags + */ +#define FREEZE_ABORT 0x01 /* do not do recovery */ +#define FREEZE_SELF 0x02 /* initiate the freeze */ +#define FREEZE_LINK_DOWN 0x04 /* link is down */ + +/* + * Chip implementation codes. + */ +#define ICODE_RTL_SILICON 0x00 +#define ICODE_RTL_VCS_SIMULATION 0x01 +#define ICODE_FPGA_EMULATION 0x02 +#define ICODE_FUNCTIONAL_SIMULATOR 0x03 + +/* + * 8051 data memory size. + */ +#define DC8051_DATA_MEM_SIZE 0x1000 + +/* + * 8051 firmware registers + */ +#define NUM_GENERAL_FIELDS 0x17 +#define NUM_LANE_FIELDS 0x8 + +/* 8051 general register Field IDs */ +#define LINK_OPTIMIZATION_SETTINGS 0x00 +#define LINK_TUNING_PARAMETERS 0x02 +#define DC_HOST_COMM_SETTINGS 0x03 +#define TX_SETTINGS 0x06 +#define VERIFY_CAP_LOCAL_PHY 0x07 +#define VERIFY_CAP_LOCAL_FABRIC 0x08 +#define VERIFY_CAP_LOCAL_LINK_WIDTH 0x09 +#define LOCAL_DEVICE_ID 0x0a +#define LOCAL_LNI_INFO 0x0c +#define REMOTE_LNI_INFO 0x0d +#define MISC_STATUS 0x0e +#define VERIFY_CAP_REMOTE_PHY 0x0f +#define VERIFY_CAP_REMOTE_FABRIC 0x10 +#define VERIFY_CAP_REMOTE_LINK_WIDTH 0x11 +#define LAST_LOCAL_STATE_COMPLETE 0x12 +#define LAST_REMOTE_STATE_COMPLETE 0x13 +#define LINK_QUALITY_INFO 0x14 +#define REMOTE_DEVICE_ID 0x15 +#define LINK_DOWN_REASON 0x16 + +/* 8051 lane specific register field IDs */ +#define TX_EQ_SETTINGS 0x00 +#define CHANNEL_LOSS_SETTINGS 0x05 + +/* Lane ID for general configuration registers */ +#define GENERAL_CONFIG 4 + +/* LINK_TUNING_PARAMETERS fields */ +#define TUNING_METHOD_SHIFT 24 + +/* LINK_OPTIMIZATION_SETTINGS fields */ +#define ENABLE_EXT_DEV_CONFIG_SHIFT 24 + +/* LOAD_DATA 8051 command shifts and fields */ +#define LOAD_DATA_FIELD_ID_SHIFT 40 +#define LOAD_DATA_FIELD_ID_MASK 0xfull +#define LOAD_DATA_LANE_ID_SHIFT 32 +#define LOAD_DATA_LANE_ID_MASK 0xfull +#define LOAD_DATA_DATA_SHIFT 0x0 +#define LOAD_DATA_DATA_MASK 0xffffffffull + +/* READ_DATA 8051 command shifts and fields */ +#define READ_DATA_FIELD_ID_SHIFT 40 +#define READ_DATA_FIELD_ID_MASK 0xffull +#define READ_DATA_LANE_ID_SHIFT 32 +#define READ_DATA_LANE_ID_MASK 0xffull +#define READ_DATA_DATA_SHIFT 0x0 +#define READ_DATA_DATA_MASK 0xffffffffull + +/* TX settings fields */ +#define ENABLE_LANE_TX_SHIFT 0 +#define ENABLE_LANE_TX_MASK 0xff +#define TX_POLARITY_INVERSION_SHIFT 8 +#define TX_POLARITY_INVERSION_MASK 0xff +#define RX_POLARITY_INVERSION_SHIFT 16 +#define RX_POLARITY_INVERSION_MASK 0xff +#define MAX_RATE_SHIFT 24 +#define MAX_RATE_MASK 0xff + +/* verify capability PHY fields */ +#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_SHIFT 0x4 +#define CONTINIOUS_REMOTE_UPDATE_SUPPORT_MASK 0x1 +#define POWER_MANAGEMENT_SHIFT 0x0 +#define POWER_MANAGEMENT_MASK 0xf + +/* 8051 lane register Field IDs */ +#define SPICO_FW_VERSION 0x7 /* SPICO firmware version */ + +/* SPICO firmware version fields */ +#define SPICO_ROM_VERSION_SHIFT 0 +#define SPICO_ROM_VERSION_MASK 0xffff +#define SPICO_ROM_PROD_ID_SHIFT 16 +#define SPICO_ROM_PROD_ID_MASK 0xffff + +/* verify capability fabric fields */ +#define VAU_SHIFT 0 +#define VAU_MASK 0x0007 +#define Z_SHIFT 3 +#define Z_MASK 0x0001 +#define VCU_SHIFT 4 +#define VCU_MASK 0x0007 +#define VL15BUF_SHIFT 8 +#define VL15BUF_MASK 0x0fff +#define CRC_SIZES_SHIFT 20 +#define CRC_SIZES_MASK 0x7 + +/* verify capability local link width fields */ +#define LINK_WIDTH_SHIFT 0 /* also for remote link width */ +#define LINK_WIDTH_MASK 0xffff /* also for remote link width */ +#define LOCAL_FLAG_BITS_SHIFT 16 +#define LOCAL_FLAG_BITS_MASK 0xff +#define MISC_CONFIG_BITS_SHIFT 24 +#define MISC_CONFIG_BITS_MASK 0xff + +/* verify capability remote link width fields */ +#define REMOTE_TX_RATE_SHIFT 16 +#define REMOTE_TX_RATE_MASK 0xff + +/* LOCAL_DEVICE_ID fields */ +#define LOCAL_DEVICE_REV_SHIFT 0 +#define LOCAL_DEVICE_REV_MASK 0xff +#define LOCAL_DEVICE_ID_SHIFT 8 +#define LOCAL_DEVICE_ID_MASK 0xffff + +/* REMOTE_DEVICE_ID fields */ +#define REMOTE_DEVICE_REV_SHIFT 0 +#define REMOTE_DEVICE_REV_MASK 0xff +#define REMOTE_DEVICE_ID_SHIFT 8 +#define REMOTE_DEVICE_ID_MASK 0xffff + +/* local LNI link width fields */ +#define ENABLE_LANE_RX_SHIFT 16 +#define ENABLE_LANE_RX_MASK 0xff + +/* mask, shift for reading 'mgmt_enabled' value from REMOTE_LNI_INFO field */ +#define MGMT_ALLOWED_SHIFT 23 +#define MGMT_ALLOWED_MASK 0x1 + +/* mask, shift for 'link_quality' within LINK_QUALITY_INFO field */ +#define LINK_QUALITY_SHIFT 24 +#define LINK_QUALITY_MASK 0x7 + +/* + * mask, shift for reading 'planned_down_remote_reason_code' + * from LINK_QUALITY_INFO field + */ +#define DOWN_REMOTE_REASON_SHIFT 16 +#define DOWN_REMOTE_REASON_MASK 0xff + +/* verify capability PHY power management bits */ +#define PWRM_BER_CONTROL 0x1 +#define PWRM_BANDWIDTH_CONTROL 0x2 + +/* 8051 link down reasons */ +#define LDR_LINK_TRANSFER_ACTIVE_LOW 0xa +#define LDR_RECEIVED_LINKDOWN_IDLE_MSG 0xb +#define LDR_RECEIVED_HOST_OFFLINE_REQ 0xc + +/* verify capability fabric CRC size bits */ +enum { + CAP_CRC_14B = (1 << 0), /* 14b CRC */ + CAP_CRC_48B = (1 << 1), /* 48b CRC */ + CAP_CRC_12B_16B_PER_LANE = (1 << 2) /* 12b-16b per lane CRC */ +}; + +#define SUPPORTED_CRCS (CAP_CRC_14B | CAP_CRC_48B) + +/* misc status version fields */ +#define STS_FM_VERSION_A_SHIFT 16 +#define STS_FM_VERSION_A_MASK 0xff +#define STS_FM_VERSION_B_SHIFT 24 +#define STS_FM_VERSION_B_MASK 0xff + +/* LCB_CFG_CRC_MODE TX_VAL and RX_VAL CRC mode values */ +#define LCB_CRC_16B 0x0 /* 16b CRC */ +#define LCB_CRC_14B 0x1 /* 14b CRC */ +#define LCB_CRC_48B 0x2 /* 48b CRC */ +#define LCB_CRC_12B_16B_PER_LANE 0x3 /* 12b-16b per lane CRC */ + +/* + * the following enum is (almost) a copy/paste of the definition + * in the OPA spec, section 20.2.2.6.8 (PortInfo) + */ +enum { + PORT_LTP_CRC_MODE_NONE = 0, + PORT_LTP_CRC_MODE_14 = 1, /* 14-bit LTP CRC mode (optional) */ + PORT_LTP_CRC_MODE_16 = 2, /* 16-bit LTP CRC mode */ + PORT_LTP_CRC_MODE_48 = 4, + /* 48-bit overlapping LTP CRC mode (optional) */ + PORT_LTP_CRC_MODE_PER_LANE = 8 + /* 12 to 16 bit per lane LTP CRC mode (optional) */ +}; + +/* timeouts */ +#define LINK_RESTART_DELAY 1000 /* link restart delay, in ms */ +#define TIMEOUT_8051_START 5000 /* 8051 start timeout, in ms */ +#define DC8051_COMMAND_TIMEOUT 20000 /* DC8051 command timeout, in ms */ +#define FREEZE_STATUS_TIMEOUT 20 /* wait for freeze indicators, in ms */ +#define VL_STATUS_CLEAR_TIMEOUT 5000 /* per-VL status clear, in ms */ +#define CCE_STATUS_TIMEOUT 10 /* time to clear CCE Status, in ms */ + +/* cclock tick time, in picoseconds per tick: 1/speed * 10^12 */ +#define ASIC_CCLOCK_PS 1242 /* 805 MHz */ +#define FPGA_CCLOCK_PS 30300 /* 33 MHz */ + +/* + * Mask of enabled MISC errors. Do not enable the two RSA engine errors - + * see firmware.c:run_rsa() for details. + */ +#define DRIVER_MISC_MASK \ + (~(MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK \ + | MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK)) + +/* valid values for the loopback module parameter */ +#define LOOPBACK_NONE 0 /* no loopback - default */ +#define LOOPBACK_SERDES 1 +#define LOOPBACK_LCB 2 +#define LOOPBACK_CABLE 3 /* external cable */ + +/* read and write hardware registers */ +u64 read_csr(const struct hfi1_devdata *dd, u32 offset); +void write_csr(const struct hfi1_devdata *dd, u32 offset, u64 value); + +/* + * The *_kctxt_* flavor of the CSR read/write functions are for + * per-context or per-SDMA CSRs that are not mappable to user-space. + * Their spacing is not a PAGE_SIZE multiple. + */ +static inline u64 read_kctxt_csr(const struct hfi1_devdata *dd, int ctxt, + u32 offset0) +{ + /* kernel per-context CSRs are separated by 0x100 */ + return read_csr(dd, offset0 + (0x100 * ctxt)); +} + +static inline void write_kctxt_csr(struct hfi1_devdata *dd, int ctxt, + u32 offset0, u64 value) +{ + /* kernel per-context CSRs are separated by 0x100 */ + write_csr(dd, offset0 + (0x100 * ctxt), value); +} + +int read_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 *data); +int write_lcb_csr(struct hfi1_devdata *dd, u32 offset, u64 data); + +void __iomem *get_csr_addr( + struct hfi1_devdata *dd, + u32 offset); + +static inline void __iomem *get_kctxt_csr_addr( + struct hfi1_devdata *dd, + int ctxt, + u32 offset0) +{ + return get_csr_addr(dd, offset0 + (0x100 * ctxt)); +} + +/* + * The *_uctxt_* flavor of the CSR read/write functions are for + * per-context CSRs that are mappable to user space. All these CSRs + * are spaced by a PAGE_SIZE multiple in order to be mappable to + * different processes without exposing other contexts' CSRs + */ +static inline u64 read_uctxt_csr(const struct hfi1_devdata *dd, int ctxt, + u32 offset0) +{ + /* user per-context CSRs are separated by 0x1000 */ + return read_csr(dd, offset0 + (0x1000 * ctxt)); +} + +static inline void write_uctxt_csr(struct hfi1_devdata *dd, int ctxt, + u32 offset0, u64 value) +{ + /* user per-context CSRs are separated by 0x1000 */ + write_csr(dd, offset0 + (0x1000 * ctxt), value); +} + +u64 create_pbc(struct hfi1_pportdata *ppd, u64, int, u32, u32); + +/* firmware.c */ +#define SBUS_MASTER_BROADCAST 0xfd +#define NUM_PCIE_SERDES 16 /* number of PCIe serdes on the SBus */ +extern const u8 pcie_serdes_broadcast[]; +extern const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES]; +extern uint platform_config_load; + +/* SBus commands */ +#define RESET_SBUS_RECEIVER 0x20 +#define WRITE_SBUS_RECEIVER 0x21 +void sbus_request(struct hfi1_devdata *dd, + u8 receiver_addr, u8 data_addr, u8 command, u32 data_in); +int sbus_request_slow(struct hfi1_devdata *dd, + u8 receiver_addr, u8 data_addr, u8 command, u32 data_in); +void set_sbus_fast_mode(struct hfi1_devdata *dd); +void clear_sbus_fast_mode(struct hfi1_devdata *dd); +int hfi1_firmware_init(struct hfi1_devdata *dd); +int load_pcie_firmware(struct hfi1_devdata *dd); +int load_firmware(struct hfi1_devdata *dd); +void dispose_firmware(void); +int acquire_hw_mutex(struct hfi1_devdata *dd); +void release_hw_mutex(struct hfi1_devdata *dd); + +/* + * Bitmask of dynamic access for ASIC block chip resources. Each HFI has its + * own range of bits for the resource so it can clear its own bits on + * starting and exiting. If either HFI has the resource bit set, the + * resource is in use. The separate bit ranges are: + * HFI0 bits 7:0 + * HFI1 bits 15:8 + */ +#define CR_SBUS 0x01 /* SBUS, THERM, and PCIE registers */ +#define CR_EPROM 0x02 /* EEP, GPIO registers */ +#define CR_I2C1 0x04 /* QSFP1_OE register */ +#define CR_I2C2 0x08 /* QSFP2_OE register */ +#define CR_DYN_SHIFT 8 /* dynamic flag shift */ +#define CR_DYN_MASK ((1ull << CR_DYN_SHIFT) - 1) + +/* + * Bitmask of static ASIC states these are outside of the dynamic ASIC + * block chip resources above. These are to be set once and never cleared. + * Must be holding the SBus dynamic flag when setting. + */ +#define CR_THERM_INIT 0x010000 + +int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait); +void release_chip_resource(struct hfi1_devdata *dd, u32 resource); +bool check_chip_resource(struct hfi1_devdata *dd, u32 resource, + const char *func); +void init_chip_resources(struct hfi1_devdata *dd); +void finish_chip_resources(struct hfi1_devdata *dd); + +/* ms wait time for access to an SBus resoure */ +#define SBUS_TIMEOUT 4000 /* long enough for a FW download and SBR */ + +/* ms wait time for a qsfp (i2c) chain to become available */ +#define QSFP_WAIT 20000 /* long enough for FW update to the F4 uc */ + +void fabric_serdes_reset(struct hfi1_devdata *dd); +int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result); + +/* chip.c */ +void read_misc_status(struct hfi1_devdata *dd, u8 *ver_a, u8 *ver_b); +void read_guid(struct hfi1_devdata *dd); +int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout); +void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason, + u8 neigh_reason, u8 rem_reason); +int set_link_state(struct hfi1_pportdata *, u32 state); +int port_ltp_to_cap(int port_ltp); +void handle_verify_cap(struct work_struct *work); +void handle_freeze(struct work_struct *work); +void handle_link_up(struct work_struct *work); +void handle_link_down(struct work_struct *work); +void handle_link_downgrade(struct work_struct *work); +void handle_link_bounce(struct work_struct *work); +void handle_sma_message(struct work_struct *work); +void reset_qsfp(struct hfi1_pportdata *ppd); +void qsfp_event(struct work_struct *work); +void start_freeze_handling(struct hfi1_pportdata *ppd, int flags); +int send_idle_sma(struct hfi1_devdata *dd, u64 message); +int load_8051_config(struct hfi1_devdata *, u8, u8, u32); +int read_8051_config(struct hfi1_devdata *, u8, u8, u32 *); +int start_link(struct hfi1_pportdata *ppd); +int bringup_serdes(struct hfi1_pportdata *ppd); +void set_intr_state(struct hfi1_devdata *dd, u32 enable); +void apply_link_downgrade_policy(struct hfi1_pportdata *ppd, + int refresh_widths); +void update_usrhead(struct hfi1_ctxtdata *, u32, u32, u32, u32, u32); +int stop_drain_data_vls(struct hfi1_devdata *dd); +int open_fill_data_vls(struct hfi1_devdata *dd); +u32 ns_to_cclock(struct hfi1_devdata *dd, u32 ns); +u32 cclock_to_ns(struct hfi1_devdata *dd, u32 cclock); +void get_linkup_link_widths(struct hfi1_pportdata *ppd); +void read_ltp_rtt(struct hfi1_devdata *dd); +void clear_linkup_counters(struct hfi1_devdata *dd); +u32 hdrqempty(struct hfi1_ctxtdata *rcd); +int is_ax(struct hfi1_devdata *dd); +int is_bx(struct hfi1_devdata *dd); +u32 read_physical_state(struct hfi1_devdata *dd); +u32 chip_to_opa_pstate(struct hfi1_devdata *dd, u32 chip_pstate); +u32 get_logical_state(struct hfi1_pportdata *ppd); +const char *opa_lstate_name(u32 lstate); +const char *opa_pstate_name(u32 pstate); +u32 driver_physical_state(struct hfi1_pportdata *ppd); +u32 driver_logical_state(struct hfi1_pportdata *ppd); + +int acquire_lcb_access(struct hfi1_devdata *dd, int sleep_ok); +int release_lcb_access(struct hfi1_devdata *dd, int sleep_ok); +#define LCB_START DC_LCB_CSRS +#define LCB_END DC_8051_CSRS /* next block is 8051 */ +static inline int is_lcb_offset(u32 offset) +{ + return (offset >= LCB_START && offset < LCB_END); +} + +extern uint num_vls; + +extern uint disable_integrity; +u64 read_dev_cntr(struct hfi1_devdata *dd, int index, int vl); +u64 write_dev_cntr(struct hfi1_devdata *dd, int index, int vl, u64 data); +u64 read_port_cntr(struct hfi1_pportdata *ppd, int index, int vl); +u64 write_port_cntr(struct hfi1_pportdata *ppd, int index, int vl, u64 data); +u32 read_logical_state(struct hfi1_devdata *dd); +void force_recv_intr(struct hfi1_ctxtdata *rcd); + +/* Per VL indexes */ +enum { + C_VL_0 = 0, + C_VL_1, + C_VL_2, + C_VL_3, + C_VL_4, + C_VL_5, + C_VL_6, + C_VL_7, + C_VL_15, + C_VL_COUNT +}; + +static inline int vl_from_idx(int idx) +{ + return (idx == C_VL_15 ? 15 : idx); +} + +static inline int idx_from_vl(int vl) +{ + return (vl == 15 ? C_VL_15 : vl); +} + +/* Per device counter indexes */ +enum { + C_RCV_OVF = 0, + C_RX_TID_FULL, + C_RX_TID_INVALID, + C_RX_TID_FLGMS, + C_RX_CTX_EGRS, + C_RCV_TID_FLSMS, + C_CCE_PCI_CR_ST, + C_CCE_PCI_TR_ST, + C_CCE_PIO_WR_ST, + C_CCE_ERR_INT, + C_CCE_SDMA_INT, + C_CCE_MISC_INT, + C_CCE_RCV_AV_INT, + C_CCE_RCV_URG_INT, + C_CCE_SEND_CR_INT, + C_DC_UNC_ERR, + C_DC_RCV_ERR, + C_DC_FM_CFG_ERR, + C_DC_RMT_PHY_ERR, + C_DC_DROPPED_PKT, + C_DC_MC_XMIT_PKTS, + C_DC_MC_RCV_PKTS, + C_DC_XMIT_CERR, + C_DC_RCV_CERR, + C_DC_RCV_FCC, + C_DC_XMIT_FCC, + C_DC_XMIT_FLITS, + C_DC_RCV_FLITS, + C_DC_XMIT_PKTS, + C_DC_RCV_PKTS, + C_DC_RX_FLIT_VL, + C_DC_RX_PKT_VL, + C_DC_RCV_FCN, + C_DC_RCV_FCN_VL, + C_DC_RCV_BCN, + C_DC_RCV_BCN_VL, + C_DC_RCV_BBL, + C_DC_RCV_BBL_VL, + C_DC_MARK_FECN, + C_DC_MARK_FECN_VL, + C_DC_TOTAL_CRC, + C_DC_CRC_LN0, + C_DC_CRC_LN1, + C_DC_CRC_LN2, + C_DC_CRC_LN3, + C_DC_CRC_MULT_LN, + C_DC_TX_REPLAY, + C_DC_RX_REPLAY, + C_DC_SEQ_CRC_CNT, + C_DC_ESC0_ONLY_CNT, + C_DC_ESC0_PLUS1_CNT, + C_DC_ESC0_PLUS2_CNT, + C_DC_REINIT_FROM_PEER_CNT, + C_DC_SBE_CNT, + C_DC_MISC_FLG_CNT, + C_DC_PRF_GOOD_LTP_CNT, + C_DC_PRF_ACCEPTED_LTP_CNT, + C_DC_PRF_RX_FLIT_CNT, + C_DC_PRF_TX_FLIT_CNT, + C_DC_PRF_CLK_CNTR, + C_DC_PG_DBG_FLIT_CRDTS_CNT, + C_DC_PG_STS_PAUSE_COMPLETE_CNT, + C_DC_PG_STS_TX_SBE_CNT, + C_DC_PG_STS_TX_MBE_CNT, + C_SW_CPU_INTR, + C_SW_CPU_RCV_LIM, + C_SW_VTX_WAIT, + C_SW_PIO_WAIT, + C_SW_PIO_DRAIN, + C_SW_KMEM_WAIT, + C_SW_SEND_SCHED, + C_SDMA_DESC_FETCHED_CNT, + C_SDMA_INT_CNT, + C_SDMA_ERR_CNT, + C_SDMA_IDLE_INT_CNT, + C_SDMA_PROGRESS_INT_CNT, +/* MISC_ERR_STATUS */ + C_MISC_PLL_LOCK_FAIL_ERR, + C_MISC_MBIST_FAIL_ERR, + C_MISC_INVALID_EEP_CMD_ERR, + C_MISC_EFUSE_DONE_PARITY_ERR, + C_MISC_EFUSE_WRITE_ERR, + C_MISC_EFUSE_READ_BAD_ADDR_ERR, + C_MISC_EFUSE_CSR_PARITY_ERR, + C_MISC_FW_AUTH_FAILED_ERR, + C_MISC_KEY_MISMATCH_ERR, + C_MISC_SBUS_WRITE_FAILED_ERR, + C_MISC_CSR_WRITE_BAD_ADDR_ERR, + C_MISC_CSR_READ_BAD_ADDR_ERR, + C_MISC_CSR_PARITY_ERR, +/* CceErrStatus */ + /* + * A special counter that is the aggregate count + * of all the cce_err_status errors. The remainder + * are actual bits in the CceErrStatus register. + */ + C_CCE_ERR_STATUS_AGGREGATED_CNT, + C_CCE_MSIX_CSR_PARITY_ERR, + C_CCE_INT_MAP_UNC_ERR, + C_CCE_INT_MAP_COR_ERR, + C_CCE_MSIX_TABLE_UNC_ERR, + C_CCE_MSIX_TABLE_COR_ERR, + C_CCE_RXDMA_CONV_FIFO_PARITY_ERR, + C_CCE_RCPL_ASYNC_FIFO_PARITY_ERR, + C_CCE_SEG_WRITE_BAD_ADDR_ERR, + C_CCE_SEG_READ_BAD_ADDR_ERR, + C_LA_TRIGGERED, + C_CCE_TRGT_CPL_TIMEOUT_ERR, + C_PCIC_RECEIVE_PARITY_ERR, + C_PCIC_TRANSMIT_BACK_PARITY_ERR, + C_PCIC_TRANSMIT_FRONT_PARITY_ERR, + C_PCIC_CPL_DAT_Q_UNC_ERR, + C_PCIC_CPL_HD_Q_UNC_ERR, + C_PCIC_POST_DAT_Q_UNC_ERR, + C_PCIC_POST_HD_Q_UNC_ERR, + C_PCIC_RETRY_SOT_MEM_UNC_ERR, + C_PCIC_RETRY_MEM_UNC_ERR, + C_PCIC_N_POST_DAT_Q_PARITY_ERR, + C_PCIC_N_POST_H_Q_PARITY_ERR, + C_PCIC_CPL_DAT_Q_COR_ERR, + C_PCIC_CPL_HD_Q_COR_ERR, + C_PCIC_POST_DAT_Q_COR_ERR, + C_PCIC_POST_HD_Q_COR_ERR, + C_PCIC_RETRY_SOT_MEM_COR_ERR, + C_PCIC_RETRY_MEM_COR_ERR, + C_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERR, + C_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERR, + C_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR, + C_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR, + C_CCE_CLI2_ASYNC_FIFO_PARITY_ERR, + C_CCE_CSR_CFG_BUS_PARITY_ERR, + C_CCE_CLI0_ASYNC_FIFO_PARTIY_ERR, + C_CCE_RSPD_DATA_PARITY_ERR, + C_CCE_TRGT_ACCESS_ERR, + C_CCE_TRGT_ASYNC_FIFO_PARITY_ERR, + C_CCE_CSR_WRITE_BAD_ADDR_ERR, + C_CCE_CSR_READ_BAD_ADDR_ERR, + C_CCE_CSR_PARITY_ERR, +/* RcvErrStatus */ + C_RX_CSR_PARITY_ERR, + C_RX_CSR_WRITE_BAD_ADDR_ERR, + C_RX_CSR_READ_BAD_ADDR_ERR, + C_RX_DMA_CSR_UNC_ERR, + C_RX_DMA_DQ_FSM_ENCODING_ERR, + C_RX_DMA_EQ_FSM_ENCODING_ERR, + C_RX_DMA_CSR_PARITY_ERR, + C_RX_RBUF_DATA_COR_ERR, + C_RX_RBUF_DATA_UNC_ERR, + C_RX_DMA_DATA_FIFO_RD_COR_ERR, + C_RX_DMA_DATA_FIFO_RD_UNC_ERR, + C_RX_DMA_HDR_FIFO_RD_COR_ERR, + C_RX_DMA_HDR_FIFO_RD_UNC_ERR, + C_RX_RBUF_DESC_PART2_COR_ERR, + C_RX_RBUF_DESC_PART2_UNC_ERR, + C_RX_RBUF_DESC_PART1_COR_ERR, + C_RX_RBUF_DESC_PART1_UNC_ERR, + C_RX_HQ_INTR_FSM_ERR, + C_RX_HQ_INTR_CSR_PARITY_ERR, + C_RX_LOOKUP_CSR_PARITY_ERR, + C_RX_LOOKUP_RCV_ARRAY_COR_ERR, + C_RX_LOOKUP_RCV_ARRAY_UNC_ERR, + C_RX_LOOKUP_DES_PART2_PARITY_ERR, + C_RX_LOOKUP_DES_PART1_UNC_COR_ERR, + C_RX_LOOKUP_DES_PART1_UNC_ERR, + C_RX_RBUF_NEXT_FREE_BUF_COR_ERR, + C_RX_RBUF_NEXT_FREE_BUF_UNC_ERR, + C_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR, + C_RX_RBUF_FL_INITDONE_PARITY_ERR, + C_RX_RBUF_FL_WRITE_ADDR_PARITY_ERR, + C_RX_RBUF_FL_RD_ADDR_PARITY_ERR, + C_RX_RBUF_EMPTY_ERR, + C_RX_RBUF_FULL_ERR, + C_RX_RBUF_BAD_LOOKUP_ERR, + C_RX_RBUF_CTX_ID_PARITY_ERR, + C_RX_RBUF_CSR_QEOPDW_PARITY_ERR, + C_RX_RBUF_CSR_Q_NUM_OF_PKT_PARITY_ERR, + C_RX_RBUF_CSR_Q_T1_PTR_PARITY_ERR, + C_RX_RBUF_CSR_Q_HD_PTR_PARITY_ERR, + C_RX_RBUF_CSR_Q_VLD_BIT_PARITY_ERR, + C_RX_RBUF_CSR_Q_NEXT_BUF_PARITY_ERR, + C_RX_RBUF_CSR_Q_ENT_CNT_PARITY_ERR, + C_RX_RBUF_CSR_Q_HEAD_BUF_NUM_PARITY_ERR, + C_RX_RBUF_BLOCK_LIST_READ_COR_ERR, + C_RX_RBUF_BLOCK_LIST_READ_UNC_ERR, + C_RX_RBUF_LOOKUP_DES_COR_ERR, + C_RX_RBUF_LOOKUP_DES_UNC_ERR, + C_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR, + C_RX_RBUF_LOOKUP_DES_REG_UNC_ERR, + C_RX_RBUF_FREE_LIST_COR_ERR, + C_RX_RBUF_FREE_LIST_UNC_ERR, + C_RX_RCV_FSM_ENCODING_ERR, + C_RX_DMA_FLAG_COR_ERR, + C_RX_DMA_FLAG_UNC_ERR, + C_RX_DC_SOP_EOP_PARITY_ERR, + C_RX_RCV_CSR_PARITY_ERR, + C_RX_RCV_QP_MAP_TABLE_COR_ERR, + C_RX_RCV_QP_MAP_TABLE_UNC_ERR, + C_RX_RCV_DATA_COR_ERR, + C_RX_RCV_DATA_UNC_ERR, + C_RX_RCV_HDR_COR_ERR, + C_RX_RCV_HDR_UNC_ERR, + C_RX_DC_INTF_PARITY_ERR, + C_RX_DMA_CSR_COR_ERR, +/* SendPioErrStatus */ + C_PIO_PEC_SOP_HEAD_PARITY_ERR, + C_PIO_PCC_SOP_HEAD_PARITY_ERR, + C_PIO_LAST_RETURNED_CNT_PARITY_ERR, + C_PIO_CURRENT_FREE_CNT_PARITY_ERR, + C_PIO_RSVD_31_ERR, + C_PIO_RSVD_30_ERR, + C_PIO_PPMC_SOP_LEN_ERR, + C_PIO_PPMC_BQC_MEM_PARITY_ERR, + C_PIO_VL_FIFO_PARITY_ERR, + C_PIO_VLF_SOP_PARITY_ERR, + C_PIO_VLF_V1_LEN_PARITY_ERR, + C_PIO_BLOCK_QW_COUNT_PARITY_ERR, + C_PIO_WRITE_QW_VALID_PARITY_ERR, + C_PIO_STATE_MACHINE_ERR, + C_PIO_WRITE_DATA_PARITY_ERR, + C_PIO_HOST_ADDR_MEM_COR_ERR, + C_PIO_HOST_ADDR_MEM_UNC_ERR, + C_PIO_PKT_EVICT_SM_OR_ARM_SM_ERR, + C_PIO_INIT_SM_IN_ERR, + C_PIO_PPMC_PBL_FIFO_ERR, + C_PIO_CREDIT_RET_FIFO_PARITY_ERR, + C_PIO_V1_LEN_MEM_BANK1_COR_ERR, + C_PIO_V1_LEN_MEM_BANK0_COR_ERR, + C_PIO_V1_LEN_MEM_BANK1_UNC_ERR, + C_PIO_V1_LEN_MEM_BANK0_UNC_ERR, + C_PIO_SM_PKT_RESET_PARITY_ERR, + C_PIO_PKT_EVICT_FIFO_PARITY_ERR, + C_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR, + C_PIO_SBRDCTL_CRREL_PARITY_ERR, + C_PIO_PEC_FIFO_PARITY_ERR, + C_PIO_PCC_FIFO_PARITY_ERR, + C_PIO_SB_MEM_FIFO1_ERR, + C_PIO_SB_MEM_FIFO0_ERR, + C_PIO_CSR_PARITY_ERR, + C_PIO_WRITE_ADDR_PARITY_ERR, + C_PIO_WRITE_BAD_CTXT_ERR, +/* SendDmaErrStatus */ + C_SDMA_PCIE_REQ_TRACKING_COR_ERR, + C_SDMA_PCIE_REQ_TRACKING_UNC_ERR, + C_SDMA_CSR_PARITY_ERR, + C_SDMA_RPY_TAG_ERR, +/* SendEgressErrStatus */ + C_TX_READ_PIO_MEMORY_CSR_UNC_ERR, + C_TX_READ_SDMA_MEMORY_CSR_UNC_ERR, + C_TX_EGRESS_FIFO_COR_ERR, + C_TX_READ_PIO_MEMORY_COR_ERR, + C_TX_READ_SDMA_MEMORY_COR_ERR, + C_TX_SB_HDR_COR_ERR, + C_TX_CREDIT_OVERRUN_ERR, + C_TX_LAUNCH_FIFO8_COR_ERR, + C_TX_LAUNCH_FIFO7_COR_ERR, + C_TX_LAUNCH_FIFO6_COR_ERR, + C_TX_LAUNCH_FIFO5_COR_ERR, + C_TX_LAUNCH_FIFO4_COR_ERR, + C_TX_LAUNCH_FIFO3_COR_ERR, + C_TX_LAUNCH_FIFO2_COR_ERR, + C_TX_LAUNCH_FIFO1_COR_ERR, + C_TX_LAUNCH_FIFO0_COR_ERR, + C_TX_CREDIT_RETURN_VL_ERR, + C_TX_HCRC_INSERTION_ERR, + C_TX_EGRESS_FIFI_UNC_ERR, + C_TX_READ_PIO_MEMORY_UNC_ERR, + C_TX_READ_SDMA_MEMORY_UNC_ERR, + C_TX_SB_HDR_UNC_ERR, + C_TX_CREDIT_RETURN_PARITY_ERR, + C_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR, + C_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR, + C_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR, + C_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR, + C_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR, + C_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR, + C_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR, + C_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR, + C_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR, + C_TX_SDMA15_DISALLOWED_PACKET_ERR, + C_TX_SDMA14_DISALLOWED_PACKET_ERR, + C_TX_SDMA13_DISALLOWED_PACKET_ERR, + C_TX_SDMA12_DISALLOWED_PACKET_ERR, + C_TX_SDMA11_DISALLOWED_PACKET_ERR, + C_TX_SDMA10_DISALLOWED_PACKET_ERR, + C_TX_SDMA9_DISALLOWED_PACKET_ERR, + C_TX_SDMA8_DISALLOWED_PACKET_ERR, + C_TX_SDMA7_DISALLOWED_PACKET_ERR, + C_TX_SDMA6_DISALLOWED_PACKET_ERR, + C_TX_SDMA5_DISALLOWED_PACKET_ERR, + C_TX_SDMA4_DISALLOWED_PACKET_ERR, + C_TX_SDMA3_DISALLOWED_PACKET_ERR, + C_TX_SDMA2_DISALLOWED_PACKET_ERR, + C_TX_SDMA1_DISALLOWED_PACKET_ERR, + C_TX_SDMA0_DISALLOWED_PACKET_ERR, + C_TX_CONFIG_PARITY_ERR, + C_TX_SBRD_CTL_CSR_PARITY_ERR, + C_TX_LAUNCH_CSR_PARITY_ERR, + C_TX_ILLEGAL_CL_ERR, + C_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR, + C_TX_RESERVED_10, + C_TX_RESERVED_9, + C_TX_SDMA_LAUNCH_INTF_PARITY_ERR, + C_TX_PIO_LAUNCH_INTF_PARITY_ERR, + C_TX_RESERVED_6, + C_TX_INCORRECT_LINK_STATE_ERR, + C_TX_LINK_DOWN_ERR, + C_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR, + C_TX_RESERVED_2, + C_TX_PKT_INTEGRITY_MEM_UNC_ERR, + C_TX_PKT_INTEGRITY_MEM_COR_ERR, +/* SendErrStatus */ + C_SEND_CSR_WRITE_BAD_ADDR_ERR, + C_SEND_CSR_READ_BAD_ADD_ERR, + C_SEND_CSR_PARITY_ERR, +/* SendCtxtErrStatus */ + C_PIO_WRITE_OUT_OF_BOUNDS_ERR, + C_PIO_WRITE_OVERFLOW_ERR, + C_PIO_WRITE_CROSSES_BOUNDARY_ERR, + C_PIO_DISALLOWED_PACKET_ERR, + C_PIO_INCONSISTENT_SOP_ERR, +/*SendDmaEngErrStatus */ + C_SDMA_HEADER_REQUEST_FIFO_COR_ERR, + C_SDMA_HEADER_STORAGE_COR_ERR, + C_SDMA_PACKET_TRACKING_COR_ERR, + C_SDMA_ASSEMBLY_COR_ERR, + C_SDMA_DESC_TABLE_COR_ERR, + C_SDMA_HEADER_REQUEST_FIFO_UNC_ERR, + C_SDMA_HEADER_STORAGE_UNC_ERR, + C_SDMA_PACKET_TRACKING_UNC_ERR, + C_SDMA_ASSEMBLY_UNC_ERR, + C_SDMA_DESC_TABLE_UNC_ERR, + C_SDMA_TIMEOUT_ERR, + C_SDMA_HEADER_LENGTH_ERR, + C_SDMA_HEADER_ADDRESS_ERR, + C_SDMA_HEADER_SELECT_ERR, + C_SMDA_RESERVED_9, + C_SDMA_PACKET_DESC_OVERFLOW_ERR, + C_SDMA_LENGTH_MISMATCH_ERR, + C_SDMA_HALT_ERR, + C_SDMA_MEM_READ_ERR, + C_SDMA_FIRST_DESC_ERR, + C_SDMA_TAIL_OUT_OF_BOUNDS_ERR, + C_SDMA_TOO_LONG_ERR, + C_SDMA_GEN_MISMATCH_ERR, + C_SDMA_WRONG_DW_ERR, + DEV_CNTR_LAST /* Must be kept last */ +}; + +/* Per port counter indexes */ +enum { + C_TX_UNSUP_VL = 0, + C_TX_INVAL_LEN, + C_TX_MM_LEN_ERR, + C_TX_UNDERRUN, + C_TX_FLOW_STALL, + C_TX_DROPPED, + C_TX_HDR_ERR, + C_TX_PKT, + C_TX_WORDS, + C_TX_WAIT, + C_TX_FLIT_VL, + C_TX_PKT_VL, + C_TX_WAIT_VL, + C_RX_PKT, + C_RX_WORDS, + C_SW_LINK_DOWN, + C_SW_LINK_UP, + C_SW_UNKNOWN_FRAME, + C_SW_XMIT_DSCD, + C_SW_XMIT_DSCD_VL, + C_SW_XMIT_CSTR_ERR, + C_SW_RCV_CSTR_ERR, + C_SW_IBP_LOOP_PKTS, + C_SW_IBP_RC_RESENDS, + C_SW_IBP_RNR_NAKS, + C_SW_IBP_OTHER_NAKS, + C_SW_IBP_RC_TIMEOUTS, + C_SW_IBP_PKT_DROPS, + C_SW_IBP_DMA_WAIT, + C_SW_IBP_RC_SEQNAK, + C_SW_IBP_RC_DUPREQ, + C_SW_IBP_RDMA_SEQ, + C_SW_IBP_UNALIGNED, + C_SW_IBP_SEQ_NAK, + C_SW_CPU_RC_ACKS, + C_SW_CPU_RC_QACKS, + C_SW_CPU_RC_DELAYED_COMP, + C_RCV_HDR_OVF_0, + C_RCV_HDR_OVF_1, + C_RCV_HDR_OVF_2, + C_RCV_HDR_OVF_3, + C_RCV_HDR_OVF_4, + C_RCV_HDR_OVF_5, + C_RCV_HDR_OVF_6, + C_RCV_HDR_OVF_7, + C_RCV_HDR_OVF_8, + C_RCV_HDR_OVF_9, + C_RCV_HDR_OVF_10, + C_RCV_HDR_OVF_11, + C_RCV_HDR_OVF_12, + C_RCV_HDR_OVF_13, + C_RCV_HDR_OVF_14, + C_RCV_HDR_OVF_15, + C_RCV_HDR_OVF_16, + C_RCV_HDR_OVF_17, + C_RCV_HDR_OVF_18, + C_RCV_HDR_OVF_19, + C_RCV_HDR_OVF_20, + C_RCV_HDR_OVF_21, + C_RCV_HDR_OVF_22, + C_RCV_HDR_OVF_23, + C_RCV_HDR_OVF_24, + C_RCV_HDR_OVF_25, + C_RCV_HDR_OVF_26, + C_RCV_HDR_OVF_27, + C_RCV_HDR_OVF_28, + C_RCV_HDR_OVF_29, + C_RCV_HDR_OVF_30, + C_RCV_HDR_OVF_31, + C_RCV_HDR_OVF_32, + C_RCV_HDR_OVF_33, + C_RCV_HDR_OVF_34, + C_RCV_HDR_OVF_35, + C_RCV_HDR_OVF_36, + C_RCV_HDR_OVF_37, + C_RCV_HDR_OVF_38, + C_RCV_HDR_OVF_39, + C_RCV_HDR_OVF_40, + C_RCV_HDR_OVF_41, + C_RCV_HDR_OVF_42, + C_RCV_HDR_OVF_43, + C_RCV_HDR_OVF_44, + C_RCV_HDR_OVF_45, + C_RCV_HDR_OVF_46, + C_RCV_HDR_OVF_47, + C_RCV_HDR_OVF_48, + C_RCV_HDR_OVF_49, + C_RCV_HDR_OVF_50, + C_RCV_HDR_OVF_51, + C_RCV_HDR_OVF_52, + C_RCV_HDR_OVF_53, + C_RCV_HDR_OVF_54, + C_RCV_HDR_OVF_55, + C_RCV_HDR_OVF_56, + C_RCV_HDR_OVF_57, + C_RCV_HDR_OVF_58, + C_RCV_HDR_OVF_59, + C_RCV_HDR_OVF_60, + C_RCV_HDR_OVF_61, + C_RCV_HDR_OVF_62, + C_RCV_HDR_OVF_63, + C_RCV_HDR_OVF_64, + C_RCV_HDR_OVF_65, + C_RCV_HDR_OVF_66, + C_RCV_HDR_OVF_67, + C_RCV_HDR_OVF_68, + C_RCV_HDR_OVF_69, + C_RCV_HDR_OVF_70, + C_RCV_HDR_OVF_71, + C_RCV_HDR_OVF_72, + C_RCV_HDR_OVF_73, + C_RCV_HDR_OVF_74, + C_RCV_HDR_OVF_75, + C_RCV_HDR_OVF_76, + C_RCV_HDR_OVF_77, + C_RCV_HDR_OVF_78, + C_RCV_HDR_OVF_79, + C_RCV_HDR_OVF_80, + C_RCV_HDR_OVF_81, + C_RCV_HDR_OVF_82, + C_RCV_HDR_OVF_83, + C_RCV_HDR_OVF_84, + C_RCV_HDR_OVF_85, + C_RCV_HDR_OVF_86, + C_RCV_HDR_OVF_87, + C_RCV_HDR_OVF_88, + C_RCV_HDR_OVF_89, + C_RCV_HDR_OVF_90, + C_RCV_HDR_OVF_91, + C_RCV_HDR_OVF_92, + C_RCV_HDR_OVF_93, + C_RCV_HDR_OVF_94, + C_RCV_HDR_OVF_95, + C_RCV_HDR_OVF_96, + C_RCV_HDR_OVF_97, + C_RCV_HDR_OVF_98, + C_RCV_HDR_OVF_99, + C_RCV_HDR_OVF_100, + C_RCV_HDR_OVF_101, + C_RCV_HDR_OVF_102, + C_RCV_HDR_OVF_103, + C_RCV_HDR_OVF_104, + C_RCV_HDR_OVF_105, + C_RCV_HDR_OVF_106, + C_RCV_HDR_OVF_107, + C_RCV_HDR_OVF_108, + C_RCV_HDR_OVF_109, + C_RCV_HDR_OVF_110, + C_RCV_HDR_OVF_111, + C_RCV_HDR_OVF_112, + C_RCV_HDR_OVF_113, + C_RCV_HDR_OVF_114, + C_RCV_HDR_OVF_115, + C_RCV_HDR_OVF_116, + C_RCV_HDR_OVF_117, + C_RCV_HDR_OVF_118, + C_RCV_HDR_OVF_119, + C_RCV_HDR_OVF_120, + C_RCV_HDR_OVF_121, + C_RCV_HDR_OVF_122, + C_RCV_HDR_OVF_123, + C_RCV_HDR_OVF_124, + C_RCV_HDR_OVF_125, + C_RCV_HDR_OVF_126, + C_RCV_HDR_OVF_127, + C_RCV_HDR_OVF_128, + C_RCV_HDR_OVF_129, + C_RCV_HDR_OVF_130, + C_RCV_HDR_OVF_131, + C_RCV_HDR_OVF_132, + C_RCV_HDR_OVF_133, + C_RCV_HDR_OVF_134, + C_RCV_HDR_OVF_135, + C_RCV_HDR_OVF_136, + C_RCV_HDR_OVF_137, + C_RCV_HDR_OVF_138, + C_RCV_HDR_OVF_139, + C_RCV_HDR_OVF_140, + C_RCV_HDR_OVF_141, + C_RCV_HDR_OVF_142, + C_RCV_HDR_OVF_143, + C_RCV_HDR_OVF_144, + C_RCV_HDR_OVF_145, + C_RCV_HDR_OVF_146, + C_RCV_HDR_OVF_147, + C_RCV_HDR_OVF_148, + C_RCV_HDR_OVF_149, + C_RCV_HDR_OVF_150, + C_RCV_HDR_OVF_151, + C_RCV_HDR_OVF_152, + C_RCV_HDR_OVF_153, + C_RCV_HDR_OVF_154, + C_RCV_HDR_OVF_155, + C_RCV_HDR_OVF_156, + C_RCV_HDR_OVF_157, + C_RCV_HDR_OVF_158, + C_RCV_HDR_OVF_159, + PORT_CNTR_LAST /* Must be kept last */ +}; + +u64 get_all_cpu_total(u64 __percpu *cntr); +void hfi1_start_cleanup(struct hfi1_devdata *dd); +void hfi1_clear_tids(struct hfi1_ctxtdata *rcd); +struct hfi1_message_header *hfi1_get_msgheader( + struct hfi1_devdata *dd, __le32 *rhf_addr); +int hfi1_get_base_kinfo(struct hfi1_ctxtdata *rcd, + struct hfi1_ctxt_info *kinfo); +u64 hfi1_gpio_mod(struct hfi1_devdata *dd, u32 target, u32 data, u32 dir, + u32 mask); +int hfi1_init_ctxt(struct send_context *sc); +void hfi1_put_tid(struct hfi1_devdata *dd, u32 index, + u32 type, unsigned long pa, u16 order); +void hfi1_quiet_serdes(struct hfi1_pportdata *ppd); +void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, int ctxt); +u32 hfi1_read_cntrs(struct hfi1_devdata *dd, char **namep, u64 **cntrp); +u32 hfi1_read_portcntrs(struct hfi1_pportdata *ppd, char **namep, u64 **cntrp); +u8 hfi1_ibphys_portstate(struct hfi1_pportdata *ppd); +int hfi1_get_ib_cfg(struct hfi1_pportdata *ppd, int which); +int hfi1_set_ib_cfg(struct hfi1_pportdata *ppd, int which, u32 val); +int hfi1_set_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt, u16 jkey); +int hfi1_clear_ctxt_jkey(struct hfi1_devdata *dd, unsigned ctxt); +int hfi1_set_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt, u16 pkey); +int hfi1_clear_ctxt_pkey(struct hfi1_devdata *dd, unsigned ctxt); +void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality); + +/* + * Interrupt source table. + * + * Each entry is an interrupt source "type". It is ordered by increasing + * number. + */ +struct is_table { + int start; /* interrupt source type start */ + int end; /* interrupt source type end */ + /* routine that returns the name of the interrupt source */ + char *(*is_name)(char *name, size_t size, unsigned int source); + /* routine to call when receiving an interrupt */ + void (*is_int)(struct hfi1_devdata *dd, unsigned int source); +}; + +#endif /* _CHIP_H */ diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h new file mode 100644 index 000000000..8744de666 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/chip_registers.h @@ -0,0 +1,1307 @@ +#ifndef DEF_CHIP_REG +#define DEF_CHIP_REG + +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define CORE 0x000000000000 +#define CCE (CORE + 0x000000000000) +#define ASIC (CORE + 0x000000400000) +#define MISC (CORE + 0x000000500000) +#define DC_TOP_CSRS (CORE + 0x000000600000) +#define CHIP_DEBUG (CORE + 0x000000700000) +#define RXE (CORE + 0x000001000000) +#define TXE (CORE + 0x000001800000) +#define DCC_CSRS (DC_TOP_CSRS + 0x000000000000) +#define DC_LCB_CSRS (DC_TOP_CSRS + 0x000000001000) +#define DC_8051_CSRS (DC_TOP_CSRS + 0x000000002000) +#define PCIE 0 + +#define ASIC_NUM_SCRATCH 4 +#define CCE_ERR_INT_CNT 0 +#define CCE_MISC_INT_CNT 2 +#define CCE_NUM_32_BIT_COUNTERS 3 +#define CCE_NUM_32_BIT_INT_COUNTERS 6 +#define CCE_NUM_INT_CSRS 12 +#define CCE_NUM_INT_MAP_CSRS 96 +#define CCE_NUM_MSIX_PBAS 4 +#define CCE_NUM_MSIX_VECTORS 256 +#define CCE_NUM_SCRATCH 4 +#define CCE_PCIE_POSTED_CRDT_STALL_CNT 2 +#define CCE_PCIE_TRGT_STALL_CNT 0 +#define CCE_PIO_WR_STALL_CNT 1 +#define CCE_RCV_AVAIL_INT_CNT 3 +#define CCE_RCV_URGENT_INT_CNT 4 +#define CCE_SDMA_INT_CNT 1 +#define CCE_SEND_CREDIT_INT_CNT 5 +#define DCC_CFG_LED_CNTRL (DCC_CSRS + 0x000000000040) +#define DCC_CFG_LED_CNTRL_LED_CNTRL_SMASK 0x10ull +#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SHIFT 0 +#define DCC_CFG_LED_CNTRL_LED_SW_BLINK_RATE_SMASK 0xFull +#define DCC_CFG_PORT_CONFIG (DCC_CSRS + 0x000000000008) +#define DCC_CFG_PORT_CONFIG1 (DCC_CSRS + 0x000000000010) +#define DCC_CFG_PORT_CONFIG1_DLID_MASK_MASK 0xFFFFull +#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SHIFT 16 +#define DCC_CFG_PORT_CONFIG1_DLID_MASK_SMASK 0xFFFF0000ull +#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_MASK 0xFFFFull +#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SHIFT 0 +#define DCC_CFG_PORT_CONFIG1_TARGET_DLID_SMASK 0xFFFFull +#define DCC_CFG_PORT_CONFIG_LINK_STATE_MASK 0x7ull +#define DCC_CFG_PORT_CONFIG_LINK_STATE_SHIFT 48 +#define DCC_CFG_PORT_CONFIG_LINK_STATE_SMASK 0x7000000000000ull +#define DCC_CFG_PORT_CONFIG_MTU_CAP_MASK 0x7ull +#define DCC_CFG_PORT_CONFIG_MTU_CAP_SHIFT 32 +#define DCC_CFG_PORT_CONFIG_MTU_CAP_SMASK 0x700000000ull +#define DCC_CFG_RESET (DCC_CSRS + 0x000000000000) +#define DCC_CFG_RESET_RESET_LCB_SHIFT 0 +#define DCC_CFG_RESET_RESET_RX_FPE_SHIFT 2 +#define DCC_CFG_SC_VL_TABLE_15_0 (DCC_CSRS + 0x000000000028) +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY0_SHIFT 0 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY10_SHIFT 40 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY11_SHIFT 44 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY12_SHIFT 48 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY13_SHIFT 52 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY14_SHIFT 56 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY15_SHIFT 60 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY1_SHIFT 4 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY2_SHIFT 8 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY3_SHIFT 12 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY4_SHIFT 16 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY5_SHIFT 20 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY6_SHIFT 24 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY7_SHIFT 28 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY8_SHIFT 32 +#define DCC_CFG_SC_VL_TABLE_15_0_ENTRY9_SHIFT 36 +#define DCC_CFG_SC_VL_TABLE_31_16 (DCC_CSRS + 0x000000000030) +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY16_SHIFT 0 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY17_SHIFT 4 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY18_SHIFT 8 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY19_SHIFT 12 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY20_SHIFT 16 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY21_SHIFT 20 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY22_SHIFT 24 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY23_SHIFT 28 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY24_SHIFT 32 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY25_SHIFT 36 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY26_SHIFT 40 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY27_SHIFT 44 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY28_SHIFT 48 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY29_SHIFT 52 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY30_SHIFT 56 +#define DCC_CFG_SC_VL_TABLE_31_16_ENTRY31_SHIFT 60 +#define DCC_ERR_DROPPED_PKT_CNT (DCC_CSRS + 0x000000000120) +#define DCC_ERR_FLG (DCC_CSRS + 0x000000000050) +#define DCC_ERR_FLG_BAD_CRDT_ACK_ERR_SMASK 0x4000ull +#define DCC_ERR_FLG_BAD_CTRL_DIST_ERR_SMASK 0x200000ull +#define DCC_ERR_FLG_BAD_CTRL_FLIT_ERR_SMASK 0x10000ull +#define DCC_ERR_FLG_BAD_DLID_TARGET_ERR_SMASK 0x200ull +#define DCC_ERR_FLG_BAD_HEAD_DIST_ERR_SMASK 0x800000ull +#define DCC_ERR_FLG_BAD_L2_ERR_SMASK 0x2ull +#define DCC_ERR_FLG_BAD_LVER_ERR_SMASK 0x400ull +#define DCC_ERR_FLG_BAD_MID_TAIL_ERR_SMASK 0x8ull +#define DCC_ERR_FLG_BAD_PKT_LENGTH_ERR_SMASK 0x4000000ull +#define DCC_ERR_FLG_BAD_PREEMPTION_ERR_SMASK 0x10ull +#define DCC_ERR_FLG_BAD_SC_ERR_SMASK 0x4ull +#define DCC_ERR_FLG_BAD_TAIL_DIST_ERR_SMASK 0x400000ull +#define DCC_ERR_FLG_BAD_VL_MARKER_ERR_SMASK 0x80ull +#define DCC_ERR_FLG_CLR (DCC_CSRS + 0x000000000060) +#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull +#define DCC_ERR_FLG_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull +#define DCC_ERR_FLG_CSR_INVAL_ADDR_SMASK 0x400000000000ull +#define DCC_ERR_FLG_CSR_PARITY_ERR_SMASK 0x200000000000ull +#define DCC_ERR_FLG_DLID_ZERO_ERR_SMASK 0x40000000ull +#define DCC_ERR_FLG_EN (DCC_CSRS + 0x000000000058) +#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_HOST_SMASK 0x8000000000ull +#define DCC_ERR_FLG_EN_CSR_ACCESS_BLOCKED_UC_SMASK 0x10000000000ull +#define DCC_ERR_FLG_EVENT_CNTR_PARITY_ERR_SMASK 0x20000ull +#define DCC_ERR_FLG_EVENT_CNTR_ROLLOVER_ERR_SMASK 0x40000ull +#define DCC_ERR_FLG_FMCONFIG_ERR_SMASK 0x40000000000000ull +#define DCC_ERR_FLG_FPE_TX_FIFO_OVFLW_ERR_SMASK 0x2000000000ull +#define DCC_ERR_FLG_FPE_TX_FIFO_UNFLW_ERR_SMASK 0x4000000000ull +#define DCC_ERR_FLG_LATE_EBP_ERR_SMASK 0x1000000000ull +#define DCC_ERR_FLG_LATE_LONG_ERR_SMASK 0x800000000ull +#define DCC_ERR_FLG_LATE_SHORT_ERR_SMASK 0x400000000ull +#define DCC_ERR_FLG_LENGTH_MTU_ERR_SMASK 0x80000000ull +#define DCC_ERR_FLG_LINK_ERR_SMASK 0x80000ull +#define DCC_ERR_FLG_MISC_CNTR_ROLLOVER_ERR_SMASK 0x100000ull +#define DCC_ERR_FLG_NONVL15_STATE_ERR_SMASK 0x1000000ull +#define DCC_ERR_FLG_PERM_NVL15_ERR_SMASK 0x10000000ull +#define DCC_ERR_FLG_PREEMPTION_ERR_SMASK 0x20ull +#define DCC_ERR_FLG_PREEMPTIONVL15_ERR_SMASK 0x40ull +#define DCC_ERR_FLG_RCVPORT_ERR_SMASK 0x80000000000000ull +#define DCC_ERR_FLG_RX_BYTE_SHFT_PARITY_ERR_SMASK 0x1000000000000ull +#define DCC_ERR_FLG_RX_CTRL_PARITY_MBE_ERR_SMASK 0x100000000000ull +#define DCC_ERR_FLG_RX_EARLY_DROP_ERR_SMASK 0x200000000ull +#define DCC_ERR_FLG_SLID_ZERO_ERR_SMASK 0x20000000ull +#define DCC_ERR_FLG_TX_BYTE_SHFT_PARITY_ERR_SMASK 0x800000000000ull +#define DCC_ERR_FLG_TX_CTRL_PARITY_ERR_SMASK 0x20000000000ull +#define DCC_ERR_FLG_TX_CTRL_PARITY_MBE_ERR_SMASK 0x40000000000ull +#define DCC_ERR_FLG_TX_SC_PARITY_ERR_SMASK 0x80000000000ull +#define DCC_ERR_FLG_UNCORRECTABLE_ERR_SMASK 0x2000ull +#define DCC_ERR_FLG_UNSUP_PKT_TYPE_SMASK 0x8000ull +#define DCC_ERR_FLG_UNSUP_VL_ERR_SMASK 0x8000000ull +#define DCC_ERR_FLG_VL15_MULTI_ERR_SMASK 0x2000000ull +#define DCC_ERR_FMCONFIG_ERR_CNT (DCC_CSRS + 0x000000000110) +#define DCC_ERR_INFO_FMCONFIG (DCC_CSRS + 0x000000000090) +#define DCC_ERR_INFO_PORTRCV (DCC_CSRS + 0x000000000078) +#define DCC_ERR_INFO_PORTRCV_HDR0 (DCC_CSRS + 0x000000000080) +#define DCC_ERR_INFO_PORTRCV_HDR1 (DCC_CSRS + 0x000000000088) +#define DCC_ERR_INFO_UNCORRECTABLE (DCC_CSRS + 0x000000000098) +#define DCC_ERR_PORTRCV_ERR_CNT (DCC_CSRS + 0x000000000108) +#define DCC_ERR_RCVREMOTE_PHY_ERR_CNT (DCC_CSRS + 0x000000000118) +#define DCC_ERR_UNCORRECTABLE_CNT (DCC_CSRS + 0x000000000100) +#define DCC_PRF_PORT_MARK_FECN_CNT (DCC_CSRS + 0x000000000330) +#define DCC_PRF_PORT_RCV_BECN_CNT (DCC_CSRS + 0x000000000290) +#define DCC_PRF_PORT_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E0) +#define DCC_PRF_PORT_RCV_CORRECTABLE_CNT (DCC_CSRS + 0x000000000140) +#define DCC_PRF_PORT_RCV_DATA_CNT (DCC_CSRS + 0x000000000198) +#define DCC_PRF_PORT_RCV_FECN_CNT (DCC_CSRS + 0x000000000240) +#define DCC_PRF_PORT_RCV_MULTICAST_PKT_CNT (DCC_CSRS + 0x000000000130) +#define DCC_PRF_PORT_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001A8) +#define DCC_PRF_PORT_VL_MARK_FECN_CNT (DCC_CSRS + 0x000000000338) +#define DCC_PRF_PORT_VL_RCV_BECN_CNT (DCC_CSRS + 0x000000000298) +#define DCC_PRF_PORT_VL_RCV_BUBBLE_CNT (DCC_CSRS + 0x0000000002E8) +#define DCC_PRF_PORT_VL_RCV_DATA_CNT (DCC_CSRS + 0x0000000001B0) +#define DCC_PRF_PORT_VL_RCV_FECN_CNT (DCC_CSRS + 0x000000000248) +#define DCC_PRF_PORT_VL_RCV_PKTS_CNT (DCC_CSRS + 0x0000000001F8) +#define DCC_PRF_PORT_XMIT_CORRECTABLE_CNT (DCC_CSRS + 0x000000000138) +#define DCC_PRF_PORT_XMIT_DATA_CNT (DCC_CSRS + 0x000000000190) +#define DCC_PRF_PORT_XMIT_MULTICAST_CNT (DCC_CSRS + 0x000000000128) +#define DCC_PRF_PORT_XMIT_PKTS_CNT (DCC_CSRS + 0x0000000001A0) +#define DCC_PRF_RX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000180) +#define DCC_PRF_TX_FLOW_CRTL_CNT (DCC_CSRS + 0x000000000188) +#define DC_DC8051_CFG_CSR_ACCESS_SEL (DC_8051_CSRS + 0x000000000110) +#define DC_DC8051_CFG_CSR_ACCESS_SEL_DCC_SMASK 0x2ull +#define DC_DC8051_CFG_CSR_ACCESS_SEL_LCB_SMASK 0x1ull +#define DC_DC8051_CFG_EXT_DEV_0 (DC_8051_CSRS + 0x000000000118) +#define DC_DC8051_CFG_EXT_DEV_0_COMPLETED_SMASK 0x1ull +#define DC_DC8051_CFG_EXT_DEV_0_RETURN_CODE_SHIFT 8 +#define DC_DC8051_CFG_EXT_DEV_0_RSP_DATA_SHIFT 16 +#define DC_DC8051_CFG_EXT_DEV_1 (DC_8051_CSRS + 0x000000000120) +#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_MASK 0xFFFFull +#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SHIFT 16 +#define DC_DC8051_CFG_EXT_DEV_1_REQ_DATA_SMASK 0xFFFF0000ull +#define DC_DC8051_CFG_EXT_DEV_1_REQ_NEW_SMASK 0x1ull +#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_MASK 0xFFull +#define DC_DC8051_CFG_EXT_DEV_1_REQ_TYPE_SHIFT 8 +#define DC_DC8051_CFG_HOST_CMD_0 (DC_8051_CSRS + 0x000000000028) +#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_MASK 0xFFFFFFFFFFFFull +#define DC_DC8051_CFG_HOST_CMD_0_REQ_DATA_SHIFT 16 +#define DC_DC8051_CFG_HOST_CMD_0_REQ_NEW_SMASK 0x1ull +#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_MASK 0xFFull +#define DC_DC8051_CFG_HOST_CMD_0_REQ_TYPE_SHIFT 8 +#define DC_DC8051_CFG_HOST_CMD_1 (DC_8051_CSRS + 0x000000000030) +#define DC_DC8051_CFG_HOST_CMD_1_COMPLETED_SMASK 0x1ull +#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_MASK 0xFFull +#define DC_DC8051_CFG_HOST_CMD_1_RETURN_CODE_SHIFT 8 +#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_MASK 0xFFFFFFFFFFFFull +#define DC_DC8051_CFG_HOST_CMD_1_RSP_DATA_SHIFT 16 +#define DC_DC8051_CFG_LOCAL_GUID (DC_8051_CSRS + 0x000000000038) +#define DC_DC8051_CFG_MODE (DC_8051_CSRS + 0x000000000070) +#define DC_DC8051_CFG_RAM_ACCESS_CTRL (DC_8051_CSRS + 0x000000000008) +#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK 0x7FFFull +#define DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT 0 +#define DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK 0x1000000ull +#define DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK 0x10000ull +#define DC_DC8051_CFG_RAM_ACCESS_SETUP (DC_8051_CSRS + 0x000000000000) +#define DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK 0x100ull +#define DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK 0x1ull +#define DC_DC8051_CFG_RAM_ACCESS_STATUS (DC_8051_CSRS + 0x000000000018) +#define DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK 0x10000ull +#define DC_DC8051_CFG_RAM_ACCESS_WR_DATA (DC_8051_CSRS + 0x000000000010) +#define DC_DC8051_CFG_RAM_ACCESS_RD_DATA (DC_8051_CSRS + 0x000000000020) +#define DC_DC8051_CFG_RST (DC_8051_CSRS + 0x000000000068) +#define DC_DC8051_CFG_RST_CRAM_SMASK 0x2ull +#define DC_DC8051_CFG_RST_DRAM_SMASK 0x4ull +#define DC_DC8051_CFG_RST_IRAM_SMASK 0x8ull +#define DC_DC8051_CFG_RST_M8051W_SMASK 0x1ull +#define DC_DC8051_CFG_RST_SFR_SMASK 0x10ull +#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051 (DC_8051_CSRS + 0x0000000000D8) +#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_MASK 0xFFFFFFFFull +#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_ERROR_SHIFT 16 +#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_MASK 0xFFFFull +#define DC_DC8051_DBG_ERR_INFO_SET_BY_8051_HOST_MSG_SHIFT 0 +#define DC_DC8051_ERR_CLR (DC_8051_CSRS + 0x0000000000E8) +#define DC_DC8051_ERR_EN (DC_8051_CSRS + 0x0000000000F0) +#define DC_DC8051_ERR_EN_LOST_8051_HEART_BEAT_SMASK 0x2ull +#define DC_DC8051_ERR_FLG (DC_8051_CSRS + 0x0000000000E0) +#define DC_DC8051_ERR_FLG_CRAM_MBE_SMASK 0x4ull +#define DC_DC8051_ERR_FLG_CRAM_SBE_SMASK 0x8ull +#define DC_DC8051_ERR_FLG_DRAM_MBE_SMASK 0x10ull +#define DC_DC8051_ERR_FLG_DRAM_SBE_SMASK 0x20ull +#define DC_DC8051_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x400ull +#define DC_DC8051_ERR_FLG_IRAM_MBE_SMASK 0x40ull +#define DC_DC8051_ERR_FLG_IRAM_SBE_SMASK 0x80ull +#define DC_DC8051_ERR_FLG_LOST_8051_HEART_BEAT_SMASK 0x2ull +#define DC_DC8051_ERR_FLG_SET_BY_8051_SMASK 0x1ull +#define DC_DC8051_ERR_FLG_UNMATCHED_SECURE_MSG_ACROSS_BCC_LANES_SMASK 0x100ull +#define DC_DC8051_STS_CUR_STATE (DC_8051_CSRS + 0x000000000060) +#define DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK 0xFFull +#define DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT 16 +#define DC_DC8051_STS_CUR_STATE_PORT_MASK 0xFFull +#define DC_DC8051_STS_CUR_STATE_PORT_SHIFT 0 +#define DC_DC8051_STS_LOCAL_FM_SECURITY (DC_8051_CSRS + 0x000000000050) +#define DC_DC8051_STS_LOCAL_FM_SECURITY_DISABLED_MASK 0x1ull +#define DC_DC8051_STS_REMOTE_FM_SECURITY (DC_8051_CSRS + 0x000000000058) +#define DC_DC8051_STS_REMOTE_GUID (DC_8051_CSRS + 0x000000000040) +#define DC_DC8051_STS_REMOTE_NODE_TYPE (DC_8051_CSRS + 0x000000000048) +#define DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK 0x3ull +#define DC_DC8051_STS_REMOTE_PORT_NO (DC_8051_CSRS + 0x000000000130) +#define DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK 0xFFull +#define DC_LCB_CFG_ALLOW_LINK_UP (DC_LCB_CSRS + 0x000000000128) +#define DC_LCB_CFG_ALLOW_LINK_UP_VAL_SHIFT 0 +#define DC_LCB_CFG_CRC_MODE (DC_LCB_CSRS + 0x000000000058) +#define DC_LCB_CFG_CRC_MODE_TX_VAL_SHIFT 0 +#define DC_LCB_CFG_IGNORE_LOST_RCLK (DC_LCB_CSRS + 0x000000000020) +#define DC_LCB_CFG_IGNORE_LOST_RCLK_EN_SMASK 0x1ull +#define DC_LCB_CFG_LANE_WIDTH (DC_LCB_CSRS + 0x000000000100) +#define DC_LCB_CFG_LINK_KILL_EN (DC_LCB_CSRS + 0x000000000120) +#define DC_LCB_CFG_LINK_KILL_EN_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull +#define DC_LCB_CFG_LINK_KILL_EN_REPLAY_BUF_MBE_SMASK 0x400000ull +#define DC_LCB_CFG_LN_DCLK (DC_LCB_CSRS + 0x000000000060) +#define DC_LCB_CFG_LOOPBACK (DC_LCB_CSRS + 0x0000000000F8) +#define DC_LCB_CFG_LOOPBACK_VAL_SHIFT 0 +#define DC_LCB_CFG_RUN (DC_LCB_CSRS + 0x000000000000) +#define DC_LCB_CFG_RUN_EN_SHIFT 0 +#define DC_LCB_CFG_RX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000018) +#define DC_LCB_CFG_RX_FIFOS_RADR_DO_NOT_JUMP_VAL_SHIFT 8 +#define DC_LCB_CFG_RX_FIFOS_RADR_OK_TO_JUMP_VAL_SHIFT 4 +#define DC_LCB_CFG_RX_FIFOS_RADR_RST_VAL_SHIFT 0 +#define DC_LCB_CFG_TX_FIFOS_RADR (DC_LCB_CSRS + 0x000000000010) +#define DC_LCB_CFG_TX_FIFOS_RADR_RST_VAL_SHIFT 0 +#define DC_LCB_CFG_TX_FIFOS_RESET (DC_LCB_CSRS + 0x000000000008) +#define DC_LCB_CFG_TX_FIFOS_RESET_VAL_SHIFT 0 +#define DC_LCB_CFG_REINIT_AS_SLAVE (DC_LCB_CSRS + 0x000000000030) +#define DC_LCB_CFG_CNT_FOR_SKIP_STALL (DC_LCB_CSRS + 0x000000000040) +#define DC_LCB_CFG_CLK_CNTR (DC_LCB_CSRS + 0x000000000110) +#define DC_LCB_ERR_CLR (DC_LCB_CSRS + 0x000000000308) +#define DC_LCB_ERR_EN (DC_LCB_CSRS + 0x000000000310) +#define DC_LCB_ERR_FLG (DC_LCB_CSRS + 0x000000000300) +#define DC_LCB_ERR_FLG_REDUNDANT_FLIT_PARITY_ERR_SMASK 0x20000000ull +#define DC_LCB_ERR_FLG_NEG_EDGE_LINK_TRANSFER_ACTIVE_SMASK 0x10000000ull +#define DC_LCB_ERR_FLG_HOLD_REINIT_SMASK 0x8000000ull +#define DC_LCB_ERR_FLG_RST_FOR_INCOMPLT_RND_TRIP_SMASK 0x4000000ull +#define DC_LCB_ERR_FLG_RST_FOR_LINK_TIMEOUT_SMASK 0x2000000ull +#define DC_LCB_ERR_FLG_CREDIT_RETURN_FLIT_MBE_SMASK 0x1000000ull +#define DC_LCB_ERR_FLG_REPLAY_BUF_SBE_SMASK 0x800000ull +#define DC_LCB_ERR_FLG_REPLAY_BUF_MBE_SMASK 0x400000ull +#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_SBE_SMASK 0x200000ull +#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_MBE_SMASK 0x100000ull +#define DC_LCB_ERR_FLG_VL_ACK_INPUT_WRONG_CRC_MODE_SMASK 0x80000ull +#define DC_LCB_ERR_FLG_VL_ACK_INPUT_PARITY_ERR_SMASK 0x40000ull +#define DC_LCB_ERR_FLG_VL_ACK_INPUT_BUF_OFLW_SMASK 0x20000ull +#define DC_LCB_ERR_FLG_FLIT_INPUT_BUF_OFLW_SMASK 0x10000ull +#define DC_LCB_ERR_FLG_ILLEGAL_FLIT_ENCODING_SMASK 0x8000ull +#define DC_LCB_ERR_FLG_ILLEGAL_NULL_LTP_SMASK 0x4000ull +#define DC_LCB_ERR_FLG_UNEXPECTED_ROUND_TRIP_MARKER_SMASK 0x2000ull +#define DC_LCB_ERR_FLG_UNEXPECTED_REPLAY_MARKER_SMASK 0x1000ull +#define DC_LCB_ERR_FLG_RCLK_STOPPED_SMASK 0x800ull +#define DC_LCB_ERR_FLG_CRC_ERR_CNT_HIT_LIMIT_SMASK 0x400ull +#define DC_LCB_ERR_FLG_REINIT_FOR_LN_DEGRADE_SMASK 0x200ull +#define DC_LCB_ERR_FLG_REINIT_FROM_PEER_SMASK 0x100ull +#define DC_LCB_ERR_FLG_SEQ_CRC_ERR_SMASK 0x80ull +#define DC_LCB_ERR_FLG_RX_LESS_THAN_FOUR_LNS_SMASK 0x40ull +#define DC_LCB_ERR_FLG_TX_LESS_THAN_FOUR_LNS_SMASK 0x20ull +#define DC_LCB_ERR_FLG_LOST_REINIT_STALL_OR_TOS_SMASK 0x10ull +#define DC_LCB_ERR_FLG_ALL_LNS_FAILED_REINIT_TEST_SMASK 0x8ull +#define DC_LCB_ERR_FLG_RST_FOR_FAILED_DESKEW_SMASK 0x4ull +#define DC_LCB_ERR_FLG_INVALID_CSR_ADDR_SMASK 0x2ull +#define DC_LCB_ERR_FLG_CSR_PARITY_ERR_SMASK 0x1ull +#define DC_LCB_ERR_INFO_CRC_ERR_LN0 (DC_LCB_CSRS + 0x000000000328) +#define DC_LCB_ERR_INFO_CRC_ERR_LN1 (DC_LCB_CSRS + 0x000000000330) +#define DC_LCB_ERR_INFO_CRC_ERR_LN2 (DC_LCB_CSRS + 0x000000000338) +#define DC_LCB_ERR_INFO_CRC_ERR_LN3 (DC_LCB_CSRS + 0x000000000340) +#define DC_LCB_ERR_INFO_CRC_ERR_MULTI_LN (DC_LCB_CSRS + 0x000000000348) +#define DC_LCB_ERR_INFO_ESCAPE_0_ONLY_CNT (DC_LCB_CSRS + 0x000000000368) +#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS1_CNT (DC_LCB_CSRS + 0x000000000370) +#define DC_LCB_ERR_INFO_ESCAPE_0_PLUS2_CNT (DC_LCB_CSRS + 0x000000000378) +#define DC_LCB_ERR_INFO_MISC_FLG_CNT (DC_LCB_CSRS + 0x000000000390) +#define DC_LCB_ERR_INFO_REINIT_FROM_PEER_CNT (DC_LCB_CSRS + 0x000000000380) +#define DC_LCB_ERR_INFO_RX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000358) +#define DC_LCB_ERR_INFO_SBE_CNT (DC_LCB_CSRS + 0x000000000388) +#define DC_LCB_ERR_INFO_SEQ_CRC_CNT (DC_LCB_CSRS + 0x000000000360) +#define DC_LCB_ERR_INFO_TOTAL_CRC_ERR (DC_LCB_CSRS + 0x000000000320) +#define DC_LCB_ERR_INFO_TX_REPLAY_CNT (DC_LCB_CSRS + 0x000000000350) +#define DC_LCB_PG_DBG_FLIT_CRDTS_CNT (DC_LCB_CSRS + 0x000000000580) +#define DC_LCB_PG_STS_PAUSE_COMPLETE_CNT (DC_LCB_CSRS + 0x0000000005F8) +#define DC_LCB_PG_STS_TX_MBE_CNT (DC_LCB_CSRS + 0x000000000608) +#define DC_LCB_PG_STS_TX_SBE_CNT (DC_LCB_CSRS + 0x000000000600) +#define DC_LCB_PRF_ACCEPTED_LTP_CNT (DC_LCB_CSRS + 0x000000000408) +#define DC_LCB_PRF_CLK_CNTR (DC_LCB_CSRS + 0x000000000420) +#define DC_LCB_PRF_GOOD_LTP_CNT (DC_LCB_CSRS + 0x000000000400) +#define DC_LCB_PRF_RX_FLIT_CNT (DC_LCB_CSRS + 0x000000000410) +#define DC_LCB_PRF_TX_FLIT_CNT (DC_LCB_CSRS + 0x000000000418) +#define DC_LCB_STS_LINK_TRANSFER_ACTIVE (DC_LCB_CSRS + 0x000000000468) +#define DC_LCB_STS_ROUND_TRIP_LTP_CNT (DC_LCB_CSRS + 0x0000000004B0) +#define RCV_BUF_OVFL_CNT 10 +#define RCV_CONTEXT_EGR_STALL 22 +#define RCV_DATA_PKT_CNT 0 +#define RCV_DWORD_CNT 1 +#define RCV_TID_FLOW_GEN_MISMATCH_CNT 20 +#define RCV_TID_FLOW_SEQ_MISMATCH_CNT 23 +#define RCV_TID_FULL_ERR_CNT 18 +#define RCV_TID_VALID_ERR_CNT 19 +#define RXE_NUM_32_BIT_COUNTERS 24 +#define RXE_NUM_64_BIT_COUNTERS 2 +#define RXE_NUM_RSM_INSTANCES 4 +#define RXE_NUM_TID_FLOWS 32 +#define RXE_PER_CONTEXT_OFFSET 0x0300000 +#define SEND_DATA_PKT_CNT 0 +#define SEND_DATA_PKT_VL0_CNT 12 +#define SEND_DATA_VL0_CNT 3 +#define SEND_DROPPED_PKT_CNT 5 +#define SEND_DWORD_CNT 1 +#define SEND_FLOW_STALL_CNT 4 +#define SEND_HEADERS_ERR_CNT 6 +#define SEND_LEN_ERR_CNT 1 +#define SEND_MAX_MIN_LEN_ERR_CNT 2 +#define SEND_UNDERRUN_CNT 3 +#define SEND_UNSUP_VL_ERR_CNT 0 +#define SEND_WAIT_CNT 2 +#define SEND_WAIT_VL0_CNT 21 +#define TXE_PIO_SEND_OFFSET 0x0800000 +#define ASIC_CFG_DRV_STR (ASIC + 0x000000000048) +#define ASIC_CFG_MUTEX (ASIC + 0x000000000040) +#define ASIC_CFG_SBUS_EXECUTE (ASIC + 0x000000000008) +#define ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK 0x1ull +#define ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK 0x2ull +#define ASIC_CFG_SBUS_REQUEST (ASIC + 0x000000000000) +#define ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT 16 +#define ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT 8 +#define ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT 32 +#define ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT 0 +#define ASIC_CFG_SCRATCH (ASIC + 0x000000000020) +#define ASIC_CFG_THERM_POLL_EN (ASIC + 0x000000000050) +#define ASIC_EEP_ADDR_CMD (ASIC + 0x000000000308) +#define ASIC_EEP_ADDR_CMD_EP_ADDR_MASK 0xFFFFFFull +#define ASIC_EEP_CTL_STAT (ASIC + 0x000000000300) +#define ASIC_EEP_CTL_STAT_EP_RESET_SMASK 0x4ull +#define ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT 8 +#define ASIC_EEP_CTL_STAT_RESETCSR 0x0000000083818000ull +#define ASIC_EEP_DATA (ASIC + 0x000000000310) +#define ASIC_GPIO_CLEAR (ASIC + 0x000000000230) +#define ASIC_GPIO_FORCE (ASIC + 0x000000000238) +#define ASIC_GPIO_IN (ASIC + 0x000000000200) +#define ASIC_GPIO_INVERT (ASIC + 0x000000000210) +#define ASIC_GPIO_MASK (ASIC + 0x000000000220) +#define ASIC_GPIO_OE (ASIC + 0x000000000208) +#define ASIC_GPIO_OUT (ASIC + 0x000000000218) +#define ASIC_PCIE_SD_HOST_CMD (ASIC + 0x000000000100) +#define ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT 0 +#define ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK 0x400ull +#define ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT 2 +#define ASIC_PCIE_SD_HOST_CMD_TIMER_MASK 0xFFFFFull +#define ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT 12 +#define ASIC_PCIE_SD_HOST_STATUS (ASIC + 0x000000000108) +#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK 0x7ull +#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT 2 +#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK 0x3ull +#define ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT 0 +#define ASIC_PCIE_SD_INTRPT_DATA_CODE (ASIC + 0x000000000110) +#define ASIC_PCIE_SD_INTRPT_ENABLE (ASIC + 0x000000000118) +#define ASIC_PCIE_SD_INTRPT_LIST (ASIC + 0x000000000180) +#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT 16 +#define ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT 0 +#define ASIC_PCIE_SD_INTRPT_STATUS (ASIC + 0x000000000128) +#define ASIC_QSFP1_CLEAR (ASIC + 0x000000000270) +#define ASIC_QSFP1_FORCE (ASIC + 0x000000000278) +#define ASIC_QSFP1_IN (ASIC + 0x000000000240) +#define ASIC_QSFP1_INVERT (ASIC + 0x000000000250) +#define ASIC_QSFP1_MASK (ASIC + 0x000000000260) +#define ASIC_QSFP1_OE (ASIC + 0x000000000248) +#define ASIC_QSFP1_OUT (ASIC + 0x000000000258) +#define ASIC_QSFP1_STATUS (ASIC + 0x000000000268) +#define ASIC_QSFP2_CLEAR (ASIC + 0x0000000002B0) +#define ASIC_QSFP2_FORCE (ASIC + 0x0000000002B8) +#define ASIC_QSFP2_IN (ASIC + 0x000000000280) +#define ASIC_QSFP2_INVERT (ASIC + 0x000000000290) +#define ASIC_QSFP2_MASK (ASIC + 0x0000000002A0) +#define ASIC_QSFP2_OE (ASIC + 0x000000000288) +#define ASIC_QSFP2_OUT (ASIC + 0x000000000298) +#define ASIC_QSFP2_STATUS (ASIC + 0x0000000002A8) +#define ASIC_STS_SBUS_COUNTERS (ASIC + 0x000000000018) +#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_MASK 0xFFFFull +#define ASIC_STS_SBUS_COUNTERS_EXECUTE_CNT_SHIFT 0 +#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_MASK 0xFFFFull +#define ASIC_STS_SBUS_COUNTERS_RCV_DATA_VALID_CNT_SHIFT 16 +#define ASIC_STS_SBUS_RESULT (ASIC + 0x000000000010) +#define ASIC_STS_SBUS_RESULT_DONE_SMASK 0x1ull +#define ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK 0x2ull +#define ASIC_STS_THERM (ASIC + 0x000000000058) +#define ASIC_STS_THERM_CRIT_TEMP_MASK 0x7FFull +#define ASIC_STS_THERM_CRIT_TEMP_SHIFT 18 +#define ASIC_STS_THERM_CURR_TEMP_MASK 0x7FFull +#define ASIC_STS_THERM_CURR_TEMP_SHIFT 2 +#define ASIC_STS_THERM_HI_TEMP_MASK 0x7FFull +#define ASIC_STS_THERM_HI_TEMP_SHIFT 50 +#define ASIC_STS_THERM_LO_TEMP_MASK 0x7FFull +#define ASIC_STS_THERM_LO_TEMP_SHIFT 34 +#define ASIC_STS_THERM_LOW_SHIFT 13 +#define CCE_COUNTER_ARRAY32 (CCE + 0x000000000060) +#define CCE_CTRL (CCE + 0x000000000010) +#define CCE_CTRL_RXE_RESUME_SMASK 0x800ull +#define CCE_CTRL_SPC_FREEZE_SMASK 0x100ull +#define CCE_CTRL_SPC_UNFREEZE_SMASK 0x200ull +#define CCE_CTRL_TXE_RESUME_SMASK 0x2000ull +#define CCE_DC_CTRL (CCE + 0x0000000000B8) +#define CCE_DC_CTRL_DC_RESET_SMASK 0x1ull +#define CCE_DC_CTRL_RESETCSR 0x0000000000000001ull +#define CCE_ERR_CLEAR (CCE + 0x000000000050) +#define CCE_ERR_MASK (CCE + 0x000000000048) +#define CCE_ERR_STATUS (CCE + 0x000000000040) +#define CCE_ERR_STATUS_CCE_CLI0_ASYNC_FIFO_PARITY_ERR_SMASK 0x40ull +#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_DBG_PARITY_ERROR_SMASK 0x1000ull +#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_PIO_CRDT_PARITY_ERR_SMASK \ + 0x200ull +#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_RXDMA_PARITY_ERROR_SMASK \ + 0x800ull +#define CCE_ERR_STATUS_CCE_CLI1_ASYNC_FIFO_SDMA_HD_PARITY_ERR_SMASK \ + 0x400ull +#define CCE_ERR_STATUS_CCE_CLI2_ASYNC_FIFO_PARITY_ERR_SMASK 0x100ull +#define CCE_ERR_STATUS_CCE_CSR_CFG_BUS_PARITY_ERR_SMASK 0x80ull +#define CCE_ERR_STATUS_CCE_CSR_PARITY_ERR_SMASK 0x1ull +#define CCE_ERR_STATUS_CCE_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull +#define CCE_ERR_STATUS_CCE_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull +#define CCE_ERR_STATUS_CCE_INT_MAP_COR_ERR_SMASK 0x4000000000ull +#define CCE_ERR_STATUS_CCE_INT_MAP_UNC_ERR_SMASK 0x8000000000ull +#define CCE_ERR_STATUS_CCE_MSIX_CSR_PARITY_ERR_SMASK 0x10000000000ull +#define CCE_ERR_STATUS_CCE_MSIX_TABLE_COR_ERR_SMASK 0x1000000000ull +#define CCE_ERR_STATUS_CCE_MSIX_TABLE_UNC_ERR_SMASK 0x2000000000ull +#define CCE_ERR_STATUS_CCE_RCPL_ASYNC_FIFO_PARITY_ERR_SMASK 0x400000000ull +#define CCE_ERR_STATUS_CCE_RSPD_DATA_PARITY_ERR_SMASK 0x20ull +#define CCE_ERR_STATUS_CCE_RXDMA_CONV_FIFO_PARITY_ERR_SMASK 0x800000000ull +#define CCE_ERR_STATUS_CCE_SEG_READ_BAD_ADDR_ERR_SMASK 0x100000000ull +#define CCE_ERR_STATUS_CCE_SEG_WRITE_BAD_ADDR_ERR_SMASK 0x200000000ull +#define CCE_ERR_STATUS_CCE_TRGT_ACCESS_ERR_SMASK 0x10ull +#define CCE_ERR_STATUS_CCE_TRGT_ASYNC_FIFO_PARITY_ERR_SMASK 0x8ull +#define CCE_ERR_STATUS_CCE_TRGT_CPL_TIMEOUT_ERR_SMASK 0x40000000ull +#define CCE_ERR_STATUS_LA_TRIGGERED_SMASK 0x80000000ull +#define CCE_ERR_STATUS_PCIC_CPL_DAT_QCOR_ERR_SMASK 0x40000ull +#define CCE_ERR_STATUS_PCIC_CPL_DAT_QUNC_ERR_SMASK 0x4000000ull +#define CCE_ERR_STATUS_PCIC_CPL_HD_QCOR_ERR_SMASK 0x20000ull +#define CCE_ERR_STATUS_PCIC_CPL_HD_QUNC_ERR_SMASK 0x2000000ull +#define CCE_ERR_STATUS_PCIC_NPOST_DAT_QPARITY_ERR_SMASK 0x100000ull +#define CCE_ERR_STATUS_PCIC_NPOST_HQ_PARITY_ERR_SMASK 0x80000ull +#define CCE_ERR_STATUS_PCIC_POST_DAT_QCOR_ERR_SMASK 0x10000ull +#define CCE_ERR_STATUS_PCIC_POST_DAT_QUNC_ERR_SMASK 0x1000000ull +#define CCE_ERR_STATUS_PCIC_POST_HD_QCOR_ERR_SMASK 0x8000ull +#define CCE_ERR_STATUS_PCIC_POST_HD_QUNC_ERR_SMASK 0x800000ull +#define CCE_ERR_STATUS_PCIC_RECEIVE_PARITY_ERR_SMASK 0x20000000ull +#define CCE_ERR_STATUS_PCIC_RETRY_MEM_COR_ERR_SMASK 0x2000ull +#define CCE_ERR_STATUS_PCIC_RETRY_MEM_UNC_ERR_SMASK 0x200000ull +#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_COR_ERR_SMASK 0x4000ull +#define CCE_ERR_STATUS_PCIC_RETRY_SOT_MEM_UNC_ERR_SMASK 0x400000ull +#define CCE_ERR_STATUS_PCIC_TRANSMIT_BACK_PARITY_ERR_SMASK 0x10000000ull +#define CCE_ERR_STATUS_PCIC_TRANSMIT_FRONT_PARITY_ERR_SMASK 0x8000000ull +#define CCE_INT_CLEAR (CCE + 0x000000110A00) +#define CCE_INT_COUNTER_ARRAY32 (CCE + 0x000000110D00) +#define CCE_INT_FORCE (CCE + 0x000000110B00) +#define CCE_INT_MAP (CCE + 0x000000110500) +#define CCE_INT_MASK (CCE + 0x000000110900) +#define CCE_INT_STATUS (CCE + 0x000000110800) +#define CCE_MSIX_INT_GRANTED (CCE + 0x000000110200) +#define CCE_MSIX_TABLE_LOWER (CCE + 0x000000100000) +#define CCE_MSIX_TABLE_UPPER (CCE + 0x000000100008) +#define CCE_MSIX_TABLE_UPPER_RESETCSR 0x0000000100000000ull +#define CCE_MSIX_VEC_CLR_WITHOUT_INT (CCE + 0x000000110400) +#define CCE_PCIE_CTRL (CCE + 0x0000000000C0) +#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK 0x3ull +#define CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT 0 +#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK 0xFull +#define CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT 2 +#define CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT 8 +#define CCE_PCIE_CTRL_XMT_MARGIN_SHIFT 9 +#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK 0x1ull +#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT 12 +#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK 0x7ull +#define CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT 13 +#define CCE_REVISION (CCE + 0x000000000000) +#define CCE_REVISION2 (CCE + 0x000000000008) +#define CCE_REVISION2_HFI_ID_MASK 0x1ull +#define CCE_REVISION2_HFI_ID_SHIFT 0 +#define CCE_REVISION2_IMPL_CODE_SHIFT 8 +#define CCE_REVISION2_IMPL_REVISION_SHIFT 16 +#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_MASK 0xFull +#define CCE_REVISION_BOARD_ID_LOWER_NIBBLE_SHIFT 32 +#define CCE_REVISION_CHIP_REV_MAJOR_MASK 0xFFull +#define CCE_REVISION_CHIP_REV_MAJOR_SHIFT 8 +#define CCE_REVISION_CHIP_REV_MINOR_MASK 0xFFull +#define CCE_REVISION_CHIP_REV_MINOR_SHIFT 0 +#define CCE_REVISION_SW_MASK 0xFFull +#define CCE_REVISION_SW_SHIFT 24 +#define CCE_SCRATCH (CCE + 0x000000000020) +#define CCE_STATUS (CCE + 0x000000000018) +#define CCE_STATUS_RXE_FROZE_SMASK 0x2ull +#define CCE_STATUS_RXE_PAUSED_SMASK 0x20ull +#define CCE_STATUS_SDMA_FROZE_SMASK 0x1ull +#define CCE_STATUS_SDMA_PAUSED_SMASK 0x10ull +#define CCE_STATUS_TXE_FROZE_SMASK 0x4ull +#define CCE_STATUS_TXE_PAUSED_SMASK 0x40ull +#define CCE_STATUS_TXE_PIO_FROZE_SMASK 0x8ull +#define CCE_STATUS_TXE_PIO_PAUSED_SMASK 0x80ull +#define MISC_CFG_FW_CTRL (MISC + 0x000000001000) +#define MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK 0x2ull +#define MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT 2 +#define MISC_CFG_FW_CTRL_RSA_STATUS_SMASK 0xCull +#define MISC_CFG_RSA_CMD (MISC + 0x000000000A08) +#define MISC_CFG_RSA_MODULUS (MISC + 0x000000000400) +#define MISC_CFG_RSA_MU (MISC + 0x000000000A10) +#define MISC_CFG_RSA_R2 (MISC + 0x000000000000) +#define MISC_CFG_RSA_SIGNATURE (MISC + 0x000000000200) +#define MISC_CFG_SHA_PRELOAD (MISC + 0x000000000A00) +#define MISC_ERR_CLEAR (MISC + 0x000000002010) +#define MISC_ERR_MASK (MISC + 0x000000002008) +#define MISC_ERR_STATUS (MISC + 0x000000002000) +#define MISC_ERR_STATUS_MISC_PLL_LOCK_FAIL_ERR_SMASK 0x1000ull +#define MISC_ERR_STATUS_MISC_MBIST_FAIL_ERR_SMASK 0x800ull +#define MISC_ERR_STATUS_MISC_INVALID_EEP_CMD_ERR_SMASK 0x400ull +#define MISC_ERR_STATUS_MISC_EFUSE_DONE_PARITY_ERR_SMASK 0x200ull +#define MISC_ERR_STATUS_MISC_EFUSE_WRITE_ERR_SMASK 0x100ull +#define MISC_ERR_STATUS_MISC_EFUSE_READ_BAD_ADDR_ERR_SMASK 0x80ull +#define MISC_ERR_STATUS_MISC_EFUSE_CSR_PARITY_ERR_SMASK 0x40ull +#define MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK 0x20ull +#define MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK 0x10ull +#define MISC_ERR_STATUS_MISC_SBUS_WRITE_FAILED_ERR_SMASK 0x8ull +#define MISC_ERR_STATUS_MISC_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull +#define MISC_ERR_STATUS_MISC_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull +#define MISC_ERR_STATUS_MISC_CSR_PARITY_ERR_SMASK 0x1ull +#define PCI_CFG_MSIX0 (PCIE + 0x0000000000B0) +#define PCI_CFG_REG1 (PCIE + 0x000000000004) +#define PCI_CFG_REG11 (PCIE + 0x00000000002C) +#define PCIE_CFG_SPCIE1 (PCIE + 0x00000000014C) +#define PCIE_CFG_SPCIE2 (PCIE + 0x000000000150) +#define PCIE_CFG_TPH2 (PCIE + 0x000000000180) +#define RCV_ARRAY (RXE + 0x000000200000) +#define RCV_ARRAY_CNT (RXE + 0x000000000018) +#define RCV_ARRAY_RT_ADDR_MASK 0xFFFFFFFFFull +#define RCV_ARRAY_RT_ADDR_SHIFT 0 +#define RCV_ARRAY_RT_BUF_SIZE_SHIFT 36 +#define RCV_ARRAY_RT_WRITE_ENABLE_SMASK 0x8000000000000000ull +#define RCV_AVAIL_TIME_OUT (RXE + 0x000000100050) +#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_MASK 0xFFull +#define RCV_AVAIL_TIME_OUT_TIME_OUT_RELOAD_SHIFT 0 +#define RCV_BTH_QP (RXE + 0x000000000028) +#define RCV_BTH_QP_KDETH_QP_MASK 0xFFull +#define RCV_BTH_QP_KDETH_QP_SHIFT 16 +#define RCV_BYPASS (RXE + 0x000000000038) +#define RCV_CONTEXTS (RXE + 0x000000000010) +#define RCV_COUNTER_ARRAY32 (RXE + 0x000000000400) +#define RCV_COUNTER_ARRAY64 (RXE + 0x000000000500) +#define RCV_CTRL (RXE + 0x000000000000) +#define RCV_CTRL_RCV_BYPASS_ENABLE_SMASK 0x10ull +#define RCV_CTRL_RCV_EXTENDED_PSN_ENABLE_SMASK 0x40ull +#define RCV_CTRL_RCV_PARTITION_KEY_ENABLE_SMASK 0x4ull +#define RCV_CTRL_RCV_PORT_ENABLE_SMASK 0x1ull +#define RCV_CTRL_RCV_QP_MAP_ENABLE_SMASK 0x2ull +#define RCV_CTRL_RCV_RSM_ENABLE_SMASK 0x20ull +#define RCV_CTRL_RX_RBUF_INIT_SMASK 0x200ull +#define RCV_CTXT_CTRL (RXE + 0x000000100000) +#define RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK 0x4ull +#define RCV_CTXT_CTRL_DONT_DROP_RHQ_FULL_SMASK 0x8ull +#define RCV_CTXT_CTRL_EGR_BUF_SIZE_MASK 0x7ull +#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SHIFT 8 +#define RCV_CTXT_CTRL_EGR_BUF_SIZE_SMASK 0x700ull +#define RCV_CTXT_CTRL_ENABLE_SMASK 0x1ull +#define RCV_CTXT_CTRL_INTR_AVAIL_SMASK 0x20ull +#define RCV_CTXT_CTRL_ONE_PACKET_PER_EGR_BUFFER_SMASK 0x2ull +#define RCV_CTXT_CTRL_TAIL_UPD_SMASK 0x40ull +#define RCV_CTXT_CTRL_TID_FLOW_ENABLE_SMASK 0x10ull +#define RCV_CTXT_STATUS (RXE + 0x000000100008) +#define RCV_EGR_CTRL (RXE + 0x000000100010) +#define RCV_EGR_CTRL_EGR_BASE_INDEX_MASK 0x1FFFull +#define RCV_EGR_CTRL_EGR_BASE_INDEX_SHIFT 0 +#define RCV_EGR_CTRL_EGR_CNT_MASK 0x1FFull +#define RCV_EGR_CTRL_EGR_CNT_SHIFT 32 +#define RCV_EGR_INDEX_HEAD (RXE + 0x000000300018) +#define RCV_EGR_INDEX_HEAD_HEAD_MASK 0x7FFull +#define RCV_EGR_INDEX_HEAD_HEAD_SHIFT 0 +#define RCV_ERR_CLEAR (RXE + 0x000000000070) +#define RCV_ERR_INFO (RXE + 0x000000000050) +#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK 0x1Full +#define RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK 0x20ull +#define RCV_ERR_MASK (RXE + 0x000000000068) +#define RCV_ERR_STATUS (RXE + 0x000000000060) +#define RCV_ERR_STATUS_RX_CSR_PARITY_ERR_SMASK 0x8000000000000000ull +#define RCV_ERR_STATUS_RX_CSR_READ_BAD_ADDR_ERR_SMASK 0x2000000000000000ull +#define RCV_ERR_STATUS_RX_CSR_WRITE_BAD_ADDR_ERR_SMASK \ + 0x4000000000000000ull +#define RCV_ERR_STATUS_RX_DC_INTF_PARITY_ERR_SMASK 0x2ull +#define RCV_ERR_STATUS_RX_DC_SOP_EOP_PARITY_ERR_SMASK 0x200ull +#define RCV_ERR_STATUS_RX_DMA_CSR_COR_ERR_SMASK 0x1ull +#define RCV_ERR_STATUS_RX_DMA_CSR_PARITY_ERR_SMASK 0x200000000000000ull +#define RCV_ERR_STATUS_RX_DMA_CSR_UNC_ERR_SMASK 0x1000000000000000ull +#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_COR_ERR_SMASK \ + 0x40000000000000ull +#define RCV_ERR_STATUS_RX_DMA_DATA_FIFO_RD_UNC_ERR_SMASK \ + 0x20000000000000ull +#define RCV_ERR_STATUS_RX_DMA_DQ_FSM_ENCODING_ERR_SMASK \ + 0x800000000000000ull +#define RCV_ERR_STATUS_RX_DMA_EQ_FSM_ENCODING_ERR_SMASK \ + 0x400000000000000ull +#define RCV_ERR_STATUS_RX_DMA_FLAG_COR_ERR_SMASK 0x800ull +#define RCV_ERR_STATUS_RX_DMA_FLAG_UNC_ERR_SMASK 0x400ull +#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_COR_ERR_SMASK 0x10000000000000ull +#define RCV_ERR_STATUS_RX_DMA_HDR_FIFO_RD_UNC_ERR_SMASK 0x8000000000000ull +#define RCV_ERR_STATUS_RX_HQ_INTR_CSR_PARITY_ERR_SMASK 0x200000000000ull +#define RCV_ERR_STATUS_RX_HQ_INTR_FSM_ERR_SMASK 0x400000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_CSR_PARITY_ERR_SMASK 0x100000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_COR_ERR_SMASK \ + 0x10000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART1_UNC_ERR_SMASK 0x8000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_DES_PART2_PARITY_ERR_SMASK \ + 0x20000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_COR_ERR_SMASK 0x80000000000ull +#define RCV_ERR_STATUS_RX_LOOKUP_RCV_ARRAY_UNC_ERR_SMASK 0x40000000000ull +#define RCV_ERR_STATUS_RX_RBUF_BAD_LOOKUP_ERR_SMASK 0x40000000ull +#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_COR_ERR_SMASK 0x100000ull +#define RCV_ERR_STATUS_RX_RBUF_BLOCK_LIST_READ_UNC_ERR_SMASK 0x80000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QENT_CNT_PARITY_ERR_SMASK 0x400000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QEOPDW_PARITY_ERR_SMASK 0x10000000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QHD_PTR_PARITY_ERR_SMASK 0x2000000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QHEAD_BUF_NUM_PARITY_ERR_SMASK \ + 0x200000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QNEXT_BUF_PARITY_ERR_SMASK 0x800000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QNUM_OF_PKT_PARITY_ERR_SMASK \ + 0x8000000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QTL_PTR_PARITY_ERR_SMASK 0x4000000ull +#define RCV_ERR_STATUS_RX_RBUF_CSR_QVLD_BIT_PARITY_ERR_SMASK 0x1000000ull +#define RCV_ERR_STATUS_RX_RBUF_CTX_ID_PARITY_ERR_SMASK 0x20000000ull +#define RCV_ERR_STATUS_RX_RBUF_DATA_COR_ERR_SMASK 0x100000000000000ull +#define RCV_ERR_STATUS_RX_RBUF_DATA_UNC_ERR_SMASK 0x80000000000000ull +#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_COR_ERR_SMASK 0x1000000000000ull +#define RCV_ERR_STATUS_RX_RBUF_DESC_PART1_UNC_ERR_SMASK 0x800000000000ull +#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_COR_ERR_SMASK 0x4000000000000ull +#define RCV_ERR_STATUS_RX_RBUF_DESC_PART2_UNC_ERR_SMASK 0x2000000000000ull +#define RCV_ERR_STATUS_RX_RBUF_EMPTY_ERR_SMASK 0x100000000ull +#define RCV_ERR_STATUS_RX_RBUF_FL_INITDONE_PARITY_ERR_SMASK 0x800000000ull +#define RCV_ERR_STATUS_RX_RBUF_FL_INIT_WR_ADDR_PARITY_ERR_SMASK \ + 0x1000000000ull +#define RCV_ERR_STATUS_RX_RBUF_FL_RD_ADDR_PARITY_ERR_SMASK 0x200000000ull +#define RCV_ERR_STATUS_RX_RBUF_FL_WR_ADDR_PARITY_ERR_SMASK 0x400000000ull +#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_COR_ERR_SMASK 0x4000ull +#define RCV_ERR_STATUS_RX_RBUF_FREE_LIST_UNC_ERR_SMASK 0x2000ull +#define RCV_ERR_STATUS_RX_RBUF_FULL_ERR_SMASK 0x80000000ull +#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_COR_ERR_SMASK 0x40000ull +#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_COR_ERR_SMASK 0x10000ull +#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_REG_UNC_ERR_SMASK 0x8000ull +#define RCV_ERR_STATUS_RX_RBUF_LOOKUP_DES_UNC_ERR_SMASK 0x20000ull +#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_COR_ERR_SMASK 0x4000000000ull +#define RCV_ERR_STATUS_RX_RBUF_NEXT_FREE_BUF_UNC_ERR_SMASK 0x2000000000ull +#define RCV_ERR_STATUS_RX_RCV_CSR_PARITY_ERR_SMASK 0x100ull +#define RCV_ERR_STATUS_RX_RCV_DATA_COR_ERR_SMASK 0x20ull +#define RCV_ERR_STATUS_RX_RCV_DATA_UNC_ERR_SMASK 0x10ull +#define RCV_ERR_STATUS_RX_RCV_FSM_ENCODING_ERR_SMASK 0x1000ull +#define RCV_ERR_STATUS_RX_RCV_HDR_COR_ERR_SMASK 0x8ull +#define RCV_ERR_STATUS_RX_RCV_HDR_UNC_ERR_SMASK 0x4ull +#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_COR_ERR_SMASK 0x80ull +#define RCV_ERR_STATUS_RX_RCV_QP_MAP_TABLE_UNC_ERR_SMASK 0x40ull +#define RCV_HDR_ADDR (RXE + 0x000000100028) +#define RCV_HDR_CNT (RXE + 0x000000100030) +#define RCV_HDR_CNT_CNT_MASK 0x1FFull +#define RCV_HDR_CNT_CNT_SHIFT 0 +#define RCV_HDR_ENT_SIZE (RXE + 0x000000100038) +#define RCV_HDR_ENT_SIZE_ENT_SIZE_MASK 0x7ull +#define RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT 0 +#define RCV_HDR_HEAD (RXE + 0x000000300008) +#define RCV_HDR_HEAD_COUNTER_MASK 0xFFull +#define RCV_HDR_HEAD_COUNTER_SHIFT 32 +#define RCV_HDR_HEAD_HEAD_MASK 0x7FFFFull +#define RCV_HDR_HEAD_HEAD_SHIFT 0 +#define RCV_HDR_HEAD_HEAD_SMASK 0x7FFFFull +#define RCV_HDR_OVFL_CNT (RXE + 0x000000100058) +#define RCV_HDR_SIZE (RXE + 0x000000100040) +#define RCV_HDR_SIZE_HDR_SIZE_MASK 0x1Full +#define RCV_HDR_SIZE_HDR_SIZE_SHIFT 0 +#define RCV_HDR_TAIL (RXE + 0x000000300000) +#define RCV_HDR_TAIL_ADDR (RXE + 0x000000100048) +#define RCV_KEY_CTRL (RXE + 0x000000100020) +#define RCV_KEY_CTRL_JOB_KEY_ENABLE_SMASK 0x200000000ull +#define RCV_KEY_CTRL_JOB_KEY_VALUE_MASK 0xFFFFull +#define RCV_KEY_CTRL_JOB_KEY_VALUE_SHIFT 0 +#define RCV_MULTICAST (RXE + 0x000000000030) +#define RCV_PARTITION_KEY (RXE + 0x000000000200) +#define RCV_PARTITION_KEY_PARTITION_KEY_A_MASK 0xFFFFull +#define RCV_PARTITION_KEY_PARTITION_KEY_B_SHIFT 16 +#define RCV_QP_MAP_TABLE (RXE + 0x000000000100) +#define RCV_RSM_CFG (RXE + 0x000000000600) +#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_MASK 0x1ull +#define RCV_RSM_CFG_ENABLE_OR_CHAIN_RSM0_SHIFT 0 +#define RCV_RSM_CFG_PACKET_TYPE_SHIFT 60 +#define RCV_RSM_CFG_OFFSET_SHIFT 32 +#define RCV_RSM_MAP_TABLE (RXE + 0x000000000900) +#define RCV_RSM_MAP_TABLE_RCV_CONTEXT_A_MASK 0xFFull +#define RCV_RSM_MATCH (RXE + 0x000000000800) +#define RCV_RSM_MATCH_MASK1_SHIFT 0 +#define RCV_RSM_MATCH_MASK2_SHIFT 16 +#define RCV_RSM_MATCH_VALUE1_SHIFT 8 +#define RCV_RSM_MATCH_VALUE2_SHIFT 24 +#define RCV_RSM_SELECT (RXE + 0x000000000700) +#define RCV_RSM_SELECT_FIELD1_OFFSET_SHIFT 0 +#define RCV_RSM_SELECT_FIELD2_OFFSET_SHIFT 16 +#define RCV_RSM_SELECT_INDEX1_OFFSET_SHIFT 32 +#define RCV_RSM_SELECT_INDEX1_WIDTH_SHIFT 44 +#define RCV_RSM_SELECT_INDEX2_OFFSET_SHIFT 48 +#define RCV_RSM_SELECT_INDEX2_WIDTH_SHIFT 60 +#define RCV_STATUS (RXE + 0x000000000008) +#define RCV_STATUS_RX_PKT_IN_PROGRESS_SMASK 0x1ull +#define RCV_STATUS_RX_RBUF_INIT_DONE_SMASK 0x200ull +#define RCV_STATUS_RX_RBUF_PKT_PENDING_SMASK 0x40ull +#define RCV_TID_CTRL (RXE + 0x000000100018) +#define RCV_TID_CTRL_TID_BASE_INDEX_MASK 0x1FFFull +#define RCV_TID_CTRL_TID_BASE_INDEX_SHIFT 0 +#define RCV_TID_CTRL_TID_PAIR_CNT_MASK 0x1FFull +#define RCV_TID_CTRL_TID_PAIR_CNT_SHIFT 32 +#define RCV_TID_FLOW_TABLE (RXE + 0x000000300800) +#define RCV_VL15 (RXE + 0x000000000048) +#define SEND_BTH_QP (TXE + 0x0000000000A0) +#define SEND_BTH_QP_KDETH_QP_MASK 0xFFull +#define SEND_BTH_QP_KDETH_QP_SHIFT 16 +#define SEND_CM_CREDIT_USED_STATUS (TXE + 0x000000000510) +#define SEND_CM_CREDIT_USED_STATUS_VL0_RETURN_CREDIT_STATUS_SMASK \ + 0x1000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL15_RETURN_CREDIT_STATUS_SMASK \ + 0x8000000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL1_RETURN_CREDIT_STATUS_SMASK \ + 0x2000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL2_RETURN_CREDIT_STATUS_SMASK \ + 0x4000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL3_RETURN_CREDIT_STATUS_SMASK \ + 0x8000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL4_RETURN_CREDIT_STATUS_SMASK \ + 0x10000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL5_RETURN_CREDIT_STATUS_SMASK \ + 0x20000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL6_RETURN_CREDIT_STATUS_SMASK \ + 0x40000000000000ull +#define SEND_CM_CREDIT_USED_STATUS_VL7_RETURN_CREDIT_STATUS_SMASK \ + 0x80000000000000ull +#define SEND_CM_CREDIT_VL (TXE + 0x000000000600) +#define SEND_CM_CREDIT_VL15 (TXE + 0x000000000678) +#define SEND_CM_CREDIT_VL15_DEDICATED_LIMIT_VL_SHIFT 0 +#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_MASK 0xFFFFull +#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SHIFT 0 +#define SEND_CM_CREDIT_VL_DEDICATED_LIMIT_VL_SMASK 0xFFFFull +#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_MASK 0xFFFFull +#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SHIFT 16 +#define SEND_CM_CREDIT_VL_SHARED_LIMIT_VL_SMASK 0xFFFF0000ull +#define SEND_CM_CTRL (TXE + 0x000000000500) +#define SEND_CM_CTRL_FORCE_CREDIT_MODE_SMASK 0x8ull +#define SEND_CM_CTRL_RESETCSR 0x0000000000000020ull +#define SEND_CM_GLOBAL_CREDIT (TXE + 0x000000000508) +#define SEND_CM_GLOBAL_CREDIT_AU_SHIFT 16 +#define SEND_CM_GLOBAL_CREDIT_RESETCSR 0x0000094000030000ull +#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_MASK 0xFFFFull +#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SHIFT 0 +#define SEND_CM_GLOBAL_CREDIT_SHARED_LIMIT_SMASK 0xFFFFull +#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_MASK 0xFFFFull +#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SHIFT 32 +#define SEND_CM_GLOBAL_CREDIT_TOTAL_CREDIT_LIMIT_SMASK 0xFFFF00000000ull +#define SEND_CM_LOCAL_AU_TABLE0_TO3 (TXE + 0x000000000520) +#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE0_SHIFT 0 +#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE1_SHIFT 16 +#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE2_SHIFT 32 +#define SEND_CM_LOCAL_AU_TABLE0_TO3_LOCAL_AU_TABLE3_SHIFT 48 +#define SEND_CM_LOCAL_AU_TABLE4_TO7 (TXE + 0x000000000528) +#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE4_SHIFT 0 +#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE5_SHIFT 16 +#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE6_SHIFT 32 +#define SEND_CM_LOCAL_AU_TABLE4_TO7_LOCAL_AU_TABLE7_SHIFT 48 +#define SEND_CM_REMOTE_AU_TABLE0_TO3 (TXE + 0x000000000530) +#define SEND_CM_REMOTE_AU_TABLE4_TO7 (TXE + 0x000000000538) +#define SEND_CM_TIMER_CTRL (TXE + 0x000000000518) +#define SEND_CONTEXTS (TXE + 0x000000000010) +#define SEND_CONTEXT_SET_CTRL (TXE + 0x000000000200) +#define SEND_COUNTER_ARRAY32 (TXE + 0x000000000300) +#define SEND_COUNTER_ARRAY64 (TXE + 0x000000000400) +#define SEND_CTRL (TXE + 0x000000000000) +#define SEND_CTRL_CM_RESET_SMASK 0x4ull +#define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull +#define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull +#define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080) +#define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull +#define SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK \ + 0x200000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK 0x800ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK 0x400ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK 0x1000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK 0x2000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \ + 0x100000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK 0x10000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \ + 0x80000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK \ + 0x40000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \ + 0x8000ull +#define SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK \ + 0x4000ull +#define SEND_CTXT_CHECK_JOB_KEY (TXE + 0x000000100090) +#define SEND_CTXT_CHECK_JOB_KEY_ALLOW_PERMISSIVE_SMASK 0x100000000ull +#define SEND_CTXT_CHECK_JOB_KEY_MASK_SMASK 0xFFFF0000ull +#define SEND_CTXT_CHECK_JOB_KEY_VALUE_MASK 0xFFFFull +#define SEND_CTXT_CHECK_JOB_KEY_VALUE_SHIFT 0 +#define SEND_CTXT_CHECK_OPCODE (TXE + 0x0000001000A8) +#define SEND_CTXT_CHECK_OPCODE_MASK_SHIFT 8 +#define SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT 0 +#define SEND_CTXT_CHECK_PARTITION_KEY (TXE + 0x000000100098) +#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_MASK 0xFFFFull +#define SEND_CTXT_CHECK_PARTITION_KEY_VALUE_SHIFT 0 +#define SEND_CTXT_CHECK_SLID (TXE + 0x0000001000A0) +#define SEND_CTXT_CHECK_SLID_MASK_MASK 0xFFFFull +#define SEND_CTXT_CHECK_SLID_MASK_SHIFT 16 +#define SEND_CTXT_CHECK_SLID_VALUE_MASK 0xFFFFull +#define SEND_CTXT_CHECK_SLID_VALUE_SHIFT 0 +#define SEND_CTXT_CHECK_VL (TXE + 0x000000100088) +#define SEND_CTXT_CREDIT_CTRL (TXE + 0x000000100010) +#define SEND_CTXT_CREDIT_CTRL_CREDIT_INTR_SMASK 0x20000ull +#define SEND_CTXT_CREDIT_CTRL_EARLY_RETURN_SMASK 0x10000ull +#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_MASK 0x7FFull +#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SHIFT 0 +#define SEND_CTXT_CREDIT_CTRL_THRESHOLD_SMASK 0x7FFull +#define SEND_CTXT_CREDIT_FORCE (TXE + 0x000000100028) +#define SEND_CTXT_CREDIT_FORCE_FORCE_RETURN_SMASK 0x1ull +#define SEND_CTXT_CREDIT_RETURN_ADDR (TXE + 0x000000100020) +#define SEND_CTXT_CREDIT_RETURN_ADDR_ADDRESS_SMASK 0xFFFFFFFFFFC0ull +#define SEND_CTXT_CTRL (TXE + 0x000000100000) +#define SEND_CTXT_CTRL_CTXT_BASE_MASK 0x3FFFull +#define SEND_CTXT_CTRL_CTXT_BASE_SHIFT 32 +#define SEND_CTXT_CTRL_CTXT_DEPTH_MASK 0x7FFull +#define SEND_CTXT_CTRL_CTXT_DEPTH_SHIFT 48 +#define SEND_CTXT_CTRL_CTXT_ENABLE_SMASK 0x1ull +#define SEND_CTXT_ERR_CLEAR (TXE + 0x000000100050) +#define SEND_CTXT_ERR_MASK (TXE + 0x000000100048) +#define SEND_CTXT_ERR_STATUS (TXE + 0x000000100040) +#define SEND_CTXT_ERR_STATUS_PIO_DISALLOWED_PACKET_ERR_SMASK 0x2ull +#define SEND_CTXT_ERR_STATUS_PIO_INCONSISTENT_SOP_ERR_SMASK 0x1ull +#define SEND_CTXT_ERR_STATUS_PIO_WRITE_CROSSES_BOUNDARY_ERR_SMASK 0x4ull +#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OUT_OF_BOUNDS_ERR_SMASK 0x10ull +#define SEND_CTXT_ERR_STATUS_PIO_WRITE_OVERFLOW_ERR_SMASK 0x8ull +#define SEND_CTXT_STATUS (TXE + 0x000000100008) +#define SEND_CTXT_STATUS_CTXT_HALTED_SMASK 0x1ull +#define SEND_DMA_BASE_ADDR (TXE + 0x000000200010) +#define SEND_DMA_CHECK_ENABLE (TXE + 0x000000200080) +#define SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull +#define SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK 0x1ull +#define SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK 0x4ull +#define SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK 0x20ull +#define SEND_DMA_CHECK_ENABLE_CHECK_PARTITION_KEY_SMASK 0x8ull +#define SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK 0x10ull +#define SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK 0x40ull +#define SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK 0x2ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK 0x20000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK 0x200000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK \ + 0x100000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK 0x200ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK 0x100ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK \ + 0x80000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK 0x40000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK \ + 0x8000ull +#define SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK 0x4000ull +#define SEND_DMA_CHECK_JOB_KEY (TXE + 0x000000200090) +#define SEND_DMA_CHECK_OPCODE (TXE + 0x0000002000A8) +#define SEND_DMA_CHECK_PARTITION_KEY (TXE + 0x000000200098) +#define SEND_DMA_CHECK_SLID (TXE + 0x0000002000A0) +#define SEND_DMA_CHECK_SLID_MASK_MASK 0xFFFFull +#define SEND_DMA_CHECK_SLID_MASK_SHIFT 16 +#define SEND_DMA_CHECK_SLID_VALUE_MASK 0xFFFFull +#define SEND_DMA_CHECK_SLID_VALUE_SHIFT 0 +#define SEND_DMA_CHECK_VL (TXE + 0x000000200088) +#define SEND_DMA_CTRL (TXE + 0x000000200000) +#define SEND_DMA_CTRL_SDMA_CLEANUP_SMASK 0x4ull +#define SEND_DMA_CTRL_SDMA_ENABLE_SMASK 0x1ull +#define SEND_DMA_CTRL_SDMA_HALT_SMASK 0x2ull +#define SEND_DMA_CTRL_SDMA_INT_ENABLE_SMASK 0x8ull +#define SEND_DMA_DESC_CNT (TXE + 0x000000200050) +#define SEND_DMA_DESC_CNT_CNT_MASK 0xFFFFull +#define SEND_DMA_DESC_CNT_CNT_SHIFT 0 +#define SEND_DMA_ENG_ERR_CLEAR (TXE + 0x000000200070) +#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK 0x1ull +#define SEND_DMA_ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT 18 +#define SEND_DMA_ENG_ERR_MASK (TXE + 0x000000200068) +#define SEND_DMA_ENG_ERR_STATUS (TXE + 0x000000200060) +#define SEND_DMA_ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK 0x8000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK 0x4000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK 0x10ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK 0x2ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK 0x40ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK 0x800ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK 0x1000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK \ + 0x40000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK 0x400ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK \ + 0x20000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK 0x80ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK 0x20ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK \ + 0x100ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK \ + 0x10000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK 0x8ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK 0x2000ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK 0x4ull +#define SEND_DMA_ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK 0x1ull +#define SEND_DMA_ENGINES (TXE + 0x000000000018) +#define SEND_DMA_ERR_CLEAR (TXE + 0x000000000070) +#define SEND_DMA_ERR_MASK (TXE + 0x000000000068) +#define SEND_DMA_ERR_STATUS (TXE + 0x000000000060) +#define SEND_DMA_ERR_STATUS_SDMA_CSR_PARITY_ERR_SMASK 0x2ull +#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_COR_ERR_SMASK 0x8ull +#define SEND_DMA_ERR_STATUS_SDMA_PCIE_REQ_TRACKING_UNC_ERR_SMASK 0x4ull +#define SEND_DMA_ERR_STATUS_SDMA_RPY_TAG_ERR_SMASK 0x1ull +#define SEND_DMA_HEAD (TXE + 0x000000200028) +#define SEND_DMA_HEAD_ADDR (TXE + 0x000000200030) +#define SEND_DMA_LEN_GEN (TXE + 0x000000200018) +#define SEND_DMA_LEN_GEN_GENERATION_SHIFT 16 +#define SEND_DMA_LEN_GEN_LENGTH_SHIFT 6 +#define SEND_DMA_MEMORY (TXE + 0x0000002000B0) +#define SEND_DMA_MEMORY_SDMA_MEMORY_CNT_SHIFT 16 +#define SEND_DMA_MEMORY_SDMA_MEMORY_INDEX_SHIFT 0 +#define SEND_DMA_MEM_SIZE (TXE + 0x000000000028) +#define SEND_DMA_PRIORITY_THLD (TXE + 0x000000200038) +#define SEND_DMA_RELOAD_CNT (TXE + 0x000000200048) +#define SEND_DMA_STATUS (TXE + 0x000000200008) +#define SEND_DMA_STATUS_ENG_CLEANED_UP_SMASK 0x200000000000000ull +#define SEND_DMA_STATUS_ENG_HALTED_SMASK 0x100000000000000ull +#define SEND_DMA_TAIL (TXE + 0x000000200020) +#define SEND_EGRESS_CTXT_STATUS (TXE + 0x000000000800) +#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK 0x10000ull +#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT 0 +#define SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK \ + 0x3FFFull +#define SEND_EGRESS_ERR_CLEAR (TXE + 0x000000000090) +#define SEND_EGRESS_ERR_INFO (TXE + 0x000000000F00) +#define SEND_EGRESS_ERR_INFO_BAD_PKT_LEN_ERR_SMASK 0x20000ull +#define SEND_EGRESS_ERR_INFO_BYPASS_ERR_SMASK 0x800ull +#define SEND_EGRESS_ERR_INFO_GRH_ERR_SMASK 0x400ull +#define SEND_EGRESS_ERR_INFO_JOB_KEY_ERR_SMASK 0x4ull +#define SEND_EGRESS_ERR_INFO_KDETH_PACKETS_ERR_SMASK 0x1000ull +#define SEND_EGRESS_ERR_INFO_NON_KDETH_PACKETS_ERR_SMASK 0x2000ull +#define SEND_EGRESS_ERR_INFO_OPCODE_ERR_SMASK 0x20ull +#define SEND_EGRESS_ERR_INFO_PARTITION_KEY_ERR_SMASK 0x8ull +#define SEND_EGRESS_ERR_INFO_PBC_STATIC_RATE_CONTROL_ERR_SMASK 0x100000ull +#define SEND_EGRESS_ERR_INFO_PBC_TEST_ERR_SMASK 0x10000ull +#define SEND_EGRESS_ERR_INFO_RAW_ERR_SMASK 0x100ull +#define SEND_EGRESS_ERR_INFO_RAW_IPV6_ERR_SMASK 0x200ull +#define SEND_EGRESS_ERR_INFO_SLID_ERR_SMASK 0x10ull +#define SEND_EGRESS_ERR_INFO_TOO_LONG_BYPASS_PACKETS_ERR_SMASK 0x80000ull +#define SEND_EGRESS_ERR_INFO_TOO_LONG_IB_PACKET_ERR_SMASK 0x40000ull +#define SEND_EGRESS_ERR_INFO_TOO_SMALL_BYPASS_PACKETS_ERR_SMASK 0x8000ull +#define SEND_EGRESS_ERR_INFO_TOO_SMALL_IB_PACKETS_ERR_SMASK 0x4000ull +#define SEND_EGRESS_ERR_INFO_VL_ERR_SMASK 0x2ull +#define SEND_EGRESS_ERR_INFO_VL_MAPPING_ERR_SMASK 0x40ull +#define SEND_EGRESS_ERR_MASK (TXE + 0x000000000088) +#define SEND_EGRESS_ERR_SOURCE (TXE + 0x000000000F08) +#define SEND_EGRESS_ERR_STATUS (TXE + 0x000000000080) +#define SEND_EGRESS_ERR_STATUS_TX_CONFIG_PARITY_ERR_SMASK 0x8000ull +#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_OVERRUN_ERR_SMASK \ + 0x200000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_PARITY_ERR_SMASK \ + 0x20000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_CREDIT_RETURN_VL_ERR_SMASK \ + 0x800000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_COR_ERR_SMASK \ + 0x2000000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNC_ERR_SMASK \ + 0x200000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_EGRESS_FIFO_UNDERRUN_OR_PARITY_ERR_SMASK \ + 0x8ull +#define SEND_EGRESS_ERR_STATUS_TX_HCRC_INSERTION_ERR_SMASK \ + 0x400000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_ILLEGAL_VL_ERR_SMASK 0x1000ull +#define SEND_EGRESS_ERR_STATUS_TX_INCORRECT_LINK_STATE_ERR_SMASK 0x20ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_CSR_PARITY_ERR_SMASK 0x2000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_COR_ERR_SMASK \ + 0x1000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO0_UNC_OR_PARITY_ERR_SMASK \ + 0x100000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_COR_ERR_SMASK \ + 0x2000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO1_UNC_OR_PARITY_ERR_SMASK \ + 0x200000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_COR_ERR_SMASK \ + 0x4000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO2_UNC_OR_PARITY_ERR_SMASK \ + 0x400000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_COR_ERR_SMASK \ + 0x8000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO3_UNC_OR_PARITY_ERR_SMASK \ + 0x800000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_COR_ERR_SMASK \ + 0x10000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO4_UNC_OR_PARITY_ERR_SMASK \ + 0x1000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_COR_ERR_SMASK \ + 0x20000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO5_UNC_OR_PARITY_ERR_SMASK \ + 0x2000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_COR_ERR_SMASK \ + 0x40000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO6_UNC_OR_PARITY_ERR_SMASK \ + 0x4000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_COR_ERR_SMASK \ + 0x80000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO7_UNC_OR_PARITY_ERR_SMASK \ + 0x8000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_COR_ERR_SMASK \ + 0x100000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LAUNCH_FIFO8_UNC_OR_PARITY_ERR_SMASK \ + 0x10000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_LINKDOWN_ERR_SMASK 0x10ull +#define SEND_EGRESS_ERR_STATUS_TX_PIO_LAUNCH_INTF_PARITY_ERR_SMASK 0x80ull +#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_COR_ERR_SMASK 0x1ull +#define SEND_EGRESS_ERR_STATUS_TX_PKT_INTEGRITY_MEM_UNC_ERR_SMASK 0x2ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_COR_ERR_SMASK \ + 0x1000000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_CSR_UNC_ERR_SMASK \ + 0x8000000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_PIO_MEMORY_UNC_ERR_SMASK \ + 0x100000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_COR_ERR_SMASK \ + 0x800000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_CSR_UNC_ERR_SMASK \ + 0x4000000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_READ_SDMA_MEMORY_UNC_ERR_SMASK \ + 0x80000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_COR_ERR_SMASK 0x400000000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SB_HDR_UNC_ERR_SMASK 0x40000000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_CSR_PARITY_ERR_SMASK 0x4000ull +#define SEND_EGRESS_ERR_STATUS_TX_SBRD_CTL_STATE_MACHINE_PARITY_ERR_SMASK \ + 0x800ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA0_DISALLOWED_PACKET_ERR_SMASK \ + 0x10000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA10_DISALLOWED_PACKET_ERR_SMASK \ + 0x4000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA11_DISALLOWED_PACKET_ERR_SMASK \ + 0x8000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA12_DISALLOWED_PACKET_ERR_SMASK \ + 0x10000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA13_DISALLOWED_PACKET_ERR_SMASK \ + 0x20000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA14_DISALLOWED_PACKET_ERR_SMASK \ + 0x40000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA15_DISALLOWED_PACKET_ERR_SMASK \ + 0x80000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA1_DISALLOWED_PACKET_ERR_SMASK \ + 0x20000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA2_DISALLOWED_PACKET_ERR_SMASK \ + 0x40000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA3_DISALLOWED_PACKET_ERR_SMASK \ + 0x80000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA4_DISALLOWED_PACKET_ERR_SMASK \ + 0x100000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA5_DISALLOWED_PACKET_ERR_SMASK \ + 0x200000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA6_DISALLOWED_PACKET_ERR_SMASK \ + 0x400000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA7_DISALLOWED_PACKET_ERR_SMASK \ + 0x800000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA8_DISALLOWED_PACKET_ERR_SMASK \ + 0x1000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA9_DISALLOWED_PACKET_ERR_SMASK \ + 0x2000000ull +#define SEND_EGRESS_ERR_STATUS_TX_SDMA_LAUNCH_INTF_PARITY_ERR_SMASK \ + 0x100ull +#define SEND_EGRESS_SEND_DMA_STATUS (TXE + 0x000000000E00) +#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT 0 +#define SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \ + 0x3FFFull +#define SEND_ERR_CLEAR (TXE + 0x0000000000F0) +#define SEND_ERR_MASK (TXE + 0x0000000000E8) +#define SEND_ERR_STATUS (TXE + 0x0000000000E0) +#define SEND_ERR_STATUS_SEND_CSR_PARITY_ERR_SMASK 0x1ull +#define SEND_ERR_STATUS_SEND_CSR_READ_BAD_ADDR_ERR_SMASK 0x2ull +#define SEND_ERR_STATUS_SEND_CSR_WRITE_BAD_ADDR_ERR_SMASK 0x4ull +#define SEND_HIGH_PRIORITY_LIMIT (TXE + 0x000000000030) +#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_MASK 0x3FFFull +#define SEND_HIGH_PRIORITY_LIMIT_LIMIT_SHIFT 0 +#define SEND_HIGH_PRIORITY_LIST (TXE + 0x000000000180) +#define SEND_LEN_CHECK0 (TXE + 0x0000000000D0) +#define SEND_LEN_CHECK0_LEN_VL0_MASK 0xFFFull +#define SEND_LEN_CHECK0_LEN_VL1_SHIFT 12 +#define SEND_LEN_CHECK1 (TXE + 0x0000000000D8) +#define SEND_LEN_CHECK1_LEN_VL15_MASK 0xFFFull +#define SEND_LEN_CHECK1_LEN_VL15_SHIFT 48 +#define SEND_LEN_CHECK1_LEN_VL4_MASK 0xFFFull +#define SEND_LEN_CHECK1_LEN_VL5_SHIFT 12 +#define SEND_LOW_PRIORITY_LIST (TXE + 0x000000000100) +#define SEND_LOW_PRIORITY_LIST_VL_MASK 0x7ull +#define SEND_LOW_PRIORITY_LIST_VL_SHIFT 16 +#define SEND_LOW_PRIORITY_LIST_WEIGHT_MASK 0xFFull +#define SEND_LOW_PRIORITY_LIST_WEIGHT_SHIFT 0 +#define SEND_PIO_ERR_CLEAR (TXE + 0x000000000050) +#define SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull +#define SEND_PIO_ERR_MASK (TXE + 0x000000000048) +#define SEND_PIO_ERR_STATUS (TXE + 0x000000000040) +#define SEND_PIO_ERR_STATUS_PIO_BLOCK_QW_COUNT_PARITY_ERR_SMASK \ + 0x1000000ull +#define SEND_PIO_ERR_STATUS_PIO_CREDIT_RET_FIFO_PARITY_ERR_SMASK 0x8000ull +#define SEND_PIO_ERR_STATUS_PIO_CSR_PARITY_ERR_SMASK 0x4ull +#define SEND_PIO_ERR_STATUS_PIO_CURRENT_FREE_CNT_PARITY_ERR_SMASK \ + 0x100000000ull +#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_COR_ERR_SMASK 0x100000ull +#define SEND_PIO_ERR_STATUS_PIO_HOST_ADDR_MEM_UNC_ERR_SMASK 0x80000ull +#define SEND_PIO_ERR_STATUS_PIO_INIT_SM_IN_ERR_SMASK 0x20000ull +#define SEND_PIO_ERR_STATUS_PIO_LAST_RETURNED_CNT_PARITY_ERR_SMASK \ + 0x200000000ull +#define SEND_PIO_ERR_STATUS_PIO_PCC_FIFO_PARITY_ERR_SMASK 0x20ull +#define SEND_PIO_ERR_STATUS_PIO_PCC_SOP_HEAD_PARITY_ERR_SMASK \ + 0x400000000ull +#define SEND_PIO_ERR_STATUS_PIO_PEC_FIFO_PARITY_ERR_SMASK 0x40ull +#define SEND_PIO_ERR_STATUS_PIO_PEC_SOP_HEAD_PARITY_ERR_SMASK \ + 0x800000000ull +#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_FIFO_PARITY_ERR_SMASK 0x200ull +#define SEND_PIO_ERR_STATUS_PIO_PKT_EVICT_SM_OR_ARB_SM_ERR_SMASK 0x40000ull +#define SEND_PIO_ERR_STATUS_PIO_PPMC_BQC_MEM_PARITY_ERR_SMASK 0x10000000ull +#define SEND_PIO_ERR_STATUS_PIO_PPMC_PBL_FIFO_ERR_SMASK 0x10000ull +#define SEND_PIO_ERR_STATUS_PIO_PPMC_SOP_LEN_ERR_SMASK 0x20000000ull +#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO0_ERR_SMASK 0x8ull +#define SEND_PIO_ERR_STATUS_PIO_SB_MEM_FIFO1_ERR_SMASK 0x10ull +#define SEND_PIO_ERR_STATUS_PIO_SBRDCTL_CRREL_PARITY_ERR_SMASK 0x80ull +#define SEND_PIO_ERR_STATUS_PIO_SBRDCTRL_CRREL_FIFO_PARITY_ERR_SMASK \ + 0x100ull +#define SEND_PIO_ERR_STATUS_PIO_SM_PKT_RESET_PARITY_ERR_SMASK 0x400ull +#define SEND_PIO_ERR_STATUS_PIO_STATE_MACHINE_ERR_SMASK 0x400000ull +#define SEND_PIO_ERR_STATUS_PIO_VL_FIFO_PARITY_ERR_SMASK 0x8000000ull +#define SEND_PIO_ERR_STATUS_PIO_VLF_SOP_PARITY_ERR_SMASK 0x4000000ull +#define SEND_PIO_ERR_STATUS_PIO_VLF_VL_LEN_PARITY_ERR_SMASK 0x2000000ull +#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_COR_ERR_SMASK 0x2000ull +#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK0_UNC_ERR_SMASK 0x800ull +#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_COR_ERR_SMASK 0x4000ull +#define SEND_PIO_ERR_STATUS_PIO_VL_LEN_MEM_BANK1_UNC_ERR_SMASK 0x1000ull +#define SEND_PIO_ERR_STATUS_PIO_WRITE_ADDR_PARITY_ERR_SMASK 0x2ull +#define SEND_PIO_ERR_STATUS_PIO_WRITE_BAD_CTXT_ERR_SMASK 0x1ull +#define SEND_PIO_ERR_STATUS_PIO_WRITE_DATA_PARITY_ERR_SMASK 0x200000ull +#define SEND_PIO_ERR_STATUS_PIO_WRITE_QW_VALID_PARITY_ERR_SMASK 0x800000ull +#define SEND_PIO_INIT_CTXT (TXE + 0x000000000038) +#define SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK 0x1ull +#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK 0xFFull +#define SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT 8 +#define SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK 0x8ull +#define SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK 0x4ull +#define SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK 0x2ull +#define SEND_PIO_MEM_SIZE (TXE + 0x000000000020) +#define SEND_SC2VLT0 (TXE + 0x0000000000B0) +#define SEND_SC2VLT0_SC0_SHIFT 0 +#define SEND_SC2VLT0_SC1_SHIFT 8 +#define SEND_SC2VLT0_SC2_SHIFT 16 +#define SEND_SC2VLT0_SC3_SHIFT 24 +#define SEND_SC2VLT0_SC4_SHIFT 32 +#define SEND_SC2VLT0_SC5_SHIFT 40 +#define SEND_SC2VLT0_SC6_SHIFT 48 +#define SEND_SC2VLT0_SC7_SHIFT 56 +#define SEND_SC2VLT1 (TXE + 0x0000000000B8) +#define SEND_SC2VLT1_SC10_SHIFT 16 +#define SEND_SC2VLT1_SC11_SHIFT 24 +#define SEND_SC2VLT1_SC12_SHIFT 32 +#define SEND_SC2VLT1_SC13_SHIFT 40 +#define SEND_SC2VLT1_SC14_SHIFT 48 +#define SEND_SC2VLT1_SC15_SHIFT 56 +#define SEND_SC2VLT1_SC8_SHIFT 0 +#define SEND_SC2VLT1_SC9_SHIFT 8 +#define SEND_SC2VLT2 (TXE + 0x0000000000C0) +#define SEND_SC2VLT2_SC16_SHIFT 0 +#define SEND_SC2VLT2_SC17_SHIFT 8 +#define SEND_SC2VLT2_SC18_SHIFT 16 +#define SEND_SC2VLT2_SC19_SHIFT 24 +#define SEND_SC2VLT2_SC20_SHIFT 32 +#define SEND_SC2VLT2_SC21_SHIFT 40 +#define SEND_SC2VLT2_SC22_SHIFT 48 +#define SEND_SC2VLT2_SC23_SHIFT 56 +#define SEND_SC2VLT3 (TXE + 0x0000000000C8) +#define SEND_SC2VLT3_SC24_SHIFT 0 +#define SEND_SC2VLT3_SC25_SHIFT 8 +#define SEND_SC2VLT3_SC26_SHIFT 16 +#define SEND_SC2VLT3_SC27_SHIFT 24 +#define SEND_SC2VLT3_SC28_SHIFT 32 +#define SEND_SC2VLT3_SC29_SHIFT 40 +#define SEND_SC2VLT3_SC30_SHIFT 48 +#define SEND_SC2VLT3_SC31_SHIFT 56 +#define SEND_STATIC_RATE_CONTROL (TXE + 0x0000000000A8) +#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT 0 +#define SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK 0xFFFFull +#define PCIE_CFG_REG_PL2 (PCIE + 0x000000000708) +#define PCIE_CFG_REG_PL3 (PCIE + 0x00000000070C) +#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SHIFT 27 +#define PCIE_CFG_REG_PL3_L1_ENT_LATENCY_SMASK 0x38000000 +#define PCIE_CFG_REG_PL102 (PCIE + 0x000000000898) +#define PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT 12 +#define PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT 6 +#define PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT 0 +#define PCIE_CFG_REG_PL103 (PCIE + 0x00000000089C) +#define PCIE_CFG_REG_PL105 (PCIE + 0x0000000008A4) +#define PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK 0x1ull +#define PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT 24 +#define PCIE_CFG_REG_PL100 (PCIE + 0x000000000890) +#define PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK 0x400ull +#define PCIE_CFG_REG_PL101 (PCIE + 0x000000000894) +#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT 6 +#define PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT 0 +#define PCIE_CFG_REG_PL106 (PCIE + 0x0000000008A8) +#define PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT 8 +#define PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK 0x20ull +#define PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK 0x10ull +#define CCE_INT_BLOCKED (CCE + 0x000000110C00) +#define SEND_DMA_IDLE_CNT (TXE + 0x000000200040) +#define SEND_DMA_DESC_FETCHED_CNT (TXE + 0x000000200058) +#define CCE_MSIX_PBA_OFFSET 0X0110000 + +#endif /* DEF_CHIP_REG */ diff --git a/drivers/infiniband/hw/hfi1/common.h b/drivers/infiniband/hw/hfi1/common.h new file mode 100644 index 000000000..fcc9c217a --- /dev/null +++ b/drivers/infiniband/hw/hfi1/common.h @@ -0,0 +1,411 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef _COMMON_H +#define _COMMON_H + +#include <rdma/hfi/hfi1_user.h> + +/* + * This file contains defines, structures, etc. that are used + * to communicate between kernel and user code. + */ + +/* version of protocol header (known to chip also). In the long run, + * we should be able to generate and accept a range of version numbers; + * for now we only accept one, and it's compiled in. + */ +#define IPS_PROTO_VERSION 2 + +/* + * These are compile time constants that you may want to enable or disable + * if you are trying to debug problems with code or performance. + * HFI1_VERBOSE_TRACING define as 1 if you want additional tracing in + * fast path code + * HFI1_TRACE_REGWRITES define as 1 if you want register writes to be + * traced in fast path code + * _HFI1_TRACING define as 0 if you want to remove all tracing in a + * compilation unit + */ + +/* + * If a packet's QP[23:16] bits match this value, then it is + * a PSM packet and the hardware will expect a KDETH header + * following the BTH. + */ +#define DEFAULT_KDETH_QP 0x80 + +/* driver/hw feature set bitmask */ +#define HFI1_CAP_USER_SHIFT 24 +#define HFI1_CAP_MASK ((1UL << HFI1_CAP_USER_SHIFT) - 1) +/* locked flag - if set, only HFI1_CAP_WRITABLE_MASK bits can be set */ +#define HFI1_CAP_LOCKED_SHIFT 63 +#define HFI1_CAP_LOCKED_MASK 0x1ULL +#define HFI1_CAP_LOCKED_SMASK (HFI1_CAP_LOCKED_MASK << HFI1_CAP_LOCKED_SHIFT) +/* extra bits used between kernel and user processes */ +#define HFI1_CAP_MISC_SHIFT (HFI1_CAP_USER_SHIFT * 2) +#define HFI1_CAP_MISC_MASK ((1ULL << (HFI1_CAP_LOCKED_SHIFT - \ + HFI1_CAP_MISC_SHIFT)) - 1) + +#define HFI1_CAP_KSET(cap) ({ hfi1_cap_mask |= HFI1_CAP_##cap; hfi1_cap_mask; }) +#define HFI1_CAP_KCLEAR(cap) \ + ({ \ + hfi1_cap_mask &= ~HFI1_CAP_##cap; \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_USET(cap) \ + ({ \ + hfi1_cap_mask |= (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_UCLEAR(cap) \ + ({ \ + hfi1_cap_mask &= ~(HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT); \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_SET(cap) \ + ({ \ + hfi1_cap_mask |= (HFI1_CAP_##cap | (HFI1_CAP_##cap << \ + HFI1_CAP_USER_SHIFT)); \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_CLEAR(cap) \ + ({ \ + hfi1_cap_mask &= ~(HFI1_CAP_##cap | \ + (HFI1_CAP_##cap << HFI1_CAP_USER_SHIFT)); \ + hfi1_cap_mask; \ + }) +#define HFI1_CAP_LOCK() \ + ({ hfi1_cap_mask |= HFI1_CAP_LOCKED_SMASK; hfi1_cap_mask; }) +#define HFI1_CAP_LOCKED() (!!(hfi1_cap_mask & HFI1_CAP_LOCKED_SMASK)) +/* + * The set of capability bits that can be changed after initial load + * This set is the same for kernel and user contexts. However, for + * user contexts, the set can be further filtered by using the + * HFI1_CAP_RESERVED_MASK bits. + */ +#define HFI1_CAP_WRITABLE_MASK (HFI1_CAP_SDMA_AHG | \ + HFI1_CAP_HDRSUPP | \ + HFI1_CAP_MULTI_PKT_EGR | \ + HFI1_CAP_NODROP_RHQ_FULL | \ + HFI1_CAP_NODROP_EGR_FULL | \ + HFI1_CAP_ALLOW_PERM_JKEY | \ + HFI1_CAP_STATIC_RATE_CTRL | \ + HFI1_CAP_PRINT_UNIMPL | \ + HFI1_CAP_TID_UNMAP) +/* + * A set of capability bits that are "global" and are not allowed to be + * set in the user bitmask. + */ +#define HFI1_CAP_RESERVED_MASK ((HFI1_CAP_SDMA | \ + HFI1_CAP_USE_SDMA_HEAD | \ + HFI1_CAP_EXTENDED_PSN | \ + HFI1_CAP_PRINT_UNIMPL | \ + HFI1_CAP_NO_INTEGRITY | \ + HFI1_CAP_PKEY_CHECK) << \ + HFI1_CAP_USER_SHIFT) +/* + * Set of capabilities that need to be enabled for kernel context in + * order to be allowed for user contexts, as well. + */ +#define HFI1_CAP_MUST_HAVE_KERN (HFI1_CAP_STATIC_RATE_CTRL) +/* Default enabled capabilities (both kernel and user) */ +#define HFI1_CAP_MASK_DEFAULT (HFI1_CAP_HDRSUPP | \ + HFI1_CAP_NODROP_RHQ_FULL | \ + HFI1_CAP_NODROP_EGR_FULL | \ + HFI1_CAP_SDMA | \ + HFI1_CAP_PRINT_UNIMPL | \ + HFI1_CAP_STATIC_RATE_CTRL | \ + HFI1_CAP_PKEY_CHECK | \ + HFI1_CAP_MULTI_PKT_EGR | \ + HFI1_CAP_EXTENDED_PSN | \ + ((HFI1_CAP_HDRSUPP | \ + HFI1_CAP_MULTI_PKT_EGR | \ + HFI1_CAP_STATIC_RATE_CTRL | \ + HFI1_CAP_PKEY_CHECK | \ + HFI1_CAP_EARLY_CREDIT_RETURN) << \ + HFI1_CAP_USER_SHIFT)) +/* + * A bitmask of kernel/global capabilities that should be communicated + * to user level processes. + */ +#define HFI1_CAP_K2U (HFI1_CAP_SDMA | \ + HFI1_CAP_EXTENDED_PSN | \ + HFI1_CAP_PKEY_CHECK | \ + HFI1_CAP_NO_INTEGRITY) + +#define HFI1_USER_SWVERSION ((HFI1_USER_SWMAJOR << HFI1_SWMAJOR_SHIFT) | \ + HFI1_USER_SWMINOR) + +#ifndef HFI1_KERN_TYPE +#define HFI1_KERN_TYPE 0 +#endif + +/* + * Similarly, this is the kernel version going back to the user. It's + * slightly different, in that we want to tell if the driver was built as + * part of a Intel release, or from the driver from openfabrics.org, + * kernel.org, or a standard distribution, for support reasons. + * The high bit is 0 for non-Intel and 1 for Intel-built/supplied. + * + * It's returned by the driver to the user code during initialization in the + * spi_sw_version field of hfi1_base_info, so the user code can in turn + * check for compatibility with the kernel. +*/ +#define HFI1_KERN_SWVERSION ((HFI1_KERN_TYPE << 31) | HFI1_USER_SWVERSION) + +/* + * Define the driver version number. This is something that refers only + * to the driver itself, not the software interfaces it supports. + */ +#ifndef HFI1_DRIVER_VERSION_BASE +#define HFI1_DRIVER_VERSION_BASE "0.9-294" +#endif + +/* create the final driver version string */ +#ifdef HFI1_IDSTR +#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE " " HFI1_IDSTR +#else +#define HFI1_DRIVER_VERSION HFI1_DRIVER_VERSION_BASE +#endif + +/* + * Diagnostics can send a packet by writing the following + * struct to the diag packet special file. + * + * This allows a custom PBC qword, so that special modes and deliberate + * changes to CRCs can be used. + */ +#define _DIAG_PKT_VERS 1 +struct diag_pkt { + __u16 version; /* structure version */ + __u16 unit; /* which device */ + __u16 sw_index; /* send sw index to use */ + __u16 len; /* data length, in bytes */ + __u16 port; /* port number */ + __u16 unused; + __u32 flags; /* call flags */ + __u64 data; /* user data pointer */ + __u64 pbc; /* PBC for the packet */ +}; + +/* diag_pkt flags */ +#define F_DIAGPKT_WAIT 0x1 /* wait until packet is sent */ + +/* + * The next set of defines are for packet headers, and chip register + * and memory bits that are visible to and/or used by user-mode software. + */ + +/* + * Receive Header Flags + */ +#define RHF_PKT_LEN_SHIFT 0 +#define RHF_PKT_LEN_MASK 0xfffull +#define RHF_PKT_LEN_SMASK (RHF_PKT_LEN_MASK << RHF_PKT_LEN_SHIFT) + +#define RHF_RCV_TYPE_SHIFT 12 +#define RHF_RCV_TYPE_MASK 0x7ull +#define RHF_RCV_TYPE_SMASK (RHF_RCV_TYPE_MASK << RHF_RCV_TYPE_SHIFT) + +#define RHF_USE_EGR_BFR_SHIFT 15 +#define RHF_USE_EGR_BFR_MASK 0x1ull +#define RHF_USE_EGR_BFR_SMASK (RHF_USE_EGR_BFR_MASK << RHF_USE_EGR_BFR_SHIFT) + +#define RHF_EGR_INDEX_SHIFT 16 +#define RHF_EGR_INDEX_MASK 0x7ffull +#define RHF_EGR_INDEX_SMASK (RHF_EGR_INDEX_MASK << RHF_EGR_INDEX_SHIFT) + +#define RHF_DC_INFO_SHIFT 27 +#define RHF_DC_INFO_MASK 0x1ull +#define RHF_DC_INFO_SMASK (RHF_DC_INFO_MASK << RHF_DC_INFO_SHIFT) + +#define RHF_RCV_SEQ_SHIFT 28 +#define RHF_RCV_SEQ_MASK 0xfull +#define RHF_RCV_SEQ_SMASK (RHF_RCV_SEQ_MASK << RHF_RCV_SEQ_SHIFT) + +#define RHF_EGR_OFFSET_SHIFT 32 +#define RHF_EGR_OFFSET_MASK 0xfffull +#define RHF_EGR_OFFSET_SMASK (RHF_EGR_OFFSET_MASK << RHF_EGR_OFFSET_SHIFT) +#define RHF_HDRQ_OFFSET_SHIFT 44 +#define RHF_HDRQ_OFFSET_MASK 0x1ffull +#define RHF_HDRQ_OFFSET_SMASK (RHF_HDRQ_OFFSET_MASK << RHF_HDRQ_OFFSET_SHIFT) +#define RHF_K_HDR_LEN_ERR (0x1ull << 53) +#define RHF_DC_UNC_ERR (0x1ull << 54) +#define RHF_DC_ERR (0x1ull << 55) +#define RHF_RCV_TYPE_ERR_SHIFT 56 +#define RHF_RCV_TYPE_ERR_MASK 0x7ul +#define RHF_RCV_TYPE_ERR_SMASK (RHF_RCV_TYPE_ERR_MASK << RHF_RCV_TYPE_ERR_SHIFT) +#define RHF_TID_ERR (0x1ull << 59) +#define RHF_LEN_ERR (0x1ull << 60) +#define RHF_ECC_ERR (0x1ull << 61) +#define RHF_VCRC_ERR (0x1ull << 62) +#define RHF_ICRC_ERR (0x1ull << 63) + +#define RHF_ERROR_SMASK 0xffe0000000000000ull /* bits 63:53 */ + +/* RHF receive types */ +#define RHF_RCV_TYPE_EXPECTED 0 +#define RHF_RCV_TYPE_EAGER 1 +#define RHF_RCV_TYPE_IB 2 /* normal IB, IB Raw, or IPv6 */ +#define RHF_RCV_TYPE_ERROR 3 +#define RHF_RCV_TYPE_BYPASS 4 +#define RHF_RCV_TYPE_INVALID5 5 +#define RHF_RCV_TYPE_INVALID6 6 +#define RHF_RCV_TYPE_INVALID7 7 + +/* RHF receive type error - expected packet errors */ +#define RHF_RTE_EXPECTED_FLOW_SEQ_ERR 0x2 +#define RHF_RTE_EXPECTED_FLOW_GEN_ERR 0x4 + +/* RHF receive type error - eager packet errors */ +#define RHF_RTE_EAGER_NO_ERR 0x0 + +/* RHF receive type error - IB packet errors */ +#define RHF_RTE_IB_NO_ERR 0x0 + +/* RHF receive type error - error packet errors */ +#define RHF_RTE_ERROR_NO_ERR 0x0 +#define RHF_RTE_ERROR_OP_CODE_ERR 0x1 +#define RHF_RTE_ERROR_KHDR_MIN_LEN_ERR 0x2 +#define RHF_RTE_ERROR_KHDR_HCRC_ERR 0x3 +#define RHF_RTE_ERROR_KHDR_KVER_ERR 0x4 +#define RHF_RTE_ERROR_CONTEXT_ERR 0x5 +#define RHF_RTE_ERROR_KHDR_TID_ERR 0x6 + +/* RHF receive type error - bypass packet errors */ +#define RHF_RTE_BYPASS_NO_ERR 0x0 + +/* + * This structure contains the first field common to all protocols + * that employ this chip. + */ +struct hfi1_message_header { + __be16 lrh[4]; +}; + +/* IB - LRH header constants */ +#define HFI1_LRH_GRH 0x0003 /* 1. word of IB LRH - next header: GRH */ +#define HFI1_LRH_BTH 0x0002 /* 1. word of IB LRH - next header: BTH */ + +/* misc. */ +#define SIZE_OF_CRC 1 + +#define LIM_MGMT_P_KEY 0x7FFF +#define FULL_MGMT_P_KEY 0xFFFF + +#define DEFAULT_P_KEY LIM_MGMT_P_KEY +#define HFI1_AETH_CREDIT_SHIFT 24 +#define HFI1_AETH_CREDIT_MASK 0x1F +#define HFI1_AETH_CREDIT_INVAL 0x1F +#define HFI1_MSN_MASK 0xFFFFFF +#define HFI1_FECN_SHIFT 31 +#define HFI1_FECN_MASK 1 +#define HFI1_FECN_SMASK BIT(HFI1_FECN_SHIFT) +#define HFI1_BECN_SHIFT 30 +#define HFI1_BECN_MASK 1 +#define HFI1_BECN_SMASK BIT(HFI1_BECN_SHIFT) + +#define HFI1_PSM_IOC_BASE_SEQ 0x0 + +static inline __u64 rhf_to_cpu(const __le32 *rbuf) +{ + return __le64_to_cpu(*((__le64 *)rbuf)); +} + +static inline u64 rhf_err_flags(u64 rhf) +{ + return rhf & RHF_ERROR_SMASK; +} + +static inline u32 rhf_rcv_type(u64 rhf) +{ + return (rhf >> RHF_RCV_TYPE_SHIFT) & RHF_RCV_TYPE_MASK; +} + +static inline u32 rhf_rcv_type_err(u64 rhf) +{ + return (rhf >> RHF_RCV_TYPE_ERR_SHIFT) & RHF_RCV_TYPE_ERR_MASK; +} + +/* return size is in bytes, not DWORDs */ +static inline u32 rhf_pkt_len(u64 rhf) +{ + return ((rhf & RHF_PKT_LEN_SMASK) >> RHF_PKT_LEN_SHIFT) << 2; +} + +static inline u32 rhf_egr_index(u64 rhf) +{ + return (rhf >> RHF_EGR_INDEX_SHIFT) & RHF_EGR_INDEX_MASK; +} + +static inline u32 rhf_rcv_seq(u64 rhf) +{ + return (rhf >> RHF_RCV_SEQ_SHIFT) & RHF_RCV_SEQ_MASK; +} + +/* returned offset is in DWORDS */ +static inline u32 rhf_hdrq_offset(u64 rhf) +{ + return (rhf >> RHF_HDRQ_OFFSET_SHIFT) & RHF_HDRQ_OFFSET_MASK; +} + +static inline u64 rhf_use_egr_bfr(u64 rhf) +{ + return rhf & RHF_USE_EGR_BFR_SMASK; +} + +static inline u64 rhf_dc_info(u64 rhf) +{ + return rhf & RHF_DC_INFO_SMASK; +} + +static inline u32 rhf_egr_buf_offset(u64 rhf) +{ + return (rhf >> RHF_EGR_OFFSET_SHIFT) & RHF_EGR_OFFSET_MASK; +} +#endif /* _COMMON_H */ diff --git a/drivers/infiniband/hw/hfi1/debugfs.c b/drivers/infiniband/hw/hfi1/debugfs.c new file mode 100644 index 000000000..dbab9d9cc --- /dev/null +++ b/drivers/infiniband/hw/hfi1/debugfs.c @@ -0,0 +1,1145 @@ +#ifdef CONFIG_DEBUG_FS +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/module.h> + +#include "hfi.h" +#include "debugfs.h" +#include "device.h" +#include "qp.h" +#include "sdma.h" + +static struct dentry *hfi1_dbg_root; + +#define private2dd(file) (file_inode(file)->i_private) +#define private2ppd(file) (file_inode(file)->i_private) + +#define DEBUGFS_SEQ_FILE_OPS(name) \ +static const struct seq_operations _##name##_seq_ops = { \ + .start = _##name##_seq_start, \ + .next = _##name##_seq_next, \ + .stop = _##name##_seq_stop, \ + .show = _##name##_seq_show \ +} + +#define DEBUGFS_SEQ_FILE_OPEN(name) \ +static int _##name##_open(struct inode *inode, struct file *s) \ +{ \ + struct seq_file *seq; \ + int ret; \ + ret = seq_open(s, &_##name##_seq_ops); \ + if (ret) \ + return ret; \ + seq = s->private_data; \ + seq->private = inode->i_private; \ + return 0; \ +} + +#define DEBUGFS_FILE_OPS(name) \ +static const struct file_operations _##name##_file_ops = { \ + .owner = THIS_MODULE, \ + .open = _##name##_open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = seq_release \ +} + +#define DEBUGFS_FILE_CREATE(name, parent, data, ops, mode) \ +do { \ + struct dentry *ent; \ + ent = debugfs_create_file(name, mode, parent, \ + data, ops); \ + if (!ent) \ + pr_warn("create of %s failed\n", name); \ +} while (0) + +#define DEBUGFS_SEQ_FILE_CREATE(name, parent, data) \ + DEBUGFS_FILE_CREATE(#name, parent, data, &_##name##_file_ops, S_IRUGO) + +static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos) +__acquires(RCU) +{ + struct hfi1_opcode_stats_perctx *opstats; + + rcu_read_lock(); + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + +static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct hfi1_opcode_stats_perctx *opstats; + + ++*pos; + if (*pos >= ARRAY_SIZE(opstats->stats)) + return NULL; + return pos; +} + +static void _opcode_stats_seq_stop(struct seq_file *s, void *v) +__releases(RCU) +{ + rcu_read_unlock(); +} + +static int _opcode_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos = v; + loff_t i = *spos, j; + u64 n_packets = 0, n_bytes = 0; + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + + for (j = 0; j < dd->first_user_ctxt; j++) { + if (!dd->rcd[j]) + continue; + n_packets += dd->rcd[j]->opstats->stats[i].n_packets; + n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes; + } + if (!n_packets && !n_bytes) + return SEQ_SKIP; + seq_printf(s, "%02llx %llu/%llu\n", i, + (unsigned long long)n_packets, + (unsigned long long)n_bytes); + + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(opcode_stats); +DEBUGFS_SEQ_FILE_OPEN(opcode_stats) +DEBUGFS_FILE_OPS(opcode_stats); + +static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + + if (!*pos) + return SEQ_START_TOKEN; + if (*pos >= dd->first_user_ctxt) + return NULL; + return pos; +} + +static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + + if (v == SEQ_START_TOKEN) + return pos; + + ++*pos; + if (*pos >= dd->first_user_ctxt) + return NULL; + return pos; +} + +static void _ctx_stats_seq_stop(struct seq_file *s, void *v) +{ + /* nothing allocated */ +} + +static int _ctx_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos; + loff_t i, j; + u64 n_packets = 0; + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + + if (v == SEQ_START_TOKEN) { + seq_puts(s, "Ctx:npkts\n"); + return 0; + } + + spos = v; + i = *spos; + + if (!dd->rcd[i]) + return SEQ_SKIP; + + for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++) + n_packets += dd->rcd[i]->opstats->stats[j].n_packets; + + if (!n_packets) + return SEQ_SKIP; + + seq_printf(s, " %llu:%llu\n", i, n_packets); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(ctx_stats); +DEBUGFS_SEQ_FILE_OPEN(ctx_stats) +DEBUGFS_FILE_OPS(ctx_stats); + +static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos) +__acquires(RCU) +{ + struct qp_iter *iter; + loff_t n = *pos; + + rcu_read_lock(); + iter = qp_iter_init(s->private); + if (!iter) + return NULL; + + while (n--) { + if (qp_iter_next(iter)) { + kfree(iter); + return NULL; + } + } + + return iter; +} + +static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr, + loff_t *pos) +{ + struct qp_iter *iter = iter_ptr; + + (*pos)++; + + if (qp_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr) +__releases(RCU) +{ + rcu_read_unlock(); +} + +static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr) +{ + struct qp_iter *iter = iter_ptr; + + if (!iter) + return 0; + + qp_iter_print(s, iter); + + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(qp_stats); +DEBUGFS_SEQ_FILE_OPEN(qp_stats) +DEBUGFS_FILE_OPS(qp_stats); + +static void *_sdes_seq_start(struct seq_file *s, loff_t *pos) +__acquires(RCU) +{ + struct hfi1_ibdev *ibd; + struct hfi1_devdata *dd; + + rcu_read_lock(); + ibd = (struct hfi1_ibdev *)s->private; + dd = dd_from_dev(ibd); + if (!dd->per_sdma || *pos >= dd->num_sdma) + return NULL; + return pos; +} + +static void *_sdes_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + + ++*pos; + if (!dd->per_sdma || *pos >= dd->num_sdma) + return NULL; + return pos; +} + +static void _sdes_seq_stop(struct seq_file *s, void *v) +__releases(RCU) +{ + rcu_read_unlock(); +} + +static int _sdes_seq_show(struct seq_file *s, void *v) +{ + struct hfi1_ibdev *ibd = (struct hfi1_ibdev *)s->private; + struct hfi1_devdata *dd = dd_from_dev(ibd); + loff_t *spos = v; + loff_t i = *spos; + + sdma_seqfile_dump_sde(s, &dd->per_sdma[i]); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(sdes); +DEBUGFS_SEQ_FILE_OPEN(sdes) +DEBUGFS_FILE_OPS(sdes); + +/* read the per-device counters */ +static ssize_t dev_counters_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 *counters; + size_t avail; + struct hfi1_devdata *dd; + ssize_t rval; + + rcu_read_lock(); + dd = private2dd(file); + avail = hfi1_read_cntrs(dd, NULL, &counters); + rval = simple_read_from_buffer(buf, count, ppos, counters, avail); + rcu_read_unlock(); + return rval; +} + +/* read the per-device counters */ +static ssize_t dev_names_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *names; + size_t avail; + struct hfi1_devdata *dd; + ssize_t rval; + + rcu_read_lock(); + dd = private2dd(file); + avail = hfi1_read_cntrs(dd, &names, NULL); + rval = simple_read_from_buffer(buf, count, ppos, names, avail); + rcu_read_unlock(); + return rval; +} + +struct counter_info { + char *name; + const struct file_operations ops; +}; + +/* + * Could use file_inode(file)->i_ino to figure out which file, + * instead of separate routine for each, but for now, this works... + */ + +/* read the per-port names (same for each port) */ +static ssize_t portnames_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *names; + size_t avail; + struct hfi1_devdata *dd; + ssize_t rval; + + rcu_read_lock(); + dd = private2dd(file); + avail = hfi1_read_portcntrs(dd->pport, &names, NULL); + rval = simple_read_from_buffer(buf, count, ppos, names, avail); + rcu_read_unlock(); + return rval; +} + +/* read the per-port counters */ +static ssize_t portcntrs_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 *counters; + size_t avail; + struct hfi1_pportdata *ppd; + ssize_t rval; + + rcu_read_lock(); + ppd = private2ppd(file); + avail = hfi1_read_portcntrs(ppd, NULL, &counters); + rval = simple_read_from_buffer(buf, count, ppos, counters, avail); + rcu_read_unlock(); + return rval; +} + +static void check_dyn_flag(u64 scratch0, char *p, int size, int *used, + int this_hfi, int hfi, u32 flag, const char *what) +{ + u32 mask; + + mask = flag << (hfi ? CR_DYN_SHIFT : 0); + if (scratch0 & mask) { + *used += scnprintf(p + *used, size - *used, + " 0x%08x - HFI%d %s in use, %s device\n", + mask, hfi, what, + this_hfi == hfi ? "this" : "other"); + } +} + +static ssize_t asic_flags_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + u64 scratch0; + char *tmp; + int ret = 0; + int size; + int used; + int i; + + rcu_read_lock(); + ppd = private2ppd(file); + dd = ppd->dd; + size = PAGE_SIZE; + used = 0; + tmp = kmalloc(size, GFP_KERNEL); + if (!tmp) { + rcu_read_unlock(); + return -ENOMEM; + } + + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + used += scnprintf(tmp + used, size - used, + "Resource flags: 0x%016llx\n", scratch0); + + /* check permanent flag */ + if (scratch0 & CR_THERM_INIT) { + used += scnprintf(tmp + used, size - used, + " 0x%08x - thermal monitoring initialized\n", + (u32)CR_THERM_INIT); + } + + /* check each dynamic flag on each HFI */ + for (i = 0; i < 2; i++) { + check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i, + CR_SBUS, "SBus"); + check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i, + CR_EPROM, "EPROM"); + check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i, + CR_I2C1, "i2c chain 1"); + check_dyn_flag(scratch0, tmp, size, &used, dd->hfi1_id, i, + CR_I2C2, "i2c chain 2"); + } + used += scnprintf(tmp + used, size - used, "Write bits to clear\n"); + + ret = simple_read_from_buffer(buf, count, ppos, tmp, used); + rcu_read_unlock(); + kfree(tmp); + return ret; +} + +static ssize_t asic_flags_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + char *buff; + int ret; + unsigned long long value; + u64 scratch0; + u64 clear; + + rcu_read_lock(); + ppd = private2ppd(file); + dd = ppd->dd; + + buff = kmalloc(count + 1, GFP_KERNEL); + if (!buff) { + ret = -ENOMEM; + goto do_return; + } + + ret = copy_from_user(buff, buf, count); + if (ret > 0) { + ret = -EFAULT; + goto do_free; + } + + /* zero terminate and read the expected integer */ + buff[count] = 0; + ret = kstrtoull(buff, 0, &value); + if (ret) + goto do_free; + clear = value; + + /* obtain exclusive access */ + mutex_lock(&dd->asic_data->asic_resource_mutex); + acquire_hw_mutex(dd); + + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + scratch0 &= ~clear; + write_csr(dd, ASIC_CFG_SCRATCH, scratch0); + /* force write to be visible to other HFI on another OS */ + (void)read_csr(dd, ASIC_CFG_SCRATCH); + + release_hw_mutex(dd); + mutex_unlock(&dd->asic_data->asic_resource_mutex); + + /* return the number of bytes written */ + ret = count; + + do_free: + kfree(buff); + do_return: + rcu_read_unlock(); + return ret; +} + +/* + * read the per-port QSFP data for ppd + */ +static ssize_t qsfp_debugfs_dump(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct hfi1_pportdata *ppd; + char *tmp; + int ret; + + rcu_read_lock(); + ppd = private2ppd(file); + tmp = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!tmp) { + rcu_read_unlock(); + return -ENOMEM; + } + + ret = qsfp_dump(ppd, tmp, PAGE_SIZE); + if (ret > 0) + ret = simple_read_from_buffer(buf, count, ppos, tmp, ret); + rcu_read_unlock(); + kfree(tmp); + return ret; +} + +/* Do an i2c write operation on the chain for the given HFI. */ +static ssize_t __i2c_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, u32 target) +{ + struct hfi1_pportdata *ppd; + char *buff; + int ret; + int i2c_addr; + int offset; + int total_written; + + rcu_read_lock(); + ppd = private2ppd(file); + + /* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */ + i2c_addr = (*ppos >> 16) & 0xffff; + offset = *ppos & 0xffff; + + /* explicitly reject invalid address 0 to catch cp and cat */ + if (i2c_addr == 0) { + ret = -EINVAL; + goto _return; + } + + buff = kmalloc(count, GFP_KERNEL); + if (!buff) { + ret = -ENOMEM; + goto _return; + } + + ret = copy_from_user(buff, buf, count); + if (ret > 0) { + ret = -EFAULT; + goto _free; + } + + total_written = i2c_write(ppd, target, i2c_addr, offset, buff, count); + if (total_written < 0) { + ret = total_written; + goto _free; + } + + *ppos += total_written; + + ret = total_written; + + _free: + kfree(buff); + _return: + rcu_read_unlock(); + return ret; +} + +/* Do an i2c write operation on chain for HFI 0. */ +static ssize_t i2c1_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return __i2c_debugfs_write(file, buf, count, ppos, 0); +} + +/* Do an i2c write operation on chain for HFI 1. */ +static ssize_t i2c2_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return __i2c_debugfs_write(file, buf, count, ppos, 1); +} + +/* Do an i2c read operation on the chain for the given HFI. */ +static ssize_t __i2c_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, u32 target) +{ + struct hfi1_pportdata *ppd; + char *buff; + int ret; + int i2c_addr; + int offset; + int total_read; + + rcu_read_lock(); + ppd = private2ppd(file); + + /* byte offset format: [offsetSize][i2cAddr][offsetHigh][offsetLow] */ + i2c_addr = (*ppos >> 16) & 0xffff; + offset = *ppos & 0xffff; + + /* explicitly reject invalid address 0 to catch cp and cat */ + if (i2c_addr == 0) { + ret = -EINVAL; + goto _return; + } + + buff = kmalloc(count, GFP_KERNEL); + if (!buff) { + ret = -ENOMEM; + goto _return; + } + + total_read = i2c_read(ppd, target, i2c_addr, offset, buff, count); + if (total_read < 0) { + ret = total_read; + goto _free; + } + + *ppos += total_read; + + ret = copy_to_user(buf, buff, total_read); + if (ret > 0) { + ret = -EFAULT; + goto _free; + } + + ret = total_read; + + _free: + kfree(buff); + _return: + rcu_read_unlock(); + return ret; +} + +/* Do an i2c read operation on chain for HFI 0. */ +static ssize_t i2c1_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return __i2c_debugfs_read(file, buf, count, ppos, 0); +} + +/* Do an i2c read operation on chain for HFI 1. */ +static ssize_t i2c2_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return __i2c_debugfs_read(file, buf, count, ppos, 1); +} + +/* Do a QSFP write operation on the i2c chain for the given HFI. */ +static ssize_t __qsfp_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, u32 target) +{ + struct hfi1_pportdata *ppd; + char *buff; + int ret; + int total_written; + + rcu_read_lock(); + if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */ + ret = -EINVAL; + goto _return; + } + + ppd = private2ppd(file); + + buff = kmalloc(count, GFP_KERNEL); + if (!buff) { + ret = -ENOMEM; + goto _return; + } + + ret = copy_from_user(buff, buf, count); + if (ret > 0) { + ret = -EFAULT; + goto _free; + } + + total_written = qsfp_write(ppd, target, *ppos, buff, count); + if (total_written < 0) { + ret = total_written; + goto _free; + } + + *ppos += total_written; + + ret = total_written; + + _free: + kfree(buff); + _return: + rcu_read_unlock(); + return ret; +} + +/* Do a QSFP write operation on i2c chain for HFI 0. */ +static ssize_t qsfp1_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return __qsfp_debugfs_write(file, buf, count, ppos, 0); +} + +/* Do a QSFP write operation on i2c chain for HFI 1. */ +static ssize_t qsfp2_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return __qsfp_debugfs_write(file, buf, count, ppos, 1); +} + +/* Do a QSFP read operation on the i2c chain for the given HFI. */ +static ssize_t __qsfp_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, u32 target) +{ + struct hfi1_pportdata *ppd; + char *buff; + int ret; + int total_read; + + rcu_read_lock(); + if (*ppos + count > QSFP_PAGESIZE * 4) { /* base page + page00-page03 */ + ret = -EINVAL; + goto _return; + } + + ppd = private2ppd(file); + + buff = kmalloc(count, GFP_KERNEL); + if (!buff) { + ret = -ENOMEM; + goto _return; + } + + total_read = qsfp_read(ppd, target, *ppos, buff, count); + if (total_read < 0) { + ret = total_read; + goto _free; + } + + *ppos += total_read; + + ret = copy_to_user(buf, buff, total_read); + if (ret > 0) { + ret = -EFAULT; + goto _free; + } + + ret = total_read; + + _free: + kfree(buff); + _return: + rcu_read_unlock(); + return ret; +} + +/* Do a QSFP read operation on i2c chain for HFI 0. */ +static ssize_t qsfp1_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return __qsfp_debugfs_read(file, buf, count, ppos, 0); +} + +/* Do a QSFP read operation on i2c chain for HFI 1. */ +static ssize_t qsfp2_debugfs_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return __qsfp_debugfs_read(file, buf, count, ppos, 1); +} + +static int __i2c_debugfs_open(struct inode *in, struct file *fp, u32 target) +{ + struct hfi1_pportdata *ppd; + int ret; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + ppd = private2ppd(fp); + + ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0); + if (ret) /* failed - release the module */ + module_put(THIS_MODULE); + + return ret; +} + +static int i2c1_debugfs_open(struct inode *in, struct file *fp) +{ + return __i2c_debugfs_open(in, fp, 0); +} + +static int i2c2_debugfs_open(struct inode *in, struct file *fp) +{ + return __i2c_debugfs_open(in, fp, 1); +} + +static int __i2c_debugfs_release(struct inode *in, struct file *fp, u32 target) +{ + struct hfi1_pportdata *ppd; + + ppd = private2ppd(fp); + + release_chip_resource(ppd->dd, i2c_target(target)); + module_put(THIS_MODULE); + + return 0; +} + +static int i2c1_debugfs_release(struct inode *in, struct file *fp) +{ + return __i2c_debugfs_release(in, fp, 0); +} + +static int i2c2_debugfs_release(struct inode *in, struct file *fp) +{ + return __i2c_debugfs_release(in, fp, 1); +} + +static int __qsfp_debugfs_open(struct inode *in, struct file *fp, u32 target) +{ + struct hfi1_pportdata *ppd; + int ret; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + ppd = private2ppd(fp); + + ret = acquire_chip_resource(ppd->dd, i2c_target(target), 0); + if (ret) /* failed - release the module */ + module_put(THIS_MODULE); + + return ret; +} + +static int qsfp1_debugfs_open(struct inode *in, struct file *fp) +{ + return __qsfp_debugfs_open(in, fp, 0); +} + +static int qsfp2_debugfs_open(struct inode *in, struct file *fp) +{ + return __qsfp_debugfs_open(in, fp, 1); +} + +static int __qsfp_debugfs_release(struct inode *in, struct file *fp, u32 target) +{ + struct hfi1_pportdata *ppd; + + ppd = private2ppd(fp); + + release_chip_resource(ppd->dd, i2c_target(target)); + module_put(THIS_MODULE); + + return 0; +} + +static int qsfp1_debugfs_release(struct inode *in, struct file *fp) +{ + return __qsfp_debugfs_release(in, fp, 0); +} + +static int qsfp2_debugfs_release(struct inode *in, struct file *fp) +{ + return __qsfp_debugfs_release(in, fp, 1); +} + +#define DEBUGFS_OPS(nm, readroutine, writeroutine) \ +{ \ + .name = nm, \ + .ops = { \ + .read = readroutine, \ + .write = writeroutine, \ + .llseek = generic_file_llseek, \ + }, \ +} + +#define DEBUGFS_XOPS(nm, readf, writef, openf, releasef) \ +{ \ + .name = nm, \ + .ops = { \ + .read = readf, \ + .write = writef, \ + .llseek = generic_file_llseek, \ + .open = openf, \ + .release = releasef \ + }, \ +} + +static const struct counter_info cntr_ops[] = { + DEBUGFS_OPS("counter_names", dev_names_read, NULL), + DEBUGFS_OPS("counters", dev_counters_read, NULL), + DEBUGFS_OPS("portcounter_names", portnames_read, NULL), +}; + +static const struct counter_info port_cntr_ops[] = { + DEBUGFS_OPS("port%dcounters", portcntrs_debugfs_read, NULL), + DEBUGFS_XOPS("i2c1", i2c1_debugfs_read, i2c1_debugfs_write, + i2c1_debugfs_open, i2c1_debugfs_release), + DEBUGFS_XOPS("i2c2", i2c2_debugfs_read, i2c2_debugfs_write, + i2c2_debugfs_open, i2c2_debugfs_release), + DEBUGFS_OPS("qsfp_dump%d", qsfp_debugfs_dump, NULL), + DEBUGFS_XOPS("qsfp1", qsfp1_debugfs_read, qsfp1_debugfs_write, + qsfp1_debugfs_open, qsfp1_debugfs_release), + DEBUGFS_XOPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write, + qsfp2_debugfs_open, qsfp2_debugfs_release), + DEBUGFS_OPS("asic_flags", asic_flags_read, asic_flags_write), +}; + +void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) +{ + char name[sizeof("port0counters") + 1]; + char link[10]; + struct hfi1_devdata *dd = dd_from_dev(ibd); + struct hfi1_pportdata *ppd; + int unit = dd->unit; + int i, j; + + if (!hfi1_dbg_root) + return; + snprintf(name, sizeof(name), "%s_%d", class_name(), unit); + snprintf(link, sizeof(link), "%d", unit); + ibd->hfi1_ibdev_dbg = debugfs_create_dir(name, hfi1_dbg_root); + if (!ibd->hfi1_ibdev_dbg) { + pr_warn("create of %s failed\n", name); + return; + } + ibd->hfi1_ibdev_link = + debugfs_create_symlink(link, hfi1_dbg_root, name); + if (!ibd->hfi1_ibdev_link) { + pr_warn("create of %s symlink failed\n", name); + return; + } + DEBUGFS_SEQ_FILE_CREATE(opcode_stats, ibd->hfi1_ibdev_dbg, ibd); + DEBUGFS_SEQ_FILE_CREATE(ctx_stats, ibd->hfi1_ibdev_dbg, ibd); + DEBUGFS_SEQ_FILE_CREATE(qp_stats, ibd->hfi1_ibdev_dbg, ibd); + DEBUGFS_SEQ_FILE_CREATE(sdes, ibd->hfi1_ibdev_dbg, ibd); + /* dev counter files */ + for (i = 0; i < ARRAY_SIZE(cntr_ops); i++) + DEBUGFS_FILE_CREATE(cntr_ops[i].name, + ibd->hfi1_ibdev_dbg, + dd, + &cntr_ops[i].ops, S_IRUGO); + /* per port files */ + for (ppd = dd->pport, j = 0; j < dd->num_pports; j++, ppd++) + for (i = 0; i < ARRAY_SIZE(port_cntr_ops); i++) { + snprintf(name, + sizeof(name), + port_cntr_ops[i].name, + j + 1); + DEBUGFS_FILE_CREATE(name, + ibd->hfi1_ibdev_dbg, + ppd, + &port_cntr_ops[i].ops, + !port_cntr_ops[i].ops.write ? + S_IRUGO : S_IRUGO | S_IWUSR); + } +} + +void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd) +{ + if (!hfi1_dbg_root) + goto out; + debugfs_remove(ibd->hfi1_ibdev_link); + debugfs_remove_recursive(ibd->hfi1_ibdev_dbg); +out: + ibd->hfi1_ibdev_dbg = NULL; + synchronize_rcu(); +} + +/* + * driver stats field names, one line per stat, single string. Used by + * programs like hfistats to print the stats in a way which works for + * different versions of drivers, without changing program source. + * if hfi1_ib_stats changes, this needs to change. Names need to be + * 12 chars or less (w/o newline), for proper display by hfistats utility. + */ +static const char * const hfi1_statnames[] = { + /* must be element 0*/ + "KernIntr", + "ErrorIntr", + "Tx_Errs", + "Rcv_Errs", + "H/W_Errs", + "NoPIOBufs", + "CtxtsOpen", + "RcvLen_Errs", + "EgrBufFull", + "EgrHdrFull" +}; + +static void *_driver_stats_names_seq_start(struct seq_file *s, loff_t *pos) +__acquires(RCU) +{ + rcu_read_lock(); + if (*pos >= ARRAY_SIZE(hfi1_statnames)) + return NULL; + return pos; +} + +static void *_driver_stats_names_seq_next( + struct seq_file *s, + void *v, + loff_t *pos) +{ + ++*pos; + if (*pos >= ARRAY_SIZE(hfi1_statnames)) + return NULL; + return pos; +} + +static void _driver_stats_names_seq_stop(struct seq_file *s, void *v) +__releases(RCU) +{ + rcu_read_unlock(); +} + +static int _driver_stats_names_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos = v; + + seq_printf(s, "%s\n", hfi1_statnames[*spos]); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(driver_stats_names); +DEBUGFS_SEQ_FILE_OPEN(driver_stats_names) +DEBUGFS_FILE_OPS(driver_stats_names); + +static void *_driver_stats_seq_start(struct seq_file *s, loff_t *pos) +__acquires(RCU) +{ + rcu_read_lock(); + if (*pos >= ARRAY_SIZE(hfi1_statnames)) + return NULL; + return pos; +} + +static void *_driver_stats_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + ++*pos; + if (*pos >= ARRAY_SIZE(hfi1_statnames)) + return NULL; + return pos; +} + +static void _driver_stats_seq_stop(struct seq_file *s, void *v) +__releases(RCU) +{ + rcu_read_unlock(); +} + +static u64 hfi1_sps_ints(void) +{ + unsigned long flags; + struct hfi1_devdata *dd; + u64 sps_ints = 0; + + spin_lock_irqsave(&hfi1_devs_lock, flags); + list_for_each_entry(dd, &hfi1_dev_list, list) { + sps_ints += get_all_cpu_total(dd->int_counter); + } + spin_unlock_irqrestore(&hfi1_devs_lock, flags); + return sps_ints; +} + +static int _driver_stats_seq_show(struct seq_file *s, void *v) +{ + loff_t *spos = v; + char *buffer; + u64 *stats = (u64 *)&hfi1_stats; + size_t sz = seq_get_buf(s, &buffer); + + if (sz < sizeof(u64)) + return SEQ_SKIP; + /* special case for interrupts */ + if (*spos == 0) + *(u64 *)buffer = hfi1_sps_ints(); + else + *(u64 *)buffer = stats[*spos]; + seq_commit(s, sizeof(u64)); + return 0; +} + +DEBUGFS_SEQ_FILE_OPS(driver_stats); +DEBUGFS_SEQ_FILE_OPEN(driver_stats) +DEBUGFS_FILE_OPS(driver_stats); + +void hfi1_dbg_init(void) +{ + hfi1_dbg_root = debugfs_create_dir(DRIVER_NAME, NULL); + if (!hfi1_dbg_root) + pr_warn("init of debugfs failed\n"); + DEBUGFS_SEQ_FILE_CREATE(driver_stats_names, hfi1_dbg_root, NULL); + DEBUGFS_SEQ_FILE_CREATE(driver_stats, hfi1_dbg_root, NULL); +} + +void hfi1_dbg_exit(void) +{ + debugfs_remove_recursive(hfi1_dbg_root); + hfi1_dbg_root = NULL; +} + +#endif diff --git a/drivers/infiniband/hw/hfi1/debugfs.h b/drivers/infiniband/hw/hfi1/debugfs.h new file mode 100644 index 000000000..b6fb6814f --- /dev/null +++ b/drivers/infiniband/hw/hfi1/debugfs.h @@ -0,0 +1,75 @@ +#ifndef _HFI1_DEBUGFS_H +#define _HFI1_DEBUGFS_H +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +struct hfi1_ibdev; +#ifdef CONFIG_DEBUG_FS +void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd); +void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd); +void hfi1_dbg_init(void); +void hfi1_dbg_exit(void); +#else +static inline void hfi1_dbg_ibdev_init(struct hfi1_ibdev *ibd) +{ +} + +void hfi1_dbg_ibdev_exit(struct hfi1_ibdev *ibd) +{ +} + +void hfi1_dbg_init(void) +{ +} + +void hfi1_dbg_exit(void) +{ +} + +#endif + +#endif /* _HFI1_DEBUGFS_H */ diff --git a/drivers/infiniband/hw/hfi1/device.c b/drivers/infiniband/hw/hfi1/device.c new file mode 100644 index 000000000..bf64b5a7b --- /dev/null +++ b/drivers/infiniband/hw/hfi1/device.c @@ -0,0 +1,183 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/cdev.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/fs.h> + +#include "hfi.h" +#include "device.h" + +static struct class *class; +static struct class *user_class; +static dev_t hfi1_dev; + +int hfi1_cdev_init(int minor, const char *name, + const struct file_operations *fops, + struct cdev *cdev, struct device **devp, + bool user_accessible, + struct kobject *parent) +{ + const dev_t dev = MKDEV(MAJOR(hfi1_dev), minor); + struct device *device = NULL; + int ret; + + cdev_init(cdev, fops); + cdev->owner = THIS_MODULE; + cdev->kobj.parent = parent; + kobject_set_name(&cdev->kobj, name); + + ret = cdev_add(cdev, dev, 1); + if (ret < 0) { + pr_err("Could not add cdev for minor %d, %s (err %d)\n", + minor, name, -ret); + goto done; + } + + if (user_accessible) + device = device_create(user_class, NULL, dev, NULL, "%s", name); + else + device = device_create(class, NULL, dev, NULL, "%s", name); + + if (IS_ERR(device)) { + ret = PTR_ERR(device); + device = NULL; + pr_err("Could not create device for minor %d, %s (err %d)\n", + minor, name, -ret); + cdev_del(cdev); + } +done: + *devp = device; + return ret; +} + +void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp) +{ + struct device *device = *devp; + + if (device) { + device_unregister(device); + *devp = NULL; + + cdev_del(cdev); + } +} + +static const char *hfi1_class_name = "hfi1"; + +const char *class_name(void) +{ + return hfi1_class_name; +} + +static char *hfi1_devnode(struct device *dev, umode_t *mode) +{ + if (mode) + *mode = 0600; + return kasprintf(GFP_KERNEL, "%s", dev_name(dev)); +} + +static const char *hfi1_class_name_user = "hfi1_user"; +static const char *class_name_user(void) +{ + return hfi1_class_name_user; +} + +static char *hfi1_user_devnode(struct device *dev, umode_t *mode) +{ + if (mode) + *mode = 0666; + return kasprintf(GFP_KERNEL, "%s", dev_name(dev)); +} + +int __init dev_init(void) +{ + int ret; + + ret = alloc_chrdev_region(&hfi1_dev, 0, HFI1_NMINORS, DRIVER_NAME); + if (ret < 0) { + pr_err("Could not allocate chrdev region (err %d)\n", -ret); + goto done; + } + + class = class_create(THIS_MODULE, class_name()); + if (IS_ERR(class)) { + ret = PTR_ERR(class); + pr_err("Could not create device class (err %d)\n", -ret); + unregister_chrdev_region(hfi1_dev, HFI1_NMINORS); + goto done; + } + class->devnode = hfi1_devnode; + + user_class = class_create(THIS_MODULE, class_name_user()); + if (IS_ERR(user_class)) { + ret = PTR_ERR(user_class); + pr_err("Could not create device class for user accessible files (err %d)\n", + -ret); + class_destroy(class); + class = NULL; + user_class = NULL; + unregister_chrdev_region(hfi1_dev, HFI1_NMINORS); + goto done; + } + user_class->devnode = hfi1_user_devnode; + +done: + return ret; +} + +void dev_cleanup(void) +{ + class_destroy(class); + class = NULL; + + class_destroy(user_class); + user_class = NULL; + + unregister_chrdev_region(hfi1_dev, HFI1_NMINORS); +} diff --git a/drivers/infiniband/hw/hfi1/device.h b/drivers/infiniband/hw/hfi1/device.h new file mode 100644 index 000000000..c3ec19cb0 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/device.h @@ -0,0 +1,60 @@ +#ifndef _HFI1_DEVICE_H +#define _HFI1_DEVICE_H +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +int hfi1_cdev_init(int minor, const char *name, + const struct file_operations *fops, + struct cdev *cdev, struct device **devp, + bool user_accessible, + struct kobject *parent); +void hfi1_cdev_cleanup(struct cdev *cdev, struct device **devp); +const char *class_name(void); +int __init dev_init(void); +void dev_cleanup(void); + +#endif /* _HFI1_DEVICE_H */ diff --git a/drivers/infiniband/hw/hfi1/dma.c b/drivers/infiniband/hw/hfi1/dma.c new file mode 100644 index 000000000..7e8dab892 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/dma.c @@ -0,0 +1,183 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include <linux/types.h> +#include <linux/scatterlist.h> + +#include "verbs.h" + +#define BAD_DMA_ADDRESS ((u64)0) + +/* + * The following functions implement driver specific replacements + * for the ib_dma_*() functions. + * + * These functions return kernel virtual addresses instead of + * device bus addresses since the driver uses the CPU to copy + * data instead of using hardware DMA. + */ + +static int hfi1_mapping_error(struct ib_device *dev, u64 dma_addr) +{ + return dma_addr == BAD_DMA_ADDRESS; +} + +static u64 hfi1_dma_map_single(struct ib_device *dev, void *cpu_addr, + size_t size, enum dma_data_direction direction) +{ + if (WARN_ON(!valid_dma_direction(direction))) + return BAD_DMA_ADDRESS; + + return (u64)cpu_addr; +} + +static void hfi1_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size, + enum dma_data_direction direction) +{ + /* This is a stub, nothing to be done here */ +} + +static u64 hfi1_dma_map_page(struct ib_device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + u64 addr; + + if (WARN_ON(!valid_dma_direction(direction))) + return BAD_DMA_ADDRESS; + + if (offset + size > PAGE_SIZE) + return BAD_DMA_ADDRESS; + + addr = (u64)page_address(page); + if (addr) + addr += offset; + + return addr; +} + +static void hfi1_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size, + enum dma_data_direction direction) +{ + /* This is a stub, nothing to be done here */ +} + +static int hfi1_map_sg(struct ib_device *dev, struct scatterlist *sgl, + int nents, enum dma_data_direction direction) +{ + struct scatterlist *sg; + u64 addr; + int i; + int ret = nents; + + if (WARN_ON(!valid_dma_direction(direction))) + return BAD_DMA_ADDRESS; + + for_each_sg(sgl, sg, nents, i) { + addr = (u64)page_address(sg_page(sg)); + if (!addr) { + ret = 0; + break; + } + sg->dma_address = addr + sg->offset; +#ifdef CONFIG_NEED_SG_DMA_LENGTH + sg->dma_length = sg->length; +#endif + } + return ret; +} + +static void hfi1_unmap_sg(struct ib_device *dev, + struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + /* This is a stub, nothing to be done here */ +} + +static void hfi1_sync_single_for_cpu(struct ib_device *dev, u64 addr, + size_t size, enum dma_data_direction dir) +{ +} + +static void hfi1_sync_single_for_device(struct ib_device *dev, u64 addr, + size_t size, + enum dma_data_direction dir) +{ +} + +static void *hfi1_dma_alloc_coherent(struct ib_device *dev, size_t size, + u64 *dma_handle, gfp_t flag) +{ + struct page *p; + void *addr = NULL; + + p = alloc_pages(flag, get_order(size)); + if (p) + addr = page_address(p); + if (dma_handle) + *dma_handle = (u64)addr; + return addr; +} + +static void hfi1_dma_free_coherent(struct ib_device *dev, size_t size, + void *cpu_addr, u64 dma_handle) +{ + free_pages((unsigned long)cpu_addr, get_order(size)); +} + +struct ib_dma_mapping_ops hfi1_dma_mapping_ops = { + .mapping_error = hfi1_mapping_error, + .map_single = hfi1_dma_map_single, + .unmap_single = hfi1_dma_unmap_single, + .map_page = hfi1_dma_map_page, + .unmap_page = hfi1_dma_unmap_page, + .map_sg = hfi1_map_sg, + .unmap_sg = hfi1_unmap_sg, + .sync_single_for_cpu = hfi1_sync_single_for_cpu, + .sync_single_for_device = hfi1_sync_single_for_device, + .alloc_coherent = hfi1_dma_alloc_coherent, + .free_coherent = hfi1_dma_free_coherent +}; diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c new file mode 100644 index 000000000..c75b0ae68 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -0,0 +1,1404 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/spinlock.h> +#include <linux/pci.h> +#include <linux/io.h> +#include <linux/delay.h> +#include <linux/netdevice.h> +#include <linux/vmalloc.h> +#include <linux/module.h> +#include <linux/prefetch.h> +#include <rdma/ib_verbs.h> + +#include "hfi.h" +#include "trace.h" +#include "qp.h" +#include "sdma.h" + +#undef pr_fmt +#define pr_fmt(fmt) DRIVER_NAME ": " fmt + +/* + * The size has to be longer than this string, so we can append + * board/chip information to it in the initialization code. + */ +const char ib_hfi1_version[] = HFI1_DRIVER_VERSION "\n"; + +DEFINE_SPINLOCK(hfi1_devs_lock); +LIST_HEAD(hfi1_dev_list); +DEFINE_MUTEX(hfi1_mutex); /* general driver use */ + +unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU; +module_param_named(max_mtu, hfi1_max_mtu, uint, S_IRUGO); +MODULE_PARM_DESC(max_mtu, "Set max MTU bytes, default is " __stringify( + HFI1_DEFAULT_MAX_MTU)); + +unsigned int hfi1_cu = 1; +module_param_named(cu, hfi1_cu, uint, S_IRUGO); +MODULE_PARM_DESC(cu, "Credit return units"); + +unsigned long hfi1_cap_mask = HFI1_CAP_MASK_DEFAULT; +static int hfi1_caps_set(const char *, const struct kernel_param *); +static int hfi1_caps_get(char *, const struct kernel_param *); +static const struct kernel_param_ops cap_ops = { + .set = hfi1_caps_set, + .get = hfi1_caps_get +}; +module_param_cb(cap_mask, &cap_ops, &hfi1_cap_mask, S_IWUSR | S_IRUGO); +MODULE_PARM_DESC(cap_mask, "Bit mask of enabled/disabled HW features"); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("Intel Omni-Path Architecture driver"); +MODULE_VERSION(HFI1_DRIVER_VERSION); + +/* + * MAX_PKT_RCV is the max # if packets processed per receive interrupt. + */ +#define MAX_PKT_RECV 64 +#define EGR_HEAD_UPDATE_THRESHOLD 16 + +struct hfi1_ib_stats hfi1_stats; + +static int hfi1_caps_set(const char *val, const struct kernel_param *kp) +{ + int ret = 0; + unsigned long *cap_mask_ptr = (unsigned long *)kp->arg, + cap_mask = *cap_mask_ptr, value, diff, + write_mask = ((HFI1_CAP_WRITABLE_MASK << HFI1_CAP_USER_SHIFT) | + HFI1_CAP_WRITABLE_MASK); + + ret = kstrtoul(val, 0, &value); + if (ret) { + pr_warn("Invalid module parameter value for 'cap_mask'\n"); + goto done; + } + /* Get the changed bits (except the locked bit) */ + diff = value ^ (cap_mask & ~HFI1_CAP_LOCKED_SMASK); + + /* Remove any bits that are not allowed to change after driver load */ + if (HFI1_CAP_LOCKED() && (diff & ~write_mask)) { + pr_warn("Ignoring non-writable capability bits %#lx\n", + diff & ~write_mask); + diff &= write_mask; + } + + /* Mask off any reserved bits */ + diff &= ~HFI1_CAP_RESERVED_MASK; + /* Clear any previously set and changing bits */ + cap_mask &= ~diff; + /* Update the bits with the new capability */ + cap_mask |= (value & diff); + /* Check for any kernel/user restrictions */ + diff = (cap_mask & (HFI1_CAP_MUST_HAVE_KERN << HFI1_CAP_USER_SHIFT)) ^ + ((cap_mask & HFI1_CAP_MUST_HAVE_KERN) << HFI1_CAP_USER_SHIFT); + cap_mask &= ~diff; + /* Set the bitmask to the final set */ + *cap_mask_ptr = cap_mask; +done: + return ret; +} + +static int hfi1_caps_get(char *buffer, const struct kernel_param *kp) +{ + unsigned long cap_mask = *(unsigned long *)kp->arg; + + cap_mask &= ~HFI1_CAP_LOCKED_SMASK; + cap_mask |= ((cap_mask & HFI1_CAP_K2U) << HFI1_CAP_USER_SHIFT); + + return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask); +} + +const char *get_unit_name(int unit) +{ + static char iname[16]; + + snprintf(iname, sizeof(iname), DRIVER_NAME "_%u", unit); + return iname; +} + +const char *get_card_name(struct rvt_dev_info *rdi) +{ + struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi); + struct hfi1_devdata *dd = container_of(ibdev, + struct hfi1_devdata, verbs_dev); + return get_unit_name(dd->unit); +} + +struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi) +{ + struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi); + struct hfi1_devdata *dd = container_of(ibdev, + struct hfi1_devdata, verbs_dev); + return dd->pcidev; +} + +/* + * Return count of units with at least one port ACTIVE. + */ +int hfi1_count_active_units(void) +{ + struct hfi1_devdata *dd; + struct hfi1_pportdata *ppd; + unsigned long flags; + int pidx, nunits_active = 0; + + spin_lock_irqsave(&hfi1_devs_lock, flags); + list_for_each_entry(dd, &hfi1_dev_list, list) { + if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase) + continue; + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->lid && ppd->linkup) { + nunits_active++; + break; + } + } + } + spin_unlock_irqrestore(&hfi1_devs_lock, flags); + return nunits_active; +} + +/* + * Return count of all units, optionally return in arguments + * the number of usable (present) units, and the number of + * ports that are up. + */ +int hfi1_count_units(int *npresentp, int *nupp) +{ + int nunits = 0, npresent = 0, nup = 0; + struct hfi1_devdata *dd; + unsigned long flags; + int pidx; + struct hfi1_pportdata *ppd; + + spin_lock_irqsave(&hfi1_devs_lock, flags); + + list_for_each_entry(dd, &hfi1_dev_list, list) { + nunits++; + if ((dd->flags & HFI1_PRESENT) && dd->kregbase) + npresent++; + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->lid && ppd->linkup) + nup++; + } + } + + spin_unlock_irqrestore(&hfi1_devs_lock, flags); + + if (npresentp) + *npresentp = npresent; + if (nupp) + *nupp = nup; + + return nunits; +} + +/* + * Get address of eager buffer from it's index (allocated in chunks, not + * contiguous). + */ +static inline void *get_egrbuf(const struct hfi1_ctxtdata *rcd, u64 rhf, + u8 *update) +{ + u32 idx = rhf_egr_index(rhf), offset = rhf_egr_buf_offset(rhf); + + *update |= !(idx & (rcd->egrbufs.threshold - 1)) && !offset; + return (void *)(((u64)(rcd->egrbufs.rcvtids[idx].addr)) + + (offset * RCV_BUF_BLOCK_SIZE)); +} + +/* + * Validate and encode the a given RcvArray Buffer size. + * The function will check whether the given size falls within + * allowed size ranges for the respective type and, optionally, + * return the proper encoding. + */ +inline int hfi1_rcvbuf_validate(u32 size, u8 type, u16 *encoded) +{ + if (unlikely(!PAGE_ALIGNED(size))) + return 0; + if (unlikely(size < MIN_EAGER_BUFFER)) + return 0; + if (size > + (type == PT_EAGER ? MAX_EAGER_BUFFER : MAX_EXPECTED_BUFFER)) + return 0; + if (encoded) + *encoded = ilog2(size / PAGE_SIZE) + 1; + return 1; +} + +static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd, + struct hfi1_packet *packet) +{ + struct hfi1_message_header *rhdr = packet->hdr; + u32 rte = rhf_rcv_type_err(packet->rhf); + int lnh = be16_to_cpu(rhdr->lrh[0]) & 3; + struct hfi1_ibport *ibp = &ppd->ibport_data; + struct hfi1_devdata *dd = ppd->dd; + struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; + + if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR)) + return; + + if (packet->rhf & RHF_TID_ERR) { + /* For TIDERR and RC QPs preemptively schedule a NAK */ + struct hfi1_ib_header *hdr = (struct hfi1_ib_header *)rhdr; + struct hfi1_other_headers *ohdr = NULL; + u32 tlen = rhf_pkt_len(packet->rhf); /* in bytes */ + u16 lid = be16_to_cpu(hdr->lrh[1]); + u32 qp_num; + u32 rcv_flags = 0; + + /* Sanity check packet */ + if (tlen < 24) + goto drop; + + /* Check for GRH */ + if (lnh == HFI1_LRH_BTH) { + ohdr = &hdr->u.oth; + } else if (lnh == HFI1_LRH_GRH) { + u32 vtf; + + ohdr = &hdr->u.l.oth; + if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR) + goto drop; + vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow); + if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) + goto drop; + rcv_flags |= HFI1_HAS_GRH; + } else { + goto drop; + } + /* Get the destination QP number. */ + qp_num = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; + if (lid < be16_to_cpu(IB_MULTICAST_LID_BASE)) { + struct rvt_qp *qp; + unsigned long flags; + + rcu_read_lock(); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); + if (!qp) { + rcu_read_unlock(); + goto drop; + } + + /* + * Handle only RC QPs - for other QP types drop error + * packet. + */ + spin_lock_irqsave(&qp->r_lock, flags); + + /* Check for valid receive state. */ + if (!(ib_rvt_state_ops[qp->state] & + RVT_PROCESS_RECV_OK)) { + ibp->rvp.n_pkt_drops++; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + hfi1_rc_hdrerr( + rcd, + hdr, + rcv_flags, + qp); + break; + default: + /* For now don't handle any other QP types */ + break; + } + + spin_unlock_irqrestore(&qp->r_lock, flags); + rcu_read_unlock(); + } /* Unicast QP */ + } /* Valid packet with TIDErr */ + + /* handle "RcvTypeErr" flags */ + switch (rte) { + case RHF_RTE_ERROR_OP_CODE_ERR: + { + u32 opcode; + void *ebuf = NULL; + __be32 *bth = NULL; + + if (rhf_use_egr_bfr(packet->rhf)) + ebuf = packet->ebuf; + + if (!ebuf) + goto drop; /* this should never happen */ + + if (lnh == HFI1_LRH_BTH) + bth = (__be32 *)ebuf; + else if (lnh == HFI1_LRH_GRH) + bth = (__be32 *)((char *)ebuf + sizeof(struct ib_grh)); + else + goto drop; + + opcode = be32_to_cpu(bth[0]) >> 24; + opcode &= 0xff; + + if (opcode == IB_OPCODE_CNP) { + /* + * Only in pre-B0 h/w is the CNP_OPCODE handled + * via this code path. + */ + struct rvt_qp *qp = NULL; + u32 lqpn, rqpn; + u16 rlid; + u8 svc_type, sl, sc5; + + sc5 = (be16_to_cpu(rhdr->lrh[0]) >> 12) & 0xf; + if (rhf_dc_info(packet->rhf)) + sc5 |= 0x10; + sl = ibp->sc_to_sl[sc5]; + + lqpn = be32_to_cpu(bth[1]) & RVT_QPN_MASK; + rcu_read_lock(); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, lqpn); + if (!qp) { + rcu_read_unlock(); + goto drop; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_UD: + rlid = 0; + rqpn = 0; + svc_type = IB_CC_SVCTYPE_UD; + break; + case IB_QPT_UC: + rlid = be16_to_cpu(rhdr->lrh[3]); + rqpn = qp->remote_qpn; + svc_type = IB_CC_SVCTYPE_UC; + break; + default: + goto drop; + } + + process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type); + rcu_read_unlock(); + } + + packet->rhf &= ~RHF_RCV_TYPE_ERR_SMASK; + break; + } + default: + break; + } + +drop: + return; +} + +static inline void init_packet(struct hfi1_ctxtdata *rcd, + struct hfi1_packet *packet) +{ + packet->rsize = rcd->rcvhdrqentsize; /* words */ + packet->maxcnt = rcd->rcvhdrq_cnt * packet->rsize; /* words */ + packet->rcd = rcd; + packet->updegr = 0; + packet->etail = -1; + packet->rhf_addr = get_rhf_addr(rcd); + packet->rhf = rhf_to_cpu(packet->rhf_addr); + packet->rhqoff = rcd->head; + packet->numpkt = 0; + packet->rcv_flags = 0; +} + +static void process_ecn(struct rvt_qp *qp, struct hfi1_ib_header *hdr, + struct hfi1_other_headers *ohdr, + u64 rhf, u32 bth1, struct ib_grh *grh) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + u32 rqpn = 0; + u16 rlid; + u8 sc5, svc_type; + + switch (qp->ibqp.qp_type) { + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + rlid = be16_to_cpu(hdr->lrh[3]); + rqpn = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK; + svc_type = IB_CC_SVCTYPE_UD; + break; + case IB_QPT_UC: + rlid = qp->remote_ah_attr.dlid; + rqpn = qp->remote_qpn; + svc_type = IB_CC_SVCTYPE_UC; + break; + case IB_QPT_RC: + rlid = qp->remote_ah_attr.dlid; + rqpn = qp->remote_qpn; + svc_type = IB_CC_SVCTYPE_RC; + break; + default: + return; + } + + sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf; + if (rhf_dc_info(rhf)) + sc5 |= 0x10; + + if (bth1 & HFI1_FECN_SMASK) { + u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]); + u16 dlid = be16_to_cpu(hdr->lrh[1]); + + return_cnp(ibp, qp, rqpn, pkey, dlid, rlid, sc5, grh); + } + + if (bth1 & HFI1_BECN_SMASK) { + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 lqpn = bth1 & RVT_QPN_MASK; + u8 sl = ibp->sc_to_sl[sc5]; + + process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type); + } +} + +struct ps_mdata { + struct hfi1_ctxtdata *rcd; + u32 rsize; + u32 maxcnt; + u32 ps_head; + u32 ps_tail; + u32 ps_seq; +}; + +static inline void init_ps_mdata(struct ps_mdata *mdata, + struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + + mdata->rcd = rcd; + mdata->rsize = packet->rsize; + mdata->maxcnt = packet->maxcnt; + mdata->ps_head = packet->rhqoff; + + if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) { + mdata->ps_tail = get_rcvhdrtail(rcd); + if (rcd->ctxt == HFI1_CTRL_CTXT) + mdata->ps_seq = rcd->seq_cnt; + else + mdata->ps_seq = 0; /* not used with DMA_RTAIL */ + } else { + mdata->ps_tail = 0; /* used only with DMA_RTAIL*/ + mdata->ps_seq = rcd->seq_cnt; + } +} + +static inline int ps_done(struct ps_mdata *mdata, u64 rhf, + struct hfi1_ctxtdata *rcd) +{ + if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) + return mdata->ps_head == mdata->ps_tail; + return mdata->ps_seq != rhf_rcv_seq(rhf); +} + +static inline int ps_skip(struct ps_mdata *mdata, u64 rhf, + struct hfi1_ctxtdata *rcd) +{ + /* + * Control context can potentially receive an invalid rhf. + * Drop such packets. + */ + if ((rcd->ctxt == HFI1_CTRL_CTXT) && (mdata->ps_head != mdata->ps_tail)) + return mdata->ps_seq != rhf_rcv_seq(rhf); + + return 0; +} + +static inline void update_ps_mdata(struct ps_mdata *mdata, + struct hfi1_ctxtdata *rcd) +{ + mdata->ps_head += mdata->rsize; + if (mdata->ps_head >= mdata->maxcnt) + mdata->ps_head = 0; + + /* Control context must do seq counting */ + if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL) || + (rcd->ctxt == HFI1_CTRL_CTXT)) { + if (++mdata->ps_seq > 13) + mdata->ps_seq = 1; + } +} + +/* + * prescan_rxq - search through the receive queue looking for packets + * containing Excplicit Congestion Notifications (FECNs, or BECNs). + * When an ECN is found, process the Congestion Notification, and toggle + * it off. + * This is declared as a macro to allow quick checking of the port to avoid + * the overhead of a function call if not enabled. + */ +#define prescan_rxq(rcd, packet) \ + do { \ + if (rcd->ppd->cc_prescan) \ + __prescan_rxq(packet); \ + } while (0) +static void __prescan_rxq(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct ps_mdata mdata; + + init_ps_mdata(&mdata, packet); + + while (1) { + struct hfi1_devdata *dd = rcd->dd; + struct hfi1_ibport *ibp = &rcd->ppd->ibport_data; + __le32 *rhf_addr = (__le32 *)rcd->rcvhdrq + mdata.ps_head + + dd->rhf_offset; + struct rvt_qp *qp; + struct hfi1_ib_header *hdr; + struct hfi1_other_headers *ohdr; + struct ib_grh *grh = NULL; + struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; + u64 rhf = rhf_to_cpu(rhf_addr); + u32 etype = rhf_rcv_type(rhf), qpn, bth1; + int is_ecn = 0; + u8 lnh; + + if (ps_done(&mdata, rhf, rcd)) + break; + + if (ps_skip(&mdata, rhf, rcd)) + goto next; + + if (etype != RHF_RCV_TYPE_IB) + goto next; + + hdr = (struct hfi1_ib_header *) + hfi1_get_msgheader(dd, rhf_addr); + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + + if (lnh == HFI1_LRH_BTH) { + ohdr = &hdr->u.oth; + } else if (lnh == HFI1_LRH_GRH) { + ohdr = &hdr->u.l.oth; + grh = &hdr->u.l.grh; + } else { + goto next; /* just in case */ + } + bth1 = be32_to_cpu(ohdr->bth[1]); + is_ecn = !!(bth1 & (HFI1_FECN_SMASK | HFI1_BECN_SMASK)); + + if (!is_ecn) + goto next; + + qpn = bth1 & RVT_QPN_MASK; + rcu_read_lock(); + qp = rvt_lookup_qpn(rdi, &ibp->rvp, qpn); + + if (!qp) { + rcu_read_unlock(); + goto next; + } + + process_ecn(qp, hdr, ohdr, rhf, bth1, grh); + rcu_read_unlock(); + + /* turn off BECN, FECN */ + bth1 &= ~(HFI1_FECN_SMASK | HFI1_BECN_SMASK); + ohdr->bth[1] = cpu_to_be32(bth1); +next: + update_ps_mdata(&mdata, rcd); + } +} + +static inline int skip_rcv_packet(struct hfi1_packet *packet, int thread) +{ + int ret = RCV_PKT_OK; + + /* Set up for the next packet */ + packet->rhqoff += packet->rsize; + if (packet->rhqoff >= packet->maxcnt) + packet->rhqoff = 0; + + packet->numpkt++; + if (unlikely((packet->numpkt & (MAX_PKT_RECV - 1)) == 0)) { + if (thread) { + cond_resched(); + } else { + ret = RCV_PKT_LIMIT; + this_cpu_inc(*packet->rcd->dd->rcv_limit); + } + } + + packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff + + packet->rcd->dd->rhf_offset; + packet->rhf = rhf_to_cpu(packet->rhf_addr); + + return ret; +} + +static inline int process_rcv_packet(struct hfi1_packet *packet, int thread) +{ + int ret = RCV_PKT_OK; + + packet->hdr = hfi1_get_msgheader(packet->rcd->dd, + packet->rhf_addr); + packet->hlen = (u8 *)packet->rhf_addr - (u8 *)packet->hdr; + packet->etype = rhf_rcv_type(packet->rhf); + /* total length */ + packet->tlen = rhf_pkt_len(packet->rhf); /* in bytes */ + /* retrieve eager buffer details */ + packet->ebuf = NULL; + if (rhf_use_egr_bfr(packet->rhf)) { + packet->etail = rhf_egr_index(packet->rhf); + packet->ebuf = get_egrbuf(packet->rcd, packet->rhf, + &packet->updegr); + /* + * Prefetch the contents of the eager buffer. It is + * OK to send a negative length to prefetch_range(). + * The +2 is the size of the RHF. + */ + prefetch_range(packet->ebuf, + packet->tlen - ((packet->rcd->rcvhdrqentsize - + (rhf_hdrq_offset(packet->rhf) + + 2)) * 4)); + } + + /* + * Call a type specific handler for the packet. We + * should be able to trust that etype won't be beyond + * the range of valid indexes. If so something is really + * wrong and we can probably just let things come + * crashing down. There is no need to eat another + * comparison in this performance critical code. + */ + packet->rcd->dd->rhf_rcv_function_map[packet->etype](packet); + packet->numpkt++; + + /* Set up for the next packet */ + packet->rhqoff += packet->rsize; + if (packet->rhqoff >= packet->maxcnt) + packet->rhqoff = 0; + + if (unlikely((packet->numpkt & (MAX_PKT_RECV - 1)) == 0)) { + if (thread) { + cond_resched(); + } else { + ret = RCV_PKT_LIMIT; + this_cpu_inc(*packet->rcd->dd->rcv_limit); + } + } + + packet->rhf_addr = (__le32 *)packet->rcd->rcvhdrq + packet->rhqoff + + packet->rcd->dd->rhf_offset; + packet->rhf = rhf_to_cpu(packet->rhf_addr); + + return ret; +} + +static inline void process_rcv_update(int last, struct hfi1_packet *packet) +{ + /* + * Update head regs etc., every 16 packets, if not last pkt, + * to help prevent rcvhdrq overflows, when many packets + * are processed and queue is nearly full. + * Don't request an interrupt for intermediate updates. + */ + if (!last && !(packet->numpkt & 0xf)) { + update_usrhead(packet->rcd, packet->rhqoff, packet->updegr, + packet->etail, 0, 0); + packet->updegr = 0; + } + packet->rcv_flags = 0; +} + +static inline void finish_packet(struct hfi1_packet *packet) +{ + /* + * Nothing we need to free for the packet. + * + * The only thing we need to do is a final update and call for an + * interrupt + */ + update_usrhead(packet->rcd, packet->rcd->head, packet->updegr, + packet->etail, rcv_intr_dynamic, packet->numpkt); +} + +static inline void process_rcv_qp_work(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd; + struct rvt_qp *qp, *nqp; + + rcd = packet->rcd; + rcd->head = packet->rhqoff; + + /* + * Iterate over all QPs waiting to respond. + * The list won't change since the IRQ is only run on one CPU. + */ + list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) { + list_del_init(&qp->rspwait); + if (qp->r_flags & RVT_R_RSP_NAK) { + qp->r_flags &= ~RVT_R_RSP_NAK; + hfi1_send_rc_ack(rcd, qp, 0); + } + if (qp->r_flags & RVT_R_RSP_SEND) { + unsigned long flags; + + qp->r_flags &= ~RVT_R_RSP_SEND; + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_rvt_state_ops[qp->state] & + RVT_PROCESS_OR_FLUSH_SEND) + hfi1_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } +} + +/* + * Handle receive interrupts when using the no dma rtail option. + */ +int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread) +{ + u32 seq; + int last = RCV_PKT_OK; + struct hfi1_packet packet; + + init_packet(rcd, &packet); + seq = rhf_rcv_seq(packet.rhf); + if (seq != rcd->seq_cnt) { + last = RCV_PKT_DONE; + goto bail; + } + + prescan_rxq(rcd, &packet); + + while (last == RCV_PKT_OK) { + last = process_rcv_packet(&packet, thread); + seq = rhf_rcv_seq(packet.rhf); + if (++rcd->seq_cnt > 13) + rcd->seq_cnt = 1; + if (seq != rcd->seq_cnt) + last = RCV_PKT_DONE; + process_rcv_update(last, &packet); + } + process_rcv_qp_work(&packet); +bail: + finish_packet(&packet); + return last; +} + +int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread) +{ + u32 hdrqtail; + int last = RCV_PKT_OK; + struct hfi1_packet packet; + + init_packet(rcd, &packet); + hdrqtail = get_rcvhdrtail(rcd); + if (packet.rhqoff == hdrqtail) { + last = RCV_PKT_DONE; + goto bail; + } + smp_rmb(); /* prevent speculative reads of dma'ed hdrq */ + + prescan_rxq(rcd, &packet); + + while (last == RCV_PKT_OK) { + last = process_rcv_packet(&packet, thread); + if (packet.rhqoff == hdrqtail) + last = RCV_PKT_DONE; + process_rcv_update(last, &packet); + } + process_rcv_qp_work(&packet); +bail: + finish_packet(&packet); + return last; +} + +static inline void set_all_nodma_rtail(struct hfi1_devdata *dd) +{ + int i; + + for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++) + dd->rcd[i]->do_interrupt = + &handle_receive_interrupt_nodma_rtail; +} + +static inline void set_all_dma_rtail(struct hfi1_devdata *dd) +{ + int i; + + for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++) + dd->rcd[i]->do_interrupt = + &handle_receive_interrupt_dma_rtail; +} + +void set_all_slowpath(struct hfi1_devdata *dd) +{ + int i; + + /* HFI1_CTRL_CTXT must always use the slow path interrupt handler */ + for (i = HFI1_CTRL_CTXT + 1; i < dd->first_user_ctxt; i++) + dd->rcd[i]->do_interrupt = &handle_receive_interrupt; +} + +static inline int set_armed_to_active(struct hfi1_ctxtdata *rcd, + struct hfi1_packet packet, + struct hfi1_devdata *dd) +{ + struct work_struct *lsaw = &rcd->ppd->linkstate_active_work; + struct hfi1_message_header *hdr = hfi1_get_msgheader(packet.rcd->dd, + packet.rhf_addr); + + if (hdr2sc(hdr, packet.rhf) != 0xf) { + int hwstate = read_logical_state(dd); + + if (hwstate != LSTATE_ACTIVE) { + dd_dev_info(dd, "Unexpected link state %d\n", hwstate); + return 0; + } + + queue_work(rcd->ppd->hfi1_wq, lsaw); + return 1; + } + return 0; +} + +/* + * handle_receive_interrupt - receive a packet + * @rcd: the context + * + * Called from interrupt handler for errors or receive interrupt. + * This is the slow path interrupt handler. + */ +int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread) +{ + struct hfi1_devdata *dd = rcd->dd; + u32 hdrqtail; + int needset, last = RCV_PKT_OK; + struct hfi1_packet packet; + int skip_pkt = 0; + + /* Control context will always use the slow path interrupt handler */ + needset = (rcd->ctxt == HFI1_CTRL_CTXT) ? 0 : 1; + + init_packet(rcd, &packet); + + if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) { + u32 seq = rhf_rcv_seq(packet.rhf); + + if (seq != rcd->seq_cnt) { + last = RCV_PKT_DONE; + goto bail; + } + hdrqtail = 0; + } else { + hdrqtail = get_rcvhdrtail(rcd); + if (packet.rhqoff == hdrqtail) { + last = RCV_PKT_DONE; + goto bail; + } + smp_rmb(); /* prevent speculative reads of dma'ed hdrq */ + + /* + * Control context can potentially receive an invalid + * rhf. Drop such packets. + */ + if (rcd->ctxt == HFI1_CTRL_CTXT) { + u32 seq = rhf_rcv_seq(packet.rhf); + + if (seq != rcd->seq_cnt) + skip_pkt = 1; + } + } + + prescan_rxq(rcd, &packet); + + while (last == RCV_PKT_OK) { + if (unlikely(dd->do_drop && + atomic_xchg(&dd->drop_packet, DROP_PACKET_OFF) == + DROP_PACKET_ON)) { + dd->do_drop = 0; + + /* On to the next packet */ + packet.rhqoff += packet.rsize; + packet.rhf_addr = (__le32 *)rcd->rcvhdrq + + packet.rhqoff + + dd->rhf_offset; + packet.rhf = rhf_to_cpu(packet.rhf_addr); + + } else if (skip_pkt) { + last = skip_rcv_packet(&packet, thread); + skip_pkt = 0; + } else { + /* Auto activate link on non-SC15 packet receive */ + if (unlikely(rcd->ppd->host_link_state == + HLS_UP_ARMED) && + set_armed_to_active(rcd, packet, dd)) + goto bail; + last = process_rcv_packet(&packet, thread); + } + + if (!HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) { + u32 seq = rhf_rcv_seq(packet.rhf); + + if (++rcd->seq_cnt > 13) + rcd->seq_cnt = 1; + if (seq != rcd->seq_cnt) + last = RCV_PKT_DONE; + if (needset) { + dd_dev_info(dd, "Switching to NO_DMA_RTAIL\n"); + set_all_nodma_rtail(dd); + needset = 0; + } + } else { + if (packet.rhqoff == hdrqtail) + last = RCV_PKT_DONE; + /* + * Control context can potentially receive an invalid + * rhf. Drop such packets. + */ + if (rcd->ctxt == HFI1_CTRL_CTXT) { + u32 seq = rhf_rcv_seq(packet.rhf); + + if (++rcd->seq_cnt > 13) + rcd->seq_cnt = 1; + if (!last && (seq != rcd->seq_cnt)) + skip_pkt = 1; + } + + if (needset) { + dd_dev_info(dd, + "Switching to DMA_RTAIL\n"); + set_all_dma_rtail(dd); + needset = 0; + } + } + + process_rcv_update(last, &packet); + } + + process_rcv_qp_work(&packet); + +bail: + /* + * Always write head at end, and setup rcv interrupt, even + * if no packets were processed. + */ + finish_packet(&packet); + return last; +} + +/* + * We may discover in the interrupt that the hardware link state has + * changed from ARMED to ACTIVE (due to the arrival of a non-SC15 packet), + * and we need to update the driver's notion of the link state. We cannot + * run set_link_state from interrupt context, so we queue this function on + * a workqueue. + * + * We delay the regular interrupt processing until after the state changes + * so that the link will be in the correct state by the time any application + * we wake up attempts to send a reply to any message it received. + * (Subsequent receive interrupts may possibly force the wakeup before we + * update the link state.) + * + * The rcd is freed in hfi1_free_ctxtdata after hfi1_postinit_cleanup invokes + * dd->f_cleanup(dd) to disable the interrupt handler and flush workqueues, + * so we're safe from use-after-free of the rcd. + */ +void receive_interrupt_work(struct work_struct *work) +{ + struct hfi1_pportdata *ppd = container_of(work, struct hfi1_pportdata, + linkstate_active_work); + struct hfi1_devdata *dd = ppd->dd; + int i; + + /* Received non-SC15 packet implies neighbor_normal */ + ppd->neighbor_normal = 1; + set_link_state(ppd, HLS_UP_ACTIVE); + + /* + * Interrupt all kernel contexts that could have had an + * interrupt during auto activation. + */ + for (i = HFI1_CTRL_CTXT; i < dd->first_user_ctxt; i++) + force_recv_intr(dd->rcd[i]); +} + +/* + * Convert a given MTU size to the on-wire MAD packet enumeration. + * Return -1 if the size is invalid. + */ +int mtu_to_enum(u32 mtu, int default_if_bad) +{ + switch (mtu) { + case 0: return OPA_MTU_0; + case 256: return OPA_MTU_256; + case 512: return OPA_MTU_512; + case 1024: return OPA_MTU_1024; + case 2048: return OPA_MTU_2048; + case 4096: return OPA_MTU_4096; + case 8192: return OPA_MTU_8192; + case 10240: return OPA_MTU_10240; + } + return default_if_bad; +} + +u16 enum_to_mtu(int mtu) +{ + switch (mtu) { + case OPA_MTU_0: return 0; + case OPA_MTU_256: return 256; + case OPA_MTU_512: return 512; + case OPA_MTU_1024: return 1024; + case OPA_MTU_2048: return 2048; + case OPA_MTU_4096: return 4096; + case OPA_MTU_8192: return 8192; + case OPA_MTU_10240: return 10240; + default: return 0xffff; + } +} + +/* + * set_mtu - set the MTU + * @ppd: the per port data + * + * We can handle "any" incoming size, the issue here is whether we + * need to restrict our outgoing size. We do not deal with what happens + * to programs that are already running when the size changes. + */ +int set_mtu(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + int i, drain, ret = 0, is_up = 0; + + ppd->ibmtu = 0; + for (i = 0; i < ppd->vls_supported; i++) + if (ppd->ibmtu < dd->vld[i].mtu) + ppd->ibmtu = dd->vld[i].mtu; + ppd->ibmaxlen = ppd->ibmtu + lrh_max_header_bytes(ppd->dd); + + mutex_lock(&ppd->hls_lock); + if (ppd->host_link_state == HLS_UP_INIT || + ppd->host_link_state == HLS_UP_ARMED || + ppd->host_link_state == HLS_UP_ACTIVE) + is_up = 1; + + drain = !is_ax(dd) && is_up; + + if (drain) + /* + * MTU is specified per-VL. To ensure that no packet gets + * stuck (due, e.g., to the MTU for the packet's VL being + * reduced), empty the per-VL FIFOs before adjusting MTU. + */ + ret = stop_drain_data_vls(dd); + + if (ret) { + dd_dev_err(dd, "%s: cannot stop/drain VLs - refusing to change per-VL MTUs\n", + __func__); + goto err; + } + + hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_MTU, 0); + + if (drain) + open_fill_data_vls(dd); /* reopen all VLs */ + +err: + mutex_unlock(&ppd->hls_lock); + + return ret; +} + +int hfi1_set_lid(struct hfi1_pportdata *ppd, u32 lid, u8 lmc) +{ + struct hfi1_devdata *dd = ppd->dd; + + ppd->lid = lid; + ppd->lmc = lmc; + hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LIDLMC, 0); + + dd_dev_info(dd, "port %u: got a lid: 0x%x\n", ppd->port, lid); + + return 0; +} + +void shutdown_led_override(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + + /* + * This pairs with the memory barrier in hfi1_start_led_override to + * ensure that we read the correct state of LED beaconing represented + * by led_override_timer_active + */ + smp_rmb(); + if (atomic_read(&ppd->led_override_timer_active)) { + del_timer_sync(&ppd->led_override_timer); + atomic_set(&ppd->led_override_timer_active, 0); + /* Ensure the atomic_set is visible to all CPUs */ + smp_wmb(); + } + + /* Hand control of the LED to the DC for normal operation */ + write_csr(dd, DCC_CFG_LED_CNTRL, 0); +} + +static void run_led_override(unsigned long opaque) +{ + struct hfi1_pportdata *ppd = (struct hfi1_pportdata *)opaque; + struct hfi1_devdata *dd = ppd->dd; + unsigned long timeout; + int phase_idx; + + if (!(dd->flags & HFI1_INITTED)) + return; + + phase_idx = ppd->led_override_phase & 1; + + setextled(dd, phase_idx); + + timeout = ppd->led_override_vals[phase_idx]; + + /* Set up for next phase */ + ppd->led_override_phase = !ppd->led_override_phase; + + mod_timer(&ppd->led_override_timer, jiffies + timeout); +} + +/* + * To have the LED blink in a particular pattern, provide timeon and timeoff + * in milliseconds. + * To turn off custom blinking and return to normal operation, use + * shutdown_led_override() + */ +void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon, + unsigned int timeoff) +{ + if (!(ppd->dd->flags & HFI1_INITTED)) + return; + + /* Convert to jiffies for direct use in timer */ + ppd->led_override_vals[0] = msecs_to_jiffies(timeoff); + ppd->led_override_vals[1] = msecs_to_jiffies(timeon); + + /* Arbitrarily start from LED on phase */ + ppd->led_override_phase = 1; + + /* + * If the timer has not already been started, do so. Use a "quick" + * timeout so the handler will be called soon to look at our request. + */ + if (!timer_pending(&ppd->led_override_timer)) { + setup_timer(&ppd->led_override_timer, run_led_override, + (unsigned long)ppd); + ppd->led_override_timer.expires = jiffies + 1; + add_timer(&ppd->led_override_timer); + atomic_set(&ppd->led_override_timer_active, 1); + /* Ensure the atomic_set is visible to all CPUs */ + smp_wmb(); + } +} + +/** + * hfi1_reset_device - reset the chip if possible + * @unit: the device to reset + * + * Whether or not reset is successful, we attempt to re-initialize the chip + * (that is, much like a driver unload/reload). We clear the INITTED flag + * so that the various entry points will fail until we reinitialize. For + * now, we only allow this if no user contexts are open that use chip resources + */ +int hfi1_reset_device(int unit) +{ + int ret, i; + struct hfi1_devdata *dd = hfi1_lookup(unit); + struct hfi1_pportdata *ppd; + unsigned long flags; + int pidx; + + if (!dd) { + ret = -ENODEV; + goto bail; + } + + dd_dev_info(dd, "Reset on unit %u requested\n", unit); + + if (!dd->kregbase || !(dd->flags & HFI1_PRESENT)) { + dd_dev_info(dd, + "Invalid unit number %u or not initialized or not present\n", + unit); + ret = -ENXIO; + goto bail; + } + + spin_lock_irqsave(&dd->uctxt_lock, flags); + if (dd->rcd) + for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) { + if (!dd->rcd[i] || !dd->rcd[i]->cnt) + continue; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + ret = -EBUSY; + goto bail; + } + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + shutdown_led_override(ppd); + } + if (dd->flags & HFI1_HAS_SEND_DMA) + sdma_exit(dd); + + hfi1_reset_cpu_counters(dd); + + ret = hfi1_init(dd, 1); + + if (ret) + dd_dev_err(dd, + "Reinitialize unit %u after reset failed with %d\n", + unit, ret); + else + dd_dev_info(dd, "Reinitialized unit %u after resetting\n", + unit); + +bail: + return ret; +} + +void handle_eflags(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + u32 rte = rhf_rcv_type_err(packet->rhf); + + rcv_hdrerr(rcd, rcd->ppd, packet); + if (rhf_err_flags(packet->rhf)) + dd_dev_err(rcd->dd, + "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n", + rcd->ctxt, packet->rhf, + packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "", + packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "", + packet->rhf & RHF_DC_ERR ? "dc " : "", + packet->rhf & RHF_TID_ERR ? "tid " : "", + packet->rhf & RHF_LEN_ERR ? "len " : "", + packet->rhf & RHF_ECC_ERR ? "ecc " : "", + packet->rhf & RHF_VCRC_ERR ? "vcrc " : "", + packet->rhf & RHF_ICRC_ERR ? "icrc " : "", + rte); +} + +/* + * The following functions are called by the interrupt handler. They are type + * specific handlers for each packet type. + */ +int process_receive_ib(struct hfi1_packet *packet) +{ + trace_hfi1_rcvhdr(packet->rcd->ppd->dd, + packet->rcd->ctxt, + rhf_err_flags(packet->rhf), + RHF_RCV_TYPE_IB, + packet->hlen, + packet->tlen, + packet->updegr, + rhf_egr_index(packet->rhf)); + + if (unlikely(rhf_err_flags(packet->rhf))) { + handle_eflags(packet); + return RHF_RCV_CONTINUE; + } + + hfi1_ib_rcv(packet); + return RHF_RCV_CONTINUE; +} + +int process_receive_bypass(struct hfi1_packet *packet) +{ + if (unlikely(rhf_err_flags(packet->rhf))) + handle_eflags(packet); + + dd_dev_err(packet->rcd->dd, + "Bypass packets are not supported in normal operation. Dropping\n"); + return RHF_RCV_CONTINUE; +} + +int process_receive_error(struct hfi1_packet *packet) +{ + handle_eflags(packet); + + if (unlikely(rhf_err_flags(packet->rhf))) + dd_dev_err(packet->rcd->dd, + "Unhandled error packet received. Dropping.\n"); + + return RHF_RCV_CONTINUE; +} + +int kdeth_process_expected(struct hfi1_packet *packet) +{ + if (unlikely(rhf_err_flags(packet->rhf))) + handle_eflags(packet); + + dd_dev_err(packet->rcd->dd, + "Unhandled expected packet received. Dropping.\n"); + return RHF_RCV_CONTINUE; +} + +int kdeth_process_eager(struct hfi1_packet *packet) +{ + if (unlikely(rhf_err_flags(packet->rhf))) + handle_eflags(packet); + + dd_dev_err(packet->rcd->dd, + "Unhandled eager packet received. Dropping.\n"); + return RHF_RCV_CONTINUE; +} + +int process_receive_invalid(struct hfi1_packet *packet) +{ + dd_dev_err(packet->rcd->dd, "Invalid packet type %d. Dropping\n", + rhf_rcv_type(packet->rhf)); + return RHF_RCV_CONTINUE; +} diff --git a/drivers/infiniband/hw/hfi1/efivar.c b/drivers/infiniband/hw/hfi1/efivar.c new file mode 100644 index 000000000..106349fc1 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/efivar.c @@ -0,0 +1,164 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "efivar.h" + +/* GUID for HFI1 variables in EFI */ +#define HFI1_EFIVAR_GUID EFI_GUID(0xc50a953e, 0xa8b2, 0x42a6, \ + 0xbf, 0x89, 0xd3, 0x33, 0xa6, 0xe9, 0xe6, 0xd4) +/* largest EFI data size we expect */ +#define EFI_DATA_SIZE 4096 + +/* + * Read the named EFI variable. Return the size of the actual data in *size + * and a kmalloc'ed buffer in *return_data. The caller must free the + * data. It is guaranteed that *return_data will be NULL and *size = 0 + * if this routine fails. + * + * Return 0 on success, -errno on failure. + */ +static int read_efi_var(const char *name, unsigned long *size, + void **return_data) +{ + efi_status_t status; + efi_char16_t *uni_name; + efi_guid_t guid; + unsigned long temp_size; + void *temp_buffer; + void *data; + int i; + int ret; + + /* set failure return values */ + *size = 0; + *return_data = NULL; + + if (!efi_enabled(EFI_RUNTIME_SERVICES)) + return -EOPNOTSUPP; + + uni_name = kcalloc(strlen(name) + 1, sizeof(efi_char16_t), GFP_KERNEL); + temp_buffer = kzalloc(EFI_DATA_SIZE, GFP_KERNEL); + + if (!uni_name || !temp_buffer) { + ret = -ENOMEM; + goto fail; + } + + /* input: the size of the buffer */ + temp_size = EFI_DATA_SIZE; + + /* convert ASCII to unicode - it is a 1:1 mapping */ + for (i = 0; name[i]; i++) + uni_name[i] = name[i]; + + /* need a variable for our GUID */ + guid = HFI1_EFIVAR_GUID; + + /* call into EFI runtime services */ + status = efi.get_variable( + uni_name, + &guid, + NULL, + &temp_size, + temp_buffer); + + /* + * It would be nice to call efi_status_to_err() here, but that + * is in the EFIVAR_FS code and may not be compiled in. + * However, even that is insufficient since it does not cover + * EFI_BUFFER_TOO_SMALL which could be an important return. + * For now, just split out succces or not found. + */ + ret = status == EFI_SUCCESS ? 0 : + status == EFI_NOT_FOUND ? -ENOENT : + -EINVAL; + if (ret) + goto fail; + + /* + * We have successfully read the EFI variable into our + * temporary buffer. Now allocate a correctly sized + * buffer. + */ + data = kmemdup(temp_buffer, temp_size, GFP_KERNEL); + if (!data) { + ret = -ENOMEM; + goto fail; + } + + *size = temp_size; + *return_data = data; + +fail: + kfree(uni_name); + kfree(temp_buffer); + + return ret; +} + +/* + * Read an HFI1 EFI variable of the form: + * <PCIe address>-<kind> + * Return an kalloc'ed array and size of the data. + * + * Returns 0 on success, -errno on failure. + */ +int read_hfi1_efi_var(struct hfi1_devdata *dd, const char *kind, + unsigned long *size, void **return_data) +{ + char name[64]; + + /* create a common prefix */ + snprintf(name, sizeof(name), "%04x:%02x:%02x.%x-%s", + pci_domain_nr(dd->pcidev->bus), + dd->pcidev->bus->number, + PCI_SLOT(dd->pcidev->devfn), + PCI_FUNC(dd->pcidev->devfn), + kind); + + return read_efi_var(name, size, return_data); +} diff --git a/drivers/infiniband/hw/hfi1/efivar.h b/drivers/infiniband/hw/hfi1/efivar.h new file mode 100644 index 000000000..94e9e70de --- /dev/null +++ b/drivers/infiniband/hw/hfi1/efivar.h @@ -0,0 +1,57 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef _HFI1_EFIVAR_H +#define _HFI1_EFIVAR_H + +#include <linux/efi.h> + +#include "hfi.h" + +int read_hfi1_efi_var(struct hfi1_devdata *dd, const char *kind, + unsigned long *size, void **return_data); + +#endif /* _HFI1_EFIVAR_H */ diff --git a/drivers/infiniband/hw/hfi1/eprom.c b/drivers/infiniband/hw/hfi1/eprom.c new file mode 100644 index 000000000..36b77943c --- /dev/null +++ b/drivers/infiniband/hw/hfi1/eprom.c @@ -0,0 +1,102 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include <linux/delay.h> +#include "hfi.h" +#include "common.h" +#include "eprom.h" + +#define CMD_SHIFT 24 +#define CMD_RELEASE_POWERDOWN_NOID ((0xab << CMD_SHIFT)) + +/* controller interface speeds */ +#define EP_SPEED_FULL 0x2 /* full speed */ + +/* + * How long to wait for the EPROM to become available, in ms. + * The spec 32 Mb EPROM takes around 40s to erase then write. + * Double it for safety. + */ +#define EPROM_TIMEOUT 80000 /* ms */ +/* + * Initialize the EPROM handler. + */ +int eprom_init(struct hfi1_devdata *dd) +{ + int ret = 0; + + /* only the discrete chip has an EPROM */ + if (dd->pcidev->device != PCI_DEVICE_ID_INTEL0) + return 0; + + /* + * It is OK if both HFIs reset the EPROM as long as they don't + * do it at the same time. + */ + ret = acquire_chip_resource(dd, CR_EPROM, EPROM_TIMEOUT); + if (ret) { + dd_dev_err(dd, + "%s: unable to acquire EPROM resource, no EPROM support\n", + __func__); + goto done_asic; + } + + /* reset EPROM to be sure it is in a good state */ + + /* set reset */ + write_csr(dd, ASIC_EEP_CTL_STAT, ASIC_EEP_CTL_STAT_EP_RESET_SMASK); + /* clear reset, set speed */ + write_csr(dd, ASIC_EEP_CTL_STAT, + EP_SPEED_FULL << ASIC_EEP_CTL_STAT_RATE_SPI_SHIFT); + + /* wake the device with command "release powerdown NoID" */ + write_csr(dd, ASIC_EEP_ADDR_CMD, CMD_RELEASE_POWERDOWN_NOID); + + dd->eprom_available = true; + release_chip_resource(dd, CR_EPROM); +done_asic: + return ret; +} diff --git a/drivers/infiniband/hw/hfi1/eprom.h b/drivers/infiniband/hw/hfi1/eprom.h new file mode 100644 index 000000000..d41f0b1af --- /dev/null +++ b/drivers/infiniband/hw/hfi1/eprom.h @@ -0,0 +1,52 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +struct hfi1_cmd; +struct hfi1_devdata; + +int eprom_init(struct hfi1_devdata *dd); +int handle_eprom_command(struct file *fp, const struct hfi1_cmd *cmd); diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c new file mode 100644 index 000000000..c702a0096 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -0,0 +1,1501 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include <linux/poll.h> +#include <linux/cdev.h> +#include <linux/vmalloc.h> +#include <linux/io.h> + +#include <rdma/ib.h> + +#include "hfi.h" +#include "pio.h" +#include "device.h" +#include "common.h" +#include "trace.h" +#include "user_sdma.h" +#include "user_exp_rcv.h" +#include "eprom.h" +#include "aspm.h" +#include "mmu_rb.h" + +#undef pr_fmt +#define pr_fmt(fmt) DRIVER_NAME ": " fmt + +#define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */ + +/* + * File operation functions + */ +static int hfi1_file_open(struct inode *, struct file *); +static int hfi1_file_close(struct inode *, struct file *); +static ssize_t hfi1_write_iter(struct kiocb *, struct iov_iter *); +static unsigned int hfi1_poll(struct file *, struct poll_table_struct *); +static int hfi1_file_mmap(struct file *, struct vm_area_struct *); + +static u64 kvirt_to_phys(void *); +static int assign_ctxt(struct file *, struct hfi1_user_info *); +static int init_subctxts(struct hfi1_ctxtdata *, const struct hfi1_user_info *); +static int user_init(struct file *); +static int get_ctxt_info(struct file *, void __user *, __u32); +static int get_base_info(struct file *, void __user *, __u32); +static int setup_ctxt(struct file *); +static int setup_subctxt(struct hfi1_ctxtdata *); +static int get_user_context(struct file *, struct hfi1_user_info *, int); +static int find_shared_ctxt(struct file *, const struct hfi1_user_info *); +static int allocate_ctxt(struct file *, struct hfi1_devdata *, + struct hfi1_user_info *); +static unsigned int poll_urgent(struct file *, struct poll_table_struct *); +static unsigned int poll_next(struct file *, struct poll_table_struct *); +static int user_event_ack(struct hfi1_ctxtdata *, int, unsigned long); +static int set_ctxt_pkey(struct hfi1_ctxtdata *, unsigned, u16); +static int manage_rcvq(struct hfi1_ctxtdata *, unsigned, int); +static int vma_fault(struct vm_area_struct *, struct vm_fault *); +static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, + unsigned long arg); + +static const struct file_operations hfi1_file_ops = { + .owner = THIS_MODULE, + .write_iter = hfi1_write_iter, + .open = hfi1_file_open, + .release = hfi1_file_close, + .unlocked_ioctl = hfi1_file_ioctl, + .poll = hfi1_poll, + .mmap = hfi1_file_mmap, + .llseek = noop_llseek, +}; + +static struct vm_operations_struct vm_ops = { + .fault = vma_fault, +}; + +/* + * Types of memories mapped into user processes' space + */ +enum mmap_types { + PIO_BUFS = 1, + PIO_BUFS_SOP, + PIO_CRED, + RCV_HDRQ, + RCV_EGRBUF, + UREGS, + EVENTS, + STATUS, + RTAIL, + SUBCTXT_UREGS, + SUBCTXT_RCV_HDRQ, + SUBCTXT_EGRBUF, + SDMA_COMP +}; + +/* + * Masks and offsets defining the mmap tokens + */ +#define HFI1_MMAP_OFFSET_MASK 0xfffULL +#define HFI1_MMAP_OFFSET_SHIFT 0 +#define HFI1_MMAP_SUBCTXT_MASK 0xfULL +#define HFI1_MMAP_SUBCTXT_SHIFT 12 +#define HFI1_MMAP_CTXT_MASK 0xffULL +#define HFI1_MMAP_CTXT_SHIFT 16 +#define HFI1_MMAP_TYPE_MASK 0xfULL +#define HFI1_MMAP_TYPE_SHIFT 24 +#define HFI1_MMAP_MAGIC_MASK 0xffffffffULL +#define HFI1_MMAP_MAGIC_SHIFT 32 + +#define HFI1_MMAP_MAGIC 0xdabbad00 + +#define HFI1_MMAP_TOKEN_SET(field, val) \ + (((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT) +#define HFI1_MMAP_TOKEN_GET(field, token) \ + (((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK) +#define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr) \ + (HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \ + HFI1_MMAP_TOKEN_SET(TYPE, type) | \ + HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \ + HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \ + HFI1_MMAP_TOKEN_SET(OFFSET, (offset_in_page(addr)))) + +#define dbg(fmt, ...) \ + pr_info(fmt, ##__VA_ARGS__) + +static inline int is_valid_mmap(u64 token) +{ + return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC); +} + +static int hfi1_file_open(struct inode *inode, struct file *fp) +{ + struct hfi1_devdata *dd = container_of(inode->i_cdev, + struct hfi1_devdata, + user_cdev); + + /* Just take a ref now. Not all opens result in a context assign */ + kobject_get(&dd->kobj); + + /* The real work is performed later in assign_ctxt() */ + fp->private_data = kzalloc(sizeof(struct hfi1_filedata), GFP_KERNEL); + if (fp->private_data) /* no cpu affinity by default */ + ((struct hfi1_filedata *)fp->private_data)->rec_cpu_num = -1; + return fp->private_data ? 0 : -ENOMEM; +} + +static long hfi1_file_ioctl(struct file *fp, unsigned int cmd, + unsigned long arg) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_user_info uinfo; + struct hfi1_tid_info tinfo; + int ret = 0; + unsigned long addr; + int uval = 0; + unsigned long ul_uval = 0; + u16 uval16 = 0; + + hfi1_cdbg(IOCTL, "IOCTL recv: 0x%x", cmd); + if (cmd != HFI1_IOCTL_ASSIGN_CTXT && + cmd != HFI1_IOCTL_GET_VERS && + !uctxt) + return -EINVAL; + + switch (cmd) { + case HFI1_IOCTL_ASSIGN_CTXT: + if (uctxt) + return -EINVAL; + + if (copy_from_user(&uinfo, + (struct hfi1_user_info __user *)arg, + sizeof(uinfo))) + return -EFAULT; + + ret = assign_ctxt(fp, &uinfo); + if (ret < 0) + return ret; + setup_ctxt(fp); + if (ret) + return ret; + ret = user_init(fp); + break; + case HFI1_IOCTL_CTXT_INFO: + ret = get_ctxt_info(fp, (void __user *)(unsigned long)arg, + sizeof(struct hfi1_ctxt_info)); + break; + case HFI1_IOCTL_USER_INFO: + ret = get_base_info(fp, (void __user *)(unsigned long)arg, + sizeof(struct hfi1_base_info)); + break; + case HFI1_IOCTL_CREDIT_UPD: + if (uctxt && uctxt->sc) + sc_return_credits(uctxt->sc); + break; + + case HFI1_IOCTL_TID_UPDATE: + if (copy_from_user(&tinfo, + (struct hfi11_tid_info __user *)arg, + sizeof(tinfo))) + return -EFAULT; + + ret = hfi1_user_exp_rcv_setup(fp, &tinfo); + if (!ret) { + /* + * Copy the number of tidlist entries we used + * and the length of the buffer we registered. + * These fields are adjacent in the structure so + * we can copy them at the same time. + */ + addr = arg + offsetof(struct hfi1_tid_info, tidcnt); + if (copy_to_user((void __user *)addr, &tinfo.tidcnt, + sizeof(tinfo.tidcnt) + + sizeof(tinfo.length))) + ret = -EFAULT; + } + break; + + case HFI1_IOCTL_TID_FREE: + if (copy_from_user(&tinfo, + (struct hfi11_tid_info __user *)arg, + sizeof(tinfo))) + return -EFAULT; + + ret = hfi1_user_exp_rcv_clear(fp, &tinfo); + if (ret) + break; + addr = arg + offsetof(struct hfi1_tid_info, tidcnt); + if (copy_to_user((void __user *)addr, &tinfo.tidcnt, + sizeof(tinfo.tidcnt))) + ret = -EFAULT; + break; + + case HFI1_IOCTL_TID_INVAL_READ: + if (copy_from_user(&tinfo, + (struct hfi11_tid_info __user *)arg, + sizeof(tinfo))) + return -EFAULT; + + ret = hfi1_user_exp_rcv_invalid(fp, &tinfo); + if (ret) + break; + addr = arg + offsetof(struct hfi1_tid_info, tidcnt); + if (copy_to_user((void __user *)addr, &tinfo.tidcnt, + sizeof(tinfo.tidcnt))) + ret = -EFAULT; + break; + + case HFI1_IOCTL_RECV_CTRL: + ret = get_user(uval, (int __user *)arg); + if (ret != 0) + return -EFAULT; + ret = manage_rcvq(uctxt, fd->subctxt, uval); + break; + + case HFI1_IOCTL_POLL_TYPE: + ret = get_user(uval, (int __user *)arg); + if (ret != 0) + return -EFAULT; + uctxt->poll_type = (typeof(uctxt->poll_type))uval; + break; + + case HFI1_IOCTL_ACK_EVENT: + ret = get_user(ul_uval, (unsigned long __user *)arg); + if (ret != 0) + return -EFAULT; + ret = user_event_ack(uctxt, fd->subctxt, ul_uval); + break; + + case HFI1_IOCTL_SET_PKEY: + ret = get_user(uval16, (u16 __user *)arg); + if (ret != 0) + return -EFAULT; + if (HFI1_CAP_IS_USET(PKEY_CHECK)) + ret = set_ctxt_pkey(uctxt, fd->subctxt, uval16); + else + return -EPERM; + break; + + case HFI1_IOCTL_CTXT_RESET: { + struct send_context *sc; + struct hfi1_devdata *dd; + + if (!uctxt || !uctxt->dd || !uctxt->sc) + return -EINVAL; + + /* + * There is no protection here. User level has to + * guarantee that no one will be writing to the send + * context while it is being re-initialized. + * If user level breaks that guarantee, it will break + * it's own context and no one else's. + */ + dd = uctxt->dd; + sc = uctxt->sc; + /* + * Wait until the interrupt handler has marked the + * context as halted or frozen. Report error if we time + * out. + */ + wait_event_interruptible_timeout( + sc->halt_wait, (sc->flags & SCF_HALTED), + msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT)); + if (!(sc->flags & SCF_HALTED)) + return -ENOLCK; + + /* + * If the send context was halted due to a Freeze, + * wait until the device has been "unfrozen" before + * resetting the context. + */ + if (sc->flags & SCF_FROZEN) { + wait_event_interruptible_timeout( + dd->event_queue, + !(ACCESS_ONCE(dd->flags) & HFI1_FROZEN), + msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT)); + if (dd->flags & HFI1_FROZEN) + return -ENOLCK; + + if (dd->flags & HFI1_FORCED_FREEZE) + /* + * Don't allow context reset if we are into + * forced freeze + */ + return -ENODEV; + + sc_disable(sc); + ret = sc_enable(sc); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB, + uctxt->ctxt); + } else { + ret = sc_restart(sc); + } + if (!ret) + sc_return_credits(sc); + break; + } + + case HFI1_IOCTL_GET_VERS: + uval = HFI1_USER_SWVERSION; + if (put_user(uval, (int __user *)arg)) + return -EFAULT; + break; + + default: + return -EINVAL; + } + + return ret; +} + +static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) +{ + struct hfi1_filedata *fd = kiocb->ki_filp->private_data; + struct hfi1_user_sdma_pkt_q *pq = fd->pq; + struct hfi1_user_sdma_comp_q *cq = fd->cq; + int ret = 0, done = 0, reqs = 0; + unsigned long dim = from->nr_segs; + + if (!cq || !pq) { + ret = -EIO; + goto done; + } + + if (!iter_is_iovec(from) || !dim) { + ret = -EINVAL; + goto done; + } + + hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)", + fd->uctxt->ctxt, fd->subctxt, dim); + + if (atomic_read(&pq->n_reqs) == pq->n_max_reqs) { + ret = -ENOSPC; + goto done; + } + + while (dim) { + unsigned long count = 0; + + ret = hfi1_user_sdma_process_request( + kiocb->ki_filp, (struct iovec *)(from->iov + done), + dim, &count); + if (ret) + goto done; + dim -= count; + done += count; + reqs++; + } +done: + return ret ? ret : reqs; +} + +static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd; + unsigned long flags, pfn; + u64 token = vma->vm_pgoff << PAGE_SHIFT, + memaddr = 0; + u8 subctxt, mapio = 0, vmf = 0, type; + ssize_t memlen = 0; + int ret = 0; + u16 ctxt; + + if (!is_valid_mmap(token) || !uctxt || + !(vma->vm_flags & VM_SHARED)) { + ret = -EINVAL; + goto done; + } + dd = uctxt->dd; + ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token); + subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token); + type = HFI1_MMAP_TOKEN_GET(TYPE, token); + if (ctxt != uctxt->ctxt || subctxt != fd->subctxt) { + ret = -EINVAL; + goto done; + } + + flags = vma->vm_flags; + + switch (type) { + case PIO_BUFS: + case PIO_BUFS_SOP: + memaddr = ((dd->physaddr + TXE_PIO_SEND) + + /* chip pio base */ + (uctxt->sc->hw_context * BIT(16))) + + /* 64K PIO space / ctxt */ + (type == PIO_BUFS_SOP ? + (TXE_PIO_SIZE / 2) : 0); /* sop? */ + /* + * Map only the amount allocated to the context, not the + * entire available context's PIO space. + */ + memlen = PAGE_ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE); + flags &= ~VM_MAYREAD; + flags |= VM_DONTCOPY | VM_DONTEXPAND; + vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); + mapio = 1; + break; + case PIO_CRED: + if (flags & VM_WRITE) { + ret = -EPERM; + goto done; + } + /* + * The credit return location for this context could be on the + * second or third page allocated for credit returns (if number + * of enabled contexts > 64 and 128 respectively). + */ + memaddr = dd->cr_base[uctxt->numa_id].pa + + (((u64)uctxt->sc->hw_free - + (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK); + memlen = PAGE_SIZE; + flags &= ~VM_MAYWRITE; + flags |= VM_DONTCOPY | VM_DONTEXPAND; + /* + * The driver has already allocated memory for credit + * returns and programmed it into the chip. Has that + * memory been flagged as non-cached? + */ + /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */ + mapio = 1; + break; + case RCV_HDRQ: + memaddr = uctxt->rcvhdrq_phys; + memlen = uctxt->rcvhdrq_size; + break; + case RCV_EGRBUF: { + unsigned long addr; + int i; + /* + * The RcvEgr buffer need to be handled differently + * as multiple non-contiguous pages need to be mapped + * into the user process. + */ + memlen = uctxt->egrbufs.size; + if ((vma->vm_end - vma->vm_start) != memlen) { + dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n", + (vma->vm_end - vma->vm_start), memlen); + ret = -EINVAL; + goto done; + } + if (vma->vm_flags & VM_WRITE) { + ret = -EPERM; + goto done; + } + vma->vm_flags &= ~VM_MAYWRITE; + addr = vma->vm_start; + for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) { + ret = remap_pfn_range( + vma, addr, + uctxt->egrbufs.buffers[i].phys >> PAGE_SHIFT, + uctxt->egrbufs.buffers[i].len, + vma->vm_page_prot); + if (ret < 0) + goto done; + addr += uctxt->egrbufs.buffers[i].len; + } + ret = 0; + goto done; + } + case UREGS: + /* + * Map only the page that contains this context's user + * registers. + */ + memaddr = (unsigned long) + (dd->physaddr + RXE_PER_CONTEXT_USER) + + (uctxt->ctxt * RXE_PER_CONTEXT_SIZE); + /* + * TidFlow table is on the same page as the rest of the + * user registers. + */ + memlen = PAGE_SIZE; + flags |= VM_DONTCOPY | VM_DONTEXPAND; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + mapio = 1; + break; + case EVENTS: + /* + * Use the page where this context's flags are. User level + * knows where it's own bitmap is within the page. + */ + memaddr = (unsigned long)(dd->events + + ((uctxt->ctxt - dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS)) & PAGE_MASK; + memlen = PAGE_SIZE; + /* + * v3.7 removes VM_RESERVED but the effect is kept by + * using VM_IO. + */ + flags |= VM_IO | VM_DONTEXPAND; + vmf = 1; + break; + case STATUS: + memaddr = kvirt_to_phys((void *)dd->status); + memlen = PAGE_SIZE; + flags |= VM_IO | VM_DONTEXPAND; + break; + case RTAIL: + if (!HFI1_CAP_IS_USET(DMA_RTAIL)) { + /* + * If the memory allocation failed, the context alloc + * also would have failed, so we would never get here + */ + ret = -EINVAL; + goto done; + } + if (flags & VM_WRITE) { + ret = -EPERM; + goto done; + } + memaddr = uctxt->rcvhdrqtailaddr_phys; + memlen = PAGE_SIZE; + flags &= ~VM_MAYWRITE; + break; + case SUBCTXT_UREGS: + memaddr = (u64)uctxt->subctxt_uregbase; + memlen = PAGE_SIZE; + flags |= VM_IO | VM_DONTEXPAND; + vmf = 1; + break; + case SUBCTXT_RCV_HDRQ: + memaddr = (u64)uctxt->subctxt_rcvhdr_base; + memlen = uctxt->rcvhdrq_size * uctxt->subctxt_cnt; + flags |= VM_IO | VM_DONTEXPAND; + vmf = 1; + break; + case SUBCTXT_EGRBUF: + memaddr = (u64)uctxt->subctxt_rcvegrbuf; + memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt; + flags |= VM_IO | VM_DONTEXPAND; + flags &= ~VM_MAYWRITE; + vmf = 1; + break; + case SDMA_COMP: { + struct hfi1_user_sdma_comp_q *cq = fd->cq; + + if (!cq) { + ret = -EFAULT; + goto done; + } + memaddr = (u64)cq->comps; + memlen = PAGE_ALIGN(sizeof(*cq->comps) * cq->nentries); + flags |= VM_IO | VM_DONTEXPAND; + vmf = 1; + break; + } + default: + ret = -EINVAL; + break; + } + + if ((vma->vm_end - vma->vm_start) != memlen) { + hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu", + uctxt->ctxt, fd->subctxt, + (vma->vm_end - vma->vm_start), memlen); + ret = -EINVAL; + goto done; + } + + vma->vm_flags = flags; + hfi1_cdbg(PROC, + "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n", + ctxt, subctxt, type, mapio, vmf, memaddr, memlen, + vma->vm_end - vma->vm_start, vma->vm_flags); + pfn = (unsigned long)(memaddr >> PAGE_SHIFT); + if (vmf) { + vma->vm_pgoff = pfn; + vma->vm_ops = &vm_ops; + ret = 0; + } else if (mapio) { + ret = io_remap_pfn_range(vma, vma->vm_start, pfn, memlen, + vma->vm_page_prot); + } else { + ret = remap_pfn_range(vma, vma->vm_start, pfn, memlen, + vma->vm_page_prot); + } +done: + return ret; +} + +/* + * Local (non-chip) user memory is not mapped right away but as it is + * accessed by the user-level code. + */ +static int vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page; + + page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT)); + if (!page) + return VM_FAULT_SIGBUS; + + get_page(page); + vmf->page = page; + + return 0; +} + +static unsigned int hfi1_poll(struct file *fp, struct poll_table_struct *pt) +{ + struct hfi1_ctxtdata *uctxt; + unsigned pollflag; + + uctxt = ((struct hfi1_filedata *)fp->private_data)->uctxt; + if (!uctxt) + pollflag = POLLERR; + else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT) + pollflag = poll_urgent(fp, pt); + else if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV) + pollflag = poll_next(fp, pt); + else /* invalid */ + pollflag = POLLERR; + + return pollflag; +} + +static int hfi1_file_close(struct inode *inode, struct file *fp) +{ + struct hfi1_filedata *fdata = fp->private_data; + struct hfi1_ctxtdata *uctxt = fdata->uctxt; + struct hfi1_devdata *dd = container_of(inode->i_cdev, + struct hfi1_devdata, + user_cdev); + unsigned long flags, *ev; + + fp->private_data = NULL; + + if (!uctxt) + goto done; + + hfi1_cdbg(PROC, "freeing ctxt %u:%u", uctxt->ctxt, fdata->subctxt); + mutex_lock(&hfi1_mutex); + + flush_wc(); + /* drain user sdma queue */ + hfi1_user_sdma_free_queues(fdata); + + /* release the cpu */ + hfi1_put_proc_affinity(dd, fdata->rec_cpu_num); + + /* + * Clear any left over, unhandled events so the next process that + * gets this context doesn't get confused. + */ + ev = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS) + fdata->subctxt; + *ev = 0; + + if (--uctxt->cnt) { + uctxt->active_slaves &= ~(1 << fdata->subctxt); + uctxt->subpid[fdata->subctxt] = 0; + mutex_unlock(&hfi1_mutex); + goto done; + } + + spin_lock_irqsave(&dd->uctxt_lock, flags); + /* + * Disable receive context and interrupt available, reset all + * RcvCtxtCtrl bits to default values. + */ + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | + HFI1_RCVCTRL_TIDFLOW_DIS | + HFI1_RCVCTRL_INTRAVAIL_DIS | + HFI1_RCVCTRL_TAILUPD_DIS | + HFI1_RCVCTRL_ONE_PKT_EGR_DIS | + HFI1_RCVCTRL_NO_RHQ_DROP_DIS | + HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt); + /* Clear the context's J_KEY */ + hfi1_clear_ctxt_jkey(dd, uctxt->ctxt); + /* + * Reset context integrity checks to default. + * (writes to CSRs probably belong in chip.c) + */ + write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE, + hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type)); + sc_disable(uctxt->sc); + uctxt->pid = 0; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + + dd->rcd[uctxt->ctxt] = NULL; + + hfi1_user_exp_rcv_free(fdata); + hfi1_clear_ctxt_pkey(dd, uctxt->ctxt); + + uctxt->rcvwait_to = 0; + uctxt->piowait_to = 0; + uctxt->rcvnowait = 0; + uctxt->pionowait = 0; + uctxt->event_flags = 0; + + hfi1_stats.sps_ctxts--; + if (++dd->freectxts == dd->num_user_contexts) + aspm_enable_all(dd); + mutex_unlock(&hfi1_mutex); + hfi1_free_ctxtdata(dd, uctxt); +done: + kobject_put(&dd->kobj); + kfree(fdata); + return 0; +} + +/* + * Convert kernel *virtual* addresses to physical addresses. + * This is used to vmalloc'ed addresses. + */ +static u64 kvirt_to_phys(void *addr) +{ + struct page *page; + u64 paddr = 0; + + page = vmalloc_to_page(addr); + if (page) + paddr = page_to_pfn(page) << PAGE_SHIFT; + + return paddr; +} + +static int assign_ctxt(struct file *fp, struct hfi1_user_info *uinfo) +{ + int i_minor, ret = 0; + unsigned int swmajor, swminor; + + swmajor = uinfo->userversion >> 16; + if (swmajor != HFI1_USER_SWMAJOR) { + ret = -ENODEV; + goto done; + } + + swminor = uinfo->userversion & 0xffff; + + mutex_lock(&hfi1_mutex); + /* First, lets check if we need to setup a shared context? */ + if (uinfo->subctxt_cnt) { + struct hfi1_filedata *fd = fp->private_data; + + ret = find_shared_ctxt(fp, uinfo); + if (ret < 0) + goto done_unlock; + if (ret) + fd->rec_cpu_num = hfi1_get_proc_affinity( + fd->uctxt->dd, fd->uctxt->numa_id); + } + + /* + * We execute the following block if we couldn't find a + * shared context or if context sharing is not required. + */ + if (!ret) { + i_minor = iminor(file_inode(fp)) - HFI1_USER_MINOR_BASE; + ret = get_user_context(fp, uinfo, i_minor); + } +done_unlock: + mutex_unlock(&hfi1_mutex); +done: + return ret; +} + +static int get_user_context(struct file *fp, struct hfi1_user_info *uinfo, + int devno) +{ + struct hfi1_devdata *dd = NULL; + int devmax, npresent, nup; + + devmax = hfi1_count_units(&npresent, &nup); + if (!npresent) + return -ENXIO; + + if (!nup) + return -ENETDOWN; + + dd = hfi1_lookup(devno); + if (!dd) + return -ENODEV; + else if (!dd->freectxts) + return -EBUSY; + + return allocate_ctxt(fp, dd, uinfo); +} + +static int find_shared_ctxt(struct file *fp, + const struct hfi1_user_info *uinfo) +{ + int devmax, ndev, i; + int ret = 0; + struct hfi1_filedata *fd = fp->private_data; + + devmax = hfi1_count_units(NULL, NULL); + + for (ndev = 0; ndev < devmax; ndev++) { + struct hfi1_devdata *dd = hfi1_lookup(ndev); + + if (!(dd && (dd->flags & HFI1_PRESENT) && dd->kregbase)) + continue; + for (i = dd->first_user_ctxt; i < dd->num_rcv_contexts; i++) { + struct hfi1_ctxtdata *uctxt = dd->rcd[i]; + + /* Skip ctxts which are not yet open */ + if (!uctxt || !uctxt->cnt) + continue; + /* Skip ctxt if it doesn't match the requested one */ + if (memcmp(uctxt->uuid, uinfo->uuid, + sizeof(uctxt->uuid)) || + uctxt->jkey != generate_jkey(current_uid()) || + uctxt->subctxt_id != uinfo->subctxt_id || + uctxt->subctxt_cnt != uinfo->subctxt_cnt) + continue; + + /* Verify the sharing process matches the master */ + if (uctxt->userversion != uinfo->userversion || + uctxt->cnt >= uctxt->subctxt_cnt) { + ret = -EINVAL; + goto done; + } + fd->uctxt = uctxt; + fd->subctxt = uctxt->cnt++; + uctxt->subpid[fd->subctxt] = current->pid; + uctxt->active_slaves |= 1 << fd->subctxt; + ret = 1; + goto done; + } + } + +done: + return ret; +} + +static int allocate_ctxt(struct file *fp, struct hfi1_devdata *dd, + struct hfi1_user_info *uinfo) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt; + unsigned ctxt; + int ret, numa; + + if (dd->flags & HFI1_FROZEN) { + /* + * Pick an error that is unique from all other errors + * that are returned so the user process knows that + * it tried to allocate while the SPC was frozen. It + * it should be able to retry with success in a short + * while. + */ + return -EIO; + } + + for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts; ctxt++) + if (!dd->rcd[ctxt]) + break; + + if (ctxt == dd->num_rcv_contexts) + return -EBUSY; + + fd->rec_cpu_num = hfi1_get_proc_affinity(dd, -1); + if (fd->rec_cpu_num != -1) + numa = cpu_to_node(fd->rec_cpu_num); + else + numa = numa_node_id(); + uctxt = hfi1_create_ctxtdata(dd->pport, ctxt, numa); + if (!uctxt) { + dd_dev_err(dd, + "Unable to allocate ctxtdata memory, failing open\n"); + return -ENOMEM; + } + hfi1_cdbg(PROC, "[%u:%u] pid %u assigned to CPU %d (NUMA %u)", + uctxt->ctxt, fd->subctxt, current->pid, fd->rec_cpu_num, + uctxt->numa_id); + + /* + * Allocate and enable a PIO send context. + */ + uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize, + uctxt->dd->node); + if (!uctxt->sc) + return -ENOMEM; + + hfi1_cdbg(PROC, "allocated send context %u(%u)\n", uctxt->sc->sw_index, + uctxt->sc->hw_context); + ret = sc_enable(uctxt->sc); + if (ret) + return ret; + /* + * Setup shared context resources if the user-level has requested + * shared contexts and this is the 'master' process. + * This has to be done here so the rest of the sub-contexts find the + * proper master. + */ + if (uinfo->subctxt_cnt && !fd->subctxt) { + ret = init_subctxts(uctxt, uinfo); + /* + * On error, we don't need to disable and de-allocate the + * send context because it will be done during file close + */ + if (ret) + return ret; + } + uctxt->userversion = uinfo->userversion; + uctxt->pid = current->pid; + uctxt->flags = HFI1_CAP_UGET(MASK); + init_waitqueue_head(&uctxt->wait); + strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm)); + memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid)); + uctxt->jkey = generate_jkey(current_uid()); + INIT_LIST_HEAD(&uctxt->sdma_queues); + spin_lock_init(&uctxt->sdma_qlock); + hfi1_stats.sps_ctxts++; + /* + * Disable ASPM when there are open user/PSM contexts to avoid + * issues with ASPM L1 exit latency + */ + if (dd->freectxts-- == dd->num_user_contexts) + aspm_disable_all(dd); + fd->uctxt = uctxt; + + return 0; +} + +static int init_subctxts(struct hfi1_ctxtdata *uctxt, + const struct hfi1_user_info *uinfo) +{ + unsigned num_subctxts; + + num_subctxts = uinfo->subctxt_cnt; + if (num_subctxts > HFI1_MAX_SHARED_CTXTS) + return -EINVAL; + + uctxt->subctxt_cnt = uinfo->subctxt_cnt; + uctxt->subctxt_id = uinfo->subctxt_id; + uctxt->active_slaves = 1; + uctxt->redirect_seq_cnt = 1; + set_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags); + + return 0; +} + +static int setup_subctxt(struct hfi1_ctxtdata *uctxt) +{ + int ret = 0; + unsigned num_subctxts = uctxt->subctxt_cnt; + + uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE); + if (!uctxt->subctxt_uregbase) { + ret = -ENOMEM; + goto bail; + } + /* We can take the size of the RcvHdr Queue from the master */ + uctxt->subctxt_rcvhdr_base = vmalloc_user(uctxt->rcvhdrq_size * + num_subctxts); + if (!uctxt->subctxt_rcvhdr_base) { + ret = -ENOMEM; + goto bail_ureg; + } + + uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size * + num_subctxts); + if (!uctxt->subctxt_rcvegrbuf) { + ret = -ENOMEM; + goto bail_rhdr; + } + goto bail; +bail_rhdr: + vfree(uctxt->subctxt_rcvhdr_base); +bail_ureg: + vfree(uctxt->subctxt_uregbase); + uctxt->subctxt_uregbase = NULL; +bail: + return ret; +} + +static int user_init(struct file *fp) +{ + unsigned int rcvctrl_ops = 0; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + + /* make sure that the context has already been setup */ + if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags)) + return -EFAULT; + + /* initialize poll variables... */ + uctxt->urgent = 0; + uctxt->urgent_poll = 0; + + /* + * Now enable the ctxt for receive. + * For chips that are set to DMA the tail register to memory + * when they change (and when the update bit transitions from + * 0 to 1. So for those chips, we turn it off and then back on. + * This will (very briefly) affect any other open ctxts, but the + * duration is very short, and therefore isn't an issue. We + * explicitly set the in-memory tail copy to 0 beforehand, so we + * don't have to wait to be sure the DMA update has happened + * (chip resets head/tail to 0 on transition to enable). + */ + if (uctxt->rcvhdrtail_kvaddr) + clear_rcvhdrtail(uctxt); + + /* Setup J_KEY before enabling the context */ + hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey); + + rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB; + if (HFI1_CAP_KGET_MASK(uctxt->flags, HDRSUPP)) + rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB; + /* + * Ignore the bit in the flags for now until proper + * support for multiple packet per rcv array entry is + * added. + */ + if (!HFI1_CAP_KGET_MASK(uctxt->flags, MULTI_PKT_EGR)) + rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; + if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_EGR_FULL)) + rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; + if (HFI1_CAP_KGET_MASK(uctxt->flags, NODROP_RHQ_FULL)) + rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; + /* + * The RcvCtxtCtrl.TailUpd bit has to be explicitly written. + * We can't rely on the correct value to be set from prior + * uses of the chip or ctxt. Therefore, add the rcvctrl op + * for both cases. + */ + if (HFI1_CAP_KGET_MASK(uctxt->flags, DMA_RTAIL)) + rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB; + else + rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS; + hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt); + + /* Notify any waiting slaves */ + if (uctxt->subctxt_cnt) { + clear_bit(HFI1_CTXT_MASTER_UNINIT, &uctxt->event_flags); + wake_up(&uctxt->wait); + } + + return 0; +} + +static int get_ctxt_info(struct file *fp, void __user *ubase, __u32 len) +{ + struct hfi1_ctxt_info cinfo; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + int ret = 0; + + memset(&cinfo, 0, sizeof(cinfo)); + ret = hfi1_get_base_kinfo(uctxt, &cinfo); + if (ret < 0) + goto done; + cinfo.num_active = hfi1_count_active_units(); + cinfo.unit = uctxt->dd->unit; + cinfo.ctxt = uctxt->ctxt; + cinfo.subctxt = fd->subctxt; + cinfo.rcvtids = roundup(uctxt->egrbufs.alloced, + uctxt->dd->rcv_entries.group_size) + + uctxt->expected_count; + cinfo.credits = uctxt->sc->credits; + cinfo.numa_node = uctxt->numa_id; + cinfo.rec_cpu = fd->rec_cpu_num; + cinfo.send_ctxt = uctxt->sc->hw_context; + + cinfo.egrtids = uctxt->egrbufs.alloced; + cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt; + cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2; + cinfo.sdma_ring_size = fd->cq->nentries; + cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size; + + trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo); + if (copy_to_user(ubase, &cinfo, sizeof(cinfo))) + ret = -EFAULT; +done: + return ret; +} + +static int setup_ctxt(struct file *fp) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + int ret = 0; + + /* + * Context should be set up only once, including allocation and + * programming of eager buffers. This is done if context sharing + * is not requested or by the master process. + */ + if (!uctxt->subctxt_cnt || !fd->subctxt) { + ret = hfi1_init_ctxt(uctxt->sc); + if (ret) + goto done; + + /* Now allocate the RcvHdr queue and eager buffers. */ + ret = hfi1_create_rcvhdrq(dd, uctxt); + if (ret) + goto done; + ret = hfi1_setup_eagerbufs(uctxt); + if (ret) + goto done; + if (uctxt->subctxt_cnt && !fd->subctxt) { + ret = setup_subctxt(uctxt); + if (ret) + goto done; + } + } else { + ret = wait_event_interruptible(uctxt->wait, !test_bit( + HFI1_CTXT_MASTER_UNINIT, + &uctxt->event_flags)); + if (ret) + goto done; + } + + ret = hfi1_user_sdma_alloc_queues(uctxt, fp); + if (ret) + goto done; + /* + * Expected receive has to be setup for all processes (including + * shared contexts). However, it has to be done after the master + * context has been fully configured as it depends on the + * eager/expected split of the RcvArray entries. + * Setting it up here ensures that the subcontexts will be waiting + * (due to the above wait_event_interruptible() until the master + * is setup. + */ + ret = hfi1_user_exp_rcv_init(fp); + if (ret) + goto done; + + set_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags); +done: + return ret; +} + +static int get_base_info(struct file *fp, void __user *ubase, __u32 len) +{ + struct hfi1_base_info binfo; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + ssize_t sz; + unsigned offset; + int ret = 0; + + trace_hfi1_uctxtdata(uctxt->dd, uctxt); + + memset(&binfo, 0, sizeof(binfo)); + binfo.hw_version = dd->revision; + binfo.sw_version = HFI1_KERN_SWVERSION; + binfo.bthqp = kdeth_qp; + binfo.jkey = uctxt->jkey; + /* + * If more than 64 contexts are enabled the allocated credit + * return will span two or three contiguous pages. Since we only + * map the page containing the context's credit return address, + * we need to calculate the offset in the proper page. + */ + offset = ((u64)uctxt->sc->hw_free - + (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE; + binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt, + fd->subctxt, offset); + binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt, + fd->subctxt, + uctxt->sc->base_addr); + binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP, + uctxt->ctxt, + fd->subctxt, + uctxt->sc->base_addr); + binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt, + fd->subctxt, + uctxt->rcvhdrq); + binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt, + fd->subctxt, + uctxt->egrbufs.rcvtids[0].phys); + binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt, + fd->subctxt, 0); + /* + * user regs are at + * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE)) + */ + binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt, + fd->subctxt, 0); + offset = offset_in_page((((uctxt->ctxt - dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS) + fd->subctxt) * + sizeof(*dd->events)); + binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt, + fd->subctxt, + offset); + binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt, + fd->subctxt, + dd->status); + if (HFI1_CAP_IS_USET(DMA_RTAIL)) + binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt, + fd->subctxt, 0); + if (uctxt->subctxt_cnt) { + binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS, + uctxt->ctxt, + fd->subctxt, 0); + binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ, + uctxt->ctxt, + fd->subctxt, 0); + binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF, + uctxt->ctxt, + fd->subctxt, 0); + } + sz = (len < sizeof(binfo)) ? len : sizeof(binfo); + if (copy_to_user(ubase, &binfo, sz)) + ret = -EFAULT; + return ret; +} + +static unsigned int poll_urgent(struct file *fp, + struct poll_table_struct *pt) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + unsigned pollflag; + + poll_wait(fp, &uctxt->wait, pt); + + spin_lock_irq(&dd->uctxt_lock); + if (uctxt->urgent != uctxt->urgent_poll) { + pollflag = POLLIN | POLLRDNORM; + uctxt->urgent_poll = uctxt->urgent; + } else { + pollflag = 0; + set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags); + } + spin_unlock_irq(&dd->uctxt_lock); + + return pollflag; +} + +static unsigned int poll_next(struct file *fp, + struct poll_table_struct *pt) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + unsigned pollflag; + + poll_wait(fp, &uctxt->wait, pt); + + spin_lock_irq(&dd->uctxt_lock); + if (hdrqempty(uctxt)) { + set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt); + pollflag = 0; + } else { + pollflag = POLLIN | POLLRDNORM; + } + spin_unlock_irq(&dd->uctxt_lock); + + return pollflag; +} + +/* + * Find all user contexts in use, and set the specified bit in their + * event mask. + * See also find_ctxt() for a similar use, that is specific to send buffers. + */ +int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit) +{ + struct hfi1_ctxtdata *uctxt; + struct hfi1_devdata *dd = ppd->dd; + unsigned ctxt; + int ret = 0; + unsigned long flags; + + if (!dd->events) { + ret = -EINVAL; + goto done; + } + + spin_lock_irqsave(&dd->uctxt_lock, flags); + for (ctxt = dd->first_user_ctxt; ctxt < dd->num_rcv_contexts; + ctxt++) { + uctxt = dd->rcd[ctxt]; + if (uctxt) { + unsigned long *evs = dd->events + + (uctxt->ctxt - dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS; + int i; + /* + * subctxt_cnt is 0 if not shared, so do base + * separately, first, then remaining subctxt, if any + */ + set_bit(evtbit, evs); + for (i = 1; i < uctxt->subctxt_cnt; i++) + set_bit(evtbit, evs + i); + } + } + spin_unlock_irqrestore(&dd->uctxt_lock, flags); +done: + return ret; +} + +/** + * manage_rcvq - manage a context's receive queue + * @uctxt: the context + * @subctxt: the sub-context + * @start_stop: action to carry out + * + * start_stop == 0 disables receive on the context, for use in queue + * overflow conditions. start_stop==1 re-enables, to be used to + * re-init the software copy of the head register + */ +static int manage_rcvq(struct hfi1_ctxtdata *uctxt, unsigned subctxt, + int start_stop) +{ + struct hfi1_devdata *dd = uctxt->dd; + unsigned int rcvctrl_op; + + if (subctxt) + goto bail; + /* atomically clear receive enable ctxt. */ + if (start_stop) { + /* + * On enable, force in-memory copy of the tail register to + * 0, so that protocol code doesn't have to worry about + * whether or not the chip has yet updated the in-memory + * copy or not on return from the system call. The chip + * always resets it's tail register back to 0 on a + * transition from disabled to enabled. + */ + if (uctxt->rcvhdrtail_kvaddr) + clear_rcvhdrtail(uctxt); + rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB; + } else { + rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS; + } + hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt); + /* always; new head should be equal to new tail; see above */ +bail: + return 0; +} + +/* + * clear the event notifier events for this context. + * User process then performs actions appropriate to bit having been + * set, if desired, and checks again in future. + */ +static int user_event_ack(struct hfi1_ctxtdata *uctxt, int subctxt, + unsigned long events) +{ + int i; + struct hfi1_devdata *dd = uctxt->dd; + unsigned long *evs; + + if (!dd->events) + return 0; + + evs = dd->events + ((uctxt->ctxt - dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS) + subctxt; + + for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) { + if (!test_bit(i, &events)) + continue; + clear_bit(i, evs); + } + return 0; +} + +static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, unsigned subctxt, + u16 pkey) +{ + int ret = -ENOENT, i, intable = 0; + struct hfi1_pportdata *ppd = uctxt->ppd; + struct hfi1_devdata *dd = uctxt->dd; + + if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) { + ret = -EINVAL; + goto done; + } + + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) + if (pkey == ppd->pkeys[i]) { + intable = 1; + break; + } + + if (intable) + ret = hfi1_set_ctxt_pkey(dd, uctxt->ctxt, pkey); +done: + return ret; +} + +static void user_remove(struct hfi1_devdata *dd) +{ + + hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device); +} + +static int user_add(struct hfi1_devdata *dd) +{ + char name[10]; + int ret; + + snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit); + ret = hfi1_cdev_init(dd->unit, name, &hfi1_file_ops, + &dd->user_cdev, &dd->user_device, + true, &dd->kobj); + if (ret) + user_remove(dd); + + return ret; +} + +/* + * Create per-unit files in /dev + */ +int hfi1_device_create(struct hfi1_devdata *dd) +{ + return user_add(dd); +} + +/* + * Remove per-unit files in /dev + * void, core kernel returns no errors for this stuff + */ +void hfi1_device_remove(struct hfi1_devdata *dd) +{ + user_remove(dd); +} diff --git a/drivers/infiniband/hw/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c new file mode 100644 index 000000000..cbd965cfa --- /dev/null +++ b/drivers/infiniband/hw/hfi1/firmware.c @@ -0,0 +1,2056 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/firmware.h> +#include <linux/mutex.h> +#include <linux/module.h> +#include <linux/delay.h> +#include <linux/crc32.h> + +#include "hfi.h" +#include "trace.h" + +/* + * Make it easy to toggle firmware file name and if it gets loaded by + * editing the following. This may be something we do while in development + * but not necessarily something a user would ever need to use. + */ +#define DEFAULT_FW_8051_NAME_FPGA "/*(DEBLOBBED)*/" +#define DEFAULT_FW_8051_NAME_ASIC "/*(DEBLOBBED)*/" +#define DEFAULT_FW_FABRIC_NAME "/*(DEBLOBBED)*/" +#define DEFAULT_FW_SBUS_NAME "/*(DEBLOBBED)*/" +#define DEFAULT_FW_PCIE_NAME "/*(DEBLOBBED)*/" +#define DEFAULT_PLATFORM_CONFIG_NAME "/*(DEBLOBBED)*/" +#define ALT_FW_8051_NAME_ASIC "/*(DEBLOBBED)*/" +#define ALT_FW_FABRIC_NAME "/*(DEBLOBBED)*/" +#define ALT_FW_SBUS_NAME "/*(DEBLOBBED)*/" +#define ALT_FW_PCIE_NAME "/*(DEBLOBBED)*/" + +static uint fw_8051_load = 1; +static uint fw_fabric_serdes_load = 1; +static uint fw_pcie_serdes_load = 1; +static uint fw_sbus_load = 1; + +/* + * Access required in platform.c + * Maintains state of whether the platform config was fetched via the + * fallback option + */ +uint platform_config_load; + +/* Firmware file names get set in hfi1_firmware_init() based on the above */ +static char *fw_8051_name; +static char *fw_fabric_serdes_name; +static char *fw_sbus_name; +static char *fw_pcie_serdes_name; +static char *platform_config_name; + +#define SBUS_MAX_POLL_COUNT 100 +#define SBUS_COUNTER(reg, name) \ + (((reg) >> ASIC_STS_SBUS_COUNTERS_##name##_CNT_SHIFT) & \ + ASIC_STS_SBUS_COUNTERS_##name##_CNT_MASK) + +/* + * Firmware security header. + */ +struct css_header { + u32 module_type; + u32 header_len; + u32 header_version; + u32 module_id; + u32 module_vendor; + u32 date; /* BCD yyyymmdd */ + u32 size; /* in DWORDs */ + u32 key_size; /* in DWORDs */ + u32 modulus_size; /* in DWORDs */ + u32 exponent_size; /* in DWORDs */ + u32 reserved[22]; +}; + +/* expected field values */ +#define CSS_MODULE_TYPE 0x00000006 +#define CSS_HEADER_LEN 0x000000a1 +#define CSS_HEADER_VERSION 0x00010000 +#define CSS_MODULE_VENDOR 0x00008086 + +#define KEY_SIZE 256 +#define MU_SIZE 8 +#define EXPONENT_SIZE 4 + +/* the file itself */ +struct firmware_file { + struct css_header css_header; + u8 modulus[KEY_SIZE]; + u8 exponent[EXPONENT_SIZE]; + u8 signature[KEY_SIZE]; + u8 firmware[]; +}; + +struct augmented_firmware_file { + struct css_header css_header; + u8 modulus[KEY_SIZE]; + u8 exponent[EXPONENT_SIZE]; + u8 signature[KEY_SIZE]; + u8 r2[KEY_SIZE]; + u8 mu[MU_SIZE]; + u8 firmware[]; +}; + +/* augmented file size difference */ +#define AUGMENT_SIZE (sizeof(struct augmented_firmware_file) - \ + sizeof(struct firmware_file)) + +struct firmware_details { + /* Linux core piece */ + const struct firmware *fw; + + struct css_header *css_header; + u8 *firmware_ptr; /* pointer to binary data */ + u32 firmware_len; /* length in bytes */ + u8 *modulus; /* pointer to the modulus */ + u8 *exponent; /* pointer to the exponent */ + u8 *signature; /* pointer to the signature */ + u8 *r2; /* pointer to r2 */ + u8 *mu; /* pointer to mu */ + struct augmented_firmware_file dummy_header; +}; + +/* + * The mutex protects fw_state, fw_err, and all of the firmware_details + * variables. + */ +static DEFINE_MUTEX(fw_mutex); +enum fw_state { + FW_EMPTY, + FW_TRY, + FW_FINAL, + FW_ERR +}; + +static enum fw_state fw_state = FW_EMPTY; +static int fw_err; +static struct firmware_details fw_8051; +static struct firmware_details fw_fabric; +static struct firmware_details fw_pcie; +static struct firmware_details fw_sbus; +static const struct firmware *platform_config; + +/* flags for turn_off_spicos() */ +#define SPICO_SBUS 0x1 +#define SPICO_FABRIC 0x2 +#define ENABLE_SPICO_SMASK 0x1 + +/* security block commands */ +#define RSA_CMD_INIT 0x1 +#define RSA_CMD_START 0x2 + +/* security block status */ +#define RSA_STATUS_IDLE 0x0 +#define RSA_STATUS_ACTIVE 0x1 +#define RSA_STATUS_DONE 0x2 +#define RSA_STATUS_FAILED 0x3 + +/* RSA engine timeout, in ms */ +#define RSA_ENGINE_TIMEOUT 100 /* ms */ + +/* hardware mutex timeout, in ms */ +#define HM_TIMEOUT 10 /* ms */ + +/* 8051 memory access timeout, in us */ +#define DC8051_ACCESS_TIMEOUT 100 /* us */ + +/* the number of fabric SerDes on the SBus */ +#define NUM_FABRIC_SERDES 4 + +/* SBus fabric SerDes addresses, one set per HFI */ +static const u8 fabric_serdes_addrs[2][NUM_FABRIC_SERDES] = { + { 0x01, 0x02, 0x03, 0x04 }, + { 0x28, 0x29, 0x2a, 0x2b } +}; + +/* SBus PCIe SerDes addresses, one set per HFI */ +static const u8 pcie_serdes_addrs[2][NUM_PCIE_SERDES] = { + { 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, + 0x18, 0x1a, 0x1c, 0x1e, 0x20, 0x22, 0x24, 0x26 }, + { 0x2f, 0x31, 0x33, 0x35, 0x37, 0x39, 0x3b, 0x3d, + 0x3f, 0x41, 0x43, 0x45, 0x47, 0x49, 0x4b, 0x4d } +}; + +/* SBus PCIe PCS addresses, one set per HFI */ +const u8 pcie_pcs_addrs[2][NUM_PCIE_SERDES] = { + { 0x09, 0x0b, 0x0d, 0x0f, 0x11, 0x13, 0x15, 0x17, + 0x19, 0x1b, 0x1d, 0x1f, 0x21, 0x23, 0x25, 0x27 }, + { 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, + 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e } +}; + +/* SBus fabric SerDes broadcast addresses, one per HFI */ +static const u8 fabric_serdes_broadcast[2] = { 0xe4, 0xe5 }; +static const u8 all_fabric_serdes_broadcast = 0xe1; + +/* SBus PCIe SerDes broadcast addresses, one per HFI */ +const u8 pcie_serdes_broadcast[2] = { 0xe2, 0xe3 }; +static const u8 all_pcie_serdes_broadcast = 0xe0; + +/* forwards */ +static void dispose_one_firmware(struct firmware_details *fdet); +static int load_fabric_serdes_firmware(struct hfi1_devdata *dd, + struct firmware_details *fdet); + +/* + * Read a single 64-bit value from 8051 data memory. + * + * Expects: + * o caller to have already set up data read, no auto increment + * o caller to turn off read enable when finished + * + * The address argument is a byte offset. Bits 0:2 in the address are + * ignored - i.e. the hardware will always do aligned 8-byte reads as if + * the lower bits are zero. + * + * Return 0 on success, -ENXIO on a read error (timeout). + */ +static int __read_8051_data(struct hfi1_devdata *dd, u32 addr, u64 *result) +{ + u64 reg; + int count; + + /* start the read at the given address */ + reg = ((addr & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK) + << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT) + | DC_DC8051_CFG_RAM_ACCESS_CTRL_READ_ENA_SMASK; + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg); + + /* wait until ACCESS_COMPLETED is set */ + count = 0; + while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS) + & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK) + == 0) { + count++; + if (count > DC8051_ACCESS_TIMEOUT) { + dd_dev_err(dd, "timeout reading 8051 data\n"); + return -ENXIO; + } + ndelay(10); + } + + /* gather the data */ + *result = read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_RD_DATA); + + return 0; +} + +/* + * Read 8051 data starting at addr, for len bytes. Will read in 8-byte chunks. + * Return 0 on success, -errno on error. + */ +int read_8051_data(struct hfi1_devdata *dd, u32 addr, u32 len, u64 *result) +{ + unsigned long flags; + u32 done; + int ret = 0; + + spin_lock_irqsave(&dd->dc8051_memlock, flags); + + /* data read set-up, no auto-increment */ + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0); + + for (done = 0; done < len; addr += 8, done += 8, result++) { + ret = __read_8051_data(dd, addr, result); + if (ret) + break; + } + + /* turn off read enable */ + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0); + + spin_unlock_irqrestore(&dd->dc8051_memlock, flags); + + return ret; +} + +/* + * Write data or code to the 8051 code or data RAM. + */ +static int write_8051(struct hfi1_devdata *dd, int code, u32 start, + const u8 *data, u32 len) +{ + u64 reg; + u32 offset; + int aligned, count; + + /* check alignment */ + aligned = ((unsigned long)data & 0x7) == 0; + + /* write set-up */ + reg = (code ? DC_DC8051_CFG_RAM_ACCESS_SETUP_RAM_SEL_SMASK : 0ull) + | DC_DC8051_CFG_RAM_ACCESS_SETUP_AUTO_INCR_ADDR_SMASK; + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, reg); + + reg = ((start & DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_MASK) + << DC_DC8051_CFG_RAM_ACCESS_CTRL_ADDRESS_SHIFT) + | DC_DC8051_CFG_RAM_ACCESS_CTRL_WRITE_ENA_SMASK; + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, reg); + + /* write */ + for (offset = 0; offset < len; offset += 8) { + int bytes = len - offset; + + if (bytes < 8) { + reg = 0; + memcpy(®, &data[offset], bytes); + } else if (aligned) { + reg = *(u64 *)&data[offset]; + } else { + memcpy(®, &data[offset], 8); + } + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_WR_DATA, reg); + + /* wait until ACCESS_COMPLETED is set */ + count = 0; + while ((read_csr(dd, DC_DC8051_CFG_RAM_ACCESS_STATUS) + & DC_DC8051_CFG_RAM_ACCESS_STATUS_ACCESS_COMPLETED_SMASK) + == 0) { + count++; + if (count > DC8051_ACCESS_TIMEOUT) { + dd_dev_err(dd, "timeout writing 8051 data\n"); + return -ENXIO; + } + udelay(1); + } + } + + /* turn off write access, auto increment (also sets to data access) */ + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_CTRL, 0); + write_csr(dd, DC_DC8051_CFG_RAM_ACCESS_SETUP, 0); + + return 0; +} + +/* return 0 if values match, non-zero and complain otherwise */ +static int invalid_header(struct hfi1_devdata *dd, const char *what, + u32 actual, u32 expected) +{ + if (actual == expected) + return 0; + + dd_dev_err(dd, + "invalid firmware header field %s: expected 0x%x, actual 0x%x\n", + what, expected, actual); + return 1; +} + +/* + * Verify that the static fields in the CSS header match. + */ +static int verify_css_header(struct hfi1_devdata *dd, struct css_header *css) +{ + /* verify CSS header fields (most sizes are in DW, so add /4) */ + if (invalid_header(dd, "module_type", css->module_type, + CSS_MODULE_TYPE) || + invalid_header(dd, "header_len", css->header_len, + (sizeof(struct firmware_file) / 4)) || + invalid_header(dd, "header_version", css->header_version, + CSS_HEADER_VERSION) || + invalid_header(dd, "module_vendor", css->module_vendor, + CSS_MODULE_VENDOR) || + invalid_header(dd, "key_size", css->key_size, KEY_SIZE / 4) || + invalid_header(dd, "modulus_size", css->modulus_size, + KEY_SIZE / 4) || + invalid_header(dd, "exponent_size", css->exponent_size, + EXPONENT_SIZE / 4)) { + return -EINVAL; + } + return 0; +} + +/* + * Make sure there are at least some bytes after the prefix. + */ +static int payload_check(struct hfi1_devdata *dd, const char *name, + long file_size, long prefix_size) +{ + /* make sure we have some payload */ + if (prefix_size >= file_size) { + dd_dev_err(dd, + "firmware \"%s\", size %ld, must be larger than %ld bytes\n", + name, file_size, prefix_size); + return -EINVAL; + } + + return 0; +} + +/* + * Request the firmware from the system. Extract the pieces and fill in + * fdet. If successful, the caller will need to call dispose_one_firmware(). + * Returns 0 on success, -ERRNO on error. + */ +static int obtain_one_firmware(struct hfi1_devdata *dd, const char *name, + struct firmware_details *fdet) +{ + struct css_header *css; + int ret; + + memset(fdet, 0, sizeof(*fdet)); + + ret = reject_firmware(&fdet->fw, name, &dd->pcidev->dev); + if (ret) { + dd_dev_warn(dd, "cannot find firmware \"%s\", err %d\n", + name, ret); + return ret; + } + + /* verify the firmware */ + if (fdet->fw->size < sizeof(struct css_header)) { + dd_dev_err(dd, "firmware \"%s\" is too small\n", name); + ret = -EINVAL; + goto done; + } + css = (struct css_header *)fdet->fw->data; + + hfi1_cdbg(FIRMWARE, "Firmware %s details:", name); + hfi1_cdbg(FIRMWARE, "file size: 0x%lx bytes", fdet->fw->size); + hfi1_cdbg(FIRMWARE, "CSS structure:"); + hfi1_cdbg(FIRMWARE, " module_type 0x%x", css->module_type); + hfi1_cdbg(FIRMWARE, " header_len 0x%03x (0x%03x bytes)", + css->header_len, 4 * css->header_len); + hfi1_cdbg(FIRMWARE, " header_version 0x%x", css->header_version); + hfi1_cdbg(FIRMWARE, " module_id 0x%x", css->module_id); + hfi1_cdbg(FIRMWARE, " module_vendor 0x%x", css->module_vendor); + hfi1_cdbg(FIRMWARE, " date 0x%x", css->date); + hfi1_cdbg(FIRMWARE, " size 0x%03x (0x%03x bytes)", + css->size, 4 * css->size); + hfi1_cdbg(FIRMWARE, " key_size 0x%03x (0x%03x bytes)", + css->key_size, 4 * css->key_size); + hfi1_cdbg(FIRMWARE, " modulus_size 0x%03x (0x%03x bytes)", + css->modulus_size, 4 * css->modulus_size); + hfi1_cdbg(FIRMWARE, " exponent_size 0x%03x (0x%03x bytes)", + css->exponent_size, 4 * css->exponent_size); + hfi1_cdbg(FIRMWARE, "firmware size: 0x%lx bytes", + fdet->fw->size - sizeof(struct firmware_file)); + + /* + * If the file does not have a valid CSS header, fail. + * Otherwise, check the CSS size field for an expected size. + * The augmented file has r2 and mu inserted after the header + * was generated, so there will be a known difference between + * the CSS header size and the actual file size. Use this + * difference to identify an augmented file. + * + * Note: css->size is in DWORDs, multiply by 4 to get bytes. + */ + ret = verify_css_header(dd, css); + if (ret) { + dd_dev_info(dd, "Invalid CSS header for \"%s\"\n", name); + } else if ((css->size * 4) == fdet->fw->size) { + /* non-augmented firmware file */ + struct firmware_file *ff = (struct firmware_file *) + fdet->fw->data; + + /* make sure there are bytes in the payload */ + ret = payload_check(dd, name, fdet->fw->size, + sizeof(struct firmware_file)); + if (ret == 0) { + fdet->css_header = css; + fdet->modulus = ff->modulus; + fdet->exponent = ff->exponent; + fdet->signature = ff->signature; + fdet->r2 = fdet->dummy_header.r2; /* use dummy space */ + fdet->mu = fdet->dummy_header.mu; /* use dummy space */ + fdet->firmware_ptr = ff->firmware; + fdet->firmware_len = fdet->fw->size - + sizeof(struct firmware_file); + /* + * Header does not include r2 and mu - generate here. + * For now, fail. + */ + dd_dev_err(dd, "driver is unable to validate firmware without r2 and mu (not in firmware file)\n"); + ret = -EINVAL; + } + } else if ((css->size * 4) + AUGMENT_SIZE == fdet->fw->size) { + /* augmented firmware file */ + struct augmented_firmware_file *aff = + (struct augmented_firmware_file *)fdet->fw->data; + + /* make sure there are bytes in the payload */ + ret = payload_check(dd, name, fdet->fw->size, + sizeof(struct augmented_firmware_file)); + if (ret == 0) { + fdet->css_header = css; + fdet->modulus = aff->modulus; + fdet->exponent = aff->exponent; + fdet->signature = aff->signature; + fdet->r2 = aff->r2; + fdet->mu = aff->mu; + fdet->firmware_ptr = aff->firmware; + fdet->firmware_len = fdet->fw->size - + sizeof(struct augmented_firmware_file); + } + } else { + /* css->size check failed */ + dd_dev_err(dd, + "invalid firmware header field size: expected 0x%lx or 0x%lx, actual 0x%x\n", + fdet->fw->size / 4, + (fdet->fw->size - AUGMENT_SIZE) / 4, + css->size); + + ret = -EINVAL; + } + +done: + /* if returning an error, clean up after ourselves */ + if (ret) + dispose_one_firmware(fdet); + return ret; +} + +static void dispose_one_firmware(struct firmware_details *fdet) +{ + release_firmware(fdet->fw); + /* erase all previous information */ + memset(fdet, 0, sizeof(*fdet)); +} + +/* + * Obtain the 4 firmwares from the OS. All must be obtained at once or not + * at all. If called with the firmware state in FW_TRY, use alternate names. + * On exit, this routine will have set the firmware state to one of FW_TRY, + * FW_FINAL, or FW_ERR. + * + * Must be holding fw_mutex. + */ +static void __obtain_firmware(struct hfi1_devdata *dd) +{ + int err = 0; + + if (fw_state == FW_FINAL) /* nothing more to obtain */ + return; + if (fw_state == FW_ERR) /* already in error */ + return; + + /* fw_state is FW_EMPTY or FW_TRY */ +retry: + if (fw_state == FW_TRY) { + /* + * We tried the original and it failed. Move to the + * alternate. + */ + dd_dev_warn(dd, "using alternate firmware names\n"); + /* + * Let others run. Some systems, when missing firmware, does + * something that holds for 30 seconds. If we do that twice + * in a row it triggers task blocked warning. + */ + cond_resched(); + if (fw_8051_load) + dispose_one_firmware(&fw_8051); + if (fw_fabric_serdes_load) + dispose_one_firmware(&fw_fabric); + if (fw_sbus_load) + dispose_one_firmware(&fw_sbus); + if (fw_pcie_serdes_load) + dispose_one_firmware(&fw_pcie); + fw_8051_name = ALT_FW_8051_NAME_ASIC; + fw_fabric_serdes_name = ALT_FW_FABRIC_NAME; + fw_sbus_name = ALT_FW_SBUS_NAME; + fw_pcie_serdes_name = ALT_FW_PCIE_NAME; + } + + if (fw_sbus_load) { + err = obtain_one_firmware(dd, fw_sbus_name, &fw_sbus); + if (err) + goto done; + } + + if (fw_pcie_serdes_load) { + err = obtain_one_firmware(dd, fw_pcie_serdes_name, &fw_pcie); + if (err) + goto done; + } + + if (fw_fabric_serdes_load) { + err = obtain_one_firmware(dd, fw_fabric_serdes_name, + &fw_fabric); + if (err) + goto done; + } + + if (fw_8051_load) { + err = obtain_one_firmware(dd, fw_8051_name, &fw_8051); + if (err) + goto done; + } + +done: + if (err) { + /* oops, had problems obtaining a firmware */ + if (fw_state == FW_EMPTY && dd->icode == ICODE_RTL_SILICON) { + /* retry with alternate (RTL only) */ + fw_state = FW_TRY; + goto retry; + } + dd_dev_err(dd, "unable to obtain working firmware\n"); + fw_state = FW_ERR; + fw_err = -ENOENT; + } else { + /* success */ + if (fw_state == FW_EMPTY && + dd->icode != ICODE_FUNCTIONAL_SIMULATOR) + fw_state = FW_TRY; /* may retry later */ + else + fw_state = FW_FINAL; /* cannot try again */ + } +} + +/* + * Called by all HFIs when loading their firmware - i.e. device probe time. + * The first one will do the actual firmware load. Use a mutex to resolve + * any possible race condition. + * + * The call to this routine cannot be moved to driver load because the kernel + * call reject_firmware() requires a device which is only available after + * the first device probe. + */ +static int obtain_firmware(struct hfi1_devdata *dd) +{ + unsigned long timeout; + int err = 0; + + mutex_lock(&fw_mutex); + + /* 40s delay due to long delay on missing firmware on some systems */ + timeout = jiffies + msecs_to_jiffies(40000); + while (fw_state == FW_TRY) { + /* + * Another device is trying the firmware. Wait until it + * decides what works (or not). + */ + if (time_after(jiffies, timeout)) { + /* waited too long */ + dd_dev_err(dd, "Timeout waiting for firmware try"); + fw_state = FW_ERR; + fw_err = -ETIMEDOUT; + break; + } + mutex_unlock(&fw_mutex); + msleep(20); /* arbitrary delay */ + mutex_lock(&fw_mutex); + } + /* not in FW_TRY state */ + + if (fw_state == FW_FINAL) { + if (platform_config) { + dd->platform_config.data = platform_config->data; + dd->platform_config.size = platform_config->size; + } + goto done; /* already acquired */ + } else if (fw_state == FW_ERR) { + goto done; /* already tried and failed */ + } + /* fw_state is FW_EMPTY */ + + /* set fw_state to FW_TRY, FW_FINAL, or FW_ERR, and fw_err */ + __obtain_firmware(dd); + + if (platform_config_load) { + platform_config = NULL; + err = reject_firmware(&platform_config, platform_config_name, + &dd->pcidev->dev); + if (err) { + platform_config = NULL; + goto done; + } + dd->platform_config.data = platform_config->data; + dd->platform_config.size = platform_config->size; + } + +done: + mutex_unlock(&fw_mutex); + + return fw_err; +} + +/* + * Called when the driver unloads. The timing is asymmetric with its + * counterpart, obtain_firmware(). If called at device remove time, + * then it is conceivable that another device could probe while the + * firmware is being disposed. The mutexes can be moved to do that + * safely, but then the firmware would be requested from the OS multiple + * times. + * + * No mutex is needed as the driver is unloading and there cannot be any + * other callers. + */ +void dispose_firmware(void) +{ + dispose_one_firmware(&fw_8051); + dispose_one_firmware(&fw_fabric); + dispose_one_firmware(&fw_pcie); + dispose_one_firmware(&fw_sbus); + + release_firmware(platform_config); + platform_config = NULL; + + /* retain the error state, otherwise revert to empty */ + if (fw_state != FW_ERR) + fw_state = FW_EMPTY; +} + +/* + * Called with the result of a firmware download. + * + * Return 1 to retry loading the firmware, 0 to stop. + */ +static int retry_firmware(struct hfi1_devdata *dd, int load_result) +{ + int retry; + + mutex_lock(&fw_mutex); + + if (load_result == 0) { + /* + * The load succeeded, so expect all others to do the same. + * Do not retry again. + */ + if (fw_state == FW_TRY) + fw_state = FW_FINAL; + retry = 0; /* do NOT retry */ + } else if (fw_state == FW_TRY) { + /* load failed, obtain alternate firmware */ + __obtain_firmware(dd); + retry = (fw_state == FW_FINAL); + } else { + /* else in FW_FINAL or FW_ERR, no retry in either case */ + retry = 0; + } + + mutex_unlock(&fw_mutex); + return retry; +} + +/* + * Write a block of data to a given array CSR. All calls will be in + * multiples of 8 bytes. + */ +static void write_rsa_data(struct hfi1_devdata *dd, int what, + const u8 *data, int nbytes) +{ + int qw_size = nbytes / 8; + int i; + + if (((unsigned long)data & 0x7) == 0) { + /* aligned */ + u64 *ptr = (u64 *)data; + + for (i = 0; i < qw_size; i++, ptr++) + write_csr(dd, what + (8 * i), *ptr); + } else { + /* not aligned */ + for (i = 0; i < qw_size; i++, data += 8) { + u64 value; + + memcpy(&value, data, 8); + write_csr(dd, what + (8 * i), value); + } + } +} + +/* + * Write a block of data to a given CSR as a stream of writes. All calls will + * be in multiples of 8 bytes. + */ +static void write_streamed_rsa_data(struct hfi1_devdata *dd, int what, + const u8 *data, int nbytes) +{ + u64 *ptr = (u64 *)data; + int qw_size = nbytes / 8; + + for (; qw_size > 0; qw_size--, ptr++) + write_csr(dd, what, *ptr); +} + +/* + * Download the signature and start the RSA mechanism. Wait for + * RSA_ENGINE_TIMEOUT before giving up. + */ +static int run_rsa(struct hfi1_devdata *dd, const char *who, + const u8 *signature) +{ + unsigned long timeout; + u64 reg; + u32 status; + int ret = 0; + + /* write the signature */ + write_rsa_data(dd, MISC_CFG_RSA_SIGNATURE, signature, KEY_SIZE); + + /* initialize RSA */ + write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_INIT); + + /* + * Make sure the engine is idle and insert a delay between the two + * writes to MISC_CFG_RSA_CMD. + */ + status = (read_csr(dd, MISC_CFG_FW_CTRL) + & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK) + >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT; + if (status != RSA_STATUS_IDLE) { + dd_dev_err(dd, "%s security engine not idle - giving up\n", + who); + return -EBUSY; + } + + /* start RSA */ + write_csr(dd, MISC_CFG_RSA_CMD, RSA_CMD_START); + + /* + * Look for the result. + * + * The RSA engine is hooked up to two MISC errors. The driver + * masks these errors as they do not respond to the standard + * error "clear down" mechanism. Look for these errors here and + * clear them when possible. This routine will exit with the + * errors of the current run still set. + * + * MISC_FW_AUTH_FAILED_ERR + * Firmware authorization failed. This can be cleared by + * re-initializing the RSA engine, then clearing the status bit. + * Do not re-init the RSA angine immediately after a successful + * run - this will reset the current authorization. + * + * MISC_KEY_MISMATCH_ERR + * Key does not match. The only way to clear this is to load + * a matching key then clear the status bit. If this error + * is raised, it will persist outside of this routine until a + * matching key is loaded. + */ + timeout = msecs_to_jiffies(RSA_ENGINE_TIMEOUT) + jiffies; + while (1) { + status = (read_csr(dd, MISC_CFG_FW_CTRL) + & MISC_CFG_FW_CTRL_RSA_STATUS_SMASK) + >> MISC_CFG_FW_CTRL_RSA_STATUS_SHIFT; + + if (status == RSA_STATUS_IDLE) { + /* should not happen */ + dd_dev_err(dd, "%s firmware security bad idle state\n", + who); + ret = -EINVAL; + break; + } else if (status == RSA_STATUS_DONE) { + /* finished successfully */ + break; + } else if (status == RSA_STATUS_FAILED) { + /* finished unsuccessfully */ + ret = -EINVAL; + break; + } + /* else still active */ + + if (time_after(jiffies, timeout)) { + /* + * Timed out while active. We can't reset the engine + * if it is stuck active, but run through the + * error code to see what error bits are set. + */ + dd_dev_err(dd, "%s firmware security time out\n", who); + ret = -ETIMEDOUT; + break; + } + + msleep(20); + } + + /* + * Arrive here on success or failure. Clear all RSA engine + * errors. All current errors will stick - the RSA logic is keeping + * error high. All previous errors will clear - the RSA logic + * is not keeping the error high. + */ + write_csr(dd, MISC_ERR_CLEAR, + MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK | + MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK); + /* + * All that is left are the current errors. Print warnings on + * authorization failure details, if any. Firmware authorization + * can be retried, so these are only warnings. + */ + reg = read_csr(dd, MISC_ERR_STATUS); + if (ret) { + if (reg & MISC_ERR_STATUS_MISC_FW_AUTH_FAILED_ERR_SMASK) + dd_dev_warn(dd, "%s firmware authorization failed\n", + who); + if (reg & MISC_ERR_STATUS_MISC_KEY_MISMATCH_ERR_SMASK) + dd_dev_warn(dd, "%s firmware key mismatch\n", who); + } + + return ret; +} + +static void load_security_variables(struct hfi1_devdata *dd, + struct firmware_details *fdet) +{ + /* Security variables a. Write the modulus */ + write_rsa_data(dd, MISC_CFG_RSA_MODULUS, fdet->modulus, KEY_SIZE); + /* Security variables b. Write the r2 */ + write_rsa_data(dd, MISC_CFG_RSA_R2, fdet->r2, KEY_SIZE); + /* Security variables c. Write the mu */ + write_rsa_data(dd, MISC_CFG_RSA_MU, fdet->mu, MU_SIZE); + /* Security variables d. Write the header */ + write_streamed_rsa_data(dd, MISC_CFG_SHA_PRELOAD, + (u8 *)fdet->css_header, + sizeof(struct css_header)); +} + +/* return the 8051 firmware state */ +static inline u32 get_firmware_state(struct hfi1_devdata *dd) +{ + u64 reg = read_csr(dd, DC_DC8051_STS_CUR_STATE); + + return (reg >> DC_DC8051_STS_CUR_STATE_FIRMWARE_SHIFT) + & DC_DC8051_STS_CUR_STATE_FIRMWARE_MASK; +} + +/* + * Wait until the firmware is up and ready to take host requests. + * Return 0 on success, -ETIMEDOUT on timeout. + */ +int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout) +{ + unsigned long timeout; + + /* in the simulator, the fake 8051 is always ready */ + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) + return 0; + + timeout = msecs_to_jiffies(mstimeout) + jiffies; + while (1) { + if (get_firmware_state(dd) == 0xa0) /* ready */ + return 0; + if (time_after(jiffies, timeout)) /* timed out */ + return -ETIMEDOUT; + usleep_range(1950, 2050); /* sleep 2ms-ish */ + } +} + +/* + * Load the 8051 firmware. + */ +static int load_8051_firmware(struct hfi1_devdata *dd, + struct firmware_details *fdet) +{ + u64 reg; + int ret; + u8 ver_a, ver_b; + + /* + * DC Reset sequence + * Load DC 8051 firmware + */ + /* + * DC reset step 1: Reset DC8051 + */ + reg = DC_DC8051_CFG_RST_M8051W_SMASK + | DC_DC8051_CFG_RST_CRAM_SMASK + | DC_DC8051_CFG_RST_DRAM_SMASK + | DC_DC8051_CFG_RST_IRAM_SMASK + | DC_DC8051_CFG_RST_SFR_SMASK; + write_csr(dd, DC_DC8051_CFG_RST, reg); + + /* + * DC reset step 2 (optional): Load 8051 data memory with link + * configuration + */ + + /* + * DC reset step 3: Load DC8051 firmware + */ + /* release all but the core reset */ + reg = DC_DC8051_CFG_RST_M8051W_SMASK; + write_csr(dd, DC_DC8051_CFG_RST, reg); + + /* Firmware load step 1 */ + load_security_variables(dd, fdet); + + /* + * Firmware load step 2. Clear MISC_CFG_FW_CTRL.FW_8051_LOADED + */ + write_csr(dd, MISC_CFG_FW_CTRL, 0); + + /* Firmware load steps 3-5 */ + ret = write_8051(dd, 1/*code*/, 0, fdet->firmware_ptr, + fdet->firmware_len); + if (ret) + return ret; + + /* + * DC reset step 4. Host starts the DC8051 firmware + */ + /* + * Firmware load step 6. Set MISC_CFG_FW_CTRL.FW_8051_LOADED + */ + write_csr(dd, MISC_CFG_FW_CTRL, MISC_CFG_FW_CTRL_FW_8051_LOADED_SMASK); + + /* Firmware load steps 7-10 */ + ret = run_rsa(dd, "8051", fdet->signature); + if (ret) + return ret; + + /* clear all reset bits, releasing the 8051 */ + write_csr(dd, DC_DC8051_CFG_RST, 0ull); + + /* + * DC reset step 5. Wait for firmware to be ready to accept host + * requests. + */ + ret = wait_fm_ready(dd, TIMEOUT_8051_START); + if (ret) { /* timed out */ + dd_dev_err(dd, "8051 start timeout, current state 0x%x\n", + get_firmware_state(dd)); + return -ETIMEDOUT; + } + + read_misc_status(dd, &ver_a, &ver_b); + dd_dev_info(dd, "8051 firmware version %d.%d\n", + (int)ver_b, (int)ver_a); + dd->dc8051_ver = dc8051_ver(ver_b, ver_a); + + return 0; +} + +/* + * Write the SBus request register + * + * No need for masking - the arguments are sized exactly. + */ +void sbus_request(struct hfi1_devdata *dd, + u8 receiver_addr, u8 data_addr, u8 command, u32 data_in) +{ + write_csr(dd, ASIC_CFG_SBUS_REQUEST, + ((u64)data_in << ASIC_CFG_SBUS_REQUEST_DATA_IN_SHIFT) | + ((u64)command << ASIC_CFG_SBUS_REQUEST_COMMAND_SHIFT) | + ((u64)data_addr << ASIC_CFG_SBUS_REQUEST_DATA_ADDR_SHIFT) | + ((u64)receiver_addr << + ASIC_CFG_SBUS_REQUEST_RECEIVER_ADDR_SHIFT)); +} + +/* + * Turn off the SBus and fabric serdes spicos. + * + * + Must be called with Sbus fast mode turned on. + * + Must be called after fabric serdes broadcast is set up. + * + Must be called before the 8051 is loaded - assumes 8051 is not loaded + * when using MISC_CFG_FW_CTRL. + */ +static void turn_off_spicos(struct hfi1_devdata *dd, int flags) +{ + /* only needed on A0 */ + if (!is_ax(dd)) + return; + + dd_dev_info(dd, "Turning off spicos:%s%s\n", + flags & SPICO_SBUS ? " SBus" : "", + flags & SPICO_FABRIC ? " fabric" : ""); + + write_csr(dd, MISC_CFG_FW_CTRL, ENABLE_SPICO_SMASK); + /* disable SBus spico */ + if (flags & SPICO_SBUS) + sbus_request(dd, SBUS_MASTER_BROADCAST, 0x01, + WRITE_SBUS_RECEIVER, 0x00000040); + + /* disable the fabric serdes spicos */ + if (flags & SPICO_FABRIC) + sbus_request(dd, fabric_serdes_broadcast[dd->hfi1_id], + 0x07, WRITE_SBUS_RECEIVER, 0x00000000); + write_csr(dd, MISC_CFG_FW_CTRL, 0); +} + +/* + * Reset all of the fabric serdes for this HFI in preparation to take the + * link to Polling. + * + * To do a reset, we need to write to to the serdes registers. Unfortunately, + * the fabric serdes download to the other HFI on the ASIC will have turned + * off the firmware validation on this HFI. This means we can't write to the + * registers to reset the serdes. Work around this by performing a complete + * re-download and validation of the fabric serdes firmware. This, as a + * by-product, will reset the serdes. NOTE: the re-download requires that + * the 8051 be in the Offline state. I.e. not actively trying to use the + * serdes. This routine is called at the point where the link is Offline and + * is getting ready to go to Polling. + */ +void fabric_serdes_reset(struct hfi1_devdata *dd) +{ + int ret; + + if (!fw_fabric_serdes_load) + return; + + ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT); + if (ret) { + dd_dev_err(dd, + "Cannot acquire SBus resource to reset fabric SerDes - perhaps you should reboot\n"); + return; + } + set_sbus_fast_mode(dd); + + if (is_ax(dd)) { + /* A0 serdes do not work with a re-download */ + u8 ra = fabric_serdes_broadcast[dd->hfi1_id]; + + /* place SerDes in reset and disable SPICO */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011); + /* wait 100 refclk cycles @ 156.25MHz => 640ns */ + udelay(1); + /* remove SerDes reset */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010); + /* turn SPICO enable on */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002); + } else { + turn_off_spicos(dd, SPICO_FABRIC); + /* + * No need for firmware retry - what to download has already + * been decided. + * No need to pay attention to the load return - the only + * failure is a validation failure, which has already been + * checked by the initial download. + */ + (void)load_fabric_serdes_firmware(dd, &fw_fabric); + } + + clear_sbus_fast_mode(dd); + release_chip_resource(dd, CR_SBUS); +} + +/* Access to the SBus in this routine should probably be serialized */ +int sbus_request_slow(struct hfi1_devdata *dd, + u8 receiver_addr, u8 data_addr, u8 command, u32 data_in) +{ + u64 reg, count = 0; + + /* make sure fast mode is clear */ + clear_sbus_fast_mode(dd); + + sbus_request(dd, receiver_addr, data_addr, command, data_in); + write_csr(dd, ASIC_CFG_SBUS_EXECUTE, + ASIC_CFG_SBUS_EXECUTE_EXECUTE_SMASK); + /* Wait for both DONE and RCV_DATA_VALID to go high */ + reg = read_csr(dd, ASIC_STS_SBUS_RESULT); + while (!((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) && + (reg & ASIC_STS_SBUS_RESULT_RCV_DATA_VALID_SMASK))) { + if (count++ >= SBUS_MAX_POLL_COUNT) { + u64 counts = read_csr(dd, ASIC_STS_SBUS_COUNTERS); + /* + * If the loop has timed out, we are OK if DONE bit + * is set and RCV_DATA_VALID and EXECUTE counters + * are the same. If not, we cannot proceed. + */ + if ((reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) && + (SBUS_COUNTER(counts, RCV_DATA_VALID) == + SBUS_COUNTER(counts, EXECUTE))) + break; + return -ETIMEDOUT; + } + udelay(1); + reg = read_csr(dd, ASIC_STS_SBUS_RESULT); + } + count = 0; + write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0); + /* Wait for DONE to clear after EXECUTE is cleared */ + reg = read_csr(dd, ASIC_STS_SBUS_RESULT); + while (reg & ASIC_STS_SBUS_RESULT_DONE_SMASK) { + if (count++ >= SBUS_MAX_POLL_COUNT) + return -ETIME; + udelay(1); + reg = read_csr(dd, ASIC_STS_SBUS_RESULT); + } + return 0; +} + +static int load_fabric_serdes_firmware(struct hfi1_devdata *dd, + struct firmware_details *fdet) +{ + int i, err; + const u8 ra = fabric_serdes_broadcast[dd->hfi1_id]; /* receiver addr */ + + dd_dev_info(dd, "Downloading fabric firmware\n"); + + /* step 1: load security variables */ + load_security_variables(dd, fdet); + /* step 2: place SerDes in reset and disable SPICO */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000011); + /* wait 100 refclk cycles @ 156.25MHz => 640ns */ + udelay(1); + /* step 3: remove SerDes reset */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000010); + /* step 4: assert IMEM override */ + sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x40000000); + /* step 5: download SerDes machine code */ + for (i = 0; i < fdet->firmware_len; i += 4) { + sbus_request(dd, ra, 0x0a, WRITE_SBUS_RECEIVER, + *(u32 *)&fdet->firmware_ptr[i]); + } + /* step 6: IMEM override off */ + sbus_request(dd, ra, 0x00, WRITE_SBUS_RECEIVER, 0x00000000); + /* step 7: turn ECC on */ + sbus_request(dd, ra, 0x0b, WRITE_SBUS_RECEIVER, 0x000c0000); + + /* steps 8-11: run the RSA engine */ + err = run_rsa(dd, "fabric serdes", fdet->signature); + if (err) + return err; + + /* step 12: turn SPICO enable on */ + sbus_request(dd, ra, 0x07, WRITE_SBUS_RECEIVER, 0x00000002); + /* step 13: enable core hardware interrupts */ + sbus_request(dd, ra, 0x08, WRITE_SBUS_RECEIVER, 0x00000000); + + return 0; +} + +static int load_sbus_firmware(struct hfi1_devdata *dd, + struct firmware_details *fdet) +{ + int i, err; + const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */ + + dd_dev_info(dd, "Downloading SBus firmware\n"); + + /* step 1: load security variables */ + load_security_variables(dd, fdet); + /* step 2: place SPICO into reset and enable off */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x000000c0); + /* step 3: remove reset, enable off, IMEM_CNTRL_EN on */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000240); + /* step 4: set starting IMEM address for burst download */ + sbus_request(dd, ra, 0x03, WRITE_SBUS_RECEIVER, 0x80000000); + /* step 5: download the SBus Master machine code */ + for (i = 0; i < fdet->firmware_len; i += 4) { + sbus_request(dd, ra, 0x14, WRITE_SBUS_RECEIVER, + *(u32 *)&fdet->firmware_ptr[i]); + } + /* step 6: set IMEM_CNTL_EN off */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000040); + /* step 7: turn ECC on */ + sbus_request(dd, ra, 0x16, WRITE_SBUS_RECEIVER, 0x000c0000); + + /* steps 8-11: run the RSA engine */ + err = run_rsa(dd, "SBus", fdet->signature); + if (err) + return err; + + /* step 12: set SPICO_ENABLE on */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140); + + return 0; +} + +static int load_pcie_serdes_firmware(struct hfi1_devdata *dd, + struct firmware_details *fdet) +{ + int i; + const u8 ra = SBUS_MASTER_BROADCAST; /* receiver address */ + + dd_dev_info(dd, "Downloading PCIe firmware\n"); + + /* step 1: load security variables */ + load_security_variables(dd, fdet); + /* step 2: assert single step (halts the SBus Master spico) */ + sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000001); + /* step 3: enable XDMEM access */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000d40); + /* step 4: load firmware into SBus Master XDMEM */ + /* + * NOTE: the dmem address, write_en, and wdata are all pre-packed, + * we only need to pick up the bytes and write them + */ + for (i = 0; i < fdet->firmware_len; i += 4) { + sbus_request(dd, ra, 0x04, WRITE_SBUS_RECEIVER, + *(u32 *)&fdet->firmware_ptr[i]); + } + /* step 5: disable XDMEM access */ + sbus_request(dd, ra, 0x01, WRITE_SBUS_RECEIVER, 0x00000140); + /* step 6: allow SBus Spico to run */ + sbus_request(dd, ra, 0x05, WRITE_SBUS_RECEIVER, 0x00000000); + + /* + * steps 7-11: run RSA, if it succeeds, firmware is available to + * be swapped + */ + return run_rsa(dd, "PCIe serdes", fdet->signature); +} + +/* + * Set the given broadcast values on the given list of devices. + */ +static void set_serdes_broadcast(struct hfi1_devdata *dd, u8 bg1, u8 bg2, + const u8 *addrs, int count) +{ + while (--count >= 0) { + /* + * Set BROADCAST_GROUP_1 and BROADCAST_GROUP_2, leave + * defaults for everything else. Do not read-modify-write, + * per instruction from the manufacturer. + * + * Register 0xfd: + * bits what + * ----- --------------------------------- + * 0 IGNORE_BROADCAST (default 0) + * 11:4 BROADCAST_GROUP_1 (default 0xff) + * 23:16 BROADCAST_GROUP_2 (default 0xff) + */ + sbus_request(dd, addrs[count], 0xfd, WRITE_SBUS_RECEIVER, + (u32)bg1 << 4 | (u32)bg2 << 16); + } +} + +int acquire_hw_mutex(struct hfi1_devdata *dd) +{ + unsigned long timeout; + int try = 0; + u8 mask = 1 << dd->hfi1_id; + u8 user; + +retry: + timeout = msecs_to_jiffies(HM_TIMEOUT) + jiffies; + while (1) { + write_csr(dd, ASIC_CFG_MUTEX, mask); + user = (u8)read_csr(dd, ASIC_CFG_MUTEX); + if (user == mask) + return 0; /* success */ + if (time_after(jiffies, timeout)) + break; /* timed out */ + msleep(20); + } + + /* timed out */ + dd_dev_err(dd, + "Unable to acquire hardware mutex, mutex mask %u, my mask %u (%s)\n", + (u32)user, (u32)mask, (try == 0) ? "retrying" : "giving up"); + + if (try == 0) { + /* break mutex and retry */ + write_csr(dd, ASIC_CFG_MUTEX, 0); + try++; + goto retry; + } + + return -EBUSY; +} + +void release_hw_mutex(struct hfi1_devdata *dd) +{ + write_csr(dd, ASIC_CFG_MUTEX, 0); +} + +/* return the given resource bit(s) as a mask for the given HFI */ +static inline u64 resource_mask(u32 hfi1_id, u32 resource) +{ + return ((u64)resource) << (hfi1_id ? CR_DYN_SHIFT : 0); +} + +static void fail_mutex_acquire_message(struct hfi1_devdata *dd, + const char *func) +{ + dd_dev_err(dd, + "%s: hardware mutex stuck - suggest rebooting the machine\n", + func); +} + +/* + * Acquire access to a chip resource. + * + * Return 0 on success, -EBUSY if resource busy, -EIO if mutex acquire failed. + */ +static int __acquire_chip_resource(struct hfi1_devdata *dd, u32 resource) +{ + u64 scratch0, all_bits, my_bit; + int ret; + + if (resource & CR_DYN_MASK) { + /* a dynamic resource is in use if either HFI has set the bit */ + if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0 && + (resource & (CR_I2C1 | CR_I2C2))) { + /* discrete devices must serialize across both chains */ + all_bits = resource_mask(0, CR_I2C1 | CR_I2C2) | + resource_mask(1, CR_I2C1 | CR_I2C2); + } else { + all_bits = resource_mask(0, resource) | + resource_mask(1, resource); + } + my_bit = resource_mask(dd->hfi1_id, resource); + } else { + /* non-dynamic resources are not split between HFIs */ + all_bits = resource; + my_bit = resource; + } + + /* lock against other callers within the driver wanting a resource */ + mutex_lock(&dd->asic_data->asic_resource_mutex); + + ret = acquire_hw_mutex(dd); + if (ret) { + fail_mutex_acquire_message(dd, __func__); + ret = -EIO; + goto done; + } + + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + if (scratch0 & all_bits) { + ret = -EBUSY; + } else { + write_csr(dd, ASIC_CFG_SCRATCH, scratch0 | my_bit); + /* force write to be visible to other HFI on another OS */ + (void)read_csr(dd, ASIC_CFG_SCRATCH); + } + + release_hw_mutex(dd); + +done: + mutex_unlock(&dd->asic_data->asic_resource_mutex); + return ret; +} + +/* + * Acquire access to a chip resource, wait up to mswait milliseconds for + * the resource to become available. + * + * Return 0 on success, -EBUSY if busy (even after wait), -EIO if mutex + * acquire failed. + */ +int acquire_chip_resource(struct hfi1_devdata *dd, u32 resource, u32 mswait) +{ + unsigned long timeout; + int ret; + + timeout = jiffies + msecs_to_jiffies(mswait); + while (1) { + ret = __acquire_chip_resource(dd, resource); + if (ret != -EBUSY) + return ret; + /* resource is busy, check our timeout */ + if (time_after_eq(jiffies, timeout)) + return -EBUSY; + usleep_range(80, 120); /* arbitrary delay */ + } +} + +/* + * Release access to a chip resource + */ +void release_chip_resource(struct hfi1_devdata *dd, u32 resource) +{ + u64 scratch0, bit; + + /* only dynamic resources should ever be cleared */ + if (!(resource & CR_DYN_MASK)) { + dd_dev_err(dd, "%s: invalid resource 0x%x\n", __func__, + resource); + return; + } + bit = resource_mask(dd->hfi1_id, resource); + + /* lock against other callers within the driver wanting a resource */ + mutex_lock(&dd->asic_data->asic_resource_mutex); + + if (acquire_hw_mutex(dd)) { + fail_mutex_acquire_message(dd, __func__); + goto done; + } + + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + if ((scratch0 & bit) != 0) { + scratch0 &= ~bit; + write_csr(dd, ASIC_CFG_SCRATCH, scratch0); + /* force write to be visible to other HFI on another OS */ + (void)read_csr(dd, ASIC_CFG_SCRATCH); + } else { + dd_dev_warn(dd, "%s: id %d, resource 0x%x: bit not set\n", + __func__, dd->hfi1_id, resource); + } + + release_hw_mutex(dd); + +done: + mutex_unlock(&dd->asic_data->asic_resource_mutex); +} + +/* + * Return true if resource is set, false otherwise. Print a warning + * if not set and a function is supplied. + */ +bool check_chip_resource(struct hfi1_devdata *dd, u32 resource, + const char *func) +{ + u64 scratch0, bit; + + if (resource & CR_DYN_MASK) + bit = resource_mask(dd->hfi1_id, resource); + else + bit = resource; + + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + if ((scratch0 & bit) == 0) { + if (func) + dd_dev_warn(dd, + "%s: id %d, resource 0x%x, not acquired!\n", + func, dd->hfi1_id, resource); + return false; + } + return true; +} + +static void clear_chip_resources(struct hfi1_devdata *dd, const char *func) +{ + u64 scratch0; + + /* lock against other callers within the driver wanting a resource */ + mutex_lock(&dd->asic_data->asic_resource_mutex); + + if (acquire_hw_mutex(dd)) { + fail_mutex_acquire_message(dd, func); + goto done; + } + + /* clear all dynamic access bits for this HFI */ + scratch0 = read_csr(dd, ASIC_CFG_SCRATCH); + scratch0 &= ~resource_mask(dd->hfi1_id, CR_DYN_MASK); + write_csr(dd, ASIC_CFG_SCRATCH, scratch0); + /* force write to be visible to other HFI on another OS */ + (void)read_csr(dd, ASIC_CFG_SCRATCH); + + release_hw_mutex(dd); + +done: + mutex_unlock(&dd->asic_data->asic_resource_mutex); +} + +void init_chip_resources(struct hfi1_devdata *dd) +{ + /* clear any holds left by us */ + clear_chip_resources(dd, __func__); +} + +void finish_chip_resources(struct hfi1_devdata *dd) +{ + /* clear any holds left by us */ + clear_chip_resources(dd, __func__); +} + +void set_sbus_fast_mode(struct hfi1_devdata *dd) +{ + write_csr(dd, ASIC_CFG_SBUS_EXECUTE, + ASIC_CFG_SBUS_EXECUTE_FAST_MODE_SMASK); +} + +void clear_sbus_fast_mode(struct hfi1_devdata *dd) +{ + u64 reg, count = 0; + + reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS); + while (SBUS_COUNTER(reg, EXECUTE) != + SBUS_COUNTER(reg, RCV_DATA_VALID)) { + if (count++ >= SBUS_MAX_POLL_COUNT) + break; + udelay(1); + reg = read_csr(dd, ASIC_STS_SBUS_COUNTERS); + } + write_csr(dd, ASIC_CFG_SBUS_EXECUTE, 0); +} + +int load_firmware(struct hfi1_devdata *dd) +{ + int ret; + + if (fw_fabric_serdes_load) { + ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT); + if (ret) + return ret; + + set_sbus_fast_mode(dd); + + set_serdes_broadcast(dd, all_fabric_serdes_broadcast, + fabric_serdes_broadcast[dd->hfi1_id], + fabric_serdes_addrs[dd->hfi1_id], + NUM_FABRIC_SERDES); + turn_off_spicos(dd, SPICO_FABRIC); + do { + ret = load_fabric_serdes_firmware(dd, &fw_fabric); + } while (retry_firmware(dd, ret)); + + clear_sbus_fast_mode(dd); + release_chip_resource(dd, CR_SBUS); + if (ret) + return ret; + } + + if (fw_8051_load) { + do { + ret = load_8051_firmware(dd, &fw_8051); + } while (retry_firmware(dd, ret)); + if (ret) + return ret; + } + + return 0; +} + +int hfi1_firmware_init(struct hfi1_devdata *dd) +{ + /* only RTL can use these */ + if (dd->icode != ICODE_RTL_SILICON) { + fw_fabric_serdes_load = 0; + fw_pcie_serdes_load = 0; + fw_sbus_load = 0; + } + + /* no 8051 or QSFP on simulator */ + if (dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { + fw_8051_load = 0; + platform_config_load = 0; + } + + if (!fw_8051_name) { + if (dd->icode == ICODE_RTL_SILICON) + fw_8051_name = DEFAULT_FW_8051_NAME_ASIC; + else + fw_8051_name = DEFAULT_FW_8051_NAME_FPGA; + } + if (!fw_fabric_serdes_name) + fw_fabric_serdes_name = DEFAULT_FW_FABRIC_NAME; + if (!fw_sbus_name) + fw_sbus_name = DEFAULT_FW_SBUS_NAME; + if (!fw_pcie_serdes_name) + fw_pcie_serdes_name = DEFAULT_FW_PCIE_NAME; + if (!platform_config_name) + platform_config_name = DEFAULT_PLATFORM_CONFIG_NAME; + + return obtain_firmware(dd); +} + +/* + * This function is a helper function for parse_platform_config(...) and + * does not check for validity of the platform configuration cache + * (because we know it is invalid as we are building up the cache). + * As such, this should not be called from anywhere other than + * parse_platform_config + */ +static int check_meta_version(struct hfi1_devdata *dd, u32 *system_table) +{ + u32 meta_ver, meta_ver_meta, ver_start, ver_len, mask; + struct platform_config_cache *pcfgcache = &dd->pcfg_cache; + + if (!system_table) + return -EINVAL; + + meta_ver_meta = + *(pcfgcache->config_tables[PLATFORM_CONFIG_SYSTEM_TABLE].table_metadata + + SYSTEM_TABLE_META_VERSION); + + mask = ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1); + ver_start = meta_ver_meta & mask; + + meta_ver_meta >>= METADATA_TABLE_FIELD_LEN_SHIFT; + + mask = ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1); + ver_len = meta_ver_meta & mask; + + ver_start /= 8; + meta_ver = *((u8 *)system_table + ver_start) & ((1 << ver_len) - 1); + + if (meta_ver < 5) { + dd_dev_info( + dd, "%s:Please update platform config\n", __func__); + return -EINVAL; + } + return 0; +} + +int parse_platform_config(struct hfi1_devdata *dd) +{ + struct platform_config_cache *pcfgcache = &dd->pcfg_cache; + u32 *ptr = NULL; + u32 header1 = 0, header2 = 0, magic_num = 0, crc = 0, file_length = 0; + u32 record_idx = 0, table_type = 0, table_length_dwords = 0; + int ret = -EINVAL; /* assume failure */ + + if (!dd->platform_config.data) { + dd_dev_info(dd, "%s: Missing config file\n", __func__); + goto bail; + } + ptr = (u32 *)dd->platform_config.data; + + magic_num = *ptr; + ptr++; + if (magic_num != PLATFORM_CONFIG_MAGIC_NUM) { + dd_dev_info(dd, "%s: Bad config file\n", __func__); + goto bail; + } + + /* Field is file size in DWORDs */ + file_length = (*ptr) * 4; + ptr++; + + if (file_length > dd->platform_config.size) { + dd_dev_info(dd, "%s:File claims to be larger than read size\n", + __func__); + goto bail; + } else if (file_length < dd->platform_config.size) { + dd_dev_info(dd, + "%s:File claims to be smaller than read size, continuing\n", + __func__); + } + /* exactly equal, perfection */ + + /* + * In both cases where we proceed, using the self-reported file length + * is the safer option + */ + while (ptr < (u32 *)(dd->platform_config.data + file_length)) { + header1 = *ptr; + header2 = *(ptr + 1); + if (header1 != ~header2) { + dd_dev_info(dd, "%s: Failed validation at offset %ld\n", + __func__, (ptr - (u32 *) + dd->platform_config.data)); + goto bail; + } + + record_idx = *ptr & + ((1 << PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS) - 1); + + table_length_dwords = (*ptr >> + PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT) & + ((1 << PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS) - 1); + + table_type = (*ptr >> PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT) & + ((1 << PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS) - 1); + + /* Done with this set of headers */ + ptr += 2; + + if (record_idx) { + /* data table */ + switch (table_type) { + case PLATFORM_CONFIG_SYSTEM_TABLE: + pcfgcache->config_tables[table_type].num_table = + 1; + ret = check_meta_version(dd, ptr); + if (ret) + goto bail; + break; + case PLATFORM_CONFIG_PORT_TABLE: + pcfgcache->config_tables[table_type].num_table = + 2; + break; + case PLATFORM_CONFIG_RX_PRESET_TABLE: + /* fall through */ + case PLATFORM_CONFIG_TX_PRESET_TABLE: + /* fall through */ + case PLATFORM_CONFIG_QSFP_ATTEN_TABLE: + /* fall through */ + case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE: + pcfgcache->config_tables[table_type].num_table = + table_length_dwords; + break; + default: + dd_dev_info(dd, + "%s: Unknown data table %d, offset %ld\n", + __func__, table_type, + (ptr - (u32 *) + dd->platform_config.data)); + goto bail; /* We don't trust this file now */ + } + pcfgcache->config_tables[table_type].table = ptr; + } else { + /* metadata table */ + switch (table_type) { + case PLATFORM_CONFIG_SYSTEM_TABLE: + /* fall through */ + case PLATFORM_CONFIG_PORT_TABLE: + /* fall through */ + case PLATFORM_CONFIG_RX_PRESET_TABLE: + /* fall through */ + case PLATFORM_CONFIG_TX_PRESET_TABLE: + /* fall through */ + case PLATFORM_CONFIG_QSFP_ATTEN_TABLE: + /* fall through */ + case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE: + break; + default: + dd_dev_info(dd, + "%s: Unknown meta table %d, offset %ld\n", + __func__, table_type, + (ptr - + (u32 *)dd->platform_config.data)); + goto bail; /* We don't trust this file now */ + } + pcfgcache->config_tables[table_type].table_metadata = + ptr; + } + + /* Calculate and check table crc */ + crc = crc32_le(~(u32)0, (unsigned char const *)ptr, + (table_length_dwords * 4)); + crc ^= ~(u32)0; + + /* Jump the table */ + ptr += table_length_dwords; + if (crc != *ptr) { + dd_dev_info(dd, "%s: Failed CRC check at offset %ld\n", + __func__, (ptr - + (u32 *) + dd->platform_config.data)); + goto bail; + } + /* Jump the CRC DWORD */ + ptr++; + } + + pcfgcache->cache_valid = 1; + return 0; +bail: + memset(pcfgcache, 0, sizeof(struct platform_config_cache)); + return ret; +} + +static int get_platform_fw_field_metadata(struct hfi1_devdata *dd, int table, + int field, u32 *field_len_bits, + u32 *field_start_bits) +{ + struct platform_config_cache *pcfgcache = &dd->pcfg_cache; + u32 *src_ptr = NULL; + + if (!pcfgcache->cache_valid) + return -EINVAL; + + switch (table) { + case PLATFORM_CONFIG_SYSTEM_TABLE: + /* fall through */ + case PLATFORM_CONFIG_PORT_TABLE: + /* fall through */ + case PLATFORM_CONFIG_RX_PRESET_TABLE: + /* fall through */ + case PLATFORM_CONFIG_TX_PRESET_TABLE: + /* fall through */ + case PLATFORM_CONFIG_QSFP_ATTEN_TABLE: + /* fall through */ + case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE: + if (field && field < platform_config_table_limits[table]) + src_ptr = + pcfgcache->config_tables[table].table_metadata + field; + break; + default: + dd_dev_info(dd, "%s: Unknown table\n", __func__); + break; + } + + if (!src_ptr) + return -EINVAL; + + if (field_start_bits) + *field_start_bits = *src_ptr & + ((1 << METADATA_TABLE_FIELD_START_LEN_BITS) - 1); + + if (field_len_bits) + *field_len_bits = (*src_ptr >> METADATA_TABLE_FIELD_LEN_SHIFT) + & ((1 << METADATA_TABLE_FIELD_LEN_LEN_BITS) - 1); + + return 0; +} + +/* This is the central interface to getting data out of the platform config + * file. It depends on parse_platform_config() having populated the + * platform_config_cache in hfi1_devdata, and checks the cache_valid member to + * validate the sanity of the cache. + * + * The non-obvious parameters: + * @table_index: Acts as a look up key into which instance of the tables the + * relevant field is fetched from. + * + * This applies to the data tables that have multiple instances. The port table + * is an exception to this rule as each HFI only has one port and thus the + * relevant table can be distinguished by hfi_id. + * + * @data: pointer to memory that will be populated with the field requested. + * @len: length of memory pointed by @data in bytes. + */ +int get_platform_config_field(struct hfi1_devdata *dd, + enum platform_config_table_type_encoding + table_type, int table_index, int field_index, + u32 *data, u32 len) +{ + int ret = 0, wlen = 0, seek = 0; + u32 field_len_bits = 0, field_start_bits = 0, *src_ptr = NULL; + struct platform_config_cache *pcfgcache = &dd->pcfg_cache; + + if (data) + memset(data, 0, len); + else + return -EINVAL; + + ret = get_platform_fw_field_metadata(dd, table_type, field_index, + &field_len_bits, + &field_start_bits); + if (ret) + return -EINVAL; + + /* Convert length to bits */ + len *= 8; + + /* Our metadata function checked cache_valid and field_index for us */ + switch (table_type) { + case PLATFORM_CONFIG_SYSTEM_TABLE: + src_ptr = pcfgcache->config_tables[table_type].table; + + if (field_index != SYSTEM_TABLE_QSFP_POWER_CLASS_MAX) { + if (len < field_len_bits) + return -EINVAL; + + seek = field_start_bits / 8; + wlen = field_len_bits / 8; + + src_ptr = (u32 *)((u8 *)src_ptr + seek); + + /* + * We expect the field to be byte aligned and whole byte + * lengths if we are here + */ + memcpy(data, src_ptr, wlen); + return 0; + } + break; + case PLATFORM_CONFIG_PORT_TABLE: + /* Port table is 4 DWORDS */ + src_ptr = dd->hfi1_id ? + pcfgcache->config_tables[table_type].table + 4 : + pcfgcache->config_tables[table_type].table; + break; + case PLATFORM_CONFIG_RX_PRESET_TABLE: + /* fall through */ + case PLATFORM_CONFIG_TX_PRESET_TABLE: + /* fall through */ + case PLATFORM_CONFIG_QSFP_ATTEN_TABLE: + /* fall through */ + case PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE: + src_ptr = pcfgcache->config_tables[table_type].table; + + if (table_index < + pcfgcache->config_tables[table_type].num_table) + src_ptr += table_index; + else + src_ptr = NULL; + break; + default: + dd_dev_info(dd, "%s: Unknown table\n", __func__); + break; + } + + if (!src_ptr || len < field_len_bits) + return -EINVAL; + + src_ptr += (field_start_bits / 32); + *data = (*src_ptr >> (field_start_bits % 32)) & + ((1 << field_len_bits) - 1); + + return 0; +} + +/* + * Download the firmware needed for the Gen3 PCIe SerDes. An update + * to the SBus firmware is needed before updating the PCIe firmware. + * + * Note: caller must be holding the SBus resource. + */ +int load_pcie_firmware(struct hfi1_devdata *dd) +{ + int ret = 0; + + /* both firmware loads below use the SBus */ + set_sbus_fast_mode(dd); + + if (fw_sbus_load) { + turn_off_spicos(dd, SPICO_SBUS); + do { + ret = load_sbus_firmware(dd, &fw_sbus); + } while (retry_firmware(dd, ret)); + if (ret) + goto done; + } + + if (fw_pcie_serdes_load) { + dd_dev_info(dd, "Setting PCIe SerDes broadcast\n"); + set_serdes_broadcast(dd, all_pcie_serdes_broadcast, + pcie_serdes_broadcast[dd->hfi1_id], + pcie_serdes_addrs[dd->hfi1_id], + NUM_PCIE_SERDES); + do { + ret = load_pcie_serdes_firmware(dd, &fw_pcie); + } while (retry_firmware(dd, ret)); + if (ret) + goto done; + } + +done: + clear_sbus_fast_mode(dd); + + return ret; +} + +/* + * Read the GUID from the hardware, store it in dd. + */ +void read_guid(struct hfi1_devdata *dd) +{ + /* Take the DC out of reset to get a valid GUID value */ + write_csr(dd, CCE_DC_CTRL, 0); + (void)read_csr(dd, CCE_DC_CTRL); + + dd->base_guid = read_csr(dd, DC_DC8051_CFG_LOCAL_GUID); + dd_dev_info(dd, "GUID %llx", + (unsigned long long)dd->base_guid); +} diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h new file mode 100644 index 000000000..4417a0fd3 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -0,0 +1,1950 @@ +#ifndef _HFI1_KERNEL_H +#define _HFI1_KERNEL_H +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/interrupt.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/scatterlist.h> +#include <linux/slab.h> +#include <linux/io.h> +#include <linux/fs.h> +#include <linux/completion.h> +#include <linux/kref.h> +#include <linux/sched.h> +#include <linux/cdev.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <rdma/rdma_vt.h> + +#include "chip_registers.h" +#include "common.h" +#include "verbs.h" +#include "pio.h" +#include "chip.h" +#include "mad.h" +#include "qsfp.h" +#include "platform.h" +#include "affinity.h" + +/* bumped 1 from s/w major version of TrueScale */ +#define HFI1_CHIP_VERS_MAJ 3U + +/* don't care about this except printing */ +#define HFI1_CHIP_VERS_MIN 0U + +/* The Organization Unique Identifier (Mfg code), and its position in GUID */ +#define HFI1_OUI 0x001175 +#define HFI1_OUI_LSB 40 + +#define DROP_PACKET_OFF 0 +#define DROP_PACKET_ON 1 + +extern unsigned long hfi1_cap_mask; +#define HFI1_CAP_KGET_MASK(mask, cap) ((mask) & HFI1_CAP_##cap) +#define HFI1_CAP_UGET_MASK(mask, cap) \ + (((mask) >> HFI1_CAP_USER_SHIFT) & HFI1_CAP_##cap) +#define HFI1_CAP_KGET(cap) (HFI1_CAP_KGET_MASK(hfi1_cap_mask, cap)) +#define HFI1_CAP_UGET(cap) (HFI1_CAP_UGET_MASK(hfi1_cap_mask, cap)) +#define HFI1_CAP_IS_KSET(cap) (!!HFI1_CAP_KGET(cap)) +#define HFI1_CAP_IS_USET(cap) (!!HFI1_CAP_UGET(cap)) +#define HFI1_MISC_GET() ((hfi1_cap_mask >> HFI1_CAP_MISC_SHIFT) & \ + HFI1_CAP_MISC_MASK) +/* Offline Disabled Reason is 4-bits */ +#define HFI1_ODR_MASK(rsn) ((rsn) & OPA_PI_MASK_OFFLINE_REASON) + +/* + * Control context is always 0 and handles the error packets. + * It also handles the VL15 and multicast packets. + */ +#define HFI1_CTRL_CTXT 0 + +/* + * Driver context will store software counters for each of the events + * associated with these status registers + */ +#define NUM_CCE_ERR_STATUS_COUNTERS 41 +#define NUM_RCV_ERR_STATUS_COUNTERS 64 +#define NUM_MISC_ERR_STATUS_COUNTERS 13 +#define NUM_SEND_PIO_ERR_STATUS_COUNTERS 36 +#define NUM_SEND_DMA_ERR_STATUS_COUNTERS 4 +#define NUM_SEND_EGRESS_ERR_STATUS_COUNTERS 64 +#define NUM_SEND_ERR_STATUS_COUNTERS 3 +#define NUM_SEND_CTXT_ERR_STATUS_COUNTERS 5 +#define NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS 24 + +/* + * per driver stats, either not device nor port-specific, or + * summed over all of the devices and ports. + * They are described by name via ipathfs filesystem, so layout + * and number of elements can change without breaking compatibility. + * If members are added or deleted hfi1_statnames[] in debugfs.c must + * change to match. + */ +struct hfi1_ib_stats { + __u64 sps_ints; /* number of interrupts handled */ + __u64 sps_errints; /* number of error interrupts */ + __u64 sps_txerrs; /* tx-related packet errors */ + __u64 sps_rcverrs; /* non-crc rcv packet errors */ + __u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */ + __u64 sps_nopiobufs; /* no pio bufs avail from kernel */ + __u64 sps_ctxts; /* number of contexts currently open */ + __u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */ + __u64 sps_buffull; + __u64 sps_hdrfull; +}; + +extern struct hfi1_ib_stats hfi1_stats; +extern const struct pci_error_handlers hfi1_pci_err_handler; + +/* + * First-cut criterion for "device is active" is + * two thousand dwords combined Tx, Rx traffic per + * 5-second interval. SMA packets are 64 dwords, + * and occur "a few per second", presumably each way. + */ +#define HFI1_TRAFFIC_ACTIVE_THRESHOLD (2000) + +/* + * Below contains all data related to a single context (formerly called port). + */ + +#ifdef CONFIG_DEBUG_FS +struct hfi1_opcode_stats_perctx; +#endif + +struct ctxt_eager_bufs { + ssize_t size; /* total size of eager buffers */ + u32 count; /* size of buffers array */ + u32 numbufs; /* number of buffers allocated */ + u32 alloced; /* number of rcvarray entries used */ + u32 rcvtid_size; /* size of each eager rcv tid */ + u32 threshold; /* head update threshold */ + struct eager_buffer { + void *addr; + dma_addr_t phys; + ssize_t len; + } *buffers; + struct { + void *addr; + dma_addr_t phys; + } *rcvtids; +}; + +struct exp_tid_set { + struct list_head list; + u32 count; +}; + +struct hfi1_ctxtdata { + /* shadow the ctxt's RcvCtrl register */ + u64 rcvctrl; + /* rcvhdrq base, needs mmap before useful */ + void *rcvhdrq; + /* kernel virtual address where hdrqtail is updated */ + volatile __le64 *rcvhdrtail_kvaddr; + /* + * Shared page for kernel to signal user processes that send buffers + * need disarming. The process should call HFI1_CMD_DISARM_BUFS + * or HFI1_CMD_ACK_EVENT with IPATH_EVENT_DISARM_BUFS set. + */ + unsigned long *user_event_mask; + /* when waiting for rcv or pioavail */ + wait_queue_head_t wait; + /* rcvhdrq size (for freeing) */ + size_t rcvhdrq_size; + /* number of rcvhdrq entries */ + u16 rcvhdrq_cnt; + /* size of each of the rcvhdrq entries */ + u16 rcvhdrqentsize; + /* mmap of hdrq, must fit in 44 bits */ + dma_addr_t rcvhdrq_phys; + dma_addr_t rcvhdrqtailaddr_phys; + struct ctxt_eager_bufs egrbufs; + /* this receive context's assigned PIO ACK send context */ + struct send_context *sc; + + /* dynamic receive available interrupt timeout */ + u32 rcvavail_timeout; + /* + * number of opens (including slave sub-contexts) on this instance + * (ignoring forks, dup, etc. for now) + */ + int cnt; + /* + * how much space to leave at start of eager TID entries for + * protocol use, on each TID + */ + /* instead of calculating it */ + unsigned ctxt; + /* non-zero if ctxt is being shared. */ + u16 subctxt_cnt; + /* non-zero if ctxt is being shared. */ + u16 subctxt_id; + u8 uuid[16]; + /* job key */ + u16 jkey; + /* number of RcvArray groups for this context. */ + u32 rcv_array_groups; + /* index of first eager TID entry. */ + u32 eager_base; + /* number of expected TID entries */ + u32 expected_count; + /* index of first expected TID entry. */ + u32 expected_base; + + struct exp_tid_set tid_group_list; + struct exp_tid_set tid_used_list; + struct exp_tid_set tid_full_list; + + /* lock protecting all Expected TID data */ + struct mutex exp_lock; + /* number of pio bufs for this ctxt (all procs, if shared) */ + u32 piocnt; + /* first pio buffer for this ctxt */ + u32 pio_base; + /* chip offset of PIO buffers for this ctxt */ + u32 piobufs; + /* per-context configuration flags */ + u32 flags; + /* per-context event flags for fileops/intr communication */ + unsigned long event_flags; + /* WAIT_RCV that timed out, no interrupt */ + u32 rcvwait_to; + /* WAIT_PIO that timed out, no interrupt */ + u32 piowait_to; + /* WAIT_RCV already happened, no wait */ + u32 rcvnowait; + /* WAIT_PIO already happened, no wait */ + u32 pionowait; + /* total number of polled urgent packets */ + u32 urgent; + /* saved total number of polled urgent packets for poll edge trigger */ + u32 urgent_poll; + /* pid of process using this ctxt */ + pid_t pid; + pid_t subpid[HFI1_MAX_SHARED_CTXTS]; + /* same size as task_struct .comm[], command that opened context */ + char comm[TASK_COMM_LEN]; + /* so file ops can get at unit */ + struct hfi1_devdata *dd; + /* so functions that need physical port can get it easily */ + struct hfi1_pportdata *ppd; + /* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */ + void *subctxt_uregbase; + /* An array of pages for the eager receive buffers * N */ + void *subctxt_rcvegrbuf; + /* An array of pages for the eager header queue entries * N */ + void *subctxt_rcvhdr_base; + /* The version of the library which opened this ctxt */ + u32 userversion; + /* Bitmask of active slaves */ + u32 active_slaves; + /* Type of packets or conditions we want to poll for */ + u16 poll_type; + /* receive packet sequence counter */ + u8 seq_cnt; + u8 redirect_seq_cnt; + /* ctxt rcvhdrq head offset */ + u32 head; + u32 pkt_count; + /* QPs waiting for context processing */ + struct list_head qp_wait_list; + /* interrupt handling */ + u64 imask; /* clear interrupt mask */ + int ireg; /* clear interrupt register */ + unsigned numa_id; /* numa node of this context */ + /* verbs stats per CTX */ + struct hfi1_opcode_stats_perctx *opstats; + /* + * This is the kernel thread that will keep making + * progress on the user sdma requests behind the scenes. + * There is one per context (shared contexts use the master's). + */ + struct task_struct *progress; + struct list_head sdma_queues; + /* protect sdma queues */ + spinlock_t sdma_qlock; + + /* Is ASPM interrupt supported for this context */ + bool aspm_intr_supported; + /* ASPM state (enabled/disabled) for this context */ + bool aspm_enabled; + /* Timer for re-enabling ASPM if interrupt activity quietens down */ + struct timer_list aspm_timer; + /* Lock to serialize between intr, timer intr and user threads */ + spinlock_t aspm_lock; + /* Is ASPM processing enabled for this context (in intr context) */ + bool aspm_intr_enable; + /* Last interrupt timestamp */ + ktime_t aspm_ts_last_intr; + /* Last timestamp at which we scheduled a timer for this context */ + ktime_t aspm_ts_timer_sched; + + /* + * The interrupt handler for a particular receive context can vary + * throughout it's lifetime. This is not a lock protected data member so + * it must be updated atomically and the prev and new value must always + * be valid. Worst case is we process an extra interrupt and up to 64 + * packets with the wrong interrupt handler. + */ + int (*do_interrupt)(struct hfi1_ctxtdata *rcd, int threaded); +}; + +/* + * Represents a single packet at a high level. Put commonly computed things in + * here so we do not have to keep doing them over and over. The rule of thumb is + * if something is used one time to derive some value, store that something in + * here. If it is used multiple times, then store the result of that derivation + * in here. + */ +struct hfi1_packet { + void *ebuf; + void *hdr; + struct hfi1_ctxtdata *rcd; + __le32 *rhf_addr; + struct rvt_qp *qp; + struct hfi1_other_headers *ohdr; + u64 rhf; + u32 maxcnt; + u32 rhqoff; + u32 hdrqtail; + int numpkt; + u16 tlen; + u16 hlen; + s16 etail; + u16 rsize; + u8 updegr; + u8 rcv_flags; + u8 etype; +}; + +static inline bool has_sc4_bit(struct hfi1_packet *p) +{ + return !!rhf_dc_info(p->rhf); +} + +/* + * Private data for snoop/capture support. + */ +struct hfi1_snoop_data { + int mode_flag; + struct cdev cdev; + struct device *class_dev; + /* protect snoop data */ + spinlock_t snoop_lock; + struct list_head queue; + wait_queue_head_t waitq; + void *filter_value; + int (*filter_callback)(void *hdr, void *data, void *value); + u64 dcc_cfg; /* saved value of DCC Cfg register */ +}; + +/* snoop mode_flag values */ +#define HFI1_PORT_SNOOP_MODE 1U +#define HFI1_PORT_CAPTURE_MODE 2U + +struct rvt_sge_state; + +/* + * Get/Set IB link-level config parameters for f_get/set_ib_cfg() + * Mostly for MADs that set or query link parameters, also ipath + * config interfaces + */ +#define HFI1_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */ +#define HFI1_IB_CFG_LWID_DG_ENB 1 /* allowed Link-width downgrade */ +#define HFI1_IB_CFG_LWID_ENB 2 /* allowed Link-width */ +#define HFI1_IB_CFG_LWID 3 /* currently active Link-width */ +#define HFI1_IB_CFG_SPD_ENB 4 /* allowed Link speeds */ +#define HFI1_IB_CFG_SPD 5 /* current Link spd */ +#define HFI1_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */ +#define HFI1_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */ +#define HFI1_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */ +#define HFI1_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */ +#define HFI1_IB_CFG_OP_VLS 10 /* operational VLs */ +#define HFI1_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */ +#define HFI1_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */ +#define HFI1_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */ +#define HFI1_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */ +#define HFI1_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */ +#define HFI1_IB_CFG_PKEYS 16 /* update partition keys */ +#define HFI1_IB_CFG_MTU 17 /* update MTU in IBC */ +#define HFI1_IB_CFG_VL_HIGH_LIMIT 19 +#define HFI1_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */ +#define HFI1_IB_CFG_PORT 21 /* switch port we are connected to */ + +/* + * HFI or Host Link States + * + * These describe the states the driver thinks the logical and physical + * states are in. Used as an argument to set_link_state(). Implemented + * as bits for easy multi-state checking. The actual state can only be + * one. + */ +#define __HLS_UP_INIT_BP 0 +#define __HLS_UP_ARMED_BP 1 +#define __HLS_UP_ACTIVE_BP 2 +#define __HLS_DN_DOWNDEF_BP 3 /* link down default */ +#define __HLS_DN_POLL_BP 4 +#define __HLS_DN_DISABLE_BP 5 +#define __HLS_DN_OFFLINE_BP 6 +#define __HLS_VERIFY_CAP_BP 7 +#define __HLS_GOING_UP_BP 8 +#define __HLS_GOING_OFFLINE_BP 9 +#define __HLS_LINK_COOLDOWN_BP 10 + +#define HLS_UP_INIT BIT(__HLS_UP_INIT_BP) +#define HLS_UP_ARMED BIT(__HLS_UP_ARMED_BP) +#define HLS_UP_ACTIVE BIT(__HLS_UP_ACTIVE_BP) +#define HLS_DN_DOWNDEF BIT(__HLS_DN_DOWNDEF_BP) /* link down default */ +#define HLS_DN_POLL BIT(__HLS_DN_POLL_BP) +#define HLS_DN_DISABLE BIT(__HLS_DN_DISABLE_BP) +#define HLS_DN_OFFLINE BIT(__HLS_DN_OFFLINE_BP) +#define HLS_VERIFY_CAP BIT(__HLS_VERIFY_CAP_BP) +#define HLS_GOING_UP BIT(__HLS_GOING_UP_BP) +#define HLS_GOING_OFFLINE BIT(__HLS_GOING_OFFLINE_BP) +#define HLS_LINK_COOLDOWN BIT(__HLS_LINK_COOLDOWN_BP) + +#define HLS_UP (HLS_UP_INIT | HLS_UP_ARMED | HLS_UP_ACTIVE) +#define HLS_DOWN ~(HLS_UP) + +/* use this MTU size if none other is given */ +#define HFI1_DEFAULT_ACTIVE_MTU 10240 +/* use this MTU size as the default maximum */ +#define HFI1_DEFAULT_MAX_MTU 10240 +/* default partition key */ +#define DEFAULT_PKEY 0xffff + +/* + * Possible fabric manager config parameters for fm_{get,set}_table() + */ +#define FM_TBL_VL_HIGH_ARB 1 /* Get/set VL high prio weights */ +#define FM_TBL_VL_LOW_ARB 2 /* Get/set VL low prio weights */ +#define FM_TBL_BUFFER_CONTROL 3 /* Get/set Buffer Control */ +#define FM_TBL_SC2VLNT 4 /* Get/set SC->VLnt */ +#define FM_TBL_VL_PREEMPT_ELEMS 5 /* Get (no set) VL preempt elems */ +#define FM_TBL_VL_PREEMPT_MATRIX 6 /* Get (no set) VL preempt matrix */ + +/* + * Possible "operations" for f_rcvctrl(ppd, op, ctxt) + * these are bits so they can be combined, e.g. + * HFI1_RCVCTRL_INTRAVAIL_ENB | HFI1_RCVCTRL_CTXT_ENB + */ +#define HFI1_RCVCTRL_TAILUPD_ENB 0x01 +#define HFI1_RCVCTRL_TAILUPD_DIS 0x02 +#define HFI1_RCVCTRL_CTXT_ENB 0x04 +#define HFI1_RCVCTRL_CTXT_DIS 0x08 +#define HFI1_RCVCTRL_INTRAVAIL_ENB 0x10 +#define HFI1_RCVCTRL_INTRAVAIL_DIS 0x20 +#define HFI1_RCVCTRL_PKEY_ENB 0x40 /* Note, default is enabled */ +#define HFI1_RCVCTRL_PKEY_DIS 0x80 +#define HFI1_RCVCTRL_TIDFLOW_ENB 0x0400 +#define HFI1_RCVCTRL_TIDFLOW_DIS 0x0800 +#define HFI1_RCVCTRL_ONE_PKT_EGR_ENB 0x1000 +#define HFI1_RCVCTRL_ONE_PKT_EGR_DIS 0x2000 +#define HFI1_RCVCTRL_NO_RHQ_DROP_ENB 0x4000 +#define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000 +#define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000 +#define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000 + +/* partition enforcement flags */ +#define HFI1_PART_ENFORCE_IN 0x1 +#define HFI1_PART_ENFORCE_OUT 0x2 + +/* how often we check for synthetic counter wrap around */ +#define SYNTH_CNT_TIME 2 + +/* Counter flags */ +#define CNTR_NORMAL 0x0 /* Normal counters, just read register */ +#define CNTR_SYNTH 0x1 /* Synthetic counters, saturate at all 1s */ +#define CNTR_DISABLED 0x2 /* Disable this counter */ +#define CNTR_32BIT 0x4 /* Simulate 64 bits for this counter */ +#define CNTR_VL 0x8 /* Per VL counter */ +#define CNTR_SDMA 0x10 +#define CNTR_INVALID_VL -1 /* Specifies invalid VL */ +#define CNTR_MODE_W 0x0 +#define CNTR_MODE_R 0x1 + +/* VLs Supported/Operational */ +#define HFI1_MIN_VLS_SUPPORTED 1 +#define HFI1_MAX_VLS_SUPPORTED 8 + +static inline void incr_cntr64(u64 *cntr) +{ + if (*cntr < (u64)-1LL) + (*cntr)++; +} + +static inline void incr_cntr32(u32 *cntr) +{ + if (*cntr < (u32)-1LL) + (*cntr)++; +} + +#define MAX_NAME_SIZE 64 +struct hfi1_msix_entry { + enum irq_type type; + struct msix_entry msix; + void *arg; + char name[MAX_NAME_SIZE]; + cpumask_t mask; +}; + +/* per-SL CCA information */ +struct cca_timer { + struct hrtimer hrtimer; + struct hfi1_pportdata *ppd; /* read-only */ + int sl; /* read-only */ + u16 ccti; /* read/write - current value of CCTI */ +}; + +struct link_down_reason { + /* + * SMA-facing value. Should be set from .latest when + * HLS_UP_* -> HLS_DN_* transition actually occurs. + */ + u8 sma; + u8 latest; +}; + +enum { + LO_PRIO_TABLE, + HI_PRIO_TABLE, + MAX_PRIO_TABLE +}; + +struct vl_arb_cache { + /* protect vl arb cache */ + spinlock_t lock; + struct ib_vl_weight_elem table[VL_ARB_TABLE_SIZE]; +}; + +/* + * The structure below encapsulates data relevant to a physical IB Port. + * Current chips support only one such port, but the separation + * clarifies things a bit. Note that to conform to IB conventions, + * port-numbers are one-based. The first or only port is port1. + */ +struct hfi1_pportdata { + struct hfi1_ibport ibport_data; + + struct hfi1_devdata *dd; + struct kobject pport_cc_kobj; + struct kobject sc2vl_kobj; + struct kobject sl2sc_kobj; + struct kobject vl2mtu_kobj; + + /* PHY support */ + u32 port_type; + struct qsfp_data qsfp_info; + + /* GUID for this interface, in host order */ + u64 guid; + /* GUID for peer interface, in host order */ + u64 neighbor_guid; + + /* up or down physical link state */ + u32 linkup; + + /* + * this address is mapped read-only into user processes so they can + * get status cheaply, whenever they want. One qword of status per port + */ + u64 *statusp; + + /* SendDMA related entries */ + + struct workqueue_struct *hfi1_wq; + + /* move out of interrupt context */ + struct work_struct link_vc_work; + struct work_struct link_up_work; + struct work_struct link_down_work; + struct work_struct sma_message_work; + struct work_struct freeze_work; + struct work_struct link_downgrade_work; + struct work_struct link_bounce_work; + /* host link state variables */ + struct mutex hls_lock; + u32 host_link_state; + + spinlock_t sdma_alllock ____cacheline_aligned_in_smp; + + u32 lstate; /* logical link state */ + + /* these are the "32 bit" regs */ + + u32 ibmtu; /* The MTU programmed for this unit */ + /* + * Current max size IB packet (in bytes) including IB headers, that + * we can send. Changes when ibmtu changes. + */ + u32 ibmaxlen; + u32 current_egress_rate; /* units [10^6 bits/sec] */ + /* LID programmed for this instance */ + u16 lid; + /* list of pkeys programmed; 0 if not set */ + u16 pkeys[MAX_PKEY_VALUES]; + u16 link_width_supported; + u16 link_width_downgrade_supported; + u16 link_speed_supported; + u16 link_width_enabled; + u16 link_width_downgrade_enabled; + u16 link_speed_enabled; + u16 link_width_active; + u16 link_width_downgrade_tx_active; + u16 link_width_downgrade_rx_active; + u16 link_speed_active; + u8 vls_supported; + u8 vls_operational; + u8 actual_vls_operational; + /* LID mask control */ + u8 lmc; + /* Rx Polarity inversion (compensate for ~tx on partner) */ + u8 rx_pol_inv; + + u8 hw_pidx; /* physical port index */ + u8 port; /* IB port number and index into dd->pports - 1 */ + /* type of neighbor node */ + u8 neighbor_type; + u8 neighbor_normal; + u8 neighbor_fm_security; /* 1 if firmware checking is disabled */ + u8 neighbor_port_number; + u8 is_sm_config_started; + u8 offline_disabled_reason; + u8 is_active_optimize_enabled; + u8 driver_link_ready; /* driver ready for active link */ + u8 link_enabled; /* link enabled? */ + u8 linkinit_reason; + u8 local_tx_rate; /* rate given to 8051 firmware */ + u8 last_pstate; /* info only */ + + /* placeholders for IB MAD packet settings */ + u8 overrun_threshold; + u8 phy_error_threshold; + + /* Used to override LED behavior for things like maintenance beaconing*/ + /* + * Alternates per phase of blink + * [0] holds LED off duration, [1] holds LED on duration + */ + unsigned long led_override_vals[2]; + u8 led_override_phase; /* LSB picks from vals[] */ + atomic_t led_override_timer_active; + /* Used to flash LEDs in override mode */ + struct timer_list led_override_timer; + + u32 sm_trap_qp; + u32 sa_qp; + + /* + * cca_timer_lock protects access to the per-SL cca_timer + * structures (specifically the ccti member). + */ + spinlock_t cca_timer_lock ____cacheline_aligned_in_smp; + struct cca_timer cca_timer[OPA_MAX_SLS]; + + /* List of congestion control table entries */ + struct ib_cc_table_entry_shadow ccti_entries[CC_TABLE_SHADOW_MAX]; + + /* congestion entries, each entry corresponding to a SL */ + struct opa_congestion_setting_entry_shadow + congestion_entries[OPA_MAX_SLS]; + + /* + * cc_state_lock protects (write) access to the per-port + * struct cc_state. + */ + spinlock_t cc_state_lock ____cacheline_aligned_in_smp; + + struct cc_state __rcu *cc_state; + + /* Total number of congestion control table entries */ + u16 total_cct_entry; + + /* Bit map identifying service level */ + u32 cc_sl_control_map; + + /* CA's max number of 64 entry units in the congestion control table */ + u8 cc_max_table_entries; + + /* + * begin congestion log related entries + * cc_log_lock protects all congestion log related data + */ + spinlock_t cc_log_lock ____cacheline_aligned_in_smp; + u8 threshold_cong_event_map[OPA_MAX_SLS / 8]; + u16 threshold_event_counter; + struct opa_hfi1_cong_log_event_internal cc_events[OPA_CONG_LOG_ELEMS]; + int cc_log_idx; /* index for logging events */ + int cc_mad_idx; /* index for reporting events */ + /* end congestion log related entries */ + + struct vl_arb_cache vl_arb_cache[MAX_PRIO_TABLE]; + + /* port relative counter buffer */ + u64 *cntrs; + /* port relative synthetic counter buffer */ + u64 *scntrs; + /* port_xmit_discards are synthesized from different egress errors */ + u64 port_xmit_discards; + u64 port_xmit_discards_vl[C_VL_COUNT]; + u64 port_xmit_constraint_errors; + u64 port_rcv_constraint_errors; + /* count of 'link_err' interrupts from DC */ + u64 link_downed; + /* number of times link retrained successfully */ + u64 link_up; + /* number of times a link unknown frame was reported */ + u64 unknown_frame_count; + /* port_ltp_crc_mode is returned in 'portinfo' MADs */ + u16 port_ltp_crc_mode; + /* port_crc_mode_enabled is the crc we support */ + u8 port_crc_mode_enabled; + /* mgmt_allowed is also returned in 'portinfo' MADs */ + u8 mgmt_allowed; + u8 part_enforce; /* partition enforcement flags */ + struct link_down_reason local_link_down_reason; + struct link_down_reason neigh_link_down_reason; + /* Value to be sent to link peer on LinkDown .*/ + u8 remote_link_down_reason; + /* Error events that will cause a port bounce. */ + u32 port_error_action; + struct work_struct linkstate_active_work; + /* Does this port need to prescan for FECNs */ + bool cc_prescan; +}; + +typedef int (*rhf_rcv_function_ptr)(struct hfi1_packet *packet); + +typedef void (*opcode_handler)(struct hfi1_packet *packet); + +/* return values for the RHF receive functions */ +#define RHF_RCV_CONTINUE 0 /* keep going */ +#define RHF_RCV_DONE 1 /* stop, this packet processed */ +#define RHF_RCV_REPROCESS 2 /* stop. retain this packet */ + +struct rcv_array_data { + u8 group_size; + u16 ngroups; + u16 nctxt_extra; +}; + +struct per_vl_data { + u16 mtu; + struct send_context *sc; +}; + +/* 16 to directly index */ +#define PER_VL_SEND_CONTEXTS 16 + +struct err_info_rcvport { + u8 status_and_code; + u64 packet_flit1; + u64 packet_flit2; +}; + +struct err_info_constraint { + u8 status; + u16 pkey; + u32 slid; +}; + +struct hfi1_temp { + unsigned int curr; /* current temperature */ + unsigned int lo_lim; /* low temperature limit */ + unsigned int hi_lim; /* high temperature limit */ + unsigned int crit_lim; /* critical temperature limit */ + u8 triggers; /* temperature triggers */ +}; + +/* common data between shared ASIC HFIs */ +struct hfi1_asic_data { + struct hfi1_devdata *dds[2]; /* back pointers */ + struct mutex asic_resource_mutex; +}; + +/* device data struct now contains only "general per-device" info. + * fields related to a physical IB port are in a hfi1_pportdata struct. + */ +struct sdma_engine; +struct sdma_vl_map; + +#define BOARD_VERS_MAX 96 /* how long the version string can be */ +#define SERIAL_MAX 16 /* length of the serial number */ + +typedef int (*send_routine)(struct rvt_qp *, struct hfi1_pkt_state *, u64); +struct hfi1_devdata { + struct hfi1_ibdev verbs_dev; /* must be first */ + struct list_head list; + /* pointers to related structs for this device */ + /* pci access data structure */ + struct pci_dev *pcidev; + struct cdev user_cdev; + struct cdev diag_cdev; + struct cdev ui_cdev; + struct device *user_device; + struct device *diag_device; + struct device *ui_device; + + /* mem-mapped pointer to base of chip regs */ + u8 __iomem *kregbase; + /* end of mem-mapped chip space excluding sendbuf and user regs */ + u8 __iomem *kregend; + /* physical address of chip for io_remap, etc. */ + resource_size_t physaddr; + /* receive context data */ + struct hfi1_ctxtdata **rcd; + /* send context data */ + struct send_context_info *send_contexts; + /* map hardware send contexts to software index */ + u8 *hw_to_sw; + /* spinlock for allocating and releasing send context resources */ + spinlock_t sc_lock; + /* Per VL data. Enough for all VLs but not all elements are set/used. */ + struct per_vl_data vld[PER_VL_SEND_CONTEXTS]; + /* lock for pio_map */ + spinlock_t pio_map_lock; + /* array of kernel send contexts */ + struct send_context **kernel_send_context; + /* array of vl maps */ + struct pio_vl_map __rcu *pio_map; + /* seqlock for sc2vl */ + seqlock_t sc2vl_lock; + u64 sc2vl[4]; + /* Send Context initialization lock. */ + spinlock_t sc_init_lock; + + /* fields common to all SDMA engines */ + + /* default flags to last descriptor */ + u64 default_desc1; + volatile __le64 *sdma_heads_dma; /* DMA'ed by chip */ + dma_addr_t sdma_heads_phys; + void *sdma_pad_dma; /* DMA'ed by chip */ + dma_addr_t sdma_pad_phys; + /* for deallocation */ + size_t sdma_heads_size; + /* number from the chip */ + u32 chip_sdma_engines; + /* num used */ + u32 num_sdma; + /* lock for sdma_map */ + spinlock_t sde_map_lock; + /* array of engines sized by num_sdma */ + struct sdma_engine *per_sdma; + /* array of vl maps */ + struct sdma_vl_map __rcu *sdma_map; + /* SPC freeze waitqueue and variable */ + wait_queue_head_t sdma_unfreeze_wq; + atomic_t sdma_unfreeze_count; + + /* common data between shared ASIC HFIs in this OS */ + struct hfi1_asic_data *asic_data; + + /* hfi1_pportdata, points to array of (physical) port-specific + * data structs, indexed by pidx (0..n-1) + */ + struct hfi1_pportdata *pport; + + /* mem-mapped pointer to base of PIO buffers */ + void __iomem *piobase; + /* + * write-combining mem-mapped pointer to base of RcvArray + * memory. + */ + void __iomem *rcvarray_wc; + /* + * credit return base - a per-NUMA range of DMA address that + * the chip will use to update the per-context free counter + */ + struct credit_return_base *cr_base; + + /* send context numbers and sizes for each type */ + struct sc_config_sizes sc_sizes[SC_MAX]; + + u32 lcb_access_count; /* count of LCB users */ + + char *boardname; /* human readable board info */ + + /* device (not port) flags, basically device capabilities */ + u32 flags; + + /* reset value */ + u64 z_int_counter; + u64 z_rcv_limit; + u64 z_send_schedule; + /* percpu int_counter */ + u64 __percpu *int_counter; + u64 __percpu *rcv_limit; + u64 __percpu *send_schedule; + /* number of receive contexts in use by the driver */ + u32 num_rcv_contexts; + /* number of pio send contexts in use by the driver */ + u32 num_send_contexts; + /* + * number of ctxts available for PSM open + */ + u32 freectxts; + /* total number of available user/PSM contexts */ + u32 num_user_contexts; + /* base receive interrupt timeout, in CSR units */ + u32 rcv_intr_timeout_csr; + + u64 __iomem *egrtidbase; + spinlock_t sendctrl_lock; /* protect changes to SendCtrl */ + spinlock_t rcvctrl_lock; /* protect changes to RcvCtrl */ + /* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */ + spinlock_t uctxt_lock; /* rcd and user context changes */ + /* exclusive access to 8051 */ + spinlock_t dc8051_lock; + /* exclusive access to 8051 memory */ + spinlock_t dc8051_memlock; + int dc8051_timed_out; /* remember if the 8051 timed out */ + /* + * A page that will hold event notification bitmaps for all + * contexts. This page will be mapped into all processes. + */ + unsigned long *events; + /* + * per unit status, see also portdata statusp + * mapped read-only into user processes so they can get unit and + * IB link status cheaply + */ + struct hfi1_status *status; + u32 freezelen; /* max length of freezemsg */ + + /* revision register shadow */ + u64 revision; + /* Base GUID for device (network order) */ + u64 base_guid; + + /* these are the "32 bit" regs */ + + /* value we put in kr_rcvhdrsize */ + u32 rcvhdrsize; + /* number of receive contexts the chip supports */ + u32 chip_rcv_contexts; + /* number of receive array entries */ + u32 chip_rcv_array_count; + /* number of PIO send contexts the chip supports */ + u32 chip_send_contexts; + /* number of bytes in the PIO memory buffer */ + u32 chip_pio_mem_size; + /* number of bytes in the SDMA memory buffer */ + u32 chip_sdma_mem_size; + + /* size of each rcvegrbuffer */ + u32 rcvegrbufsize; + /* log2 of above */ + u16 rcvegrbufsize_shift; + /* both sides of the PCIe link are gen3 capable */ + u8 link_gen3_capable; + /* localbus width (1, 2,4,8,16,32) from config space */ + u32 lbus_width; + /* localbus speed in MHz */ + u32 lbus_speed; + int unit; /* unit # of this chip */ + int node; /* home node of this chip */ + + /* save these PCI fields to restore after a reset */ + u32 pcibar0; + u32 pcibar1; + u32 pci_rom; + u16 pci_command; + u16 pcie_devctl; + u16 pcie_lnkctl; + u16 pcie_devctl2; + u32 pci_msix0; + u32 pci_lnkctl3; + u32 pci_tph2; + + /* + * ASCII serial number, from flash, large enough for original + * all digit strings, and longer serial number format + */ + u8 serial[SERIAL_MAX]; + /* human readable board version */ + u8 boardversion[BOARD_VERS_MAX]; + u8 lbus_info[32]; /* human readable localbus info */ + /* chip major rev, from CceRevision */ + u8 majrev; + /* chip minor rev, from CceRevision */ + u8 minrev; + /* hardware ID */ + u8 hfi1_id; + /* implementation code */ + u8 icode; + /* default link down value (poll/sleep) */ + u8 link_default; + /* vAU of this device */ + u8 vau; + /* vCU of this device */ + u8 vcu; + /* link credits of this device */ + u16 link_credits; + /* initial vl15 credits to use */ + u16 vl15_init; + + /* Misc small ints */ + /* Number of physical ports available */ + u8 num_pports; + /* Lowest context number which can be used by user processes */ + u8 first_user_ctxt; + u8 n_krcv_queues; + u8 qos_shift; + u8 qpn_mask; + + u16 rhf_offset; /* offset of RHF within receive header entry */ + u16 irev; /* implementation revision */ + u16 dc8051_ver; /* 8051 firmware version */ + + struct platform_config platform_config; + struct platform_config_cache pcfg_cache; + + struct diag_client *diag_client; + spinlock_t hfi1_diag_trans_lock; /* protect diag observer ops */ + + u8 psxmitwait_supported; + /* cycle length of PS* counters in HW (in picoseconds) */ + u16 psxmitwait_check_rate; + /* high volume overflow errors deferred to tasklet */ + struct tasklet_struct error_tasklet; + + /* MSI-X information */ + struct hfi1_msix_entry *msix_entries; + u32 num_msix_entries; + + /* INTx information */ + u32 requested_intx_irq; /* did we request one? */ + char intx_name[MAX_NAME_SIZE]; /* INTx name */ + + /* general interrupt: mask of handled interrupts */ + u64 gi_mask[CCE_NUM_INT_CSRS]; + + struct rcv_array_data rcv_entries; + + /* + * 64 bit synthetic counters + */ + struct timer_list synth_stats_timer; + + /* + * device counters + */ + char *cntrnames; + size_t cntrnameslen; + size_t ndevcntrs; + u64 *cntrs; + u64 *scntrs; + + /* + * remembered values for synthetic counters + */ + u64 last_tx; + u64 last_rx; + + /* + * per-port counters + */ + size_t nportcntrs; + char *portcntrnames; + size_t portcntrnameslen; + + struct hfi1_snoop_data hfi1_snoop; + + struct err_info_rcvport err_info_rcvport; + struct err_info_constraint err_info_rcv_constraint; + struct err_info_constraint err_info_xmit_constraint; + u8 err_info_uncorrectable; + u8 err_info_fmconfig; + + atomic_t drop_packet; + u8 do_drop; + + /* + * Software counters for the status bits defined by the + * associated error status registers + */ + u64 cce_err_status_cnt[NUM_CCE_ERR_STATUS_COUNTERS]; + u64 rcv_err_status_cnt[NUM_RCV_ERR_STATUS_COUNTERS]; + u64 misc_err_status_cnt[NUM_MISC_ERR_STATUS_COUNTERS]; + u64 send_pio_err_status_cnt[NUM_SEND_PIO_ERR_STATUS_COUNTERS]; + u64 send_dma_err_status_cnt[NUM_SEND_DMA_ERR_STATUS_COUNTERS]; + u64 send_egress_err_status_cnt[NUM_SEND_EGRESS_ERR_STATUS_COUNTERS]; + u64 send_err_status_cnt[NUM_SEND_ERR_STATUS_COUNTERS]; + + /* Software counter that spans all contexts */ + u64 sw_ctxt_err_status_cnt[NUM_SEND_CTXT_ERR_STATUS_COUNTERS]; + /* Software counter that spans all DMA engines */ + u64 sw_send_dma_eng_err_status_cnt[ + NUM_SEND_DMA_ENG_ERR_STATUS_COUNTERS]; + /* Software counter that aggregates all cce_err_status errors */ + u64 sw_cce_err_status_aggregate; + + /* receive interrupt functions */ + rhf_rcv_function_ptr *rhf_rcv_function_map; + rhf_rcv_function_ptr normal_rhf_rcv_functions[8]; + + /* + * Handlers for outgoing data so that snoop/capture does not + * have to have its hooks in the send path + */ + send_routine process_pio_send; + send_routine process_dma_send; + void (*pio_inline_send)(struct hfi1_devdata *dd, struct pio_buf *pbuf, + u64 pbc, const void *from, size_t count); + + /* OUI comes from the HW. Used everywhere as 3 separate bytes. */ + u8 oui1; + u8 oui2; + u8 oui3; + /* Timer and counter used to detect RcvBufOvflCnt changes */ + struct timer_list rcverr_timer; + u32 rcv_ovfl_cnt; + + wait_queue_head_t event_queue; + + /* Save the enabled LCB error bits */ + u64 lcb_err_en; + u8 dc_shutdown; + + /* receive context tail dummy address */ + __le64 *rcvhdrtail_dummy_kvaddr; + dma_addr_t rcvhdrtail_dummy_physaddr; + + bool eprom_available; /* true if EPROM is available for this device */ + bool aspm_supported; /* Does HW support ASPM */ + bool aspm_enabled; /* ASPM state: enabled/disabled */ + /* Serialize ASPM enable/disable between multiple verbs contexts */ + spinlock_t aspm_lock; + /* Number of verbs contexts which have disabled ASPM */ + atomic_t aspm_disabled_cnt; + + struct hfi1_affinity *affinity; + struct kobject kobj; +}; + +/* 8051 firmware version helper */ +#define dc8051_ver(a, b) ((a) << 8 | (b)) + +/* f_put_tid types */ +#define PT_EXPECTED 0 +#define PT_EAGER 1 +#define PT_INVALID 2 + +struct tid_rb_node; +struct mmu_rb_node; + +/* Private data for file operations */ +struct hfi1_filedata { + struct hfi1_ctxtdata *uctxt; + unsigned subctxt; + struct hfi1_user_sdma_comp_q *cq; + struct hfi1_user_sdma_pkt_q *pq; + /* for cpu affinity; -1 if none */ + int rec_cpu_num; + u32 tid_n_pinned; + struct rb_root tid_rb_root; + struct tid_rb_node **entry_to_rb; + spinlock_t tid_lock; /* protect tid_[limit,used] counters */ + u32 tid_limit; + u32 tid_used; + u32 *invalid_tids; + u32 invalid_tid_idx; + /* protect invalid_tids array and invalid_tid_idx */ + spinlock_t invalid_lock; +}; + +extern struct list_head hfi1_dev_list; +extern spinlock_t hfi1_devs_lock; +struct hfi1_devdata *hfi1_lookup(int unit); +extern u32 hfi1_cpulist_count; +extern unsigned long *hfi1_cpulist; + +extern unsigned int snoop_drop_send; +extern unsigned int snoop_force_capture; +int hfi1_init(struct hfi1_devdata *, int); +int hfi1_count_units(int *npresentp, int *nupp); +int hfi1_count_active_units(void); + +int hfi1_diag_add(struct hfi1_devdata *); +void hfi1_diag_remove(struct hfi1_devdata *); +void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup); + +void handle_user_interrupt(struct hfi1_ctxtdata *rcd); + +int hfi1_create_rcvhdrq(struct hfi1_devdata *, struct hfi1_ctxtdata *); +int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *); +int hfi1_create_ctxts(struct hfi1_devdata *dd); +struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *, u32, int); +void hfi1_init_pportdata(struct pci_dev *, struct hfi1_pportdata *, + struct hfi1_devdata *, u8, u8); +void hfi1_free_ctxtdata(struct hfi1_devdata *, struct hfi1_ctxtdata *); + +int handle_receive_interrupt(struct hfi1_ctxtdata *, int); +int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *, int); +int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *, int); +void set_all_slowpath(struct hfi1_devdata *dd); + +/* receive packet handler dispositions */ +#define RCV_PKT_OK 0x0 /* keep going */ +#define RCV_PKT_LIMIT 0x1 /* stop, hit limit, start thread */ +#define RCV_PKT_DONE 0x2 /* stop, no more packets detected */ + +/* calculate the current RHF address */ +static inline __le32 *get_rhf_addr(struct hfi1_ctxtdata *rcd) +{ + return (__le32 *)rcd->rcvhdrq + rcd->head + rcd->dd->rhf_offset; +} + +int hfi1_reset_device(int); + +/* return the driver's idea of the logical OPA port state */ +static inline u32 driver_lstate(struct hfi1_pportdata *ppd) +{ + return ppd->lstate; /* use the cached value */ +} + +void receive_interrupt_work(struct work_struct *work); + +/* extract service channel from header and rhf */ +static inline int hdr2sc(struct hfi1_message_header *hdr, u64 rhf) +{ + return ((be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf) | + ((!!(rhf & RHF_DC_INFO_SMASK)) << 4); +} + +static inline u16 generate_jkey(kuid_t uid) +{ + return from_kuid(current_user_ns(), uid) & 0xffff; +} + +/* + * active_egress_rate + * + * returns the active egress rate in units of [10^6 bits/sec] + */ +static inline u32 active_egress_rate(struct hfi1_pportdata *ppd) +{ + u16 link_speed = ppd->link_speed_active; + u16 link_width = ppd->link_width_active; + u32 egress_rate; + + if (link_speed == OPA_LINK_SPEED_25G) + egress_rate = 25000; + else /* assume OPA_LINK_SPEED_12_5G */ + egress_rate = 12500; + + switch (link_width) { + case OPA_LINK_WIDTH_4X: + egress_rate *= 4; + break; + case OPA_LINK_WIDTH_3X: + egress_rate *= 3; + break; + case OPA_LINK_WIDTH_2X: + egress_rate *= 2; + break; + default: + /* assume IB_WIDTH_1X */ + break; + } + + return egress_rate; +} + +/* + * egress_cycles + * + * Returns the number of 'fabric clock cycles' to egress a packet + * of length 'len' bytes, at 'rate' Mbit/s. Since the fabric clock + * rate is (approximately) 805 MHz, the units of the returned value + * are (1/805 MHz). + */ +static inline u32 egress_cycles(u32 len, u32 rate) +{ + u32 cycles; + + /* + * cycles is: + * + * (length) [bits] / (rate) [bits/sec] + * --------------------------------------------------- + * fabric_clock_period == 1 /(805 * 10^6) [cycles/sec] + */ + + cycles = len * 8; /* bits */ + cycles *= 805; + cycles /= rate; + + return cycles; +} + +void set_link_ipg(struct hfi1_pportdata *ppd); +void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, + u32 rqpn, u8 svc_type); +void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn, + u32 pkey, u32 slid, u32 dlid, u8 sc5, + const struct ib_grh *old_grh); +#define PKEY_CHECK_INVALID -1 +int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth, + u8 sc5, int8_t s_pkey_index); + +#define PACKET_EGRESS_TIMEOUT 350 +static inline void pause_for_credit_return(struct hfi1_devdata *dd) +{ + /* Pause at least 1us, to ensure chip returns all credits */ + u32 usec = cclock_to_ns(dd, PACKET_EGRESS_TIMEOUT) / 1000; + + udelay(usec ? usec : 1); +} + +/** + * sc_to_vlt() reverse lookup sc to vl + * @dd - devdata + * @sc5 - 5 bit sc + */ +static inline u8 sc_to_vlt(struct hfi1_devdata *dd, u8 sc5) +{ + unsigned seq; + u8 rval; + + if (sc5 >= OPA_MAX_SCS) + return (u8)(0xff); + + do { + seq = read_seqbegin(&dd->sc2vl_lock); + rval = *(((u8 *)dd->sc2vl) + sc5); + } while (read_seqretry(&dd->sc2vl_lock, seq)); + + return rval; +} + +#define PKEY_MEMBER_MASK 0x8000 +#define PKEY_LOW_15_MASK 0x7fff + +/* + * ingress_pkey_matches_entry - return 1 if the pkey matches ent (ent + * being an entry from the ingress partition key table), return 0 + * otherwise. Use the matching criteria for ingress partition keys + * specified in the OPAv1 spec., section 9.10.14. + */ +static inline int ingress_pkey_matches_entry(u16 pkey, u16 ent) +{ + u16 mkey = pkey & PKEY_LOW_15_MASK; + u16 ment = ent & PKEY_LOW_15_MASK; + + if (mkey == ment) { + /* + * If pkey[15] is clear (limited partition member), + * is bit 15 in the corresponding table element + * clear (limited member)? + */ + if (!(pkey & PKEY_MEMBER_MASK)) + return !!(ent & PKEY_MEMBER_MASK); + return 1; + } + return 0; +} + +/* + * ingress_pkey_table_search - search the entire pkey table for + * an entry which matches 'pkey'. return 0 if a match is found, + * and 1 otherwise. + */ +static int ingress_pkey_table_search(struct hfi1_pportdata *ppd, u16 pkey) +{ + int i; + + for (i = 0; i < MAX_PKEY_VALUES; i++) { + if (ingress_pkey_matches_entry(pkey, ppd->pkeys[i])) + return 0; + } + return 1; +} + +/* + * ingress_pkey_table_fail - record a failure of ingress pkey validation, + * i.e., increment port_rcv_constraint_errors for the port, and record + * the 'error info' for this failure. + */ +static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey, + u16 slid) +{ + struct hfi1_devdata *dd = ppd->dd; + + incr_cntr64(&ppd->port_rcv_constraint_errors); + if (!(dd->err_info_rcv_constraint.status & OPA_EI_STATUS_SMASK)) { + dd->err_info_rcv_constraint.status |= OPA_EI_STATUS_SMASK; + dd->err_info_rcv_constraint.slid = slid; + dd->err_info_rcv_constraint.pkey = pkey; + } +} + +/* + * ingress_pkey_check - Return 0 if the ingress pkey is valid, return 1 + * otherwise. Use the criteria in the OPAv1 spec, section 9.10.14. idx + * is a hint as to the best place in the partition key table to begin + * searching. This function should not be called on the data path because + * of performance reasons. On datapath pkey check is expected to be done + * by HW and rcv_pkey_check function should be called instead. + */ +static inline int ingress_pkey_check(struct hfi1_pportdata *ppd, u16 pkey, + u8 sc5, u8 idx, u16 slid) +{ + if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN)) + return 0; + + /* If SC15, pkey[0:14] must be 0x7fff */ + if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK)) + goto bad; + + /* Is the pkey = 0x0, or 0x8000? */ + if ((pkey & PKEY_LOW_15_MASK) == 0) + goto bad; + + /* The most likely matching pkey has index 'idx' */ + if (ingress_pkey_matches_entry(pkey, ppd->pkeys[idx])) + return 0; + + /* no match - try the whole table */ + if (!ingress_pkey_table_search(ppd, pkey)) + return 0; + +bad: + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; +} + +/* + * rcv_pkey_check - Return 0 if the ingress pkey is valid, return 1 + * otherwise. It only ensures pkey is vlid for QP0. This function + * should be called on the data path instead of ingress_pkey_check + * as on data path, pkey check is done by HW (except for QP0). + */ +static inline int rcv_pkey_check(struct hfi1_pportdata *ppd, u16 pkey, + u8 sc5, u16 slid) +{ + if (!(ppd->part_enforce & HFI1_PART_ENFORCE_IN)) + return 0; + + /* If SC15, pkey[0:14] must be 0x7fff */ + if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK)) + goto bad; + + return 0; +bad: + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; +} + +/* MTU handling */ + +/* MTU enumeration, 256-4k match IB */ +#define OPA_MTU_0 0 +#define OPA_MTU_256 1 +#define OPA_MTU_512 2 +#define OPA_MTU_1024 3 +#define OPA_MTU_2048 4 +#define OPA_MTU_4096 5 + +u32 lrh_max_header_bytes(struct hfi1_devdata *dd); +int mtu_to_enum(u32 mtu, int default_if_bad); +u16 enum_to_mtu(int); +static inline int valid_ib_mtu(unsigned int mtu) +{ + return mtu == 256 || mtu == 512 || + mtu == 1024 || mtu == 2048 || + mtu == 4096; +} + +static inline int valid_opa_max_mtu(unsigned int mtu) +{ + return mtu >= 2048 && + (valid_ib_mtu(mtu) || mtu == 8192 || mtu == 10240); +} + +int set_mtu(struct hfi1_pportdata *); + +int hfi1_set_lid(struct hfi1_pportdata *, u32, u8); +void hfi1_disable_after_error(struct hfi1_devdata *); +int hfi1_set_uevent_bits(struct hfi1_pportdata *, const int); +int hfi1_rcvbuf_validate(u32, u8, u16 *); + +int fm_get_table(struct hfi1_pportdata *, int, void *); +int fm_set_table(struct hfi1_pportdata *, int, void *); + +void set_up_vl15(struct hfi1_devdata *dd, u8 vau, u16 vl15buf); +void reset_link_credits(struct hfi1_devdata *dd); +void assign_remote_cm_au_table(struct hfi1_devdata *dd, u8 vcu); + +int snoop_recv_handler(struct hfi1_packet *packet); +int snoop_send_dma_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + u64 pbc); +int snoop_send_pio_handler(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + u64 pbc); +void snoop_inline_pio_send(struct hfi1_devdata *dd, struct pio_buf *pbuf, + u64 pbc, const void *from, size_t count); +int set_buffer_control(struct hfi1_pportdata *ppd, struct buffer_control *bc); + +static inline struct hfi1_devdata *dd_from_ppd(struct hfi1_pportdata *ppd) +{ + return ppd->dd; +} + +static inline struct hfi1_devdata *dd_from_dev(struct hfi1_ibdev *dev) +{ + return container_of(dev, struct hfi1_devdata, verbs_dev); +} + +static inline struct hfi1_devdata *dd_from_ibdev(struct ib_device *ibdev) +{ + return dd_from_dev(to_idev(ibdev)); +} + +static inline struct hfi1_pportdata *ppd_from_ibp(struct hfi1_ibport *ibp) +{ + return container_of(ibp, struct hfi1_pportdata, ibport_data); +} + +static inline struct hfi1_ibdev *dev_from_rdi(struct rvt_dev_info *rdi) +{ + return container_of(rdi, struct hfi1_ibdev, rdi); +} + +static inline struct hfi1_ibport *to_iport(struct ib_device *ibdev, u8 port) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */ + + WARN_ON(pidx >= dd->num_pports); + return &dd->pport[pidx].ibport_data; +} + +/* + * Return the indexed PKEY from the port PKEY table. + */ +static inline u16 hfi1_get_pkey(struct hfi1_ibport *ibp, unsigned index) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u16 ret; + + if (index >= ARRAY_SIZE(ppd->pkeys)) + ret = 0; + else + ret = ppd->pkeys[index]; + + return ret; +} + +/* + * Readers of cc_state must call get_cc_state() under rcu_read_lock(). + * Writers of cc_state must call get_cc_state() under cc_state_lock. + */ +static inline struct cc_state *get_cc_state(struct hfi1_pportdata *ppd) +{ + return rcu_dereference(ppd->cc_state); +} + +/* + * values for dd->flags (_device_ related flags) + */ +#define HFI1_INITTED 0x1 /* chip and driver up and initted */ +#define HFI1_PRESENT 0x2 /* chip accesses can be done */ +#define HFI1_FROZEN 0x4 /* chip in SPC freeze */ +#define HFI1_HAS_SDMA_TIMEOUT 0x8 +#define HFI1_HAS_SEND_DMA 0x10 /* Supports Send DMA */ +#define HFI1_FORCED_FREEZE 0x80 /* driver forced freeze mode */ + +/* IB dword length mask in PBC (lower 11 bits); same for all chips */ +#define HFI1_PBC_LENGTH_MASK ((1 << 11) - 1) + +/* ctxt_flag bit offsets */ + /* context has been setup */ +#define HFI1_CTXT_SETUP_DONE 1 + /* waiting for a packet to arrive */ +#define HFI1_CTXT_WAITING_RCV 2 + /* master has not finished initializing */ +#define HFI1_CTXT_MASTER_UNINIT 4 + /* waiting for an urgent packet to arrive */ +#define HFI1_CTXT_WAITING_URG 5 + +/* free up any allocated data at closes */ +struct hfi1_devdata *hfi1_init_dd(struct pci_dev *, + const struct pci_device_id *); +void hfi1_free_devdata(struct hfi1_devdata *); +void cc_state_reclaim(struct rcu_head *rcu); +struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra); + +/* LED beaconing functions */ +void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon, + unsigned int timeoff); +void shutdown_led_override(struct hfi1_pportdata *ppd); + +#define HFI1_CREDIT_RETURN_RATE (100) + +/* + * The number of words for the KDETH protocol field. If this is + * larger then the actual field used, then part of the payload + * will be in the header. + * + * Optimally, we want this sized so that a typical case will + * use full cache lines. The typical local KDETH header would + * be: + * + * Bytes Field + * 8 LRH + * 12 BHT + * ?? KDETH + * 8 RHF + * --- + * 28 + KDETH + * + * For a 64-byte cache line, KDETH would need to be 36 bytes or 9 DWORDS + */ +#define DEFAULT_RCVHDRSIZE 9 + +/* + * Maximal header byte count: + * + * Bytes Field + * 8 LRH + * 40 GRH (optional) + * 12 BTH + * ?? KDETH + * 8 RHF + * --- + * 68 + KDETH + * + * We also want to maintain a cache line alignment to assist DMA'ing + * of the header bytes. Round up to a good size. + */ +#define DEFAULT_RCVHDR_ENTSIZE 32 + +bool hfi1_can_pin_pages(struct hfi1_devdata *, u32, u32); +int hfi1_acquire_user_pages(unsigned long, size_t, bool, struct page **); +void hfi1_release_user_pages(struct mm_struct *, struct page **, size_t, bool); + +static inline void clear_rcvhdrtail(const struct hfi1_ctxtdata *rcd) +{ + *((u64 *)rcd->rcvhdrtail_kvaddr) = 0ULL; +} + +static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd) +{ + /* + * volatile because it's a DMA target from the chip, routine is + * inlined, and don't want register caching or reordering. + */ + return (u32)le64_to_cpu(*rcd->rcvhdrtail_kvaddr); +} + +/* + * sysfs interface. + */ + +extern const char ib_hfi1_version[]; + +int hfi1_device_create(struct hfi1_devdata *); +void hfi1_device_remove(struct hfi1_devdata *); + +int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num, + struct kobject *kobj); +int hfi1_verbs_register_sysfs(struct hfi1_devdata *); +void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *); +/* Hook for sysfs read of QSFP */ +int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len); + +int hfi1_pcie_init(struct pci_dev *, const struct pci_device_id *); +void hfi1_pcie_cleanup(struct pci_dev *); +int hfi1_pcie_ddinit(struct hfi1_devdata *, struct pci_dev *, + const struct pci_device_id *); +void hfi1_pcie_ddcleanup(struct hfi1_devdata *); +void hfi1_pcie_flr(struct hfi1_devdata *); +int pcie_speeds(struct hfi1_devdata *); +void request_msix(struct hfi1_devdata *, u32 *, struct hfi1_msix_entry *); +void hfi1_enable_intx(struct pci_dev *); +void restore_pci_variables(struct hfi1_devdata *dd); +int do_pcie_gen3_transition(struct hfi1_devdata *dd); +int parse_platform_config(struct hfi1_devdata *dd); +int get_platform_config_field(struct hfi1_devdata *dd, + enum platform_config_table_type_encoding + table_type, int table_index, int field_index, + u32 *data, u32 len); + +const char *get_unit_name(int unit); +const char *get_card_name(struct rvt_dev_info *rdi); +struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi); + +/* + * Flush write combining store buffers (if present) and perform a write + * barrier. + */ +static inline void flush_wc(void) +{ + asm volatile("sfence" : : : "memory"); +} + +void handle_eflags(struct hfi1_packet *packet); +int process_receive_ib(struct hfi1_packet *packet); +int process_receive_bypass(struct hfi1_packet *packet); +int process_receive_error(struct hfi1_packet *packet); +int kdeth_process_expected(struct hfi1_packet *packet); +int kdeth_process_eager(struct hfi1_packet *packet); +int process_receive_invalid(struct hfi1_packet *packet); + +extern rhf_rcv_function_ptr snoop_rhf_rcv_functions[8]; + +void update_sge(struct rvt_sge_state *ss, u32 length); + +/* global module parameter variables */ +extern unsigned int hfi1_max_mtu; +extern unsigned int hfi1_cu; +extern unsigned int user_credit_return_threshold; +extern int num_user_contexts; +extern unsigned n_krcvqs; +extern uint krcvqs[]; +extern int krcvqsset; +extern uint kdeth_qp; +extern uint loopback; +extern uint quick_linkup; +extern uint rcv_intr_timeout; +extern uint rcv_intr_count; +extern uint rcv_intr_dynamic; +extern ushort link_crc_mask; + +extern struct mutex hfi1_mutex; + +/* Number of seconds before our card status check... */ +#define STATUS_TIMEOUT 60 + +#define DRIVER_NAME "hfi1" +#define HFI1_USER_MINOR_BASE 0 +#define HFI1_TRACE_MINOR 127 +#define HFI1_DIAGPKT_MINOR 128 +#define HFI1_DIAG_MINOR_BASE 129 +#define HFI1_SNOOP_CAPTURE_BASE 200 +#define HFI1_NMINORS 255 + +#define PCI_VENDOR_ID_INTEL 0x8086 +#define PCI_DEVICE_ID_INTEL0 0x24f0 +#define PCI_DEVICE_ID_INTEL1 0x24f1 + +#define HFI1_PKT_USER_SC_INTEGRITY \ + (SEND_CTXT_CHECK_ENABLE_DISALLOW_NON_KDETH_PACKETS_SMASK \ + | SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK \ + | SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_SMASK \ + | SEND_CTXT_CHECK_ENABLE_DISALLOW_GRH_SMASK) + +#define HFI1_PKT_KERNEL_SC_INTEGRITY \ + (SEND_CTXT_CHECK_ENABLE_DISALLOW_KDETH_PACKETS_SMASK) + +static inline u64 hfi1_pkt_default_send_ctxt_mask(struct hfi1_devdata *dd, + u16 ctxt_type) +{ + u64 base_sc_integrity = + SEND_CTXT_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_PBC_TEST_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK + | SEND_CTXT_CHECK_ENABLE_DISALLOW_RAW_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_OPCODE_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_SLID_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_VL_SMASK + | SEND_CTXT_CHECK_ENABLE_CHECK_ENABLE_SMASK; + + if (ctxt_type == SC_USER) + base_sc_integrity |= HFI1_PKT_USER_SC_INTEGRITY; + else + base_sc_integrity |= HFI1_PKT_KERNEL_SC_INTEGRITY; + + if (is_ax(dd)) + /* turn off send-side job key checks - A0 */ + return base_sc_integrity & + ~SEND_CTXT_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + return base_sc_integrity; +} + +static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) +{ + u64 base_sdma_integrity = + SEND_DMA_CHECK_ENABLE_DISALLOW_BYPASS_BAD_PKT_LEN_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_BYPASS_PACKETS_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_LONG_IB_PACKETS_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_BAD_PKT_LEN_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_BYPASS_PACKETS_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_TOO_SMALL_IB_PACKETS_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_IPV6_SMASK + | SEND_DMA_CHECK_ENABLE_DISALLOW_RAW_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_VL_MAPPING_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_OPCODE_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_SLID_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_VL_SMASK + | SEND_DMA_CHECK_ENABLE_CHECK_ENABLE_SMASK; + + if (is_ax(dd)) + /* turn off send-side job key checks - A0 */ + return base_sdma_integrity & + ~SEND_DMA_CHECK_ENABLE_CHECK_JOB_KEY_SMASK; + return base_sdma_integrity; +} + +/* + * hfi1_early_err is used (only!) to print early errors before devdata is + * allocated, or when dd->pcidev may not be valid, and at the tail end of + * cleanup when devdata may have been freed, etc. hfi1_dev_porterr is + * the same as dd_dev_err, but is used when the message really needs + * the IB port# to be definitive as to what's happening.. + */ +#define hfi1_early_err(dev, fmt, ...) \ + dev_err(dev, fmt, ##__VA_ARGS__) + +#define hfi1_early_info(dev, fmt, ...) \ + dev_info(dev, fmt, ##__VA_ARGS__) + +#define dd_dev_emerg(dd, fmt, ...) \ + dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) +#define dd_dev_err(dd, fmt, ...) \ + dev_err(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) +#define dd_dev_warn(dd, fmt, ...) \ + dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) + +#define dd_dev_warn_ratelimited(dd, fmt, ...) \ + dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) + +#define dd_dev_info(dd, fmt, ...) \ + dev_info(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) + +#define dd_dev_dbg(dd, fmt, ...) \ + dev_dbg(&(dd)->pcidev->dev, "%s: " fmt, \ + get_unit_name((dd)->unit), ##__VA_ARGS__) + +#define hfi1_dev_porterr(dd, port, fmt, ...) \ + dev_err(&(dd)->pcidev->dev, "%s: port %u: " fmt, \ + get_unit_name((dd)->unit), (port), ##__VA_ARGS__) + +/* + * this is used for formatting hw error messages... + */ +struct hfi1_hwerror_msgs { + u64 mask; + const char *msg; + size_t sz; +}; + +/* in intr.c... */ +void hfi1_format_hwerrors(u64 hwerrs, + const struct hfi1_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, char *msg, size_t lmsg); + +#define USER_OPCODE_CHECK_VAL 0xC0 +#define USER_OPCODE_CHECK_MASK 0xC0 +#define OPCODE_CHECK_VAL_DISABLED 0x0 +#define OPCODE_CHECK_MASK_DISABLED 0x0 + +static inline void hfi1_reset_cpu_counters(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + int i; + + dd->z_int_counter = get_all_cpu_total(dd->int_counter); + dd->z_rcv_limit = get_all_cpu_total(dd->rcv_limit); + dd->z_send_schedule = get_all_cpu_total(dd->send_schedule); + + ppd = (struct hfi1_pportdata *)(dd + 1); + for (i = 0; i < dd->num_pports; i++, ppd++) { + ppd->ibport_data.rvp.z_rc_acks = + get_all_cpu_total(ppd->ibport_data.rvp.rc_acks); + ppd->ibport_data.rvp.z_rc_qacks = + get_all_cpu_total(ppd->ibport_data.rvp.rc_qacks); + } +} + +/* Control LED state */ +static inline void setextled(struct hfi1_devdata *dd, u32 on) +{ + if (on) + write_csr(dd, DCC_CFG_LED_CNTRL, 0x1F); + else + write_csr(dd, DCC_CFG_LED_CNTRL, 0x10); +} + +/* return the i2c resource given the target */ +static inline u32 i2c_target(u32 target) +{ + return target ? CR_I2C2 : CR_I2C1; +} + +/* return the i2c chain chip resource that this HFI uses for QSFP */ +static inline u32 qsfp_resource(struct hfi1_devdata *dd) +{ + return i2c_target(dd->hfi1_id); +} + +int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp); + +#endif /* _HFI1_KERNEL_H */ diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c new file mode 100644 index 000000000..eed971ccd --- /dev/null +++ b/drivers/infiniband/hw/hfi1/init.c @@ -0,0 +1,1818 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/pci.h> +#include <linux/netdevice.h> +#include <linux/vmalloc.h> +#include <linux/delay.h> +#include <linux/idr.h> +#include <linux/module.h> +#include <linux/printk.h> +#include <linux/hrtimer.h> +#include <rdma/rdma_vt.h> + +#include "hfi.h" +#include "device.h" +#include "common.h" +#include "trace.h" +#include "mad.h" +#include "sdma.h" +#include "debugfs.h" +#include "verbs.h" +#include "aspm.h" + +#undef pr_fmt +#define pr_fmt(fmt) DRIVER_NAME ": " fmt + +/* + * min buffers we want to have per context, after driver + */ +#define HFI1_MIN_USER_CTXT_BUFCNT 7 + +#define HFI1_MIN_HDRQ_EGRBUF_CNT 2 +#define HFI1_MAX_HDRQ_EGRBUF_CNT 16352 +#define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */ +#define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */ + +/* + * Number of user receive contexts we are configured to use (to allow for more + * pio buffers per ctxt, etc.) Zero means use one user context per CPU. + */ +int num_user_contexts = -1; +module_param_named(num_user_contexts, num_user_contexts, uint, S_IRUGO); +MODULE_PARM_DESC( + num_user_contexts, "Set max number of user contexts to use"); + +uint krcvqs[RXE_NUM_DATA_VL]; +int krcvqsset; +module_param_array(krcvqs, uint, &krcvqsset, S_IRUGO); +MODULE_PARM_DESC(krcvqs, "Array of the number of non-control kernel receive queues by VL"); + +/* computed based on above array */ +unsigned n_krcvqs; + +static unsigned hfi1_rcvarr_split = 25; +module_param_named(rcvarr_split, hfi1_rcvarr_split, uint, S_IRUGO); +MODULE_PARM_DESC(rcvarr_split, "Percent of context's RcvArray entries used for Eager buffers"); + +static uint eager_buffer_size = (2 << 20); /* 2MB */ +module_param(eager_buffer_size, uint, S_IRUGO); +MODULE_PARM_DESC(eager_buffer_size, "Size of the eager buffers, default: 2MB"); + +static uint rcvhdrcnt = 2048; /* 2x the max eager buffer count */ +module_param_named(rcvhdrcnt, rcvhdrcnt, uint, S_IRUGO); +MODULE_PARM_DESC(rcvhdrcnt, "Receive header queue count (default 2048)"); + +static uint hfi1_hdrq_entsize = 32; +module_param_named(hdrq_entsize, hfi1_hdrq_entsize, uint, S_IRUGO); +MODULE_PARM_DESC(hdrq_entsize, "Size of header queue entries: 2 - 8B, 16 - 64B (default), 32 - 128B"); + +unsigned int user_credit_return_threshold = 33; /* default is 33% */ +module_param(user_credit_return_threshold, uint, S_IRUGO); +MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user send contexts, return when unreturned credits passes this many blocks (in percent of allocated blocks, 0 is off)"); + +static inline u64 encode_rcv_header_entry_size(u16); + +static struct idr hfi1_unit_table; +u32 hfi1_cpulist_count; +unsigned long *hfi1_cpulist; + +/* + * Common code for creating the receive context array. + */ +int hfi1_create_ctxts(struct hfi1_devdata *dd) +{ + unsigned i; + int ret; + + /* Control context has to be always 0 */ + BUILD_BUG_ON(HFI1_CTRL_CTXT != 0); + + dd->rcd = kzalloc_node(dd->num_rcv_contexts * sizeof(*dd->rcd), + GFP_KERNEL, dd->node); + if (!dd->rcd) + goto nomem; + + /* create one or more kernel contexts */ + for (i = 0; i < dd->first_user_ctxt; ++i) { + struct hfi1_pportdata *ppd; + struct hfi1_ctxtdata *rcd; + + ppd = dd->pport + (i % dd->num_pports); + rcd = hfi1_create_ctxtdata(ppd, i, dd->node); + if (!rcd) { + dd_dev_err(dd, + "Unable to allocate kernel receive context, failing\n"); + goto nomem; + } + /* + * Set up the kernel context flags here and now because they + * use default values for all receive side memories. User + * contexts will be handled as they are created. + */ + rcd->flags = HFI1_CAP_KGET(MULTI_PKT_EGR) | + HFI1_CAP_KGET(NODROP_RHQ_FULL) | + HFI1_CAP_KGET(NODROP_EGR_FULL) | + HFI1_CAP_KGET(DMA_RTAIL); + + /* Control context must use DMA_RTAIL */ + if (rcd->ctxt == HFI1_CTRL_CTXT) + rcd->flags |= HFI1_CAP_DMA_RTAIL; + rcd->seq_cnt = 1; + + rcd->sc = sc_alloc(dd, SC_ACK, rcd->rcvhdrqentsize, dd->node); + if (!rcd->sc) { + dd_dev_err(dd, + "Unable to allocate kernel send context, failing\n"); + dd->rcd[rcd->ctxt] = NULL; + hfi1_free_ctxtdata(dd, rcd); + goto nomem; + } + + ret = hfi1_init_ctxt(rcd->sc); + if (ret < 0) { + dd_dev_err(dd, + "Failed to setup kernel receive context, failing\n"); + sc_free(rcd->sc); + dd->rcd[rcd->ctxt] = NULL; + hfi1_free_ctxtdata(dd, rcd); + ret = -EFAULT; + goto bail; + } + } + + /* + * Initialize aspm, to be done after gen3 transition and setting up + * contexts and before enabling interrupts + */ + aspm_init(dd); + + return 0; +nomem: + ret = -ENOMEM; +bail: + kfree(dd->rcd); + dd->rcd = NULL; + return ret; +} + +/* + * Common code for user and kernel context setup. + */ +struct hfi1_ctxtdata *hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, u32 ctxt, + int numa) +{ + struct hfi1_devdata *dd = ppd->dd; + struct hfi1_ctxtdata *rcd; + unsigned kctxt_ngroups = 0; + u32 base; + + if (dd->rcv_entries.nctxt_extra > + dd->num_rcv_contexts - dd->first_user_ctxt) + kctxt_ngroups = (dd->rcv_entries.nctxt_extra - + (dd->num_rcv_contexts - dd->first_user_ctxt)); + rcd = kzalloc(sizeof(*rcd), GFP_KERNEL); + if (rcd) { + u32 rcvtids, max_entries; + + hfi1_cdbg(PROC, "setting up context %u\n", ctxt); + + INIT_LIST_HEAD(&rcd->qp_wait_list); + rcd->ppd = ppd; + rcd->dd = dd; + rcd->cnt = 1; + rcd->ctxt = ctxt; + dd->rcd[ctxt] = rcd; + rcd->numa_id = numa; + rcd->rcv_array_groups = dd->rcv_entries.ngroups; + + mutex_init(&rcd->exp_lock); + + /* + * Calculate the context's RcvArray entry starting point. + * We do this here because we have to take into account all + * the RcvArray entries that previous context would have + * taken and we have to account for any extra groups + * assigned to the kernel or user contexts. + */ + if (ctxt < dd->first_user_ctxt) { + if (ctxt < kctxt_ngroups) { + base = ctxt * (dd->rcv_entries.ngroups + 1); + rcd->rcv_array_groups++; + } else + base = kctxt_ngroups + + (ctxt * dd->rcv_entries.ngroups); + } else { + u16 ct = ctxt - dd->first_user_ctxt; + + base = ((dd->n_krcv_queues * dd->rcv_entries.ngroups) + + kctxt_ngroups); + if (ct < dd->rcv_entries.nctxt_extra) { + base += ct * (dd->rcv_entries.ngroups + 1); + rcd->rcv_array_groups++; + } else + base += dd->rcv_entries.nctxt_extra + + (ct * dd->rcv_entries.ngroups); + } + rcd->eager_base = base * dd->rcv_entries.group_size; + + /* Validate and initialize Rcv Hdr Q variables */ + if (rcvhdrcnt % HDRQ_INCREMENT) { + dd_dev_err(dd, + "ctxt%u: header queue count %d must be divisible by %lu\n", + rcd->ctxt, rcvhdrcnt, HDRQ_INCREMENT); + goto bail; + } + rcd->rcvhdrq_cnt = rcvhdrcnt; + rcd->rcvhdrqentsize = hfi1_hdrq_entsize; + /* + * Simple Eager buffer allocation: we have already pre-allocated + * the number of RcvArray entry groups. Each ctxtdata structure + * holds the number of groups for that context. + * + * To follow CSR requirements and maintain cacheline alignment, + * make sure all sizes and bases are multiples of group_size. + * + * The expected entry count is what is left after assigning + * eager. + */ + max_entries = rcd->rcv_array_groups * + dd->rcv_entries.group_size; + rcvtids = ((max_entries * hfi1_rcvarr_split) / 100); + rcd->egrbufs.count = round_down(rcvtids, + dd->rcv_entries.group_size); + if (rcd->egrbufs.count > MAX_EAGER_ENTRIES) { + dd_dev_err(dd, "ctxt%u: requested too many RcvArray entries.\n", + rcd->ctxt); + rcd->egrbufs.count = MAX_EAGER_ENTRIES; + } + hfi1_cdbg(PROC, + "ctxt%u: max Eager buffer RcvArray entries: %u\n", + rcd->ctxt, rcd->egrbufs.count); + + /* + * Allocate array that will hold the eager buffer accounting + * data. + * This will allocate the maximum possible buffer count based + * on the value of the RcvArray split parameter. + * The resulting value will be rounded down to the closest + * multiple of dd->rcv_entries.group_size. + */ + rcd->egrbufs.buffers = kcalloc(rcd->egrbufs.count, + sizeof(*rcd->egrbufs.buffers), + GFP_KERNEL); + if (!rcd->egrbufs.buffers) + goto bail; + rcd->egrbufs.rcvtids = kcalloc(rcd->egrbufs.count, + sizeof(*rcd->egrbufs.rcvtids), + GFP_KERNEL); + if (!rcd->egrbufs.rcvtids) + goto bail; + rcd->egrbufs.size = eager_buffer_size; + /* + * The size of the buffers programmed into the RcvArray + * entries needs to be big enough to handle the highest + * MTU supported. + */ + if (rcd->egrbufs.size < hfi1_max_mtu) { + rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu); + hfi1_cdbg(PROC, + "ctxt%u: eager bufs size too small. Adjusting to %zu\n", + rcd->ctxt, rcd->egrbufs.size); + } + rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE; + + if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */ + rcd->opstats = kzalloc(sizeof(*rcd->opstats), + GFP_KERNEL); + if (!rcd->opstats) + goto bail; + } + } + return rcd; +bail: + kfree(rcd->egrbufs.rcvtids); + kfree(rcd->egrbufs.buffers); + kfree(rcd); + return NULL; +} + +/* + * Convert a receive header entry size that to the encoding used in the CSR. + * + * Return a zero if the given size is invalid. + */ +static inline u64 encode_rcv_header_entry_size(u16 size) +{ + /* there are only 3 valid receive header entry sizes */ + if (size == 2) + return 1; + if (size == 16) + return 2; + else if (size == 32) + return 4; + return 0; /* invalid */ +} + +/* + * Select the largest ccti value over all SLs to determine the intra- + * packet gap for the link. + * + * called with cca_timer_lock held (to protect access to cca_timer + * array), and rcu_read_lock() (to protect access to cc_state). + */ +void set_link_ipg(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + struct cc_state *cc_state; + int i; + u16 cce, ccti_limit, max_ccti = 0; + u16 shift, mult; + u64 src; + u32 current_egress_rate; /* Mbits /sec */ + u32 max_pkt_time; + /* + * max_pkt_time is the maximum packet egress time in units + * of the fabric clock period 1/(805 MHz). + */ + + cc_state = get_cc_state(ppd); + + if (!cc_state) + /* + * This should _never_ happen - rcu_read_lock() is held, + * and set_link_ipg() should not be called if cc_state + * is NULL. + */ + return; + + for (i = 0; i < OPA_MAX_SLS; i++) { + u16 ccti = ppd->cca_timer[i].ccti; + + if (ccti > max_ccti) + max_ccti = ccti; + } + + ccti_limit = cc_state->cct.ccti_limit; + if (max_ccti > ccti_limit) + max_ccti = ccti_limit; + + cce = cc_state->cct.entries[max_ccti].entry; + shift = (cce & 0xc000) >> 14; + mult = (cce & 0x3fff); + + current_egress_rate = active_egress_rate(ppd); + + max_pkt_time = egress_cycles(ppd->ibmaxlen, current_egress_rate); + + src = (max_pkt_time >> shift) * mult; + + src &= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SMASK; + src <<= SEND_STATIC_RATE_CONTROL_CSR_SRC_RELOAD_SHIFT; + + write_csr(dd, SEND_STATIC_RATE_CONTROL, src); +} + +static enum hrtimer_restart cca_timer_fn(struct hrtimer *t) +{ + struct cca_timer *cca_timer; + struct hfi1_pportdata *ppd; + int sl; + u16 ccti_timer, ccti_min; + struct cc_state *cc_state; + unsigned long flags; + enum hrtimer_restart ret = HRTIMER_NORESTART; + + cca_timer = container_of(t, struct cca_timer, hrtimer); + ppd = cca_timer->ppd; + sl = cca_timer->sl; + + rcu_read_lock(); + + cc_state = get_cc_state(ppd); + + if (!cc_state) { + rcu_read_unlock(); + return HRTIMER_NORESTART; + } + + /* + * 1) decrement ccti for SL + * 2) calculate IPG for link (set_link_ipg()) + * 3) restart timer, unless ccti is at min value + */ + + ccti_min = cc_state->cong_setting.entries[sl].ccti_min; + ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer; + + spin_lock_irqsave(&ppd->cca_timer_lock, flags); + + if (cca_timer->ccti > ccti_min) { + cca_timer->ccti--; + set_link_ipg(ppd); + } + + if (cca_timer->ccti > ccti_min) { + unsigned long nsec = 1024 * ccti_timer; + /* ccti_timer is in units of 1.024 usec */ + hrtimer_forward_now(t, ns_to_ktime(nsec)); + ret = HRTIMER_RESTART; + } + + spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); + rcu_read_unlock(); + return ret; +} + +/* + * Common code for initializing the physical port structure. + */ +void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, + struct hfi1_devdata *dd, u8 hw_pidx, u8 port) +{ + int i, size; + uint default_pkey_idx; + + ppd->dd = dd; + ppd->hw_pidx = hw_pidx; + ppd->port = port; /* IB port number, not index */ + + default_pkey_idx = 1; + + ppd->pkeys[default_pkey_idx] = DEFAULT_P_KEY; + if (loopback) { + hfi1_early_err(&pdev->dev, + "Faking data partition 0x8001 in idx %u\n", + !default_pkey_idx); + ppd->pkeys[!default_pkey_idx] = 0x8001; + } + + INIT_WORK(&ppd->link_vc_work, handle_verify_cap); + INIT_WORK(&ppd->link_up_work, handle_link_up); + INIT_WORK(&ppd->link_down_work, handle_link_down); + INIT_WORK(&ppd->freeze_work, handle_freeze); + INIT_WORK(&ppd->link_downgrade_work, handle_link_downgrade); + INIT_WORK(&ppd->sma_message_work, handle_sma_message); + INIT_WORK(&ppd->link_bounce_work, handle_link_bounce); + INIT_WORK(&ppd->linkstate_active_work, receive_interrupt_work); + INIT_WORK(&ppd->qsfp_info.qsfp_work, qsfp_event); + + mutex_init(&ppd->hls_lock); + spin_lock_init(&ppd->sdma_alllock); + spin_lock_init(&ppd->qsfp_info.qsfp_lock); + + ppd->qsfp_info.ppd = ppd; + ppd->sm_trap_qp = 0x0; + ppd->sa_qp = 0x1; + + ppd->hfi1_wq = NULL; + + spin_lock_init(&ppd->cca_timer_lock); + + for (i = 0; i < OPA_MAX_SLS; i++) { + hrtimer_init(&ppd->cca_timer[i].hrtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + ppd->cca_timer[i].ppd = ppd; + ppd->cca_timer[i].sl = i; + ppd->cca_timer[i].ccti = 0; + ppd->cca_timer[i].hrtimer.function = cca_timer_fn; + } + + ppd->cc_max_table_entries = IB_CC_TABLE_CAP_DEFAULT; + + spin_lock_init(&ppd->cc_state_lock); + spin_lock_init(&ppd->cc_log_lock); + size = sizeof(struct cc_state); + RCU_INIT_POINTER(ppd->cc_state, kzalloc(size, GFP_KERNEL)); + if (!rcu_dereference(ppd->cc_state)) + goto bail; + return; + +bail: + + hfi1_early_err(&pdev->dev, + "Congestion Control Agent disabled for port %d\n", port); +} + +/* + * Do initialization for device that is only needed on + * first detect, not on resets. + */ +static int loadtime_init(struct hfi1_devdata *dd) +{ + return 0; +} + +/** + * init_after_reset - re-initialize after a reset + * @dd: the hfi1_ib device + * + * sanity check at least some of the values after reset, and + * ensure no receive or transmit (explicitly, in case reset + * failed + */ +static int init_after_reset(struct hfi1_devdata *dd) +{ + int i; + + /* + * Ensure chip does no sends or receives, tail updates, or + * pioavail updates while we re-initialize. This is mostly + * for the driver data structures, not chip registers. + */ + for (i = 0; i < dd->num_rcv_contexts; i++) + hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS | + HFI1_RCVCTRL_INTRAVAIL_DIS | + HFI1_RCVCTRL_TAILUPD_DIS, i); + pio_send_control(dd, PSC_GLOBAL_DISABLE); + for (i = 0; i < dd->num_send_contexts; i++) + sc_disable(dd->send_contexts[i].sc); + + return 0; +} + +static void enable_chip(struct hfi1_devdata *dd) +{ + u32 rcvmask; + u32 i; + + /* enable PIO send */ + pio_send_control(dd, PSC_GLOBAL_ENABLE); + + /* + * Enable kernel ctxts' receive and receive interrupt. + * Other ctxts done as user opens and initializes them. + */ + for (i = 0; i < dd->first_user_ctxt; ++i) { + rcvmask = HFI1_RCVCTRL_CTXT_ENB | HFI1_RCVCTRL_INTRAVAIL_ENB; + rcvmask |= HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, DMA_RTAIL) ? + HFI1_RCVCTRL_TAILUPD_ENB : HFI1_RCVCTRL_TAILUPD_DIS; + if (!HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, MULTI_PKT_EGR)) + rcvmask |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB; + if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_RHQ_FULL)) + rcvmask |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB; + if (HFI1_CAP_KGET_MASK(dd->rcd[i]->flags, NODROP_EGR_FULL)) + rcvmask |= HFI1_RCVCTRL_NO_EGR_DROP_ENB; + hfi1_rcvctrl(dd, rcvmask, i); + sc_enable(dd->rcd[i]->sc); + } +} + +/** + * create_workqueues - create per port workqueues + * @dd: the hfi1_ib device + */ +static int create_workqueues(struct hfi1_devdata *dd) +{ + int pidx; + struct hfi1_pportdata *ppd; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (!ppd->hfi1_wq) { + ppd->hfi1_wq = + alloc_workqueue( + "hfi%d_%d", + WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE, + dd->num_sdma, + dd->unit, pidx); + if (!ppd->hfi1_wq) + goto wq_error; + } + } + return 0; +wq_error: + pr_err("alloc_workqueue failed for port %d\n", pidx + 1); + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->hfi1_wq) { + destroy_workqueue(ppd->hfi1_wq); + ppd->hfi1_wq = NULL; + } + } + return -ENOMEM; +} + +/** + * hfi1_init - do the actual initialization sequence on the chip + * @dd: the hfi1_ib device + * @reinit: re-initializing, so don't allocate new memory + * + * Do the actual initialization sequence on the chip. This is done + * both from the init routine called from the PCI infrastructure, and + * when we reset the chip, or detect that it was reset internally, + * or it's administratively re-enabled. + * + * Memory allocation here and in called routines is only done in + * the first case (reinit == 0). We have to be careful, because even + * without memory allocation, we need to re-write all the chip registers + * TIDs, etc. after the reset or enable has completed. + */ +int hfi1_init(struct hfi1_devdata *dd, int reinit) +{ + int ret = 0, pidx, lastfail = 0; + unsigned i, len; + struct hfi1_ctxtdata *rcd; + struct hfi1_pportdata *ppd; + + /* Set up recv low level handlers */ + dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EXPECTED] = + kdeth_process_expected; + dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_EAGER] = + kdeth_process_eager; + dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_IB] = process_receive_ib; + dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_ERROR] = + process_receive_error; + dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_BYPASS] = + process_receive_bypass; + dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID5] = + process_receive_invalid; + dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID6] = + process_receive_invalid; + dd->normal_rhf_rcv_functions[RHF_RCV_TYPE_INVALID7] = + process_receive_invalid; + dd->rhf_rcv_function_map = dd->normal_rhf_rcv_functions; + + /* Set up send low level handlers */ + dd->process_pio_send = hfi1_verbs_send_pio; + dd->process_dma_send = hfi1_verbs_send_dma; + dd->pio_inline_send = pio_copy; + + if (is_ax(dd)) { + atomic_set(&dd->drop_packet, DROP_PACKET_ON); + dd->do_drop = 1; + } else { + atomic_set(&dd->drop_packet, DROP_PACKET_OFF); + dd->do_drop = 0; + } + + /* make sure the link is not "up" */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + ppd->linkup = 0; + } + + if (reinit) + ret = init_after_reset(dd); + else + ret = loadtime_init(dd); + if (ret) + goto done; + + /* allocate dummy tail memory for all receive contexts */ + dd->rcvhdrtail_dummy_kvaddr = dma_zalloc_coherent( + &dd->pcidev->dev, sizeof(u64), + &dd->rcvhdrtail_dummy_physaddr, + GFP_KERNEL); + + if (!dd->rcvhdrtail_dummy_kvaddr) { + dd_dev_err(dd, "cannot allocate dummy tail memory\n"); + ret = -ENOMEM; + goto done; + } + + /* dd->rcd can be NULL if early initialization failed */ + for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) { + /* + * Set up the (kernel) rcvhdr queue and egr TIDs. If doing + * re-init, the simplest way to handle this is to free + * existing, and re-allocate. + * Need to re-create rest of ctxt 0 ctxtdata as well. + */ + rcd = dd->rcd[i]; + if (!rcd) + continue; + + rcd->do_interrupt = &handle_receive_interrupt; + + lastfail = hfi1_create_rcvhdrq(dd, rcd); + if (!lastfail) + lastfail = hfi1_setup_eagerbufs(rcd); + if (lastfail) { + dd_dev_err(dd, + "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); + ret = lastfail; + } + } + + /* Allocate enough memory for user event notification. */ + len = PAGE_ALIGN(dd->chip_rcv_contexts * HFI1_MAX_SHARED_CTXTS * + sizeof(*dd->events)); + dd->events = vmalloc_user(len); + if (!dd->events) + dd_dev_err(dd, "Failed to allocate user events page\n"); + /* + * Allocate a page for device and port status. + * Page will be shared amongst all user processes. + */ + dd->status = vmalloc_user(PAGE_SIZE); + if (!dd->status) + dd_dev_err(dd, "Failed to allocate dev status page\n"); + else + dd->freezelen = PAGE_SIZE - (sizeof(*dd->status) - + sizeof(dd->status->freezemsg)); + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (dd->status) + /* Currently, we only have one port */ + ppd->statusp = &dd->status->port; + + set_mtu(ppd); + } + + /* enable chip even if we have an error, so we can debug cause */ + enable_chip(dd); + +done: + /* + * Set status even if port serdes is not initialized + * so that diags will work. + */ + if (dd->status) + dd->status->dev |= HFI1_STATUS_CHIP_PRESENT | + HFI1_STATUS_INITTED; + if (!ret) { + /* enable all interrupts from the chip */ + set_intr_state(dd, 1); + + /* chip is OK for user apps; mark it as initialized */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + /* + * start the serdes - must be after interrupts are + * enabled so we are notified when the link goes up + */ + lastfail = bringup_serdes(ppd); + if (lastfail) + dd_dev_info(dd, + "Failed to bring up port %u\n", + ppd->port); + + /* + * Set status even if port serdes is not initialized + * so that diags will work. + */ + if (ppd->statusp) + *ppd->statusp |= HFI1_STATUS_CHIP_PRESENT | + HFI1_STATUS_INITTED; + if (!ppd->link_speed_enabled) + continue; + } + } + + /* if ret is non-zero, we probably should do some cleanup here... */ + return ret; +} + +static inline struct hfi1_devdata *__hfi1_lookup(int unit) +{ + return idr_find(&hfi1_unit_table, unit); +} + +struct hfi1_devdata *hfi1_lookup(int unit) +{ + struct hfi1_devdata *dd; + unsigned long flags; + + spin_lock_irqsave(&hfi1_devs_lock, flags); + dd = __hfi1_lookup(unit); + spin_unlock_irqrestore(&hfi1_devs_lock, flags); + + return dd; +} + +/* + * Stop the timers during unit shutdown, or after an error late + * in initialization. + */ +static void stop_timers(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + int pidx; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + if (ppd->led_override_timer.data) { + del_timer_sync(&ppd->led_override_timer); + atomic_set(&ppd->led_override_timer_active, 0); + } + } +} + +/** + * shutdown_device - shut down a device + * @dd: the hfi1_ib device + * + * This is called to make the device quiet when we are about to + * unload the driver, and also when the device is administratively + * disabled. It does not free any data structures. + * Everything it does has to be setup again by hfi1_init(dd, 1) + */ +static void shutdown_device(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + unsigned pidx; + int i; + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + ppd->linkup = 0; + if (ppd->statusp) + *ppd->statusp &= ~(HFI1_STATUS_IB_CONF | + HFI1_STATUS_IB_READY); + } + dd->flags &= ~HFI1_INITTED; + + /* mask interrupts, but not errors */ + set_intr_state(dd, 0); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + for (i = 0; i < dd->num_rcv_contexts; i++) + hfi1_rcvctrl(dd, HFI1_RCVCTRL_TAILUPD_DIS | + HFI1_RCVCTRL_CTXT_DIS | + HFI1_RCVCTRL_INTRAVAIL_DIS | + HFI1_RCVCTRL_PKEY_DIS | + HFI1_RCVCTRL_ONE_PKT_EGR_DIS, i); + /* + * Gracefully stop all sends allowing any in progress to + * trickle out first. + */ + for (i = 0; i < dd->num_send_contexts; i++) + sc_flush(dd->send_contexts[i].sc); + } + + /* + * Enough for anything that's going to trickle out to have actually + * done so. + */ + udelay(20); + + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + ppd = dd->pport + pidx; + + /* disable all contexts */ + for (i = 0; i < dd->num_send_contexts; i++) + sc_disable(dd->send_contexts[i].sc); + /* disable the send device */ + pio_send_control(dd, PSC_GLOBAL_DISABLE); + + shutdown_led_override(ppd); + + /* + * Clear SerdesEnable. + * We can't count on interrupts since we are stopping. + */ + hfi1_quiet_serdes(ppd); + + if (ppd->hfi1_wq) { + destroy_workqueue(ppd->hfi1_wq); + ppd->hfi1_wq = NULL; + } + } + sdma_exit(dd); +} + +/** + * hfi1_free_ctxtdata - free a context's allocated data + * @dd: the hfi1_ib device + * @rcd: the ctxtdata structure + * + * free up any allocated data for a context + * This should not touch anything that would affect a simultaneous + * re-allocation of context data, because it is called after hfi1_mutex + * is released (and can be called from reinit as well). + * It should never change any chip state, or global driver state. + */ +void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) +{ + unsigned e; + + if (!rcd) + return; + + if (rcd->rcvhdrq) { + dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size, + rcd->rcvhdrq, rcd->rcvhdrq_phys); + rcd->rcvhdrq = NULL; + if (rcd->rcvhdrtail_kvaddr) { + dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE, + (void *)rcd->rcvhdrtail_kvaddr, + rcd->rcvhdrqtailaddr_phys); + rcd->rcvhdrtail_kvaddr = NULL; + } + } + + /* all the RcvArray entries should have been cleared by now */ + kfree(rcd->egrbufs.rcvtids); + + for (e = 0; e < rcd->egrbufs.alloced; e++) { + if (rcd->egrbufs.buffers[e].phys) + dma_free_coherent(&dd->pcidev->dev, + rcd->egrbufs.buffers[e].len, + rcd->egrbufs.buffers[e].addr, + rcd->egrbufs.buffers[e].phys); + } + kfree(rcd->egrbufs.buffers); + + sc_free(rcd->sc); + vfree(rcd->user_event_mask); + vfree(rcd->subctxt_uregbase); + vfree(rcd->subctxt_rcvegrbuf); + vfree(rcd->subctxt_rcvhdr_base); + kfree(rcd->opstats); + kfree(rcd); +} + +/* + * Release our hold on the shared asic data. If we are the last one, + * free the structure. Must be holding hfi1_devs_lock. + */ +static void release_asic_data(struct hfi1_devdata *dd) +{ + int other; + + if (!dd->asic_data) + return; + dd->asic_data->dds[dd->hfi1_id] = NULL; + other = dd->hfi1_id ? 0 : 1; + if (!dd->asic_data->dds[other]) { + /* we are the last holder, free it */ + kfree(dd->asic_data); + } + dd->asic_data = NULL; +} + +static void __hfi1_free_devdata(struct kobject *kobj) +{ + struct hfi1_devdata *dd = + container_of(kobj, struct hfi1_devdata, kobj); + unsigned long flags; + + spin_lock_irqsave(&hfi1_devs_lock, flags); + idr_remove(&hfi1_unit_table, dd->unit); + list_del(&dd->list); + release_asic_data(dd); + spin_unlock_irqrestore(&hfi1_devs_lock, flags); + free_platform_config(dd); + rcu_barrier(); /* wait for rcu callbacks to complete */ + free_percpu(dd->int_counter); + free_percpu(dd->rcv_limit); + hfi1_dev_affinity_free(dd); + free_percpu(dd->send_schedule); + rvt_dealloc_device(&dd->verbs_dev.rdi); +} + +static struct kobj_type hfi1_devdata_type = { + .release = __hfi1_free_devdata, +}; + +void hfi1_free_devdata(struct hfi1_devdata *dd) +{ + kobject_put(&dd->kobj); +} + +/* + * Allocate our primary per-unit data structure. Must be done via verbs + * allocator, because the verbs cleanup process both does cleanup and + * free of the data structure. + * "extra" is for chip-specific data. + * + * Use the idr mechanism to get a unit number for this unit. + */ +struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) +{ + unsigned long flags; + struct hfi1_devdata *dd; + int ret, nports; + + /* extra is * number of ports */ + nports = extra / sizeof(struct hfi1_pportdata); + + dd = (struct hfi1_devdata *)rvt_alloc_device(sizeof(*dd) + extra, + nports); + if (!dd) + return ERR_PTR(-ENOMEM); + dd->num_pports = nports; + dd->pport = (struct hfi1_pportdata *)(dd + 1); + + INIT_LIST_HEAD(&dd->list); + idr_preload(GFP_KERNEL); + spin_lock_irqsave(&hfi1_devs_lock, flags); + + ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT); + if (ret >= 0) { + dd->unit = ret; + list_add(&dd->list, &hfi1_dev_list); + } + + spin_unlock_irqrestore(&hfi1_devs_lock, flags); + idr_preload_end(); + + if (ret < 0) { + hfi1_early_err(&pdev->dev, + "Could not allocate unit ID: error %d\n", -ret); + goto bail; + } + /* + * Initialize all locks for the device. This needs to be as early as + * possible so locks are usable. + */ + spin_lock_init(&dd->sc_lock); + spin_lock_init(&dd->sendctrl_lock); + spin_lock_init(&dd->rcvctrl_lock); + spin_lock_init(&dd->uctxt_lock); + spin_lock_init(&dd->hfi1_diag_trans_lock); + spin_lock_init(&dd->sc_init_lock); + spin_lock_init(&dd->dc8051_lock); + spin_lock_init(&dd->dc8051_memlock); + seqlock_init(&dd->sc2vl_lock); + spin_lock_init(&dd->sde_map_lock); + spin_lock_init(&dd->pio_map_lock); + init_waitqueue_head(&dd->event_queue); + + dd->int_counter = alloc_percpu(u64); + if (!dd->int_counter) { + ret = -ENOMEM; + hfi1_early_err(&pdev->dev, + "Could not allocate per-cpu int_counter\n"); + goto bail; + } + + dd->rcv_limit = alloc_percpu(u64); + if (!dd->rcv_limit) { + ret = -ENOMEM; + hfi1_early_err(&pdev->dev, + "Could not allocate per-cpu rcv_limit\n"); + goto bail; + } + + dd->send_schedule = alloc_percpu(u64); + if (!dd->send_schedule) { + ret = -ENOMEM; + hfi1_early_err(&pdev->dev, + "Could not allocate per-cpu int_counter\n"); + goto bail; + } + + if (!hfi1_cpulist_count) { + u32 count = num_online_cpus(); + + hfi1_cpulist = kcalloc(BITS_TO_LONGS(count), sizeof(long), + GFP_KERNEL); + if (hfi1_cpulist) + hfi1_cpulist_count = count; + else + hfi1_early_err( + &pdev->dev, + "Could not alloc cpulist info, cpu affinity might be wrong\n"); + } + kobject_init(&dd->kobj, &hfi1_devdata_type); + return dd; + +bail: + if (!list_empty(&dd->list)) + list_del_init(&dd->list); + rvt_dealloc_device(&dd->verbs_dev.rdi); + return ERR_PTR(ret); +} + +/* + * Called from freeze mode handlers, and from PCI error + * reporting code. Should be paranoid about state of + * system and data structures. + */ +void hfi1_disable_after_error(struct hfi1_devdata *dd) +{ + if (dd->flags & HFI1_INITTED) { + u32 pidx; + + dd->flags &= ~HFI1_INITTED; + if (dd->pport) + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + struct hfi1_pportdata *ppd; + + ppd = dd->pport + pidx; + if (dd->flags & HFI1_PRESENT) + set_link_state(ppd, HLS_DN_DISABLE); + + if (ppd->statusp) + *ppd->statusp &= ~HFI1_STATUS_IB_READY; + } + } + + /* + * Mark as having had an error for driver, and also + * for /sys and status word mapped to user programs. + * This marks unit as not usable, until reset. + */ + if (dd->status) + dd->status->dev |= HFI1_STATUS_HWERROR; +} + +static void remove_one(struct pci_dev *); +static int init_one(struct pci_dev *, const struct pci_device_id *); + +#define DRIVER_LOAD_MSG "Intel " DRIVER_NAME " loaded: " +#define PFX DRIVER_NAME ": " + +static const struct pci_device_id hfi1_pci_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL0) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL1) }, + { 0, } +}; + +MODULE_DEVICE_TABLE(pci, hfi1_pci_tbl); + +static struct pci_driver hfi1_pci_driver = { + .name = DRIVER_NAME, + .probe = init_one, + .remove = remove_one, + .id_table = hfi1_pci_tbl, + .err_handler = &hfi1_pci_err_handler, +}; + +static void __init compute_krcvqs(void) +{ + int i; + + for (i = 0; i < krcvqsset; i++) + n_krcvqs += krcvqs[i]; +} + +/* + * Do all the generic driver unit- and chip-independent memory + * allocation and initialization. + */ +static int __init hfi1_mod_init(void) +{ + int ret; + + ret = dev_init(); + if (ret) + goto bail; + + /* validate max MTU before any devices start */ + if (!valid_opa_max_mtu(hfi1_max_mtu)) { + pr_err("Invalid max_mtu 0x%x, using 0x%x instead\n", + hfi1_max_mtu, HFI1_DEFAULT_MAX_MTU); + hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU; + } + /* valid CUs run from 1-128 in powers of 2 */ + if (hfi1_cu > 128 || !is_power_of_2(hfi1_cu)) + hfi1_cu = 1; + /* valid credit return threshold is 0-100, variable is unsigned */ + if (user_credit_return_threshold > 100) + user_credit_return_threshold = 100; + + compute_krcvqs(); + /* + * sanitize receive interrupt count, time must wait until after + * the hardware type is known + */ + if (rcv_intr_count > RCV_HDR_HEAD_COUNTER_MASK) + rcv_intr_count = RCV_HDR_HEAD_COUNTER_MASK; + /* reject invalid combinations */ + if (rcv_intr_count == 0 && rcv_intr_timeout == 0) { + pr_err("Invalid mode: both receive interrupt count and available timeout are zero - setting interrupt count to 1\n"); + rcv_intr_count = 1; + } + if (rcv_intr_count > 1 && rcv_intr_timeout == 0) { + /* + * Avoid indefinite packet delivery by requiring a timeout + * if count is > 1. + */ + pr_err("Invalid mode: receive interrupt count greater than 1 and available timeout is zero - setting available timeout to 1\n"); + rcv_intr_timeout = 1; + } + if (rcv_intr_dynamic && !(rcv_intr_count > 1 && rcv_intr_timeout > 0)) { + /* + * The dynamic algorithm expects a non-zero timeout + * and a count > 1. + */ + pr_err("Invalid mode: dynamic receive interrupt mitigation with invalid count and timeout - turning dynamic off\n"); + rcv_intr_dynamic = 0; + } + + /* sanitize link CRC options */ + link_crc_mask &= SUPPORTED_CRCS; + + /* + * These must be called before the driver is registered with + * the PCI subsystem. + */ + idr_init(&hfi1_unit_table); + + hfi1_dbg_init(); + ret = hfi1_wss_init(); + if (ret < 0) + goto bail_wss; + ret = pci_register_driver(&hfi1_pci_driver); + if (ret < 0) { + pr_err("Unable to register driver: error %d\n", -ret); + goto bail_dev; + } + goto bail; /* all OK */ + +bail_dev: + hfi1_wss_exit(); +bail_wss: + hfi1_dbg_exit(); + idr_destroy(&hfi1_unit_table); + dev_cleanup(); +bail: + return ret; +} + +module_init(hfi1_mod_init); + +/* + * Do the non-unit driver cleanup, memory free, etc. at unload. + */ +static void __exit hfi1_mod_cleanup(void) +{ + pci_unregister_driver(&hfi1_pci_driver); + hfi1_wss_exit(); + hfi1_dbg_exit(); + hfi1_cpulist_count = 0; + kfree(hfi1_cpulist); + + idr_destroy(&hfi1_unit_table); + dispose_firmware(); /* asymmetric with obtain_firmware() */ + dev_cleanup(); +} + +module_exit(hfi1_mod_cleanup); + +/* this can only be called after a successful initialization */ +static void cleanup_device_data(struct hfi1_devdata *dd) +{ + int ctxt; + int pidx; + struct hfi1_ctxtdata **tmp; + unsigned long flags; + + /* users can't do anything more with chip */ + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + struct hfi1_pportdata *ppd = &dd->pport[pidx]; + struct cc_state *cc_state; + int i; + + if (ppd->statusp) + *ppd->statusp &= ~HFI1_STATUS_CHIP_PRESENT; + + for (i = 0; i < OPA_MAX_SLS; i++) + hrtimer_cancel(&ppd->cca_timer[i].hrtimer); + + spin_lock(&ppd->cc_state_lock); + cc_state = get_cc_state(ppd); + RCU_INIT_POINTER(ppd->cc_state, NULL); + spin_unlock(&ppd->cc_state_lock); + + if (cc_state) + call_rcu(&cc_state->rcu, cc_state_reclaim); + } + + free_credit_return(dd); + + /* + * Free any resources still in use (usually just kernel contexts) + * at unload; we do for ctxtcnt, because that's what we allocate. + * We acquire lock to be really paranoid that rcd isn't being + * accessed from some interrupt-related code (that should not happen, + * but best to be sure). + */ + spin_lock_irqsave(&dd->uctxt_lock, flags); + tmp = dd->rcd; + dd->rcd = NULL; + spin_unlock_irqrestore(&dd->uctxt_lock, flags); + + if (dd->rcvhdrtail_dummy_kvaddr) { + dma_free_coherent(&dd->pcidev->dev, sizeof(u64), + (void *)dd->rcvhdrtail_dummy_kvaddr, + dd->rcvhdrtail_dummy_physaddr); + dd->rcvhdrtail_dummy_kvaddr = NULL; + } + + for (ctxt = 0; tmp && ctxt < dd->num_rcv_contexts; ctxt++) { + struct hfi1_ctxtdata *rcd = tmp[ctxt]; + + tmp[ctxt] = NULL; /* debugging paranoia */ + if (rcd) { + hfi1_clear_tids(rcd); + hfi1_free_ctxtdata(dd, rcd); + } + } + kfree(tmp); + free_pio_map(dd); + /* must follow rcv context free - need to remove rcv's hooks */ + for (ctxt = 0; ctxt < dd->num_send_contexts; ctxt++) + sc_free(dd->send_contexts[ctxt].sc); + dd->num_send_contexts = 0; + kfree(dd->send_contexts); + dd->send_contexts = NULL; + kfree(dd->hw_to_sw); + dd->hw_to_sw = NULL; + kfree(dd->boardname); + vfree(dd->events); + vfree(dd->status); +} + +/* + * Clean up on unit shutdown, or error during unit load after + * successful initialization. + */ +static void postinit_cleanup(struct hfi1_devdata *dd) +{ + hfi1_start_cleanup(dd); + + hfi1_pcie_ddcleanup(dd); + hfi1_pcie_cleanup(dd->pcidev); + + cleanup_device_data(dd); + + hfi1_free_devdata(dd); +} + +static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int ret = 0, j, pidx, initfail; + struct hfi1_devdata *dd = ERR_PTR(-EINVAL); + struct hfi1_pportdata *ppd; + + /* First, lock the non-writable module parameters */ + HFI1_CAP_LOCK(); + + /* Validate some global module parameters */ + if (rcvhdrcnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) { + hfi1_early_err(&pdev->dev, "Header queue count too small\n"); + ret = -EINVAL; + goto bail; + } + if (rcvhdrcnt > HFI1_MAX_HDRQ_EGRBUF_CNT) { + hfi1_early_err(&pdev->dev, + "Receive header queue count cannot be greater than %u\n", + HFI1_MAX_HDRQ_EGRBUF_CNT); + ret = -EINVAL; + goto bail; + } + /* use the encoding function as a sanitization check */ + if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) { + hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n", + hfi1_hdrq_entsize); + ret = -EINVAL; + goto bail; + } + + /* The receive eager buffer size must be set before the receive + * contexts are created. + * + * Set the eager buffer size. Validate that it falls in a range + * allowed by the hardware - all powers of 2 between the min and + * max. The maximum valid MTU is within the eager buffer range + * so we do not need to cap the max_mtu by an eager buffer size + * setting. + */ + if (eager_buffer_size) { + if (!is_power_of_2(eager_buffer_size)) + eager_buffer_size = + roundup_pow_of_two(eager_buffer_size); + eager_buffer_size = + clamp_val(eager_buffer_size, + MIN_EAGER_BUFFER * 8, + MAX_EAGER_BUFFER_TOTAL); + hfi1_early_info(&pdev->dev, "Eager buffer size %u\n", + eager_buffer_size); + } else { + hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n"); + ret = -EINVAL; + goto bail; + } + + /* restrict value of hfi1_rcvarr_split */ + hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100); + + ret = hfi1_pcie_init(pdev, ent); + if (ret) + goto bail; + + /* + * Do device-specific initialization, function table setup, dd + * allocation, etc. + */ + switch (ent->device) { + case PCI_DEVICE_ID_INTEL0: + case PCI_DEVICE_ID_INTEL1: + dd = hfi1_init_dd(pdev, ent); + break; + default: + hfi1_early_err(&pdev->dev, + "Failing on unknown Intel deviceid 0x%x\n", + ent->device); + ret = -ENODEV; + } + + if (IS_ERR(dd)) + ret = PTR_ERR(dd); + if (ret) + goto clean_bail; /* error already printed */ + + ret = create_workqueues(dd); + if (ret) + goto clean_bail; + + /* do the generic initialization */ + initfail = hfi1_init(dd, 0); + + ret = hfi1_register_ib_device(dd); + + /* + * Now ready for use. this should be cleared whenever we + * detect a reset, or initiate one. If earlier failure, + * we still create devices, so diags, etc. can be used + * to determine cause of problem. + */ + if (!initfail && !ret) { + dd->flags |= HFI1_INITTED; + /* create debufs files after init and ib register */ + hfi1_dbg_ibdev_init(&dd->verbs_dev); + } + + j = hfi1_device_create(dd); + if (j) + dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j); + + if (initfail || ret) { + stop_timers(dd); + flush_workqueue(ib_wq); + for (pidx = 0; pidx < dd->num_pports; ++pidx) { + hfi1_quiet_serdes(dd->pport + pidx); + ppd = dd->pport + pidx; + if (ppd->hfi1_wq) { + destroy_workqueue(ppd->hfi1_wq); + ppd->hfi1_wq = NULL; + } + } + if (!j) + hfi1_device_remove(dd); + if (!ret) + hfi1_unregister_ib_device(dd); + postinit_cleanup(dd); + if (initfail) + ret = initfail; + goto bail; /* everything already cleaned */ + } + + sdma_start(dd); + + return 0; + +clean_bail: + hfi1_pcie_cleanup(pdev); +bail: + return ret; +} + +static void remove_one(struct pci_dev *pdev) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + + /* close debugfs files before ib unregister */ + hfi1_dbg_ibdev_exit(&dd->verbs_dev); + /* unregister from IB core */ + hfi1_unregister_ib_device(dd); + + /* + * Disable the IB link, disable interrupts on the device, + * clear dma engines, etc. + */ + shutdown_device(dd); + + stop_timers(dd); + + /* wait until all of our (qsfp) queue_work() calls complete */ + flush_workqueue(ib_wq); + + hfi1_device_remove(dd); + + postinit_cleanup(dd); +} + +/** + * hfi1_create_rcvhdrq - create a receive header queue + * @dd: the hfi1_ib device + * @rcd: the context data + * + * This must be contiguous memory (from an i/o perspective), and must be + * DMA'able (which means for some systems, it will go through an IOMMU, + * or be forced into a low address range). + */ +int hfi1_create_rcvhdrq(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd) +{ + unsigned amt; + u64 reg; + + if (!rcd->rcvhdrq) { + dma_addr_t phys_hdrqtail; + gfp_t gfp_flags; + + /* + * rcvhdrqentsize is in DWs, so we have to convert to bytes + * (* sizeof(u32)). + */ + amt = PAGE_ALIGN(rcd->rcvhdrq_cnt * rcd->rcvhdrqentsize * + sizeof(u32)); + + gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ? + GFP_USER : GFP_KERNEL; + rcd->rcvhdrq = dma_zalloc_coherent( + &dd->pcidev->dev, amt, &rcd->rcvhdrq_phys, + gfp_flags | __GFP_COMP); + + if (!rcd->rcvhdrq) { + dd_dev_err(dd, + "attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n", + amt, rcd->ctxt); + goto bail; + } + + if (HFI1_CAP_KGET_MASK(rcd->flags, DMA_RTAIL)) { + rcd->rcvhdrtail_kvaddr = dma_zalloc_coherent( + &dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail, + gfp_flags); + if (!rcd->rcvhdrtail_kvaddr) + goto bail_free; + rcd->rcvhdrqtailaddr_phys = phys_hdrqtail; + } + + rcd->rcvhdrq_size = amt; + } + /* + * These values are per-context: + * RcvHdrCnt + * RcvHdrEntSize + * RcvHdrSize + */ + reg = ((u64)(rcd->rcvhdrq_cnt >> HDRQ_SIZE_SHIFT) + & RCV_HDR_CNT_CNT_MASK) + << RCV_HDR_CNT_CNT_SHIFT; + write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_CNT, reg); + reg = (encode_rcv_header_entry_size(rcd->rcvhdrqentsize) + & RCV_HDR_ENT_SIZE_ENT_SIZE_MASK) + << RCV_HDR_ENT_SIZE_ENT_SIZE_SHIFT; + write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_ENT_SIZE, reg); + reg = (dd->rcvhdrsize & RCV_HDR_SIZE_HDR_SIZE_MASK) + << RCV_HDR_SIZE_HDR_SIZE_SHIFT; + write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_SIZE, reg); + + /* + * Program dummy tail address for every receive context + * before enabling any receive context + */ + write_kctxt_csr(dd, rcd->ctxt, RCV_HDR_TAIL_ADDR, + dd->rcvhdrtail_dummy_physaddr); + + return 0; + +bail_free: + dd_dev_err(dd, + "attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n", + rcd->ctxt); + vfree(rcd->user_event_mask); + rcd->user_event_mask = NULL; + dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq, + rcd->rcvhdrq_phys); + rcd->rcvhdrq = NULL; +bail: + return -ENOMEM; +} + +/** + * allocate eager buffers, both kernel and user contexts. + * @rcd: the context we are setting up. + * + * Allocate the eager TID buffers and program them into hip. + * They are no longer completely contiguous, we do multiple allocation + * calls. Otherwise we get the OOM code involved, by asking for too + * much per call, with disastrous results on some kernels. + */ +int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd) +{ + struct hfi1_devdata *dd = rcd->dd; + u32 max_entries, egrtop, alloced_bytes = 0, idx = 0; + gfp_t gfp_flags; + u16 order; + int ret = 0; + u16 round_mtu = roundup_pow_of_two(hfi1_max_mtu); + + /* + * GFP_USER, but without GFP_FS, so buffer cache can be + * coalesced (we hope); otherwise, even at order 4, + * heavy filesystem activity makes these fail, and we can + * use compound pages. + */ + gfp_flags = __GFP_RECLAIM | __GFP_IO | __GFP_COMP; + + /* + * The minimum size of the eager buffers is a groups of MTU-sized + * buffers. + * The global eager_buffer_size parameter is checked against the + * theoretical lower limit of the value. Here, we check against the + * MTU. + */ + if (rcd->egrbufs.size < (round_mtu * dd->rcv_entries.group_size)) + rcd->egrbufs.size = round_mtu * dd->rcv_entries.group_size; + /* + * If using one-pkt-per-egr-buffer, lower the eager buffer + * size to the max MTU (page-aligned). + */ + if (!HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) + rcd->egrbufs.rcvtid_size = round_mtu; + + /* + * Eager buffers sizes of 1MB or less require smaller TID sizes + * to satisfy the "multiple of 8 RcvArray entries" requirement. + */ + if (rcd->egrbufs.size <= (1 << 20)) + rcd->egrbufs.rcvtid_size = max((unsigned long)round_mtu, + rounddown_pow_of_two(rcd->egrbufs.size / 8)); + + while (alloced_bytes < rcd->egrbufs.size && + rcd->egrbufs.alloced < rcd->egrbufs.count) { + rcd->egrbufs.buffers[idx].addr = + dma_zalloc_coherent(&dd->pcidev->dev, + rcd->egrbufs.rcvtid_size, + &rcd->egrbufs.buffers[idx].phys, + gfp_flags); + if (rcd->egrbufs.buffers[idx].addr) { + rcd->egrbufs.buffers[idx].len = + rcd->egrbufs.rcvtid_size; + rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].addr = + rcd->egrbufs.buffers[idx].addr; + rcd->egrbufs.rcvtids[rcd->egrbufs.alloced].phys = + rcd->egrbufs.buffers[idx].phys; + rcd->egrbufs.alloced++; + alloced_bytes += rcd->egrbufs.rcvtid_size; + idx++; + } else { + u32 new_size, i, j; + u64 offset = 0; + + /* + * Fail the eager buffer allocation if: + * - we are already using the lowest acceptable size + * - we are using one-pkt-per-egr-buffer (this implies + * that we are accepting only one size) + */ + if (rcd->egrbufs.rcvtid_size == round_mtu || + !HFI1_CAP_KGET_MASK(rcd->flags, MULTI_PKT_EGR)) { + dd_dev_err(dd, "ctxt%u: Failed to allocate eager buffers\n", + rcd->ctxt); + goto bail_rcvegrbuf_phys; + } + + new_size = rcd->egrbufs.rcvtid_size / 2; + + /* + * If the first attempt to allocate memory failed, don't + * fail everything but continue with the next lower + * size. + */ + if (idx == 0) { + rcd->egrbufs.rcvtid_size = new_size; + continue; + } + + /* + * Re-partition already allocated buffers to a smaller + * size. + */ + rcd->egrbufs.alloced = 0; + for (i = 0, j = 0, offset = 0; j < idx; i++) { + if (i >= rcd->egrbufs.count) + break; + rcd->egrbufs.rcvtids[i].phys = + rcd->egrbufs.buffers[j].phys + offset; + rcd->egrbufs.rcvtids[i].addr = + rcd->egrbufs.buffers[j].addr + offset; + rcd->egrbufs.alloced++; + if ((rcd->egrbufs.buffers[j].phys + offset + + new_size) == + (rcd->egrbufs.buffers[j].phys + + rcd->egrbufs.buffers[j].len)) { + j++; + offset = 0; + } else { + offset += new_size; + } + } + rcd->egrbufs.rcvtid_size = new_size; + } + } + rcd->egrbufs.numbufs = idx; + rcd->egrbufs.size = alloced_bytes; + + hfi1_cdbg(PROC, + "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n", + rcd->ctxt, rcd->egrbufs.alloced, rcd->egrbufs.rcvtid_size, + rcd->egrbufs.size); + + /* + * Set the contexts rcv array head update threshold to the closest + * power of 2 (so we can use a mask instead of modulo) below half + * the allocated entries. + */ + rcd->egrbufs.threshold = + rounddown_pow_of_two(rcd->egrbufs.alloced / 2); + /* + * Compute the expected RcvArray entry base. This is done after + * allocating the eager buffers in order to maximize the + * expected RcvArray entries for the context. + */ + max_entries = rcd->rcv_array_groups * dd->rcv_entries.group_size; + egrtop = roundup(rcd->egrbufs.alloced, dd->rcv_entries.group_size); + rcd->expected_count = max_entries - egrtop; + if (rcd->expected_count > MAX_TID_PAIR_ENTRIES * 2) + rcd->expected_count = MAX_TID_PAIR_ENTRIES * 2; + + rcd->expected_base = rcd->eager_base + egrtop; + hfi1_cdbg(PROC, "ctxt%u: eager:%u, exp:%u, egrbase:%u, expbase:%u\n", + rcd->ctxt, rcd->egrbufs.alloced, rcd->expected_count, + rcd->eager_base, rcd->expected_base); + + if (!hfi1_rcvbuf_validate(rcd->egrbufs.rcvtid_size, PT_EAGER, &order)) { + hfi1_cdbg(PROC, + "ctxt%u: current Eager buffer size is invalid %u\n", + rcd->ctxt, rcd->egrbufs.rcvtid_size); + ret = -EINVAL; + goto bail; + } + + for (idx = 0; idx < rcd->egrbufs.alloced; idx++) { + hfi1_put_tid(dd, rcd->eager_base + idx, PT_EAGER, + rcd->egrbufs.rcvtids[idx].phys, order); + cond_resched(); + } + goto bail; + +bail_rcvegrbuf_phys: + for (idx = 0; idx < rcd->egrbufs.alloced && + rcd->egrbufs.buffers[idx].addr; + idx++) { + dma_free_coherent(&dd->pcidev->dev, + rcd->egrbufs.buffers[idx].len, + rcd->egrbufs.buffers[idx].addr, + rcd->egrbufs.buffers[idx].phys); + rcd->egrbufs.buffers[idx].addr = NULL; + rcd->egrbufs.buffers[idx].phys = 0; + rcd->egrbufs.buffers[idx].len = 0; + } +bail: + return ret; +} diff --git a/drivers/infiniband/hw/hfi1/intr.c b/drivers/infiniband/hw/hfi1/intr.c new file mode 100644 index 000000000..65348d16a --- /dev/null +++ b/drivers/infiniband/hw/hfi1/intr.c @@ -0,0 +1,200 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/pci.h> +#include <linux/delay.h> + +#include "hfi.h" +#include "common.h" +#include "sdma.h" + +/** + * format_hwmsg - format a single hwerror message + * @msg message buffer + * @msgl length of message buffer + * @hwmsg message to add to message buffer + */ +static void format_hwmsg(char *msg, size_t msgl, const char *hwmsg) +{ + strlcat(msg, "[", msgl); + strlcat(msg, hwmsg, msgl); + strlcat(msg, "]", msgl); +} + +/** + * hfi1_format_hwerrors - format hardware error messages for display + * @hwerrs hardware errors bit vector + * @hwerrmsgs hardware error descriptions + * @nhwerrmsgs number of hwerrmsgs + * @msg message buffer + * @msgl message buffer length + */ +void hfi1_format_hwerrors(u64 hwerrs, const struct hfi1_hwerror_msgs *hwerrmsgs, + size_t nhwerrmsgs, char *msg, size_t msgl) +{ + int i; + + for (i = 0; i < nhwerrmsgs; i++) + if (hwerrs & hwerrmsgs[i].mask) + format_hwmsg(msg, msgl, hwerrmsgs[i].msg); +} + +static void signal_ib_event(struct hfi1_pportdata *ppd, enum ib_event_type ev) +{ + struct ib_event event; + struct hfi1_devdata *dd = ppd->dd; + + /* + * Only call ib_dispatch_event() if the IB device has been + * registered. HFI1_INITED is set iff the driver has successfully + * registered with the IB core. + */ + if (!(dd->flags & HFI1_INITTED)) + return; + event.device = &dd->verbs_dev.rdi.ibdev; + event.element.port_num = ppd->port; + event.event = ev; + ib_dispatch_event(&event); +} + +/* + * Handle a linkup or link down notification. + * This is called outside an interrupt. + */ +void handle_linkup_change(struct hfi1_devdata *dd, u32 linkup) +{ + struct hfi1_pportdata *ppd = &dd->pport[0]; + enum ib_event_type ev; + + if (!(ppd->linkup ^ !!linkup)) + return; /* no change, nothing to do */ + + if (linkup) { + /* + * Quick linkup and all link up on the simulator does not + * trigger or implement: + * - VerifyCap interrupt + * - VerifyCap frames + * But rather moves directly to LinkUp. + * + * Do the work of the VerifyCap interrupt handler, + * handle_verify_cap(), but do not try moving the state to + * LinkUp as we are already there. + * + * NOTE: This uses this device's vAU, vCU, and vl15_init for + * the remote values. Both sides must be using the values. + */ + if (quick_linkup || dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { + set_up_vl15(dd, dd->vau, dd->vl15_init); + assign_remote_cm_au_table(dd, dd->vcu); + ppd->neighbor_guid = + read_csr(dd, DC_DC8051_STS_REMOTE_GUID); + ppd->neighbor_type = + read_csr(dd, DC_DC8051_STS_REMOTE_NODE_TYPE) & + DC_DC8051_STS_REMOTE_NODE_TYPE_VAL_MASK; + ppd->neighbor_port_number = + read_csr(dd, DC_DC8051_STS_REMOTE_PORT_NO) & + DC_DC8051_STS_REMOTE_PORT_NO_VAL_SMASK; + dd_dev_info(dd, "Neighbor GUID: %llx Neighbor type %d\n", + ppd->neighbor_guid, + ppd->neighbor_type); + } + + /* physical link went up */ + ppd->linkup = 1; + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE); + + /* link widths are not available until the link is fully up */ + get_linkup_link_widths(ppd); + + } else { + /* physical link went down */ + ppd->linkup = 0; + + /* clear HW details of the previous connection */ + reset_link_credits(dd); + + /* freeze after a link down to guarantee a clean egress */ + start_freeze_handling(ppd, FREEZE_SELF | FREEZE_LINK_DOWN); + + ev = IB_EVENT_PORT_ERR; + + hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LINKDOWN_BIT); + + /* if we are down, the neighbor is down */ + ppd->neighbor_normal = 0; + + /* notify IB of the link change */ + signal_ib_event(ppd, ev); + } +} + +/* + * Handle receive or urgent interrupts for user contexts. This means a user + * process was waiting for a packet to arrive, and didn't want to poll. + */ +void handle_user_interrupt(struct hfi1_ctxtdata *rcd) +{ + struct hfi1_devdata *dd = rcd->dd; + unsigned long flags; + + spin_lock_irqsave(&dd->uctxt_lock, flags); + if (!rcd->cnt) + goto done; + + if (test_and_clear_bit(HFI1_CTXT_WAITING_RCV, &rcd->event_flags)) { + wake_up_interruptible(&rcd->wait); + hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_DIS, rcd->ctxt); + } else if (test_and_clear_bit(HFI1_CTXT_WAITING_URG, + &rcd->event_flags)) { + rcd->urgent++; + wake_up_interruptible(&rcd->wait); + } +done: + spin_unlock_irqrestore(&dd->uctxt_lock, flags); +} diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h new file mode 100644 index 000000000..2ec6ef38d --- /dev/null +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -0,0 +1,300 @@ +#ifndef _HFI1_IOWAIT_H +#define _HFI1_IOWAIT_H +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/list.h> +#include <linux/workqueue.h> +#include <linux/sched.h> + +#include "sdma_txreq.h" + +/* + * typedef (*restart_t)() - restart callback + * @work: pointer to work structure + */ +typedef void (*restart_t)(struct work_struct *work); + +struct sdma_txreq; +struct sdma_engine; +/** + * struct iowait - linkage for delayed progress/waiting + * @list: used to add/insert into QP/PQ wait lists + * @tx_head: overflow list of sdma_txreq's + * @sleep: no space callback + * @wakeup: space callback wakeup + * @sdma_drained: sdma count drained + * @iowork: workqueue overhead + * @wait_dma: wait for sdma_busy == 0 + * @wait_pio: wait for pio_busy == 0 + * @sdma_busy: # of packets in flight + * @count: total number of descriptors in tx_head'ed list + * @tx_limit: limit for overflow queuing + * @tx_count: number of tx entry's in tx_head'ed list + * + * This is to be embedded in user's state structure + * (QP or PQ). + * + * The sleep and wakeup members are a + * bit misnamed. They do not strictly + * speaking sleep or wake up, but they + * are callbacks for the ULP to implement + * what ever queuing/dequeuing of + * the embedded iowait and its containing struct + * when a resource shortage like SDMA ring space is seen. + * + * Both potentially have locks help + * so sleeping is not allowed. + * + * The wait_dma member along with the iow + */ + +struct iowait { + struct list_head list; + struct list_head tx_head; + int (*sleep)( + struct sdma_engine *sde, + struct iowait *wait, + struct sdma_txreq *tx, + unsigned seq); + void (*wakeup)(struct iowait *wait, int reason); + void (*sdma_drained)(struct iowait *wait); + struct work_struct iowork; + wait_queue_head_t wait_dma; + wait_queue_head_t wait_pio; + atomic_t sdma_busy; + atomic_t pio_busy; + u32 count; + u32 tx_limit; + u32 tx_count; +}; + +#define SDMA_AVAIL_REASON 0 + +/** + * iowait_init() - initialize wait structure + * @wait: wait struct to initialize + * @tx_limit: limit for overflow queuing + * @func: restart function for workqueue + * @sleep: sleep function for no space + * @resume: wakeup function for no space + * + * This function initializes the iowait + * structure embedded in the QP or PQ. + * + */ + +static inline void iowait_init( + struct iowait *wait, + u32 tx_limit, + void (*func)(struct work_struct *work), + int (*sleep)( + struct sdma_engine *sde, + struct iowait *wait, + struct sdma_txreq *tx, + unsigned seq), + void (*wakeup)(struct iowait *wait, int reason), + void (*sdma_drained)(struct iowait *wait)) +{ + wait->count = 0; + INIT_LIST_HEAD(&wait->list); + INIT_LIST_HEAD(&wait->tx_head); + INIT_WORK(&wait->iowork, func); + init_waitqueue_head(&wait->wait_dma); + init_waitqueue_head(&wait->wait_pio); + atomic_set(&wait->sdma_busy, 0); + atomic_set(&wait->pio_busy, 0); + wait->tx_limit = tx_limit; + wait->sleep = sleep; + wait->wakeup = wakeup; + wait->sdma_drained = sdma_drained; +} + +/** + * iowait_schedule() - initialize wait structure + * @wait: wait struct to schedule + * @wq: workqueue for schedule + * @cpu: cpu + */ +static inline void iowait_schedule( + struct iowait *wait, + struct workqueue_struct *wq, + int cpu) +{ + queue_work_on(cpu, wq, &wait->iowork); +} + +/** + * iowait_sdma_drain() - wait for DMAs to drain + * + * @wait: iowait structure + * + * This will delay until the iowait sdmas have + * completed. + */ +static inline void iowait_sdma_drain(struct iowait *wait) +{ + wait_event(wait->wait_dma, !atomic_read(&wait->sdma_busy)); +} + +/** + * iowait_sdma_pending() - return sdma pending count + * + * @wait: iowait structure + * + */ +static inline int iowait_sdma_pending(struct iowait *wait) +{ + return atomic_read(&wait->sdma_busy); +} + +/** + * iowait_sdma_inc - note sdma io pending + * @wait: iowait structure + */ +static inline void iowait_sdma_inc(struct iowait *wait) +{ + atomic_inc(&wait->sdma_busy); +} + +/** + * iowait_sdma_add - add count to pending + * @wait: iowait structure + */ +static inline void iowait_sdma_add(struct iowait *wait, int count) +{ + atomic_add(count, &wait->sdma_busy); +} + +/** + * iowait_sdma_dec - note sdma complete + * @wait: iowait structure + */ +static inline int iowait_sdma_dec(struct iowait *wait) +{ + return atomic_dec_and_test(&wait->sdma_busy); +} + +/** + * iowait_pio_drain() - wait for pios to drain + * + * @wait: iowait structure + * + * This will delay until the iowait pios have + * completed. + */ +static inline void iowait_pio_drain(struct iowait *wait) +{ + wait_event_timeout(wait->wait_pio, + !atomic_read(&wait->pio_busy), + HZ); +} + +/** + * iowait_pio_pending() - return pio pending count + * + * @wait: iowait structure + * + */ +static inline int iowait_pio_pending(struct iowait *wait) +{ + return atomic_read(&wait->pio_busy); +} + +/** + * iowait_pio_inc - note pio pending + * @wait: iowait structure + */ +static inline void iowait_pio_inc(struct iowait *wait) +{ + atomic_inc(&wait->pio_busy); +} + +/** + * iowait_sdma_dec - note pio complete + * @wait: iowait structure + */ +static inline int iowait_pio_dec(struct iowait *wait) +{ + return atomic_dec_and_test(&wait->pio_busy); +} + +/** + * iowait_drain_wakeup() - trigger iowait_drain() waiter + * + * @wait: iowait structure + * + * This will trigger any waiters. + */ +static inline void iowait_drain_wakeup(struct iowait *wait) +{ + wake_up(&wait->wait_dma); + wake_up(&wait->wait_pio); + if (wait->sdma_drained) + wait->sdma_drained(wait); +} + +/** + * iowait_get_txhead() - get packet off of iowait list + * + * @wait wait struture + */ +static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait) +{ + struct sdma_txreq *tx = NULL; + + if (!list_empty(&wait->tx_head)) { + tx = list_first_entry( + &wait->tx_head, + struct sdma_txreq, + list); + list_del_init(&tx->list); + } + return tx; +} + +#endif diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c new file mode 100644 index 000000000..fca07a1d6 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -0,0 +1,4454 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/net.h> +#define OPA_NUM_PKEY_BLOCKS_PER_SMP (OPA_SMP_DR_DATA_SIZE \ + / (OPA_PARTITION_TABLE_BLK_SIZE * sizeof(u16))) + +#include "hfi.h" +#include "mad.h" +#include "trace.h" +#include "qp.h" + +/* the reset value from the FM is supposed to be 0xffff, handle both */ +#define OPA_LINK_WIDTH_RESET_OLD 0x0fff +#define OPA_LINK_WIDTH_RESET 0xffff + +static int reply(struct ib_mad_hdr *smp) +{ + /* + * The verbs framework will handle the directed/LID route + * packet changes. + */ + smp->method = IB_MGMT_METHOD_GET_RESP; + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + smp->status |= IB_SMP_DIRECTION; + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY; +} + +static inline void clear_opa_smp_data(struct opa_smp *smp) +{ + void *data = opa_get_smp_data(smp); + size_t size = opa_get_smp_data_size(smp); + + memset(data, 0, size); +} + +void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port) +{ + struct ib_event event; + + event.event = IB_EVENT_PKEY_CHANGE; + event.device = &dd->verbs_dev.rdi.ibdev; + event.element.port_num = port; + ib_dispatch_event(&event); +} + +static void send_trap(struct hfi1_ibport *ibp, void *data, unsigned len) +{ + struct ib_mad_send_buf *send_buf; + struct ib_mad_agent *agent; + struct opa_smp *smp; + int ret; + unsigned long flags; + unsigned long timeout; + int pkey_idx; + u32 qpn = ppd_from_ibp(ibp)->sm_trap_qp; + + agent = ibp->rvp.send_agent; + if (!agent) + return; + + /* o14-3.2.1 */ + if (ppd_from_ibp(ibp)->lstate != IB_PORT_ACTIVE) + return; + + /* o14-2 */ + if (ibp->rvp.trap_timeout && time_before(jiffies, + ibp->rvp.trap_timeout)) + return; + + pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY); + if (pkey_idx < 0) { + pr_warn("%s: failed to find limited mgmt pkey, defaulting 0x%x\n", + __func__, hfi1_get_pkey(ibp, 1)); + pkey_idx = 1; + } + + send_buf = ib_create_send_mad(agent, qpn, pkey_idx, 0, + IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, + GFP_ATOMIC, IB_MGMT_BASE_VERSION); + if (IS_ERR(send_buf)) + return; + + smp = send_buf->mad; + smp->base_version = OPA_MGMT_BASE_VERSION; + smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; + smp->class_version = OPA_SMI_CLASS_VERSION; + smp->method = IB_MGMT_METHOD_TRAP; + ibp->rvp.tid++; + smp->tid = cpu_to_be64(ibp->rvp.tid); + smp->attr_id = IB_SMP_ATTR_NOTICE; + /* o14-1: smp->mkey = 0; */ + memcpy(smp->route.lid.data, data, len); + + spin_lock_irqsave(&ibp->rvp.lock, flags); + if (!ibp->rvp.sm_ah) { + if (ibp->rvp.sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) { + struct ib_ah *ah; + + ah = hfi1_create_qp0_ah(ibp, ibp->rvp.sm_lid); + if (IS_ERR(ah)) { + ret = PTR_ERR(ah); + } else { + send_buf->ah = ah; + ibp->rvp.sm_ah = ibah_to_rvtah(ah); + ret = 0; + } + } else { + ret = -EINVAL; + } + } else { + send_buf->ah = &ibp->rvp.sm_ah->ibah; + ret = 0; + } + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + + if (!ret) + ret = ib_post_send_mad(send_buf, NULL); + if (!ret) { + /* 4.096 usec. */ + timeout = (4096 * (1UL << ibp->rvp.subnet_timeout)) / 1000; + ibp->rvp.trap_timeout = jiffies + usecs_to_jiffies(timeout); + } else { + ib_free_send_mad(send_buf); + ibp->rvp.trap_timeout = 0; + } +} + +/* + * Send a bad [PQ]_Key trap (ch. 14.3.8). + */ +void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl, + u32 qp1, u32 qp2, u16 lid1, u16 lid2) +{ + struct opa_mad_notice_attr data; + u32 lid = ppd_from_ibp(ibp)->lid; + u32 _lid1 = lid1; + u32 _lid2 = lid2; + + memset(&data, 0, sizeof(data)); + + if (trap_num == OPA_TRAP_BAD_P_KEY) + ibp->rvp.pkey_violations++; + else + ibp->rvp.qkey_violations++; + ibp->rvp.n_pkt_drops++; + + /* Send violation trap */ + data.generic_type = IB_NOTICE_TYPE_SECURITY; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = trap_num; + data.issuer_lid = cpu_to_be32(lid); + data.ntc_257_258.lid1 = cpu_to_be32(_lid1); + data.ntc_257_258.lid2 = cpu_to_be32(_lid2); + data.ntc_257_258.key = cpu_to_be32(key); + data.ntc_257_258.sl = sl << 3; + data.ntc_257_258.qp1 = cpu_to_be32(qp1); + data.ntc_257_258.qp2 = cpu_to_be32(qp2); + + send_trap(ibp, &data, sizeof(data)); +} + +/* + * Send a bad M_Key trap (ch. 14.3.9). + */ +static void bad_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad, + __be64 mkey, __be32 dr_slid, u8 return_path[], u8 hop_cnt) +{ + struct opa_mad_notice_attr data; + u32 lid = ppd_from_ibp(ibp)->lid; + + memset(&data, 0, sizeof(data)); + /* Send violation trap */ + data.generic_type = IB_NOTICE_TYPE_SECURITY; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = OPA_TRAP_BAD_M_KEY; + data.issuer_lid = cpu_to_be32(lid); + data.ntc_256.lid = data.issuer_lid; + data.ntc_256.method = mad->method; + data.ntc_256.attr_id = mad->attr_id; + data.ntc_256.attr_mod = mad->attr_mod; + data.ntc_256.mkey = mkey; + if (mad->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + data.ntc_256.dr_slid = dr_slid; + data.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE; + if (hop_cnt > ARRAY_SIZE(data.ntc_256.dr_rtn_path)) { + data.ntc_256.dr_trunc_hop |= + IB_NOTICE_TRAP_DR_TRUNC; + hop_cnt = ARRAY_SIZE(data.ntc_256.dr_rtn_path); + } + data.ntc_256.dr_trunc_hop |= hop_cnt; + memcpy(data.ntc_256.dr_rtn_path, return_path, + hop_cnt); + } + + send_trap(ibp, &data, sizeof(data)); +} + +/* + * Send a Port Capability Mask Changed trap (ch. 14.3.11). + */ +void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num) +{ + struct opa_mad_notice_attr data; + struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); + struct hfi1_devdata *dd = dd_from_dev(verbs_dev); + struct hfi1_ibport *ibp = &dd->pport[port_num - 1].ibport_data; + u32 lid = ppd_from_ibp(ibp)->lid; + + memset(&data, 0, sizeof(data)); + + data.generic_type = IB_NOTICE_TYPE_INFO; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = OPA_TRAP_CHANGE_CAPABILITY; + data.issuer_lid = cpu_to_be32(lid); + data.ntc_144.lid = data.issuer_lid; + data.ntc_144.new_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags); + + send_trap(ibp, &data, sizeof(data)); +} + +/* + * Send a System Image GUID Changed trap (ch. 14.3.12). + */ +void hfi1_sys_guid_chg(struct hfi1_ibport *ibp) +{ + struct opa_mad_notice_attr data; + u32 lid = ppd_from_ibp(ibp)->lid; + + memset(&data, 0, sizeof(data)); + + data.generic_type = IB_NOTICE_TYPE_INFO; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = OPA_TRAP_CHANGE_SYSGUID; + data.issuer_lid = cpu_to_be32(lid); + data.ntc_145.new_sys_guid = ib_hfi1_sys_image_guid; + data.ntc_145.lid = data.issuer_lid; + + send_trap(ibp, &data, sizeof(data)); +} + +/* + * Send a Node Description Changed trap (ch. 14.3.13). + */ +void hfi1_node_desc_chg(struct hfi1_ibport *ibp) +{ + struct opa_mad_notice_attr data; + u32 lid = ppd_from_ibp(ibp)->lid; + + memset(&data, 0, sizeof(data)); + + data.generic_type = IB_NOTICE_TYPE_INFO; + data.prod_type_lsb = IB_NOTICE_PROD_CA; + data.trap_num = OPA_TRAP_CHANGE_CAPABILITY; + data.issuer_lid = cpu_to_be32(lid); + data.ntc_144.lid = data.issuer_lid; + data.ntc_144.change_flags = + cpu_to_be16(OPA_NOTICE_TRAP_NODE_DESC_CHG); + + send_trap(ibp, &data, sizeof(data)); +} + +static int __subn_get_opa_nodedesc(struct opa_smp *smp, u32 am, + u8 *data, struct ib_device *ibdev, + u8 port, u32 *resp_len) +{ + struct opa_node_description *nd; + + if (am) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + nd = (struct opa_node_description *)data; + + memcpy(nd->data, ibdev->node_desc, sizeof(nd->data)); + + if (resp_len) + *resp_len += sizeof(*nd); + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_get_opa_nodeinfo(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct opa_node_info *ni; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */ + + ni = (struct opa_node_info *)data; + + /* GUID 0 is illegal */ + if (am || pidx >= dd->num_pports || dd->pport[pidx].guid == 0) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ni->port_guid = cpu_to_be64(dd->pport[pidx].guid); + ni->base_version = OPA_MGMT_BASE_VERSION; + ni->class_version = OPA_SMI_CLASS_VERSION; + ni->node_type = 1; /* channel adapter */ + ni->num_ports = ibdev->phys_port_cnt; + /* This is already in network order */ + ni->system_image_guid = ib_hfi1_sys_image_guid; + /* Use first-port GUID as node */ + ni->node_guid = cpu_to_be64(dd->pport->guid); + ni->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd)); + ni->device_id = cpu_to_be16(dd->pcidev->device); + ni->revision = cpu_to_be32(dd->minrev); + ni->local_port_num = port; + ni->vendor_id[0] = dd->oui1; + ni->vendor_id[1] = dd->oui2; + ni->vendor_id[2] = dd->oui3; + + if (resp_len) + *resp_len += sizeof(*ni); + + return reply((struct ib_mad_hdr *)smp); +} + +static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev, + u8 port) +{ + struct ib_node_info *nip = (struct ib_node_info *)&smp->data; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + unsigned pidx = port - 1; /* IB number port from 1, hw from 0 */ + + /* GUID 0 is illegal */ + if (smp->attr_mod || pidx >= dd->num_pports || + dd->pport[pidx].guid == 0) + smp->status |= IB_SMP_INVALID_FIELD; + else + nip->port_guid = cpu_to_be64(dd->pport[pidx].guid); + + nip->base_version = OPA_MGMT_BASE_VERSION; + nip->class_version = OPA_SMI_CLASS_VERSION; + nip->node_type = 1; /* channel adapter */ + nip->num_ports = ibdev->phys_port_cnt; + /* This is already in network order */ + nip->sys_guid = ib_hfi1_sys_image_guid; + /* Use first-port GUID as node */ + nip->node_guid = cpu_to_be64(dd->pport->guid); + nip->partition_cap = cpu_to_be16(hfi1_get_npkeys(dd)); + nip->device_id = cpu_to_be16(dd->pcidev->device); + nip->revision = cpu_to_be32(dd->minrev); + nip->local_port_num = port; + nip->vendor_id[0] = dd->oui1; + nip->vendor_id[1] = dd->oui2; + nip->vendor_id[2] = dd->oui3; + + return reply((struct ib_mad_hdr *)smp); +} + +static void set_link_width_enabled(struct hfi1_pportdata *ppd, u32 w) +{ + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_ENB, w); +} + +static void set_link_width_downgrade_enabled(struct hfi1_pportdata *ppd, u32 w) +{ + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_LWID_DG_ENB, w); +} + +static void set_link_speed_enabled(struct hfi1_pportdata *ppd, u32 s) +{ + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_SPD_ENB, s); +} + +static int check_mkey(struct hfi1_ibport *ibp, struct ib_mad_hdr *mad, + int mad_flags, __be64 mkey, __be32 dr_slid, + u8 return_path[], u8 hop_cnt) +{ + int valid_mkey = 0; + int ret = 0; + + /* Is the mkey in the process of expiring? */ + if (ibp->rvp.mkey_lease_timeout && + time_after_eq(jiffies, ibp->rvp.mkey_lease_timeout)) { + /* Clear timeout and mkey protection field. */ + ibp->rvp.mkey_lease_timeout = 0; + ibp->rvp.mkeyprot = 0; + } + + if ((mad_flags & IB_MAD_IGNORE_MKEY) || ibp->rvp.mkey == 0 || + ibp->rvp.mkey == mkey) + valid_mkey = 1; + + /* Unset lease timeout on any valid Get/Set/TrapRepress */ + if (valid_mkey && ibp->rvp.mkey_lease_timeout && + (mad->method == IB_MGMT_METHOD_GET || + mad->method == IB_MGMT_METHOD_SET || + mad->method == IB_MGMT_METHOD_TRAP_REPRESS)) + ibp->rvp.mkey_lease_timeout = 0; + + if (!valid_mkey) { + switch (mad->method) { + case IB_MGMT_METHOD_GET: + /* Bad mkey not a violation below level 2 */ + if (ibp->rvp.mkeyprot < 2) + break; + case IB_MGMT_METHOD_SET: + case IB_MGMT_METHOD_TRAP_REPRESS: + if (ibp->rvp.mkey_violations != 0xFFFF) + ++ibp->rvp.mkey_violations; + if (!ibp->rvp.mkey_lease_timeout && + ibp->rvp.mkey_lease_period) + ibp->rvp.mkey_lease_timeout = jiffies + + ibp->rvp.mkey_lease_period * HZ; + /* Generate a trap notice. */ + bad_mkey(ibp, mad, mkey, dr_slid, return_path, + hop_cnt); + ret = 1; + } + } + + return ret; +} + +/* + * The SMA caches reads from LCB registers in case the LCB is unavailable. + * (The LCB is unavailable in certain link states, for example.) + */ +struct lcb_datum { + u32 off; + u64 val; +}; + +static struct lcb_datum lcb_cache[] = { + { DC_LCB_STS_ROUND_TRIP_LTP_CNT, 0 }, +}; + +static int write_lcb_cache(u32 off, u64 val) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) { + if (lcb_cache[i].off == off) { + lcb_cache[i].val = val; + return 0; + } + } + + pr_warn("%s bad offset 0x%x\n", __func__, off); + return -1; +} + +static int read_lcb_cache(u32 off, u64 *val) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(lcb_cache); i++) { + if (lcb_cache[i].off == off) { + *val = lcb_cache[i].val; + return 0; + } + } + + pr_warn("%s bad offset 0x%x\n", __func__, off); + return -1; +} + +void read_ltp_rtt(struct hfi1_devdata *dd) +{ + u64 reg; + + if (read_lcb_csr(dd, DC_LCB_STS_ROUND_TRIP_LTP_CNT, ®)) + dd_dev_err(dd, "%s: unable to read LTP RTT\n", __func__); + else + write_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, reg); +} + +static int __subn_get_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + int i; + struct hfi1_devdata *dd; + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + struct opa_port_info *pi = (struct opa_port_info *)data; + u8 mtu; + u8 credit_rate; + u8 is_beaconing_active; + u32 state; + u32 num_ports = OPA_AM_NPORT(am); + u32 start_of_sm_config = OPA_AM_START_SM_CFG(am); + u32 buffer_units; + u64 tmp = 0; + + if (num_ports != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + dd = dd_from_ibdev(ibdev); + /* IB numbers ports from 1, hw from 0 */ + ppd = dd->pport + (port - 1); + ibp = &ppd->ibport_data; + + if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) || + ppd->vls_supported > ARRAY_SIZE(dd->vld)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + pi->lid = cpu_to_be32(ppd->lid); + + /* Only return the mkey if the protection field allows it. */ + if (!(smp->method == IB_MGMT_METHOD_GET && + ibp->rvp.mkey != smp->mkey && + ibp->rvp.mkeyprot == 1)) + pi->mkey = ibp->rvp.mkey; + + pi->subnet_prefix = ibp->rvp.gid_prefix; + pi->sm_lid = cpu_to_be32(ibp->rvp.sm_lid); + pi->ib_cap_mask = cpu_to_be32(ibp->rvp.port_cap_flags); + pi->mkey_lease_period = cpu_to_be16(ibp->rvp.mkey_lease_period); + pi->sm_trap_qp = cpu_to_be32(ppd->sm_trap_qp); + pi->sa_qp = cpu_to_be32(ppd->sa_qp); + + pi->link_width.enabled = cpu_to_be16(ppd->link_width_enabled); + pi->link_width.supported = cpu_to_be16(ppd->link_width_supported); + pi->link_width.active = cpu_to_be16(ppd->link_width_active); + + pi->link_width_downgrade.supported = + cpu_to_be16(ppd->link_width_downgrade_supported); + pi->link_width_downgrade.enabled = + cpu_to_be16(ppd->link_width_downgrade_enabled); + pi->link_width_downgrade.tx_active = + cpu_to_be16(ppd->link_width_downgrade_tx_active); + pi->link_width_downgrade.rx_active = + cpu_to_be16(ppd->link_width_downgrade_rx_active); + + pi->link_speed.supported = cpu_to_be16(ppd->link_speed_supported); + pi->link_speed.active = cpu_to_be16(ppd->link_speed_active); + pi->link_speed.enabled = cpu_to_be16(ppd->link_speed_enabled); + + state = driver_lstate(ppd); + + if (start_of_sm_config && (state == IB_PORT_INIT)) + ppd->is_sm_config_started = 1; + + pi->port_phys_conf = (ppd->port_type & 0xf); + +#if PI_LED_ENABLE_SUP + pi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4; + pi->port_states.ledenable_offlinereason |= + ppd->is_sm_config_started << 5; + /* + * This pairs with the memory barrier in hfi1_start_led_override to + * ensure that we read the correct state of LED beaconing represented + * by led_override_timer_active + */ + smp_rmb(); + is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active); + pi->port_states.ledenable_offlinereason |= is_beaconing_active << 6; + pi->port_states.ledenable_offlinereason |= + ppd->offline_disabled_reason; +#else + pi->port_states.offline_reason = ppd->neighbor_normal << 4; + pi->port_states.offline_reason |= ppd->is_sm_config_started << 5; + pi->port_states.offline_reason |= ppd->offline_disabled_reason; +#endif /* PI_LED_ENABLE_SUP */ + + pi->port_states.portphysstate_portstate = + (hfi1_ibphys_portstate(ppd) << 4) | state; + + pi->mkeyprotect_lmc = (ibp->rvp.mkeyprot << 6) | ppd->lmc; + + memset(pi->neigh_mtu.pvlx_to_mtu, 0, sizeof(pi->neigh_mtu.pvlx_to_mtu)); + for (i = 0; i < ppd->vls_supported; i++) { + mtu = mtu_to_enum(dd->vld[i].mtu, HFI1_DEFAULT_ACTIVE_MTU); + if ((i % 2) == 0) + pi->neigh_mtu.pvlx_to_mtu[i / 2] |= (mtu << 4); + else + pi->neigh_mtu.pvlx_to_mtu[i / 2] |= mtu; + } + /* don't forget VL 15 */ + mtu = mtu_to_enum(dd->vld[15].mtu, 2048); + pi->neigh_mtu.pvlx_to_mtu[15 / 2] |= mtu; + pi->smsl = ibp->rvp.sm_sl & OPA_PI_MASK_SMSL; + pi->operational_vls = hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS); + pi->partenforce_filterraw |= + (ppd->linkinit_reason & OPA_PI_MASK_LINKINIT_REASON); + if (ppd->part_enforce & HFI1_PART_ENFORCE_IN) + pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_IN; + if (ppd->part_enforce & HFI1_PART_ENFORCE_OUT) + pi->partenforce_filterraw |= OPA_PI_MASK_PARTITION_ENFORCE_OUT; + pi->mkey_violations = cpu_to_be16(ibp->rvp.mkey_violations); + /* P_KeyViolations are counted by hardware. */ + pi->pkey_violations = cpu_to_be16(ibp->rvp.pkey_violations); + pi->qkey_violations = cpu_to_be16(ibp->rvp.qkey_violations); + + pi->vl.cap = ppd->vls_supported; + pi->vl.high_limit = cpu_to_be16(ibp->rvp.vl_high_limit); + pi->vl.arb_high_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_CAP); + pi->vl.arb_low_cap = (u8)hfi1_get_ib_cfg(ppd, HFI1_IB_CFG_VL_LOW_CAP); + + pi->clientrereg_subnettimeout = ibp->rvp.subnet_timeout; + + pi->port_link_mode = cpu_to_be16(OPA_PORT_LINK_MODE_OPA << 10 | + OPA_PORT_LINK_MODE_OPA << 5 | + OPA_PORT_LINK_MODE_OPA); + + pi->port_ltp_crc_mode = cpu_to_be16(ppd->port_ltp_crc_mode); + + pi->port_mode = cpu_to_be16( + ppd->is_active_optimize_enabled ? + OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE : 0); + + pi->port_packet_format.supported = + cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B); + pi->port_packet_format.enabled = + cpu_to_be16(OPA_PORT_PACKET_FORMAT_9B); + + /* flit_control.interleave is (OPA V1, version .76): + * bits use + * ---- --- + * 2 res + * 2 DistanceSupported + * 2 DistanceEnabled + * 5 MaxNextLevelTxEnabled + * 5 MaxNestLevelRxSupported + * + * HFI supports only "distance mode 1" (see OPA V1, version .76, + * section 9.6.2), so set DistanceSupported, DistanceEnabled + * to 0x1. + */ + pi->flit_control.interleave = cpu_to_be16(0x1400); + + pi->link_down_reason = ppd->local_link_down_reason.sma; + pi->neigh_link_down_reason = ppd->neigh_link_down_reason.sma; + pi->port_error_action = cpu_to_be32(ppd->port_error_action); + pi->mtucap = mtu_to_enum(hfi1_max_mtu, IB_MTU_4096); + + /* 32.768 usec. response time (guessing) */ + pi->resptimevalue = 3; + + pi->local_port_num = port; + + /* buffer info for FM */ + pi->overall_buffer_space = cpu_to_be16(dd->link_credits); + + pi->neigh_node_guid = cpu_to_be64(ppd->neighbor_guid); + pi->neigh_port_num = ppd->neighbor_port_number; + pi->port_neigh_mode = + (ppd->neighbor_type & OPA_PI_MASK_NEIGH_NODE_TYPE) | + (ppd->mgmt_allowed ? OPA_PI_MASK_NEIGH_MGMT_ALLOWED : 0) | + (ppd->neighbor_fm_security ? + OPA_PI_MASK_NEIGH_FW_AUTH_BYPASS : 0); + + /* HFIs shall always return VL15 credits to their + * neighbor in a timely manner, without any credit return pacing. + */ + credit_rate = 0; + buffer_units = (dd->vau) & OPA_PI_MASK_BUF_UNIT_BUF_ALLOC; + buffer_units |= (dd->vcu << 3) & OPA_PI_MASK_BUF_UNIT_CREDIT_ACK; + buffer_units |= (credit_rate << 6) & + OPA_PI_MASK_BUF_UNIT_VL15_CREDIT_RATE; + buffer_units |= (dd->vl15_init << 11) & OPA_PI_MASK_BUF_UNIT_VL15_INIT; + pi->buffer_units = cpu_to_be32(buffer_units); + + pi->opa_cap_mask = cpu_to_be16(OPA_CAP_MASK3_IsSharedSpaceSupported); + + /* HFI supports a replay buffer 128 LTPs in size */ + pi->replay_depth.buffer = 0x80; + /* read the cached value of DC_LCB_STS_ROUND_TRIP_LTP_CNT */ + read_lcb_cache(DC_LCB_STS_ROUND_TRIP_LTP_CNT, &tmp); + + /* + * this counter is 16 bits wide, but the replay_depth.wire + * variable is only 8 bits + */ + if (tmp > 0xff) + tmp = 0xff; + pi->replay_depth.wire = tmp; + + if (resp_len) + *resp_len += sizeof(struct opa_port_info); + + return reply((struct ib_mad_hdr *)smp); +} + +/** + * get_pkeys - return the PKEY table + * @dd: the hfi1_ib device + * @port: the IB port number + * @pkeys: the pkey table is placed here + */ +static int get_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys) +{ + struct hfi1_pportdata *ppd = dd->pport + port - 1; + + memcpy(pkeys, ppd->pkeys, sizeof(ppd->pkeys)); + + return 0; +} + +static int __subn_get_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u32 n_blocks_req = OPA_AM_NBLK(am); + u32 start_block = am & 0x7ff; + __be16 *p; + u16 *q; + int i; + u16 n_blocks_avail; + unsigned npkeys = hfi1_get_npkeys(dd); + size_t size; + + if (n_blocks_req == 0) { + pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n", + port, start_block, n_blocks_req); + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1; + + size = (n_blocks_req * OPA_PARTITION_TABLE_BLK_SIZE) * sizeof(u16); + + if (start_block + n_blocks_req > n_blocks_avail || + n_blocks_req > OPA_NUM_PKEY_BLOCKS_PER_SMP) { + pr_warn("OPA Get PKey AM Invalid : s 0x%x; req 0x%x; " + "avail 0x%x; blk/smp 0x%lx\n", + start_block, n_blocks_req, n_blocks_avail, + OPA_NUM_PKEY_BLOCKS_PER_SMP); + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + p = (__be16 *)data; + q = (u16 *)data; + /* get the real pkeys if we are requesting the first block */ + if (start_block == 0) { + get_pkeys(dd, port, q); + for (i = 0; i < npkeys; i++) + p[i] = cpu_to_be16(q[i]); + if (resp_len) + *resp_len += size; + } else { + smp->status |= IB_SMP_INVALID_FIELD; + } + return reply((struct ib_mad_hdr *)smp); +} + +enum { + HFI_TRANSITION_DISALLOWED, + HFI_TRANSITION_IGNORED, + HFI_TRANSITION_ALLOWED, + HFI_TRANSITION_UNDEFINED, +}; + +/* + * Use shortened names to improve readability of + * {logical,physical}_state_transitions + */ +enum { + __D = HFI_TRANSITION_DISALLOWED, + __I = HFI_TRANSITION_IGNORED, + __A = HFI_TRANSITION_ALLOWED, + __U = HFI_TRANSITION_UNDEFINED, +}; + +/* + * IB_PORTPHYSSTATE_POLLING (2) through OPA_PORTPHYSSTATE_MAX (11) are + * represented in physical_state_transitions. + */ +#define __N_PHYSTATES (OPA_PORTPHYSSTATE_MAX - IB_PORTPHYSSTATE_POLLING + 1) + +/* + * Within physical_state_transitions, rows represent "old" states, + * columns "new" states, and physical_state_transitions.allowed[old][new] + * indicates if the transition from old state to new state is legal (see + * OPAg1v1, Table 6-4). + */ +static const struct { + u8 allowed[__N_PHYSTATES][__N_PHYSTATES]; +} physical_state_transitions = { + { + /* 2 3 4 5 6 7 8 9 10 11 */ + /* 2 */ { __A, __A, __D, __D, __D, __D, __D, __D, __D, __D }, + /* 3 */ { __A, __I, __D, __D, __D, __D, __D, __D, __D, __A }, + /* 4 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U }, + /* 5 */ { __A, __A, __D, __I, __D, __D, __D, __D, __D, __D }, + /* 6 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U }, + /* 7 */ { __D, __A, __D, __D, __D, __I, __D, __D, __D, __D }, + /* 8 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U }, + /* 9 */ { __I, __A, __D, __D, __D, __D, __D, __I, __D, __D }, + /*10 */ { __U, __U, __U, __U, __U, __U, __U, __U, __U, __U }, + /*11 */ { __D, __A, __D, __D, __D, __D, __D, __D, __D, __I }, + } +}; + +/* + * IB_PORT_DOWN (1) through IB_PORT_ACTIVE_DEFER (5) are represented + * logical_state_transitions + */ + +#define __N_LOGICAL_STATES (IB_PORT_ACTIVE_DEFER - IB_PORT_DOWN + 1) + +/* + * Within logical_state_transitions rows represent "old" states, + * columns "new" states, and logical_state_transitions.allowed[old][new] + * indicates if the transition from old state to new state is legal (see + * OPAg1v1, Table 9-12). + */ +static const struct { + u8 allowed[__N_LOGICAL_STATES][__N_LOGICAL_STATES]; +} logical_state_transitions = { + { + /* 1 2 3 4 5 */ + /* 1 */ { __I, __D, __D, __D, __U}, + /* 2 */ { __D, __I, __A, __D, __U}, + /* 3 */ { __D, __D, __I, __A, __U}, + /* 4 */ { __D, __D, __I, __I, __U}, + /* 5 */ { __U, __U, __U, __U, __U}, + } +}; + +static int logical_transition_allowed(int old, int new) +{ + if (old < IB_PORT_NOP || old > IB_PORT_ACTIVE_DEFER || + new < IB_PORT_NOP || new > IB_PORT_ACTIVE_DEFER) { + pr_warn("invalid logical state(s) (old %d new %d)\n", + old, new); + return HFI_TRANSITION_UNDEFINED; + } + + if (new == IB_PORT_NOP) + return HFI_TRANSITION_ALLOWED; /* always allowed */ + + /* adjust states for indexing into logical_state_transitions */ + old -= IB_PORT_DOWN; + new -= IB_PORT_DOWN; + + if (old < 0 || new < 0) + return HFI_TRANSITION_UNDEFINED; + return logical_state_transitions.allowed[old][new]; +} + +static int physical_transition_allowed(int old, int new) +{ + if (old < IB_PORTPHYSSTATE_NOP || old > OPA_PORTPHYSSTATE_MAX || + new < IB_PORTPHYSSTATE_NOP || new > OPA_PORTPHYSSTATE_MAX) { + pr_warn("invalid physical state(s) (old %d new %d)\n", + old, new); + return HFI_TRANSITION_UNDEFINED; + } + + if (new == IB_PORTPHYSSTATE_NOP) + return HFI_TRANSITION_ALLOWED; /* always allowed */ + + /* adjust states for indexing into physical_state_transitions */ + old -= IB_PORTPHYSSTATE_POLLING; + new -= IB_PORTPHYSSTATE_POLLING; + + if (old < 0 || new < 0) + return HFI_TRANSITION_UNDEFINED; + return physical_state_transitions.allowed[old][new]; +} + +static int port_states_transition_allowed(struct hfi1_pportdata *ppd, + u32 logical_new, u32 physical_new) +{ + u32 physical_old = driver_physical_state(ppd); + u32 logical_old = driver_logical_state(ppd); + int ret, logical_allowed, physical_allowed; + + ret = logical_transition_allowed(logical_old, logical_new); + logical_allowed = ret; + + if (ret == HFI_TRANSITION_DISALLOWED || + ret == HFI_TRANSITION_UNDEFINED) { + pr_warn("invalid logical state transition %s -> %s\n", + opa_lstate_name(logical_old), + opa_lstate_name(logical_new)); + return ret; + } + + ret = physical_transition_allowed(physical_old, physical_new); + physical_allowed = ret; + + if (ret == HFI_TRANSITION_DISALLOWED || + ret == HFI_TRANSITION_UNDEFINED) { + pr_warn("invalid physical state transition %s -> %s\n", + opa_pstate_name(physical_old), + opa_pstate_name(physical_new)); + return ret; + } + + if (logical_allowed == HFI_TRANSITION_IGNORED && + physical_allowed == HFI_TRANSITION_IGNORED) + return HFI_TRANSITION_IGNORED; + + /* + * A change request of Physical Port State from + * 'Offline' to 'Polling' should be ignored. + */ + if ((physical_old == OPA_PORTPHYSSTATE_OFFLINE) && + (physical_new == IB_PORTPHYSSTATE_POLLING)) + return HFI_TRANSITION_IGNORED; + + /* + * Either physical_allowed or logical_allowed is + * HFI_TRANSITION_ALLOWED. + */ + return HFI_TRANSITION_ALLOWED; +} + +static int set_port_states(struct hfi1_pportdata *ppd, struct opa_smp *smp, + u32 logical_state, u32 phys_state, + int suppress_idle_sma) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 link_state; + int ret; + + ret = port_states_transition_allowed(ppd, logical_state, phys_state); + if (ret == HFI_TRANSITION_DISALLOWED || + ret == HFI_TRANSITION_UNDEFINED) { + /* error message emitted above */ + smp->status |= IB_SMP_INVALID_FIELD; + return 0; + } + + if (ret == HFI_TRANSITION_IGNORED) + return 0; + + if ((phys_state != IB_PORTPHYSSTATE_NOP) && + !(logical_state == IB_PORT_DOWN || + logical_state == IB_PORT_NOP)){ + pr_warn("SubnSet(OPA_PortInfo) port state invalid: logical_state 0x%x physical_state 0x%x\n", + logical_state, phys_state); + smp->status |= IB_SMP_INVALID_FIELD; + } + + /* + * Logical state changes are summarized in OPAv1g1 spec., + * Table 9-12; physical state changes are summarized in + * OPAv1g1 spec., Table 6.4. + */ + switch (logical_state) { + case IB_PORT_NOP: + if (phys_state == IB_PORTPHYSSTATE_NOP) + break; + /* FALLTHROUGH */ + case IB_PORT_DOWN: + if (phys_state == IB_PORTPHYSSTATE_NOP) { + link_state = HLS_DN_DOWNDEF; + } else if (phys_state == IB_PORTPHYSSTATE_POLLING) { + link_state = HLS_DN_POLL; + set_link_down_reason(ppd, OPA_LINKDOWN_REASON_FM_BOUNCE, + 0, OPA_LINKDOWN_REASON_FM_BOUNCE); + } else if (phys_state == IB_PORTPHYSSTATE_DISABLED) { + link_state = HLS_DN_DISABLE; + } else { + pr_warn("SubnSet(OPA_PortInfo) invalid physical state 0x%x\n", + phys_state); + smp->status |= IB_SMP_INVALID_FIELD; + break; + } + + if ((link_state == HLS_DN_POLL || + link_state == HLS_DN_DOWNDEF)) { + /* + * Going to poll. No matter what the current state, + * always move offline first, then tune and start the + * link. This correctly handles a FM link bounce and + * a link enable. Going offline is a no-op if already + * offline. + */ + set_link_state(ppd, HLS_DN_OFFLINE); + tune_serdes(ppd); + start_link(ppd); + } else { + set_link_state(ppd, link_state); + } + if (link_state == HLS_DN_DISABLE && + (ppd->offline_disabled_reason > + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED) || + ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE))) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_SMA_DISABLED); + /* + * Don't send a reply if the response would be sent + * through the disabled port. + */ + if (link_state == HLS_DN_DISABLE && smp->hop_cnt) + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + break; + case IB_PORT_ARMED: + ret = set_link_state(ppd, HLS_UP_ARMED); + if ((ret == 0) && (suppress_idle_sma == 0)) + send_idle_sma(dd, SMA_IDLE_ARM); + break; + case IB_PORT_ACTIVE: + if (ppd->neighbor_normal) { + ret = set_link_state(ppd, HLS_UP_ACTIVE); + if (ret == 0) + send_idle_sma(dd, SMA_IDLE_ACTIVE); + } else { + pr_warn("SubnSet(OPA_PortInfo) Cannot move to Active with NeighborNormal 0\n"); + smp->status |= IB_SMP_INVALID_FIELD; + } + break; + default: + pr_warn("SubnSet(OPA_PortInfo) invalid logical state 0x%x\n", + logical_state); + smp->status |= IB_SMP_INVALID_FIELD; + } + + return 0; +} + +/** + * subn_set_opa_portinfo - set port information + * @smp: the incoming SM packet + * @ibdev: the infiniband device + * @port: the port on the device + * + */ +static int __subn_set_opa_portinfo(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct opa_port_info *pi = (struct opa_port_info *)data; + struct ib_event event; + struct hfi1_devdata *dd; + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + u8 clientrereg; + unsigned long flags; + u32 smlid, opa_lid; /* tmp vars to hold LID values */ + u16 lid; + u8 ls_old, ls_new, ps_new; + u8 vls; + u8 msl; + u8 crc_enabled; + u16 lse, lwe, mtu; + u32 num_ports = OPA_AM_NPORT(am); + u32 start_of_sm_config = OPA_AM_START_SM_CFG(am); + int ret, i, invalid = 0, call_set_mtu = 0; + int call_link_downgrade_policy = 0; + + if (num_ports != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + opa_lid = be32_to_cpu(pi->lid); + if (opa_lid & 0xFFFF0000) { + pr_warn("OPA_PortInfo lid out of range: %X\n", opa_lid); + smp->status |= IB_SMP_INVALID_FIELD; + goto get_only; + } + + lid = (u16)(opa_lid & 0x0000FFFF); + + smlid = be32_to_cpu(pi->sm_lid); + if (smlid & 0xFFFF0000) { + pr_warn("OPA_PortInfo SM lid out of range: %X\n", smlid); + smp->status |= IB_SMP_INVALID_FIELD; + goto get_only; + } + smlid &= 0x0000FFFF; + + clientrereg = (pi->clientrereg_subnettimeout & + OPA_PI_MASK_CLIENT_REREGISTER); + + dd = dd_from_ibdev(ibdev); + /* IB numbers ports from 1, hw from 0 */ + ppd = dd->pport + (port - 1); + ibp = &ppd->ibport_data; + event.device = ibdev; + event.element.port_num = port; + + ls_old = driver_lstate(ppd); + + ibp->rvp.mkey = pi->mkey; + ibp->rvp.gid_prefix = pi->subnet_prefix; + ibp->rvp.mkey_lease_period = be16_to_cpu(pi->mkey_lease_period); + + /* Must be a valid unicast LID address. */ + if ((lid == 0 && ls_old > IB_PORT_INIT) || + lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) { + smp->status |= IB_SMP_INVALID_FIELD; + pr_warn("SubnSet(OPA_PortInfo) lid invalid 0x%x\n", + lid); + } else if (ppd->lid != lid || + ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC)) { + if (ppd->lid != lid) + hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LID_CHANGE_BIT); + if (ppd->lmc != (pi->mkeyprotect_lmc & OPA_PI_MASK_LMC)) + hfi1_set_uevent_bits(ppd, _HFI1_EVENT_LMC_CHANGE_BIT); + hfi1_set_lid(ppd, lid, pi->mkeyprotect_lmc & OPA_PI_MASK_LMC); + event.event = IB_EVENT_LID_CHANGE; + ib_dispatch_event(&event); + } + + msl = pi->smsl & OPA_PI_MASK_SMSL; + if (pi->partenforce_filterraw & OPA_PI_MASK_LINKINIT_REASON) + ppd->linkinit_reason = + (pi->partenforce_filterraw & + OPA_PI_MASK_LINKINIT_REASON); + /* enable/disable SW pkey checking as per FM control */ + if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_IN) + ppd->part_enforce |= HFI1_PART_ENFORCE_IN; + else + ppd->part_enforce &= ~HFI1_PART_ENFORCE_IN; + + if (pi->partenforce_filterraw & OPA_PI_MASK_PARTITION_ENFORCE_OUT) + ppd->part_enforce |= HFI1_PART_ENFORCE_OUT; + else + ppd->part_enforce &= ~HFI1_PART_ENFORCE_OUT; + + /* Must be a valid unicast LID address. */ + if ((smlid == 0 && ls_old > IB_PORT_INIT) || + smlid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) { + smp->status |= IB_SMP_INVALID_FIELD; + pr_warn("SubnSet(OPA_PortInfo) smlid invalid 0x%x\n", smlid); + } else if (smlid != ibp->rvp.sm_lid || msl != ibp->rvp.sm_sl) { + pr_warn("SubnSet(OPA_PortInfo) smlid 0x%x\n", smlid); + spin_lock_irqsave(&ibp->rvp.lock, flags); + if (ibp->rvp.sm_ah) { + if (smlid != ibp->rvp.sm_lid) + ibp->rvp.sm_ah->attr.dlid = smlid; + if (msl != ibp->rvp.sm_sl) + ibp->rvp.sm_ah->attr.sl = msl; + } + spin_unlock_irqrestore(&ibp->rvp.lock, flags); + if (smlid != ibp->rvp.sm_lid) + ibp->rvp.sm_lid = smlid; + if (msl != ibp->rvp.sm_sl) + ibp->rvp.sm_sl = msl; + event.event = IB_EVENT_SM_CHANGE; + ib_dispatch_event(&event); + } + + if (pi->link_down_reason == 0) { + ppd->local_link_down_reason.sma = 0; + ppd->local_link_down_reason.latest = 0; + } + + if (pi->neigh_link_down_reason == 0) { + ppd->neigh_link_down_reason.sma = 0; + ppd->neigh_link_down_reason.latest = 0; + } + + ppd->sm_trap_qp = be32_to_cpu(pi->sm_trap_qp); + ppd->sa_qp = be32_to_cpu(pi->sa_qp); + + ppd->port_error_action = be32_to_cpu(pi->port_error_action); + lwe = be16_to_cpu(pi->link_width.enabled); + if (lwe) { + if (lwe == OPA_LINK_WIDTH_RESET || + lwe == OPA_LINK_WIDTH_RESET_OLD) + set_link_width_enabled(ppd, ppd->link_width_supported); + else if ((lwe & ~ppd->link_width_supported) == 0) + set_link_width_enabled(ppd, lwe); + else + smp->status |= IB_SMP_INVALID_FIELD; + } + lwe = be16_to_cpu(pi->link_width_downgrade.enabled); + /* LWD.E is always applied - 0 means "disabled" */ + if (lwe == OPA_LINK_WIDTH_RESET || + lwe == OPA_LINK_WIDTH_RESET_OLD) { + set_link_width_downgrade_enabled(ppd, + ppd-> + link_width_downgrade_supported + ); + } else if ((lwe & ~ppd->link_width_downgrade_supported) == 0) { + /* only set and apply if something changed */ + if (lwe != ppd->link_width_downgrade_enabled) { + set_link_width_downgrade_enabled(ppd, lwe); + call_link_downgrade_policy = 1; + } + } else { + smp->status |= IB_SMP_INVALID_FIELD; + } + lse = be16_to_cpu(pi->link_speed.enabled); + if (lse) { + if (lse & be16_to_cpu(pi->link_speed.supported)) + set_link_speed_enabled(ppd, lse); + else + smp->status |= IB_SMP_INVALID_FIELD; + } + + ibp->rvp.mkeyprot = + (pi->mkeyprotect_lmc & OPA_PI_MASK_MKEY_PROT_BIT) >> 6; + ibp->rvp.vl_high_limit = be16_to_cpu(pi->vl.high_limit) & 0xFF; + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_VL_HIGH_LIMIT, + ibp->rvp.vl_high_limit); + + if (ppd->vls_supported / 2 > ARRAY_SIZE(pi->neigh_mtu.pvlx_to_mtu) || + ppd->vls_supported > ARRAY_SIZE(dd->vld)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + for (i = 0; i < ppd->vls_supported; i++) { + if ((i % 2) == 0) + mtu = enum_to_mtu((pi->neigh_mtu.pvlx_to_mtu[i / 2] >> + 4) & 0xF); + else + mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[i / 2] & + 0xF); + if (mtu == 0xffff) { + pr_warn("SubnSet(OPA_PortInfo) mtu invalid %d (0x%x)\n", + mtu, + (pi->neigh_mtu.pvlx_to_mtu[0] >> 4) & 0xF); + smp->status |= IB_SMP_INVALID_FIELD; + mtu = hfi1_max_mtu; /* use a valid MTU */ + } + if (dd->vld[i].mtu != mtu) { + dd_dev_info(dd, + "MTU change on vl %d from %d to %d\n", + i, dd->vld[i].mtu, mtu); + dd->vld[i].mtu = mtu; + call_set_mtu++; + } + } + /* As per OPAV1 spec: VL15 must support and be configured + * for operation with a 2048 or larger MTU. + */ + mtu = enum_to_mtu(pi->neigh_mtu.pvlx_to_mtu[15 / 2] & 0xF); + if (mtu < 2048 || mtu == 0xffff) + mtu = 2048; + if (dd->vld[15].mtu != mtu) { + dd_dev_info(dd, + "MTU change on vl 15 from %d to %d\n", + dd->vld[15].mtu, mtu); + dd->vld[15].mtu = mtu; + call_set_mtu++; + } + if (call_set_mtu) + set_mtu(ppd); + + /* Set operational VLs */ + vls = pi->operational_vls & OPA_PI_MASK_OPERATIONAL_VL; + if (vls) { + if (vls > ppd->vls_supported) { + pr_warn("SubnSet(OPA_PortInfo) VL's supported invalid %d\n", + pi->operational_vls); + smp->status |= IB_SMP_INVALID_FIELD; + } else { + if (hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_OP_VLS, + vls) == -EINVAL) + smp->status |= IB_SMP_INVALID_FIELD; + } + } + + if (pi->mkey_violations == 0) + ibp->rvp.mkey_violations = 0; + + if (pi->pkey_violations == 0) + ibp->rvp.pkey_violations = 0; + + if (pi->qkey_violations == 0) + ibp->rvp.qkey_violations = 0; + + ibp->rvp.subnet_timeout = + pi->clientrereg_subnettimeout & OPA_PI_MASK_SUBNET_TIMEOUT; + + crc_enabled = be16_to_cpu(pi->port_ltp_crc_mode); + crc_enabled >>= 4; + crc_enabled &= 0xf; + + if (crc_enabled != 0) + ppd->port_crc_mode_enabled = port_ltp_to_cap(crc_enabled); + + ppd->is_active_optimize_enabled = + !!(be16_to_cpu(pi->port_mode) + & OPA_PI_MASK_PORT_ACTIVE_OPTOMIZE); + + ls_new = pi->port_states.portphysstate_portstate & + OPA_PI_MASK_PORT_STATE; + ps_new = (pi->port_states.portphysstate_portstate & + OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4; + + if (ls_old == IB_PORT_INIT) { + if (start_of_sm_config) { + if (ls_new == ls_old || (ls_new == IB_PORT_ARMED)) + ppd->is_sm_config_started = 1; + } else if (ls_new == IB_PORT_ARMED) { + if (ppd->is_sm_config_started == 0) + invalid = 1; + } + } + + /* Handle CLIENT_REREGISTER event b/c SM asked us for it */ + if (clientrereg) { + event.event = IB_EVENT_CLIENT_REREGISTER; + ib_dispatch_event(&event); + } + + /* + * Do the port state change now that the other link parameters + * have been set. + * Changing the port physical state only makes sense if the link + * is down or is being set to down. + */ + + ret = set_port_states(ppd, smp, ls_new, ps_new, invalid); + if (ret) + return ret; + + ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len); + + /* restore re-reg bit per o14-12.2.1 */ + pi->clientrereg_subnettimeout |= clientrereg; + + /* + * Apply the new link downgrade policy. This may result in a link + * bounce. Do this after everything else so things are settled. + * Possible problem: if setting the port state above fails, then + * the policy change is not applied. + */ + if (call_link_downgrade_policy) + apply_link_downgrade_policy(ppd, 0); + + return ret; + +get_only: + return __subn_get_opa_portinfo(smp, am, data, ibdev, port, resp_len); +} + +/** + * set_pkeys - set the PKEY table for ctxt 0 + * @dd: the hfi1_ib device + * @port: the IB port number + * @pkeys: the PKEY table + */ +static int set_pkeys(struct hfi1_devdata *dd, u8 port, u16 *pkeys) +{ + struct hfi1_pportdata *ppd; + int i; + int changed = 0; + int update_includes_mgmt_partition = 0; + + /* + * IB port one/two always maps to context zero/one, + * always a kernel context, no locking needed + * If we get here with ppd setup, no need to check + * that rcd is valid. + */ + ppd = dd->pport + (port - 1); + /* + * If the update does not include the management pkey, don't do it. + */ + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + if (pkeys[i] == LIM_MGMT_P_KEY) { + update_includes_mgmt_partition = 1; + break; + } + } + + if (!update_includes_mgmt_partition) + return 1; + + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + u16 key = pkeys[i]; + u16 okey = ppd->pkeys[i]; + + if (key == okey) + continue; + /* + * Don't update pkeys[2], if an HFI port without MgmtAllowed + * by neighbor is a switch. + */ + if (i == 2 && !ppd->mgmt_allowed && ppd->neighbor_type == 1) + continue; + /* + * The SM gives us the complete PKey table. We have + * to ensure that we put the PKeys in the matching + * slots. + */ + ppd->pkeys[i] = key; + changed = 1; + } + + if (changed) { + (void)hfi1_set_ib_cfg(ppd, HFI1_IB_CFG_PKEYS, 0); + hfi1_event_pkey_change(dd, port); + } + + return 0; +} + +static int __subn_set_opa_pkeytable(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u32 n_blocks_sent = OPA_AM_NBLK(am); + u32 start_block = am & 0x7ff; + u16 *p = (u16 *)data; + __be16 *q = (__be16 *)data; + int i; + u16 n_blocks_avail; + unsigned npkeys = hfi1_get_npkeys(dd); + + if (n_blocks_sent == 0) { + pr_warn("OPA Get PKey AM Invalid : P = %d; B = 0x%x; N = 0x%x\n", + port, start_block, n_blocks_sent); + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + n_blocks_avail = (u16)(npkeys / OPA_PARTITION_TABLE_BLK_SIZE) + 1; + + if (start_block + n_blocks_sent > n_blocks_avail || + n_blocks_sent > OPA_NUM_PKEY_BLOCKS_PER_SMP) { + pr_warn("OPA Set PKey AM Invalid : s 0x%x; req 0x%x; avail 0x%x; blk/smp 0x%lx\n", + start_block, n_blocks_sent, n_blocks_avail, + OPA_NUM_PKEY_BLOCKS_PER_SMP); + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < n_blocks_sent * OPA_PARTITION_TABLE_BLK_SIZE; i++) + p[i] = be16_to_cpu(q[i]); + + if (start_block == 0 && set_pkeys(dd, port, p) != 0) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + return __subn_get_opa_pkeytable(smp, am, data, ibdev, port, resp_len); +} + +static int get_sc2vlt_tables(struct hfi1_devdata *dd, void *data) +{ + u64 *val = data; + + *val++ = read_csr(dd, SEND_SC2VLT0); + *val++ = read_csr(dd, SEND_SC2VLT1); + *val++ = read_csr(dd, SEND_SC2VLT2); + *val++ = read_csr(dd, SEND_SC2VLT3); + return 0; +} + +#define ILLEGAL_VL 12 +/* + * filter_sc2vlt changes mappings to VL15 to ILLEGAL_VL (except + * for SC15, which must map to VL15). If we don't remap things this + * way it is possible for VL15 counters to increment when we try to + * send on a SC which is mapped to an invalid VL. + */ +static void filter_sc2vlt(void *data) +{ + int i; + u8 *pd = data; + + for (i = 0; i < OPA_MAX_SCS; i++) { + if (i == 15) + continue; + if ((pd[i] & 0x1f) == 0xf) + pd[i] = ILLEGAL_VL; + } +} + +static int set_sc2vlt_tables(struct hfi1_devdata *dd, void *data) +{ + u64 *val = data; + + filter_sc2vlt(data); + + write_csr(dd, SEND_SC2VLT0, *val++); + write_csr(dd, SEND_SC2VLT1, *val++); + write_csr(dd, SEND_SC2VLT2, *val++); + write_csr(dd, SEND_SC2VLT3, *val++); + write_seqlock_irq(&dd->sc2vl_lock); + memcpy(dd->sc2vl, data, sizeof(dd->sc2vl)); + write_sequnlock_irq(&dd->sc2vl_lock); + return 0; +} + +static int __subn_get_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, port); + u8 *p = data; + size_t size = ARRAY_SIZE(ibp->sl_to_sc); /* == 32 */ + unsigned i; + + if (am) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++) + *p++ = ibp->sl_to_sc[i]; + + if (resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_sl_to_sc(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, port); + u8 *p = data; + int i; + u8 sc; + + if (am) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < ARRAY_SIZE(ibp->sl_to_sc); i++) { + sc = *p++; + if (ibp->sl_to_sc[i] != sc) { + ibp->sl_to_sc[i] = sc; + + /* Put all stale qps into error state */ + hfi1_error_port_qps(ibp, i); + } + } + + return __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, resp_len); +} + +static int __subn_get_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, port); + u8 *p = data; + size_t size = ARRAY_SIZE(ibp->sc_to_sl); /* == 32 */ + unsigned i; + + if (am) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++) + *p++ = ibp->sc_to_sl[i]; + + if (resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_sc_to_sl(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, port); + u8 *p = data; + int i; + + if (am) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < ARRAY_SIZE(ibp->sc_to_sl); i++) + ibp->sc_to_sl[i] = *p++; + + return __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, resp_len); +} + +static int __subn_get_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + u32 n_blocks = OPA_AM_NBLK(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + void *vp = (void *)data; + size_t size = 4 * sizeof(u64); + + if (n_blocks != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + get_sc2vlt_tables(dd, vp); + + if (resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_sc_to_vlt(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + u32 n_blocks = OPA_AM_NBLK(am); + int async_update = OPA_AM_ASYNC(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + void *vp = (void *)data; + struct hfi1_pportdata *ppd; + int lstate; + + if (n_blocks != 1 || async_update) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + /* IB numbers ports from 1, hw from 0 */ + ppd = dd->pport + (port - 1); + lstate = driver_lstate(ppd); + /* + * it's known that async_update is 0 by this point, but include + * the explicit check for clarity + */ + if (!async_update && + (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + set_sc2vlt_tables(dd, vp); + + return __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, resp_len); +} + +static int __subn_get_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + u32 n_blocks = OPA_AM_NPORT(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_pportdata *ppd; + void *vp = (void *)data; + int size; + + if (n_blocks != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ppd = dd->pport + (port - 1); + + size = fm_get_table(ppd, FM_TBL_SC2VLNT, vp); + + if (resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_sc_to_vlnt(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + u32 n_blocks = OPA_AM_NPORT(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_pportdata *ppd; + void *vp = (void *)data; + int lstate; + + if (n_blocks != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + /* IB numbers ports from 1, hw from 0 */ + ppd = dd->pport + (port - 1); + lstate = driver_lstate(ppd); + if (lstate == IB_PORT_ARMED || lstate == IB_PORT_ACTIVE) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ppd = dd->pport + (port - 1); + + fm_set_table(ppd, FM_TBL_SC2VLNT, vp); + + return __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port, + resp_len); +} + +static int __subn_get_opa_psi(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + u32 nports = OPA_AM_NPORT(am); + u32 start_of_sm_config = OPA_AM_START_SM_CFG(am); + u32 lstate; + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct opa_port_state_info *psi = (struct opa_port_state_info *)data; + + if (nports != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ibp = to_iport(ibdev, port); + ppd = ppd_from_ibp(ibp); + + lstate = driver_lstate(ppd); + + if (start_of_sm_config && (lstate == IB_PORT_INIT)) + ppd->is_sm_config_started = 1; + +#if PI_LED_ENABLE_SUP + psi->port_states.ledenable_offlinereason = ppd->neighbor_normal << 4; + psi->port_states.ledenable_offlinereason |= + ppd->is_sm_config_started << 5; + psi->port_states.ledenable_offlinereason |= + ppd->offline_disabled_reason; +#else + psi->port_states.offline_reason = ppd->neighbor_normal << 4; + psi->port_states.offline_reason |= ppd->is_sm_config_started << 5; + psi->port_states.offline_reason |= ppd->offline_disabled_reason; +#endif /* PI_LED_ENABLE_SUP */ + + psi->port_states.portphysstate_portstate = + (hfi1_ibphys_portstate(ppd) << 4) | (lstate & 0xf); + psi->link_width_downgrade_tx_active = + cpu_to_be16(ppd->link_width_downgrade_tx_active); + psi->link_width_downgrade_rx_active = + cpu_to_be16(ppd->link_width_downgrade_rx_active); + if (resp_len) + *resp_len += sizeof(struct opa_port_state_info); + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_psi(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + u32 nports = OPA_AM_NPORT(am); + u32 start_of_sm_config = OPA_AM_START_SM_CFG(am); + u32 ls_old; + u8 ls_new, ps_new; + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct opa_port_state_info *psi = (struct opa_port_state_info *)data; + int ret, invalid = 0; + + if (nports != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ibp = to_iport(ibdev, port); + ppd = ppd_from_ibp(ibp); + + ls_old = driver_lstate(ppd); + + ls_new = port_states_to_logical_state(&psi->port_states); + ps_new = port_states_to_phys_state(&psi->port_states); + + if (ls_old == IB_PORT_INIT) { + if (start_of_sm_config) { + if (ls_new == ls_old || (ls_new == IB_PORT_ARMED)) + ppd->is_sm_config_started = 1; + } else if (ls_new == IB_PORT_ARMED) { + if (ppd->is_sm_config_started == 0) + invalid = 1; + } + } + + ret = set_port_states(ppd, smp, ls_new, ps_new, invalid); + if (ret) + return ret; + + if (invalid) + smp->status |= IB_SMP_INVALID_FIELD; + + return __subn_get_opa_psi(smp, am, data, ibdev, port, resp_len); +} + +static int __subn_get_opa_cable_info(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u32 addr = OPA_AM_CI_ADDR(am); + u32 len = OPA_AM_CI_LEN(am) + 1; + int ret; + +#define __CI_PAGE_SIZE BIT(7) /* 128 bytes */ +#define __CI_PAGE_MASK ~(__CI_PAGE_SIZE - 1) +#define __CI_PAGE_NUM(a) ((a) & __CI_PAGE_MASK) + + /* + * check that addr is within spec, and + * addr and (addr + len - 1) are on the same "page" + */ + if (addr >= 4096 || + (__CI_PAGE_NUM(addr) != __CI_PAGE_NUM(addr + len - 1))) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ret = get_cable_info(dd, port, addr, len, data); + + if (ret == -ENODEV) { + smp->status |= IB_SMP_UNSUP_METH_ATTR; + return reply((struct ib_mad_hdr *)smp); + } + + /* The address range for the CableInfo SMA query is wider than the + * memory available on the QSFP cable. We want to return a valid + * response, albeit zeroed out, for address ranges beyond available + * memory but that are within the CableInfo query spec + */ + if (ret < 0 && ret != -ERANGE) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + if (resp_len) + *resp_len += len; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_get_opa_bct(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, u32 *resp_len) +{ + u32 num_ports = OPA_AM_NPORT(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_pportdata *ppd; + struct buffer_control *p = (struct buffer_control *)data; + int size; + + if (num_ports != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + ppd = dd->pport + (port - 1); + size = fm_get_table(ppd, FM_TBL_BUFFER_CONTROL, p); + trace_bct_get(dd, p); + if (resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_bct(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, u32 *resp_len) +{ + u32 num_ports = OPA_AM_NPORT(am); + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_pportdata *ppd; + struct buffer_control *p = (struct buffer_control *)data; + + if (num_ports != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + ppd = dd->pport + (port - 1); + trace_bct_set(dd, p); + if (fm_set_table(ppd, FM_TBL_BUFFER_CONTROL, p) < 0) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + return __subn_get_opa_bct(smp, am, data, ibdev, port, resp_len); +} + +static int __subn_get_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port)); + u32 num_ports = OPA_AM_NPORT(am); + u8 section = (am & 0x00ff0000) >> 16; + u8 *p = data; + int size = 0; + + if (num_ports != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + switch (section) { + case OPA_VLARB_LOW_ELEMENTS: + size = fm_get_table(ppd, FM_TBL_VL_LOW_ARB, p); + break; + case OPA_VLARB_HIGH_ELEMENTS: + size = fm_get_table(ppd, FM_TBL_VL_HIGH_ARB, p); + break; + case OPA_VLARB_PREEMPT_ELEMENTS: + size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_ELEMS, p); + break; + case OPA_VLARB_PREEMPT_MATRIX: + size = fm_get_table(ppd, FM_TBL_VL_PREEMPT_MATRIX, p); + break; + default: + pr_warn("OPA SubnGet(VL Arb) AM Invalid : 0x%x\n", + be32_to_cpu(smp->attr_mod)); + smp->status |= IB_SMP_INVALID_FIELD; + break; + } + + if (size > 0 && resp_len) + *resp_len += size; + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_vl_arb(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port)); + u32 num_ports = OPA_AM_NPORT(am); + u8 section = (am & 0x00ff0000) >> 16; + u8 *p = data; + + if (num_ports != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + switch (section) { + case OPA_VLARB_LOW_ELEMENTS: + (void)fm_set_table(ppd, FM_TBL_VL_LOW_ARB, p); + break; + case OPA_VLARB_HIGH_ELEMENTS: + (void)fm_set_table(ppd, FM_TBL_VL_HIGH_ARB, p); + break; + /* + * neither OPA_VLARB_PREEMPT_ELEMENTS, or OPA_VLARB_PREEMPT_MATRIX + * can be changed from the default values + */ + case OPA_VLARB_PREEMPT_ELEMENTS: + /* FALLTHROUGH */ + case OPA_VLARB_PREEMPT_MATRIX: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + break; + default: + pr_warn("OPA SubnSet(VL Arb) AM Invalid : 0x%x\n", + be32_to_cpu(smp->attr_mod)); + smp->status |= IB_SMP_INVALID_FIELD; + break; + } + + return __subn_get_opa_vl_arb(smp, am, data, ibdev, port, resp_len); +} + +struct opa_pma_mad { + struct ib_mad_hdr mad_hdr; + u8 data[2024]; +} __packed; + +struct opa_class_port_info { + u8 base_version; + u8 class_version; + __be16 cap_mask; + __be32 cap_mask2_resp_time; + + u8 redirect_gid[16]; + __be32 redirect_tc_fl; + __be32 redirect_lid; + __be32 redirect_sl_qp; + __be32 redirect_qkey; + + u8 trap_gid[16]; + __be32 trap_tc_fl; + __be32 trap_lid; + __be32 trap_hl_qp; + __be32 trap_qkey; + + __be16 trap_pkey; + __be16 redirect_pkey; + + u8 trap_sl_rsvd; + u8 reserved[3]; +} __packed; + +struct opa_port_status_req { + __u8 port_num; + __u8 reserved[3]; + __be32 vl_select_mask; +}; + +#define VL_MASK_ALL 0x000080ff + +struct opa_port_status_rsp { + __u8 port_num; + __u8 reserved[3]; + __be32 vl_select_mask; + + /* Data counters */ + __be64 port_xmit_data; + __be64 port_rcv_data; + __be64 port_xmit_pkts; + __be64 port_rcv_pkts; + __be64 port_multicast_xmit_pkts; + __be64 port_multicast_rcv_pkts; + __be64 port_xmit_wait; + __be64 sw_port_congestion; + __be64 port_rcv_fecn; + __be64 port_rcv_becn; + __be64 port_xmit_time_cong; + __be64 port_xmit_wasted_bw; + __be64 port_xmit_wait_data; + __be64 port_rcv_bubble; + __be64 port_mark_fecn; + /* Error counters */ + __be64 port_rcv_constraint_errors; + __be64 port_rcv_switch_relay_errors; + __be64 port_xmit_discards; + __be64 port_xmit_constraint_errors; + __be64 port_rcv_remote_physical_errors; + __be64 local_link_integrity_errors; + __be64 port_rcv_errors; + __be64 excessive_buffer_overruns; + __be64 fm_config_errors; + __be32 link_error_recovery; + __be32 link_downed; + u8 uncorrectable_errors; + + u8 link_quality_indicator; /* 5res, 3bit */ + u8 res2[6]; + struct _vls_pctrs { + /* per-VL Data counters */ + __be64 port_vl_xmit_data; + __be64 port_vl_rcv_data; + __be64 port_vl_xmit_pkts; + __be64 port_vl_rcv_pkts; + __be64 port_vl_xmit_wait; + __be64 sw_port_vl_congestion; + __be64 port_vl_rcv_fecn; + __be64 port_vl_rcv_becn; + __be64 port_xmit_time_cong; + __be64 port_vl_xmit_wasted_bw; + __be64 port_vl_xmit_wait_data; + __be64 port_vl_rcv_bubble; + __be64 port_vl_mark_fecn; + __be64 port_vl_xmit_discards; + } vls[0]; /* real array size defined by # bits set in vl_select_mask */ +}; + +enum counter_selects { + CS_PORT_XMIT_DATA = (1 << 31), + CS_PORT_RCV_DATA = (1 << 30), + CS_PORT_XMIT_PKTS = (1 << 29), + CS_PORT_RCV_PKTS = (1 << 28), + CS_PORT_MCAST_XMIT_PKTS = (1 << 27), + CS_PORT_MCAST_RCV_PKTS = (1 << 26), + CS_PORT_XMIT_WAIT = (1 << 25), + CS_SW_PORT_CONGESTION = (1 << 24), + CS_PORT_RCV_FECN = (1 << 23), + CS_PORT_RCV_BECN = (1 << 22), + CS_PORT_XMIT_TIME_CONG = (1 << 21), + CS_PORT_XMIT_WASTED_BW = (1 << 20), + CS_PORT_XMIT_WAIT_DATA = (1 << 19), + CS_PORT_RCV_BUBBLE = (1 << 18), + CS_PORT_MARK_FECN = (1 << 17), + CS_PORT_RCV_CONSTRAINT_ERRORS = (1 << 16), + CS_PORT_RCV_SWITCH_RELAY_ERRORS = (1 << 15), + CS_PORT_XMIT_DISCARDS = (1 << 14), + CS_PORT_XMIT_CONSTRAINT_ERRORS = (1 << 13), + CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS = (1 << 12), + CS_LOCAL_LINK_INTEGRITY_ERRORS = (1 << 11), + CS_PORT_RCV_ERRORS = (1 << 10), + CS_EXCESSIVE_BUFFER_OVERRUNS = (1 << 9), + CS_FM_CONFIG_ERRORS = (1 << 8), + CS_LINK_ERROR_RECOVERY = (1 << 7), + CS_LINK_DOWNED = (1 << 6), + CS_UNCORRECTABLE_ERRORS = (1 << 5), +}; + +struct opa_clear_port_status { + __be64 port_select_mask[4]; + __be32 counter_select_mask; +}; + +struct opa_aggregate { + __be16 attr_id; + __be16 err_reqlength; /* 1 bit, 8 res, 7 bit */ + __be32 attr_mod; + u8 data[0]; +}; + +#define MSK_LLI 0x000000f0 +#define MSK_LLI_SFT 4 +#define MSK_LER 0x0000000f +#define MSK_LER_SFT 0 +#define ADD_LLI 8 +#define ADD_LER 2 + +/* Request contains first three fields, response contains those plus the rest */ +struct opa_port_data_counters_msg { + __be64 port_select_mask[4]; + __be32 vl_select_mask; + __be32 resolution; + + /* Response fields follow */ + struct _port_dctrs { + u8 port_number; + u8 reserved2[3]; + __be32 link_quality_indicator; /* 29res, 3bit */ + + /* Data counters */ + __be64 port_xmit_data; + __be64 port_rcv_data; + __be64 port_xmit_pkts; + __be64 port_rcv_pkts; + __be64 port_multicast_xmit_pkts; + __be64 port_multicast_rcv_pkts; + __be64 port_xmit_wait; + __be64 sw_port_congestion; + __be64 port_rcv_fecn; + __be64 port_rcv_becn; + __be64 port_xmit_time_cong; + __be64 port_xmit_wasted_bw; + __be64 port_xmit_wait_data; + __be64 port_rcv_bubble; + __be64 port_mark_fecn; + + __be64 port_error_counter_summary; + /* Sum of error counts/port */ + + struct _vls_dctrs { + /* per-VL Data counters */ + __be64 port_vl_xmit_data; + __be64 port_vl_rcv_data; + __be64 port_vl_xmit_pkts; + __be64 port_vl_rcv_pkts; + __be64 port_vl_xmit_wait; + __be64 sw_port_vl_congestion; + __be64 port_vl_rcv_fecn; + __be64 port_vl_rcv_becn; + __be64 port_xmit_time_cong; + __be64 port_vl_xmit_wasted_bw; + __be64 port_vl_xmit_wait_data; + __be64 port_vl_rcv_bubble; + __be64 port_vl_mark_fecn; + } vls[0]; + /* array size defined by #bits set in vl_select_mask*/ + } port[1]; /* array size defined by #ports in attribute modifier */ +}; + +struct opa_port_error_counters64_msg { + /* + * Request contains first two fields, response contains the + * whole magilla + */ + __be64 port_select_mask[4]; + __be32 vl_select_mask; + + /* Response-only fields follow */ + __be32 reserved1; + struct _port_ectrs { + u8 port_number; + u8 reserved2[7]; + __be64 port_rcv_constraint_errors; + __be64 port_rcv_switch_relay_errors; + __be64 port_xmit_discards; + __be64 port_xmit_constraint_errors; + __be64 port_rcv_remote_physical_errors; + __be64 local_link_integrity_errors; + __be64 port_rcv_errors; + __be64 excessive_buffer_overruns; + __be64 fm_config_errors; + __be32 link_error_recovery; + __be32 link_downed; + u8 uncorrectable_errors; + u8 reserved3[7]; + struct _vls_ectrs { + __be64 port_vl_xmit_discards; + } vls[0]; + /* array size defined by #bits set in vl_select_mask */ + } port[1]; /* array size defined by #ports in attribute modifier */ +}; + +struct opa_port_error_info_msg { + __be64 port_select_mask[4]; + __be32 error_info_select_mask; + __be32 reserved1; + struct _port_ei { + u8 port_number; + u8 reserved2[7]; + + /* PortRcvErrorInfo */ + struct { + u8 status_and_code; + union { + u8 raw[17]; + struct { + /* EI1to12 format */ + u8 packet_flit1[8]; + u8 packet_flit2[8]; + u8 remaining_flit_bits12; + } ei1to12; + struct { + u8 packet_bytes[8]; + u8 remaining_flit_bits; + } ei13; + } ei; + u8 reserved3[6]; + } __packed port_rcv_ei; + + /* ExcessiveBufferOverrunInfo */ + struct { + u8 status_and_sc; + u8 reserved4[7]; + } __packed excessive_buffer_overrun_ei; + + /* PortXmitConstraintErrorInfo */ + struct { + u8 status; + u8 reserved5; + __be16 pkey; + __be32 slid; + } __packed port_xmit_constraint_ei; + + /* PortRcvConstraintErrorInfo */ + struct { + u8 status; + u8 reserved6; + __be16 pkey; + __be32 slid; + } __packed port_rcv_constraint_ei; + + /* PortRcvSwitchRelayErrorInfo */ + struct { + u8 status_and_code; + u8 reserved7[3]; + __u32 error_info; + } __packed port_rcv_switch_relay_ei; + + /* UncorrectableErrorInfo */ + struct { + u8 status_and_code; + u8 reserved8; + } __packed uncorrectable_ei; + + /* FMConfigErrorInfo */ + struct { + u8 status_and_code; + u8 error_info; + } __packed fm_config_ei; + __u32 reserved9; + } port[1]; /* actual array size defined by #ports in attr modifier */ +}; + +/* opa_port_error_info_msg error_info_select_mask bit definitions */ +enum error_info_selects { + ES_PORT_RCV_ERROR_INFO = (1 << 31), + ES_EXCESSIVE_BUFFER_OVERRUN_INFO = (1 << 30), + ES_PORT_XMIT_CONSTRAINT_ERROR_INFO = (1 << 29), + ES_PORT_RCV_CONSTRAINT_ERROR_INFO = (1 << 28), + ES_PORT_RCV_SWITCH_RELAY_ERROR_INFO = (1 << 27), + ES_UNCORRECTABLE_ERROR_INFO = (1 << 26), + ES_FM_CONFIG_ERROR_INFO = (1 << 25) +}; + +static int pma_get_opa_classportinfo(struct opa_pma_mad *pmp, + struct ib_device *ibdev, u32 *resp_len) +{ + struct opa_class_port_info *p = + (struct opa_class_port_info *)pmp->data; + + memset(pmp->data, 0, sizeof(pmp->data)); + + if (pmp->mad_hdr.attr_mod != 0) + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + + p->base_version = OPA_MGMT_BASE_VERSION; + p->class_version = OPA_SMI_CLASS_VERSION; + /* + * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec. + */ + p->cap_mask2_resp_time = cpu_to_be32(18); + + if (resp_len) + *resp_len += sizeof(*p); + + return reply((struct ib_mad_hdr *)pmp); +} + +static void a0_portstatus(struct hfi1_pportdata *ppd, + struct opa_port_status_rsp *rsp, u32 vl_select_mask) +{ + if (!is_bx(ppd->dd)) { + unsigned long vl; + u64 sum_vl_xmit_wait = 0; + u32 vl_all_mask = VL_MASK_ALL; + + for_each_set_bit(vl, (unsigned long *)&(vl_all_mask), + 8 * sizeof(vl_all_mask)) { + u64 tmp = sum_vl_xmit_wait + + read_port_cntr(ppd, C_TX_WAIT_VL, + idx_from_vl(vl)); + if (tmp < sum_vl_xmit_wait) { + /* we wrapped */ + sum_vl_xmit_wait = (u64)~0; + break; + } + sum_vl_xmit_wait = tmp; + } + if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait) + rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait); + } +} + +static int pma_get_opa_portstatus(struct opa_pma_mad *pmp, + struct ib_device *ibdev, + u8 port, u32 *resp_len) +{ + struct opa_port_status_req *req = + (struct opa_port_status_req *)pmp->data; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct opa_port_status_rsp *rsp; + u32 vl_select_mask = be32_to_cpu(req->vl_select_mask); + unsigned long vl; + size_t response_data_size; + u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24; + u8 port_num = req->port_num; + u8 num_vls = hweight32(vl_select_mask); + struct _vls_pctrs *vlinfo; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + int vfi; + u64 tmp, tmp2; + + response_data_size = sizeof(struct opa_port_status_rsp) + + num_vls * sizeof(struct _vls_pctrs); + if (response_data_size > sizeof(pmp->data)) { + pmp->mad_hdr.status |= OPA_PM_STATUS_REQUEST_TOO_LARGE; + return reply((struct ib_mad_hdr *)pmp); + } + + if (nports != 1 || (port_num && port_num != port) || + num_vls > OPA_MAX_VLS || (vl_select_mask & ~VL_MASK_ALL)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + memset(pmp->data, 0, sizeof(pmp->data)); + + rsp = (struct opa_port_status_rsp *)pmp->data; + if (port_num) + rsp->port_num = port_num; + else + rsp->port_num = port; + + rsp->port_rcv_constraint_errors = + cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR, + CNTR_INVALID_VL)); + + hfi1_read_link_quality(dd, &rsp->link_quality_indicator); + + rsp->vl_select_mask = cpu_to_be32(vl_select_mask); + rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS, + CNTR_INVALID_VL)); + rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS, + CNTR_INVALID_VL)); + rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS, + CNTR_INVALID_VL)); + rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS, + CNTR_INVALID_VL)); + rsp->port_multicast_xmit_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS, + CNTR_INVALID_VL)); + rsp->port_multicast_rcv_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS, + CNTR_INVALID_VL)); + rsp->port_xmit_wait = + cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL)); + rsp->port_rcv_fecn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL)); + rsp->port_rcv_becn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL)); + rsp->port_xmit_discards = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD, + CNTR_INVALID_VL)); + rsp->port_xmit_constraint_errors = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, + CNTR_INVALID_VL)); + rsp->port_rcv_remote_physical_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR, + CNTR_INVALID_VL)); + tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL); + tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL); + if (tmp2 < tmp) { + /* overflow/wrapped */ + rsp->local_link_integrity_errors = cpu_to_be64(~0); + } else { + rsp->local_link_integrity_errors = cpu_to_be64(tmp2); + } + tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL); + tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, + CNTR_INVALID_VL); + if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) { + /* overflow/wrapped */ + rsp->link_error_recovery = cpu_to_be32(~0); + } else { + rsp->link_error_recovery = cpu_to_be32(tmp2); + } + rsp->port_rcv_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL)); + rsp->excessive_buffer_overruns = + cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL)); + rsp->fm_config_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR, + CNTR_INVALID_VL)); + rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN, + CNTR_INVALID_VL)); + + /* rsp->uncorrectable_errors is 8 bits wide, and it pegs at 0xff */ + tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL); + rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff; + + vlinfo = &rsp->vls[0]; + vfi = 0; + /* The vl_select_mask has been checked above, and we know + * that it contains only entries which represent valid VLs. + * So in the for_each_set_bit() loop below, we don't need + * any additional checks for vl. + */ + for_each_set_bit(vl, (unsigned long *)&(vl_select_mask), + 8 * sizeof(vl_select_mask)) { + memset(vlinfo, 0, sizeof(*vlinfo)); + + tmp = read_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl)); + rsp->vls[vfi].port_vl_rcv_data = cpu_to_be64(tmp); + + rsp->vls[vfi].port_vl_rcv_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_xmit_data = + cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_xmit_pkts = + cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_xmit_wait = + cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_rcv_fecn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_rcv_becn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL, + idx_from_vl(vl))); + + vlinfo++; + vfi++; + } + + a0_portstatus(ppd, rsp, vl_select_mask); + + if (resp_len) + *resp_len += response_data_size; + + return reply((struct ib_mad_hdr *)pmp); +} + +static u64 get_error_counter_summary(struct ib_device *ibdev, u8 port, + u8 res_lli, u8 res_ler) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u64 error_counter_summary = 0, tmp; + + error_counter_summary += read_port_cntr(ppd, C_SW_RCV_CSTR_ERR, + CNTR_INVALID_VL); + /* port_rcv_switch_relay_errors is 0 for HFIs */ + error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_DSCD, + CNTR_INVALID_VL); + error_counter_summary += read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, + CNTR_INVALID_VL); + error_counter_summary += read_dev_cntr(dd, C_DC_RMT_PHY_ERR, + CNTR_INVALID_VL); + /* local link integrity must be right-shifted by the lli resolution */ + tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL); + tmp += read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL); + error_counter_summary += (tmp >> res_lli); + /* link error recovery must b right-shifted by the ler resolution */ + tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL); + tmp += read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL); + error_counter_summary += (tmp >> res_ler); + error_counter_summary += read_dev_cntr(dd, C_DC_RCV_ERR, + CNTR_INVALID_VL); + error_counter_summary += read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL); + error_counter_summary += read_dev_cntr(dd, C_DC_FM_CFG_ERR, + CNTR_INVALID_VL); + /* ppd->link_downed is a 32-bit value */ + error_counter_summary += read_port_cntr(ppd, C_SW_LINK_DOWN, + CNTR_INVALID_VL); + tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL); + /* this is an 8-bit quantity */ + error_counter_summary += tmp < 0x100 ? (tmp & 0xff) : 0xff; + + return error_counter_summary; +} + +static void a0_datacounters(struct hfi1_pportdata *ppd, struct _port_dctrs *rsp, + u32 vl_select_mask) +{ + if (!is_bx(ppd->dd)) { + unsigned long vl; + u64 sum_vl_xmit_wait = 0; + u32 vl_all_mask = VL_MASK_ALL; + + for_each_set_bit(vl, (unsigned long *)&(vl_all_mask), + 8 * sizeof(vl_all_mask)) { + u64 tmp = sum_vl_xmit_wait + + read_port_cntr(ppd, C_TX_WAIT_VL, + idx_from_vl(vl)); + if (tmp < sum_vl_xmit_wait) { + /* we wrapped */ + sum_vl_xmit_wait = (u64)~0; + break; + } + sum_vl_xmit_wait = tmp; + } + if (be64_to_cpu(rsp->port_xmit_wait) > sum_vl_xmit_wait) + rsp->port_xmit_wait = cpu_to_be64(sum_vl_xmit_wait); + } +} + +static void pma_get_opa_port_dctrs(struct ib_device *ibdev, + struct _port_dctrs *rsp) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + + rsp->port_xmit_data = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_FLITS, + CNTR_INVALID_VL)); + rsp->port_rcv_data = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FLITS, + CNTR_INVALID_VL)); + rsp->port_xmit_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_XMIT_PKTS, + CNTR_INVALID_VL)); + rsp->port_rcv_pkts = cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_PKTS, + CNTR_INVALID_VL)); + rsp->port_multicast_xmit_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_MC_XMIT_PKTS, + CNTR_INVALID_VL)); + rsp->port_multicast_rcv_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_MC_RCV_PKTS, + CNTR_INVALID_VL)); +} + +static int pma_get_opa_datacounters(struct opa_pma_mad *pmp, + struct ib_device *ibdev, + u8 port, u32 *resp_len) +{ + struct opa_port_data_counters_msg *req = + (struct opa_port_data_counters_msg *)pmp->data; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct _port_dctrs *rsp; + struct _vls_dctrs *vlinfo; + size_t response_data_size; + u32 num_ports; + u8 num_pslm; + u8 lq, num_vls; + u8 res_lli, res_ler; + u64 port_mask; + unsigned long port_num; + unsigned long vl; + u32 vl_select_mask; + int vfi; + + num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24; + num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3])); + num_vls = hweight32(be32_to_cpu(req->vl_select_mask)); + vl_select_mask = be32_to_cpu(req->vl_select_mask); + res_lli = (u8)(be32_to_cpu(req->resolution) & MSK_LLI) >> MSK_LLI_SFT; + res_lli = res_lli ? res_lli + ADD_LLI : 0; + res_ler = (u8)(be32_to_cpu(req->resolution) & MSK_LER) >> MSK_LER_SFT; + res_ler = res_ler ? res_ler + ADD_LER : 0; + + if (num_ports != 1 || (vl_select_mask & ~VL_MASK_ALL)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + /* Sanity check */ + response_data_size = sizeof(struct opa_port_data_counters_msg) + + num_vls * sizeof(struct _vls_dctrs); + + if (response_data_size > sizeof(pmp->data)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + /* + * The bit set in the mask needs to be consistent with the + * port the request came in on. + */ + port_mask = be64_to_cpu(req->port_select_mask[3]); + port_num = find_first_bit((unsigned long *)&port_mask, + sizeof(port_mask)); + + if ((u8)port_num != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + rsp = &req->port[0]; + memset(rsp, 0, sizeof(*rsp)); + + rsp->port_number = port; + /* + * Note that link_quality_indicator is a 32 bit quantity in + * 'datacounters' queries (as opposed to 'portinfo' queries, + * where it's a byte). + */ + hfi1_read_link_quality(dd, &lq); + rsp->link_quality_indicator = cpu_to_be32((u32)lq); + pma_get_opa_port_dctrs(ibdev, rsp); + + rsp->port_xmit_wait = + cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL)); + rsp->port_rcv_fecn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL)); + rsp->port_rcv_becn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL)); + rsp->port_error_counter_summary = + cpu_to_be64(get_error_counter_summary(ibdev, port, + res_lli, res_ler)); + + vlinfo = &rsp->vls[0]; + vfi = 0; + /* The vl_select_mask has been checked above, and we know + * that it contains only entries which represent valid VLs. + * So in the for_each_set_bit() loop below, we don't need + * any additional checks for vl. + */ + for_each_set_bit(vl, (unsigned long *)&(vl_select_mask), + 8 * sizeof(req->vl_select_mask)) { + memset(vlinfo, 0, sizeof(*vlinfo)); + + rsp->vls[vfi].port_vl_xmit_data = + cpu_to_be64(read_port_cntr(ppd, C_TX_FLIT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_rcv_data = + cpu_to_be64(read_dev_cntr(dd, C_DC_RX_FLIT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_xmit_pkts = + cpu_to_be64(read_port_cntr(ppd, C_TX_PKT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_rcv_pkts = + cpu_to_be64(read_dev_cntr(dd, C_DC_RX_PKT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_xmit_wait = + cpu_to_be64(read_port_cntr(ppd, C_TX_WAIT_VL, + idx_from_vl(vl))); + + rsp->vls[vfi].port_vl_rcv_fecn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_FCN_VL, + idx_from_vl(vl))); + rsp->vls[vfi].port_vl_rcv_becn = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_BCN_VL, + idx_from_vl(vl))); + + /* rsp->port_vl_xmit_time_cong is 0 for HFIs */ + /* rsp->port_vl_xmit_wasted_bw ??? */ + /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ??? + * does this differ from rsp->vls[vfi].port_vl_xmit_wait + */ + /*rsp->vls[vfi].port_vl_mark_fecn = + * cpu_to_be64(read_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + * + offset)); + */ + vlinfo++; + vfi++; + } + + a0_datacounters(ppd, rsp, vl_select_mask); + + if (resp_len) + *resp_len += response_data_size; + + return reply((struct ib_mad_hdr *)pmp); +} + +static int pma_get_ib_portcounters_ext(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters_ext *p = (struct ib_pma_portcounters_ext *) + pmp->data; + struct _port_dctrs rsp; + + if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + goto bail; + } + + memset(&rsp, 0, sizeof(rsp)); + pma_get_opa_port_dctrs(ibdev, &rsp); + + p->port_xmit_data = rsp.port_xmit_data; + p->port_rcv_data = rsp.port_rcv_data; + p->port_xmit_packets = rsp.port_xmit_pkts; + p->port_rcv_packets = rsp.port_rcv_pkts; + p->port_unicast_xmit_packets = 0; + p->port_unicast_rcv_packets = 0; + p->port_multicast_xmit_packets = rsp.port_multicast_xmit_pkts; + p->port_multicast_rcv_packets = rsp.port_multicast_rcv_pkts; + +bail: + return reply((struct ib_mad_hdr *)pmp); +} + +static void pma_get_opa_port_ectrs(struct ib_device *ibdev, + struct _port_ectrs *rsp, u8 port) +{ + u64 tmp, tmp2; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + tmp = read_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL); + tmp2 = tmp + read_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, + CNTR_INVALID_VL); + if (tmp2 > (u32)UINT_MAX || tmp2 < tmp) { + /* overflow/wrapped */ + rsp->link_error_recovery = cpu_to_be32(~0); + } else { + rsp->link_error_recovery = cpu_to_be32(tmp2); + } + + rsp->link_downed = cpu_to_be32(read_port_cntr(ppd, C_SW_LINK_DOWN, + CNTR_INVALID_VL)); + rsp->port_rcv_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL)); + rsp->port_rcv_remote_physical_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR, + CNTR_INVALID_VL)); + rsp->port_rcv_switch_relay_errors = 0; + rsp->port_xmit_discards = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_DSCD, + CNTR_INVALID_VL)); + rsp->port_xmit_constraint_errors = + cpu_to_be64(read_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, + CNTR_INVALID_VL)); + rsp->port_rcv_constraint_errors = + cpu_to_be64(read_port_cntr(ppd, C_SW_RCV_CSTR_ERR, + CNTR_INVALID_VL)); + tmp = read_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL); + tmp2 = tmp + read_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL); + if (tmp2 < tmp) { + /* overflow/wrapped */ + rsp->local_link_integrity_errors = cpu_to_be64(~0); + } else { + rsp->local_link_integrity_errors = cpu_to_be64(tmp2); + } + rsp->excessive_buffer_overruns = + cpu_to_be64(read_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL)); +} + +static int pma_get_opa_porterrors(struct opa_pma_mad *pmp, + struct ib_device *ibdev, + u8 port, u32 *resp_len) +{ + size_t response_data_size; + struct _port_ectrs *rsp; + u8 port_num; + struct opa_port_error_counters64_msg *req; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u32 num_ports; + u8 num_pslm; + u8 num_vls; + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct _vls_ectrs *vlinfo; + unsigned long vl; + u64 port_mask, tmp; + u32 vl_select_mask; + int vfi; + + req = (struct opa_port_error_counters64_msg *)pmp->data; + + num_ports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24; + + num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3])); + num_vls = hweight32(be32_to_cpu(req->vl_select_mask)); + + if (num_ports != 1 || num_ports != num_pslm) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + response_data_size = sizeof(struct opa_port_error_counters64_msg) + + num_vls * sizeof(struct _vls_ectrs); + + if (response_data_size > sizeof(pmp->data)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + /* + * The bit set in the mask needs to be consistent with the + * port the request came in on. + */ + port_mask = be64_to_cpu(req->port_select_mask[3]); + port_num = find_first_bit((unsigned long *)&port_mask, + sizeof(port_mask)); + + if (port_num != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + rsp = &req->port[0]; + + ibp = to_iport(ibdev, port_num); + ppd = ppd_from_ibp(ibp); + + memset(rsp, 0, sizeof(*rsp)); + rsp->port_number = port_num; + + pma_get_opa_port_ectrs(ibdev, rsp, port_num); + + rsp->port_rcv_remote_physical_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_RMT_PHY_ERR, + CNTR_INVALID_VL)); + rsp->fm_config_errors = + cpu_to_be64(read_dev_cntr(dd, C_DC_FM_CFG_ERR, + CNTR_INVALID_VL)); + tmp = read_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL); + + rsp->uncorrectable_errors = tmp < 0x100 ? (tmp & 0xff) : 0xff; + + vlinfo = &rsp->vls[0]; + vfi = 0; + vl_select_mask = be32_to_cpu(req->vl_select_mask); + for_each_set_bit(vl, (unsigned long *)&(vl_select_mask), + 8 * sizeof(req->vl_select_mask)) { + memset(vlinfo, 0, sizeof(*vlinfo)); + /* vlinfo->vls[vfi].port_vl_xmit_discards ??? */ + vlinfo += 1; + vfi++; + } + + if (resp_len) + *resp_len += response_data_size; + + return reply((struct ib_mad_hdr *)pmp); +} + +static int pma_get_ib_portcounters(struct ib_pma_mad *pmp, + struct ib_device *ibdev, u8 port) +{ + struct ib_pma_portcounters *p = (struct ib_pma_portcounters *) + pmp->data; + struct _port_ectrs rsp; + u64 temp_link_overrun_errors; + u64 temp_64; + u32 temp_32; + + memset(&rsp, 0, sizeof(rsp)); + pma_get_opa_port_ectrs(ibdev, &rsp, port); + + if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + goto bail; + } + + p->symbol_error_counter = 0; /* N/A for OPA */ + + temp_32 = be32_to_cpu(rsp.link_error_recovery); + if (temp_32 > 0xFFUL) + p->link_error_recovery_counter = 0xFF; + else + p->link_error_recovery_counter = (u8)temp_32; + + temp_32 = be32_to_cpu(rsp.link_downed); + if (temp_32 > 0xFFUL) + p->link_downed_counter = 0xFF; + else + p->link_downed_counter = (u8)temp_32; + + temp_64 = be64_to_cpu(rsp.port_rcv_errors); + if (temp_64 > 0xFFFFUL) + p->port_rcv_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_errors = cpu_to_be16((u16)temp_64); + + temp_64 = be64_to_cpu(rsp.port_rcv_remote_physical_errors); + if (temp_64 > 0xFFFFUL) + p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF); + else + p->port_rcv_remphys_errors = cpu_to_be16((u16)temp_64); + + temp_64 = be64_to_cpu(rsp.port_rcv_switch_relay_errors); + p->port_rcv_switch_relay_errors = cpu_to_be16((u16)temp_64); + + temp_64 = be64_to_cpu(rsp.port_xmit_discards); + if (temp_64 > 0xFFFFUL) + p->port_xmit_discards = cpu_to_be16(0xFFFF); + else + p->port_xmit_discards = cpu_to_be16((u16)temp_64); + + temp_64 = be64_to_cpu(rsp.port_xmit_constraint_errors); + if (temp_64 > 0xFFUL) + p->port_xmit_constraint_errors = 0xFF; + else + p->port_xmit_constraint_errors = (u8)temp_64; + + temp_64 = be64_to_cpu(rsp.port_rcv_constraint_errors); + if (temp_64 > 0xFFUL) + p->port_rcv_constraint_errors = 0xFFUL; + else + p->port_rcv_constraint_errors = (u8)temp_64; + + /* LocalLink: 7:4, BufferOverrun: 3:0 */ + temp_64 = be64_to_cpu(rsp.local_link_integrity_errors); + if (temp_64 > 0xFUL) + temp_64 = 0xFUL; + + temp_link_overrun_errors = temp_64 << 4; + + temp_64 = be64_to_cpu(rsp.excessive_buffer_overruns); + if (temp_64 > 0xFUL) + temp_64 = 0xFUL; + temp_link_overrun_errors |= temp_64; + + p->link_overrun_errors = (u8)temp_link_overrun_errors; + + p->vl15_dropped = 0; /* N/A for OPA */ + +bail: + return reply((struct ib_mad_hdr *)pmp); +} + +static int pma_get_opa_errorinfo(struct opa_pma_mad *pmp, + struct ib_device *ibdev, + u8 port, u32 *resp_len) +{ + size_t response_data_size; + struct _port_ei *rsp; + struct opa_port_error_info_msg *req; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u64 port_mask; + u32 num_ports; + u8 port_num; + u8 num_pslm; + u64 reg; + + req = (struct opa_port_error_info_msg *)pmp->data; + rsp = &req->port[0]; + + num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod)); + num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3])); + + memset(rsp, 0, sizeof(*rsp)); + + if (num_ports != 1 || num_ports != num_pslm) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + /* Sanity check */ + response_data_size = sizeof(struct opa_port_error_info_msg); + + if (response_data_size > sizeof(pmp->data)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + /* + * The bit set in the mask needs to be consistent with the port + * the request came in on. + */ + port_mask = be64_to_cpu(req->port_select_mask[3]); + port_num = find_first_bit((unsigned long *)&port_mask, + sizeof(port_mask)); + + if (port_num != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + /* PortRcvErrorInfo */ + rsp->port_rcv_ei.status_and_code = + dd->err_info_rcvport.status_and_code; + memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit1, + &dd->err_info_rcvport.packet_flit1, sizeof(u64)); + memcpy(&rsp->port_rcv_ei.ei.ei1to12.packet_flit2, + &dd->err_info_rcvport.packet_flit2, sizeof(u64)); + + /* ExcessiverBufferOverrunInfo */ + reg = read_csr(dd, RCV_ERR_INFO); + if (reg & RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK) { + /* + * if the RcvExcessBufferOverrun bit is set, save SC of + * first pkt that encountered an excess buffer overrun + */ + u8 tmp = (u8)reg; + + tmp &= RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SC_SMASK; + tmp <<= 2; + rsp->excessive_buffer_overrun_ei.status_and_sc = tmp; + /* set the status bit */ + rsp->excessive_buffer_overrun_ei.status_and_sc |= 0x80; + } + + rsp->port_xmit_constraint_ei.status = + dd->err_info_xmit_constraint.status; + rsp->port_xmit_constraint_ei.pkey = + cpu_to_be16(dd->err_info_xmit_constraint.pkey); + rsp->port_xmit_constraint_ei.slid = + cpu_to_be32(dd->err_info_xmit_constraint.slid); + + rsp->port_rcv_constraint_ei.status = + dd->err_info_rcv_constraint.status; + rsp->port_rcv_constraint_ei.pkey = + cpu_to_be16(dd->err_info_rcv_constraint.pkey); + rsp->port_rcv_constraint_ei.slid = + cpu_to_be32(dd->err_info_rcv_constraint.slid); + + /* UncorrectableErrorInfo */ + rsp->uncorrectable_ei.status_and_code = dd->err_info_uncorrectable; + + /* FMConfigErrorInfo */ + rsp->fm_config_ei.status_and_code = dd->err_info_fmconfig; + + if (resp_len) + *resp_len += response_data_size; + + return reply((struct ib_mad_hdr *)pmp); +} + +static int pma_set_opa_portstatus(struct opa_pma_mad *pmp, + struct ib_device *ibdev, + u8 port, u32 *resp_len) +{ + struct opa_clear_port_status *req = + (struct opa_clear_port_status *)pmp->data; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 nports = be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24; + u64 portn = be64_to_cpu(req->port_select_mask[3]); + u32 counter_select = be32_to_cpu(req->counter_select_mask); + u32 vl_select_mask = VL_MASK_ALL; /* clear all per-vl cnts */ + unsigned long vl; + + if ((nports != 1) || (portn != 1 << port)) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + /* + * only counters returned by pma_get_opa_portstatus() are + * handled, so when pma_get_opa_portstatus() gets a fix, + * the corresponding change should be made here as well. + */ + + if (counter_select & CS_PORT_XMIT_DATA) + write_dev_cntr(dd, C_DC_XMIT_FLITS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_RCV_DATA) + write_dev_cntr(dd, C_DC_RCV_FLITS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_XMIT_PKTS) + write_dev_cntr(dd, C_DC_XMIT_PKTS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_RCV_PKTS) + write_dev_cntr(dd, C_DC_RCV_PKTS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_MCAST_XMIT_PKTS) + write_dev_cntr(dd, C_DC_MC_XMIT_PKTS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_MCAST_RCV_PKTS) + write_dev_cntr(dd, C_DC_MC_RCV_PKTS, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_XMIT_WAIT) + write_port_cntr(ppd, C_TX_WAIT, CNTR_INVALID_VL, 0); + + /* ignore cs_sw_portCongestion for HFIs */ + + if (counter_select & CS_PORT_RCV_FECN) + write_dev_cntr(dd, C_DC_RCV_FCN, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_RCV_BECN) + write_dev_cntr(dd, C_DC_RCV_BCN, CNTR_INVALID_VL, 0); + + /* ignore cs_port_xmit_time_cong for HFIs */ + /* ignore cs_port_xmit_wasted_bw for now */ + /* ignore cs_port_xmit_wait_data for now */ + if (counter_select & CS_PORT_RCV_BUBBLE) + write_dev_cntr(dd, C_DC_RCV_BBL, CNTR_INVALID_VL, 0); + + /* Only applicable for switch */ + /* if (counter_select & CS_PORT_MARK_FECN) + * write_csr(dd, DCC_PRF_PORT_MARK_FECN_CNT, 0); + */ + + if (counter_select & CS_PORT_RCV_CONSTRAINT_ERRORS) + write_port_cntr(ppd, C_SW_RCV_CSTR_ERR, CNTR_INVALID_VL, 0); + + /* ignore cs_port_rcv_switch_relay_errors for HFIs */ + if (counter_select & CS_PORT_XMIT_DISCARDS) + write_port_cntr(ppd, C_SW_XMIT_DSCD, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_XMIT_CONSTRAINT_ERRORS) + write_port_cntr(ppd, C_SW_XMIT_CSTR_ERR, CNTR_INVALID_VL, 0); + + if (counter_select & CS_PORT_RCV_REMOTE_PHYSICAL_ERRORS) + write_dev_cntr(dd, C_DC_RMT_PHY_ERR, CNTR_INVALID_VL, 0); + + if (counter_select & CS_LOCAL_LINK_INTEGRITY_ERRORS) { + write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0); + write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0); + } + + if (counter_select & CS_LINK_ERROR_RECOVERY) { + write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0); + write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, + CNTR_INVALID_VL, 0); + } + + if (counter_select & CS_PORT_RCV_ERRORS) + write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0); + + if (counter_select & CS_EXCESSIVE_BUFFER_OVERRUNS) { + write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0); + dd->rcv_ovfl_cnt = 0; + } + + if (counter_select & CS_FM_CONFIG_ERRORS) + write_dev_cntr(dd, C_DC_FM_CFG_ERR, CNTR_INVALID_VL, 0); + + if (counter_select & CS_LINK_DOWNED) + write_port_cntr(ppd, C_SW_LINK_DOWN, CNTR_INVALID_VL, 0); + + if (counter_select & CS_UNCORRECTABLE_ERRORS) + write_dev_cntr(dd, C_DC_UNC_ERR, CNTR_INVALID_VL, 0); + + for_each_set_bit(vl, (unsigned long *)&(vl_select_mask), + 8 * sizeof(vl_select_mask)) { + if (counter_select & CS_PORT_XMIT_DATA) + write_port_cntr(ppd, C_TX_FLIT_VL, idx_from_vl(vl), 0); + + if (counter_select & CS_PORT_RCV_DATA) + write_dev_cntr(dd, C_DC_RX_FLIT_VL, idx_from_vl(vl), 0); + + if (counter_select & CS_PORT_XMIT_PKTS) + write_port_cntr(ppd, C_TX_PKT_VL, idx_from_vl(vl), 0); + + if (counter_select & CS_PORT_RCV_PKTS) + write_dev_cntr(dd, C_DC_RX_PKT_VL, idx_from_vl(vl), 0); + + if (counter_select & CS_PORT_XMIT_WAIT) + write_port_cntr(ppd, C_TX_WAIT_VL, idx_from_vl(vl), 0); + + /* sw_port_vl_congestion is 0 for HFIs */ + if (counter_select & CS_PORT_RCV_FECN) + write_dev_cntr(dd, C_DC_RCV_FCN_VL, idx_from_vl(vl), 0); + + if (counter_select & CS_PORT_RCV_BECN) + write_dev_cntr(dd, C_DC_RCV_BCN_VL, idx_from_vl(vl), 0); + + /* port_vl_xmit_time_cong is 0 for HFIs */ + /* port_vl_xmit_wasted_bw ??? */ + /* port_vl_xmit_wait_data - TXE (table 13-9 HFI spec) ??? */ + if (counter_select & CS_PORT_RCV_BUBBLE) + write_dev_cntr(dd, C_DC_RCV_BBL_VL, idx_from_vl(vl), 0); + + /* if (counter_select & CS_PORT_MARK_FECN) + * write_csr(dd, DCC_PRF_PORT_VL_MARK_FECN_CNT + offset, 0); + */ + /* port_vl_xmit_discards ??? */ + } + + if (resp_len) + *resp_len += sizeof(*req); + + return reply((struct ib_mad_hdr *)pmp); +} + +static int pma_set_opa_errorinfo(struct opa_pma_mad *pmp, + struct ib_device *ibdev, + u8 port, u32 *resp_len) +{ + struct _port_ei *rsp; + struct opa_port_error_info_msg *req; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + u64 port_mask; + u32 num_ports; + u8 port_num; + u8 num_pslm; + u32 error_info_select; + + req = (struct opa_port_error_info_msg *)pmp->data; + rsp = &req->port[0]; + + num_ports = OPA_AM_NPORT(be32_to_cpu(pmp->mad_hdr.attr_mod)); + num_pslm = hweight64(be64_to_cpu(req->port_select_mask[3])); + + memset(rsp, 0, sizeof(*rsp)); + + if (num_ports != 1 || num_ports != num_pslm) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + /* + * The bit set in the mask needs to be consistent with the port + * the request came in on. + */ + port_mask = be64_to_cpu(req->port_select_mask[3]); + port_num = find_first_bit((unsigned long *)&port_mask, + sizeof(port_mask)); + + if (port_num != port) { + pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)pmp); + } + + error_info_select = be32_to_cpu(req->error_info_select_mask); + + /* PortRcvErrorInfo */ + if (error_info_select & ES_PORT_RCV_ERROR_INFO) + /* turn off status bit */ + dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK; + + /* ExcessiverBufferOverrunInfo */ + if (error_info_select & ES_EXCESSIVE_BUFFER_OVERRUN_INFO) + /* + * status bit is essentially kept in the h/w - bit 5 of + * RCV_ERR_INFO + */ + write_csr(dd, RCV_ERR_INFO, + RCV_ERR_INFO_RCV_EXCESS_BUFFER_OVERRUN_SMASK); + + if (error_info_select & ES_PORT_XMIT_CONSTRAINT_ERROR_INFO) + dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK; + + if (error_info_select & ES_PORT_RCV_CONSTRAINT_ERROR_INFO) + dd->err_info_rcv_constraint.status &= ~OPA_EI_STATUS_SMASK; + + /* UncorrectableErrorInfo */ + if (error_info_select & ES_UNCORRECTABLE_ERROR_INFO) + /* turn off status bit */ + dd->err_info_uncorrectable &= ~OPA_EI_STATUS_SMASK; + + /* FMConfigErrorInfo */ + if (error_info_select & ES_FM_CONFIG_ERROR_INFO) + /* turn off status bit */ + dd->err_info_fmconfig &= ~OPA_EI_STATUS_SMASK; + + if (resp_len) + *resp_len += sizeof(*req); + + return reply((struct ib_mad_hdr *)pmp); +} + +struct opa_congestion_info_attr { + __be16 congestion_info; + u8 control_table_cap; /* Multiple of 64 entry unit CCTs */ + u8 congestion_log_length; +} __packed; + +static int __subn_get_opa_cong_info(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct opa_congestion_info_attr *p = + (struct opa_congestion_info_attr *)data; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + p->congestion_info = 0; + p->control_table_cap = ppd->cc_max_table_entries; + p->congestion_log_length = OPA_CONG_LOG_ELEMS; + + if (resp_len) + *resp_len += sizeof(*p); + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_get_opa_cong_setting(struct opa_smp *smp, u32 am, + u8 *data, struct ib_device *ibdev, + u8 port, u32 *resp_len) +{ + int i; + struct opa_congestion_setting_attr *p = + (struct opa_congestion_setting_attr *)data; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct opa_congestion_setting_entry_shadow *entries; + struct cc_state *cc_state; + + rcu_read_lock(); + + cc_state = get_cc_state(ppd); + + if (!cc_state) { + rcu_read_unlock(); + return reply((struct ib_mad_hdr *)smp); + } + + entries = cc_state->cong_setting.entries; + p->port_control = cpu_to_be16(cc_state->cong_setting.port_control); + p->control_map = cpu_to_be32(cc_state->cong_setting.control_map); + for (i = 0; i < OPA_MAX_SLS; i++) { + p->entries[i].ccti_increase = entries[i].ccti_increase; + p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer); + p->entries[i].trigger_threshold = + entries[i].trigger_threshold; + p->entries[i].ccti_min = entries[i].ccti_min; + } + + rcu_read_unlock(); + + if (resp_len) + *resp_len += sizeof(*p); + + return reply((struct ib_mad_hdr *)smp); +} + +/* + * Apply congestion control information stored in the ppd to the + * active structure. + */ +static void apply_cc_state(struct hfi1_pportdata *ppd) +{ + struct cc_state *old_cc_state, *new_cc_state; + + new_cc_state = kzalloc(sizeof(*new_cc_state), GFP_KERNEL); + if (!new_cc_state) + return; + + /* + * Hold the lock for updating *and* to prevent ppd information + * from changing during the update. + */ + spin_lock(&ppd->cc_state_lock); + + old_cc_state = get_cc_state(ppd); + if (!old_cc_state) { + /* never active, or shutting down */ + spin_unlock(&ppd->cc_state_lock); + kfree(new_cc_state); + return; + } + + *new_cc_state = *old_cc_state; + + new_cc_state->cct.ccti_limit = ppd->total_cct_entry - 1; + memcpy(new_cc_state->cct.entries, ppd->ccti_entries, + ppd->total_cct_entry * sizeof(struct ib_cc_table_entry)); + + new_cc_state->cong_setting.port_control = IB_CC_CCS_PC_SL_BASED; + new_cc_state->cong_setting.control_map = ppd->cc_sl_control_map; + memcpy(new_cc_state->cong_setting.entries, ppd->congestion_entries, + OPA_MAX_SLS * sizeof(struct opa_congestion_setting_entry)); + + rcu_assign_pointer(ppd->cc_state, new_cc_state); + + spin_unlock(&ppd->cc_state_lock); + + call_rcu(&old_cc_state->rcu, cc_state_reclaim); +} + +static int __subn_set_opa_cong_setting(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct opa_congestion_setting_attr *p = + (struct opa_congestion_setting_attr *)data; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct opa_congestion_setting_entry_shadow *entries; + int i; + + /* + * Save details from packet into the ppd. Hold the cc_state_lock so + * our information is consistent with anyone trying to apply the state. + */ + spin_lock(&ppd->cc_state_lock); + ppd->cc_sl_control_map = be32_to_cpu(p->control_map); + + entries = ppd->congestion_entries; + for (i = 0; i < OPA_MAX_SLS; i++) { + entries[i].ccti_increase = p->entries[i].ccti_increase; + entries[i].ccti_timer = be16_to_cpu(p->entries[i].ccti_timer); + entries[i].trigger_threshold = + p->entries[i].trigger_threshold; + entries[i].ccti_min = p->entries[i].ccti_min; + } + spin_unlock(&ppd->cc_state_lock); + + /* now apply the information */ + apply_cc_state(ppd); + + return __subn_get_opa_cong_setting(smp, am, data, ibdev, port, + resp_len); +} + +static int __subn_get_opa_hfi1_cong_log(struct opa_smp *smp, u32 am, + u8 *data, struct ib_device *ibdev, + u8 port, u32 *resp_len) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct opa_hfi1_cong_log *cong_log = (struct opa_hfi1_cong_log *)data; + s64 ts; + int i; + + if (am != 0) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + spin_lock_irq(&ppd->cc_log_lock); + + cong_log->log_type = OPA_CC_LOG_TYPE_HFI; + cong_log->congestion_flags = 0; + cong_log->threshold_event_counter = + cpu_to_be16(ppd->threshold_event_counter); + memcpy(cong_log->threshold_cong_event_map, + ppd->threshold_cong_event_map, + sizeof(cong_log->threshold_cong_event_map)); + /* keep timestamp in units of 1.024 usec */ + ts = ktime_to_ns(ktime_get()) / 1024; + cong_log->current_time_stamp = cpu_to_be32(ts); + for (i = 0; i < OPA_CONG_LOG_ELEMS; i++) { + struct opa_hfi1_cong_log_event_internal *cce = + &ppd->cc_events[ppd->cc_mad_idx++]; + if (ppd->cc_mad_idx == OPA_CONG_LOG_ELEMS) + ppd->cc_mad_idx = 0; + /* + * Entries which are older than twice the time + * required to wrap the counter are supposed to + * be zeroed (CA10-49 IBTA, release 1.2.1, V1). + */ + if ((u64)(ts - cce->timestamp) > (2 * UINT_MAX)) + continue; + memcpy(cong_log->events[i].local_qp_cn_entry, &cce->lqpn, 3); + memcpy(cong_log->events[i].remote_qp_number_cn_entry, + &cce->rqpn, 3); + cong_log->events[i].sl_svc_type_cn_entry = + ((cce->sl & 0x1f) << 3) | (cce->svc_type & 0x7); + cong_log->events[i].remote_lid_cn_entry = + cpu_to_be32(cce->rlid); + cong_log->events[i].timestamp_cn_entry = + cpu_to_be32(cce->timestamp); + } + + /* + * Reset threshold_cong_event_map, and threshold_event_counter + * to 0 when log is read. + */ + memset(ppd->threshold_cong_event_map, 0x0, + sizeof(ppd->threshold_cong_event_map)); + ppd->threshold_event_counter = 0; + + spin_unlock_irq(&ppd->cc_log_lock); + + if (resp_len) + *resp_len += sizeof(struct opa_hfi1_cong_log); + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_get_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct ib_cc_table_attr *cc_table_attr = + (struct ib_cc_table_attr *)data; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 start_block = OPA_AM_START_BLK(am); + u32 n_blocks = OPA_AM_NBLK(am); + struct ib_cc_table_entry_shadow *entries; + int i, j; + u32 sentry, eentry; + struct cc_state *cc_state; + + /* sanity check n_blocks, start_block */ + if (n_blocks == 0 || + start_block + n_blocks > ppd->cc_max_table_entries) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + rcu_read_lock(); + + cc_state = get_cc_state(ppd); + + if (!cc_state) { + rcu_read_unlock(); + return reply((struct ib_mad_hdr *)smp); + } + + sentry = start_block * IB_CCT_ENTRIES; + eentry = sentry + (IB_CCT_ENTRIES * n_blocks); + + cc_table_attr->ccti_limit = cpu_to_be16(cc_state->cct.ccti_limit); + + entries = cc_state->cct.entries; + + /* return n_blocks, though the last block may not be full */ + for (j = 0, i = sentry; i < eentry; j++, i++) + cc_table_attr->ccti_entries[j].entry = + cpu_to_be16(entries[i].entry); + + rcu_read_unlock(); + + if (resp_len) + *resp_len += sizeof(u16) * (IB_CCT_ENTRIES * n_blocks + 1); + + return reply((struct ib_mad_hdr *)smp); +} + +void cc_state_reclaim(struct rcu_head *rcu) +{ + struct cc_state *cc_state = container_of(rcu, struct cc_state, rcu); + + kfree(cc_state); +} + +static int __subn_set_opa_cc_table(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct ib_cc_table_attr *p = (struct ib_cc_table_attr *)data; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 start_block = OPA_AM_START_BLK(am); + u32 n_blocks = OPA_AM_NBLK(am); + struct ib_cc_table_entry_shadow *entries; + int i, j; + u32 sentry, eentry; + u16 ccti_limit; + + /* sanity check n_blocks, start_block */ + if (n_blocks == 0 || + start_block + n_blocks > ppd->cc_max_table_entries) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + sentry = start_block * IB_CCT_ENTRIES; + eentry = sentry + ((n_blocks - 1) * IB_CCT_ENTRIES) + + (be16_to_cpu(p->ccti_limit)) % IB_CCT_ENTRIES + 1; + + /* sanity check ccti_limit */ + ccti_limit = be16_to_cpu(p->ccti_limit); + if (ccti_limit + 1 > eentry) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + /* + * Save details from packet into the ppd. Hold the cc_state_lock so + * our information is consistent with anyone trying to apply the state. + */ + spin_lock(&ppd->cc_state_lock); + ppd->total_cct_entry = ccti_limit + 1; + entries = ppd->ccti_entries; + for (j = 0, i = sentry; i < eentry; j++, i++) + entries[i].entry = be16_to_cpu(p->ccti_entries[j].entry); + spin_unlock(&ppd->cc_state_lock); + + /* now apply the information */ + apply_cc_state(ppd); + + return __subn_get_opa_cc_table(smp, am, data, ibdev, port, resp_len); +} + +struct opa_led_info { + __be32 rsvd_led_mask; + __be32 rsvd; +}; + +#define OPA_LED_SHIFT 31 +#define OPA_LED_MASK BIT(OPA_LED_SHIFT) + +static int __subn_get_opa_led_info(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct hfi1_pportdata *ppd = dd->pport; + struct opa_led_info *p = (struct opa_led_info *)data; + u32 nport = OPA_AM_NPORT(am); + u32 is_beaconing_active; + + if (nport != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + /* + * This pairs with the memory barrier in hfi1_start_led_override to + * ensure that we read the correct state of LED beaconing represented + * by led_override_timer_active + */ + smp_rmb(); + is_beaconing_active = !!atomic_read(&ppd->led_override_timer_active); + p->rsvd_led_mask = cpu_to_be32(is_beaconing_active << OPA_LED_SHIFT); + + if (resp_len) + *resp_len += sizeof(struct opa_led_info); + + return reply((struct ib_mad_hdr *)smp); +} + +static int __subn_set_opa_led_info(struct opa_smp *smp, u32 am, u8 *data, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + struct opa_led_info *p = (struct opa_led_info *)data; + u32 nport = OPA_AM_NPORT(am); + int on = !!(be32_to_cpu(p->rsvd_led_mask) & OPA_LED_MASK); + + if (nport != 1) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + if (on) + hfi1_start_led_override(dd->pport, 2000, 1500); + else + shutdown_led_override(dd->pport); + + return __subn_get_opa_led_info(smp, am, data, ibdev, port, resp_len); +} + +static int subn_get_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, + u8 *data, struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + int ret; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + + switch (attr_id) { + case IB_SMP_ATTR_NODE_DESC: + ret = __subn_get_opa_nodedesc(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_NODE_INFO: + ret = __subn_get_opa_nodeinfo(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_PORT_INFO: + ret = __subn_get_opa_portinfo(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_PKEY_TABLE: + ret = __subn_get_opa_pkeytable(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_SL_TO_SC_MAP: + ret = __subn_get_opa_sl_to_sc(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_SC_TO_SL_MAP: + ret = __subn_get_opa_sc_to_sl(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_SC_TO_VLT_MAP: + ret = __subn_get_opa_sc_to_vlt(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_SC_TO_VLNT_MAP: + ret = __subn_get_opa_sc_to_vlnt(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_PORT_STATE_INFO: + ret = __subn_get_opa_psi(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE: + ret = __subn_get_opa_bct(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_CABLE_INFO: + ret = __subn_get_opa_cable_info(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_VL_ARB_TABLE: + ret = __subn_get_opa_vl_arb(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_CONGESTION_INFO: + ret = __subn_get_opa_cong_info(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING: + ret = __subn_get_opa_cong_setting(smp, am, data, ibdev, + port, resp_len); + break; + case OPA_ATTRIB_ID_HFI_CONGESTION_LOG: + ret = __subn_get_opa_hfi1_cong_log(smp, am, data, ibdev, + port, resp_len); + break; + case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE: + ret = __subn_get_opa_cc_table(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_LED_INFO: + ret = __subn_get_opa_led_info(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_SM_INFO: + if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED) + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + if (ibp->rvp.port_cap_flags & IB_PORT_SM) + return IB_MAD_RESULT_SUCCESS; + /* FALLTHROUGH */ + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)smp); + break; + } + return ret; +} + +static int subn_set_opa_sma(__be16 attr_id, struct opa_smp *smp, u32 am, + u8 *data, struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + int ret; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + + switch (attr_id) { + case IB_SMP_ATTR_PORT_INFO: + ret = __subn_set_opa_portinfo(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_PKEY_TABLE: + ret = __subn_set_opa_pkeytable(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_SL_TO_SC_MAP: + ret = __subn_set_opa_sl_to_sc(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_SC_TO_SL_MAP: + ret = __subn_set_opa_sc_to_sl(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_SC_TO_VLT_MAP: + ret = __subn_set_opa_sc_to_vlt(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_SC_TO_VLNT_MAP: + ret = __subn_set_opa_sc_to_vlnt(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_PORT_STATE_INFO: + ret = __subn_set_opa_psi(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_BUFFER_CONTROL_TABLE: + ret = __subn_set_opa_bct(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_VL_ARB_TABLE: + ret = __subn_set_opa_vl_arb(smp, am, data, ibdev, port, + resp_len); + break; + case OPA_ATTRIB_ID_HFI_CONGESTION_SETTING: + ret = __subn_set_opa_cong_setting(smp, am, data, ibdev, + port, resp_len); + break; + case OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE: + ret = __subn_set_opa_cc_table(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_LED_INFO: + ret = __subn_set_opa_led_info(smp, am, data, ibdev, port, + resp_len); + break; + case IB_SMP_ATTR_SM_INFO: + if (ibp->rvp.port_cap_flags & IB_PORT_SM_DISABLED) + return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED; + if (ibp->rvp.port_cap_flags & IB_PORT_SM) + return IB_MAD_RESULT_SUCCESS; + /* FALLTHROUGH */ + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)smp); + break; + } + return ret; +} + +static inline void set_aggr_error(struct opa_aggregate *ag) +{ + ag->err_reqlength |= cpu_to_be16(0x8000); +} + +static int subn_get_opa_aggregate(struct opa_smp *smp, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + int i; + u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff; + u8 *next_smp = opa_get_smp_data(smp); + + if (num_attr < 1 || num_attr > 117) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < num_attr; i++) { + struct opa_aggregate *agg; + size_t agg_data_len; + size_t agg_size; + u32 am; + + agg = (struct opa_aggregate *)next_smp; + agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8; + agg_size = sizeof(*agg) + agg_data_len; + am = be32_to_cpu(agg->attr_mod); + + *resp_len += agg_size; + + if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + /* zero the payload for this segment */ + memset(next_smp + sizeof(*agg), 0, agg_data_len); + + (void)subn_get_opa_sma(agg->attr_id, smp, am, agg->data, + ibdev, port, NULL); + if (smp->status & ~IB_SMP_DIRECTION) { + set_aggr_error(agg); + return reply((struct ib_mad_hdr *)smp); + } + next_smp += agg_size; + } + + return reply((struct ib_mad_hdr *)smp); +} + +static int subn_set_opa_aggregate(struct opa_smp *smp, + struct ib_device *ibdev, u8 port, + u32 *resp_len) +{ + int i; + u32 num_attr = be32_to_cpu(smp->attr_mod) & 0x000000ff; + u8 *next_smp = opa_get_smp_data(smp); + + if (num_attr < 1 || num_attr > 117) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + for (i = 0; i < num_attr; i++) { + struct opa_aggregate *agg; + size_t agg_data_len; + size_t agg_size; + u32 am; + + agg = (struct opa_aggregate *)next_smp; + agg_data_len = (be16_to_cpu(agg->err_reqlength) & 0x007f) * 8; + agg_size = sizeof(*agg) + agg_data_len; + am = be32_to_cpu(agg->attr_mod); + + *resp_len += agg_size; + + if (next_smp + agg_size > ((u8 *)smp) + sizeof(*smp)) { + smp->status |= IB_SMP_INVALID_FIELD; + return reply((struct ib_mad_hdr *)smp); + } + + (void)subn_set_opa_sma(agg->attr_id, smp, am, agg->data, + ibdev, port, NULL); + if (smp->status & ~IB_SMP_DIRECTION) { + set_aggr_error(agg); + return reply((struct ib_mad_hdr *)smp); + } + next_smp += agg_size; + } + + return reply((struct ib_mad_hdr *)smp); +} + +/* + * OPAv1 specifies that, on the transition to link up, these counters + * are cleared: + * PortRcvErrors [*] + * LinkErrorRecovery + * LocalLinkIntegrityErrors + * ExcessiveBufferOverruns [*] + * + * [*] Error info associated with these counters is retained, but the + * error info status is reset to 0. + */ +void clear_linkup_counters(struct hfi1_devdata *dd) +{ + /* PortRcvErrors */ + write_dev_cntr(dd, C_DC_RCV_ERR, CNTR_INVALID_VL, 0); + dd->err_info_rcvport.status_and_code &= ~OPA_EI_STATUS_SMASK; + /* LinkErrorRecovery */ + write_dev_cntr(dd, C_DC_SEQ_CRC_CNT, CNTR_INVALID_VL, 0); + write_dev_cntr(dd, C_DC_REINIT_FROM_PEER_CNT, CNTR_INVALID_VL, 0); + /* LocalLinkIntegrityErrors */ + write_dev_cntr(dd, C_DC_TX_REPLAY, CNTR_INVALID_VL, 0); + write_dev_cntr(dd, C_DC_RX_REPLAY, CNTR_INVALID_VL, 0); + /* ExcessiveBufferOverruns */ + write_dev_cntr(dd, C_RCV_OVF, CNTR_INVALID_VL, 0); + dd->rcv_ovfl_cnt = 0; + dd->err_info_xmit_constraint.status &= ~OPA_EI_STATUS_SMASK; +} + +/* + * is_local_mad() returns 1 if 'mad' is sent from, and destined to the + * local node, 0 otherwise. + */ +static int is_local_mad(struct hfi1_ibport *ibp, const struct opa_mad *mad, + const struct ib_wc *in_wc) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + const struct opa_smp *smp = (const struct opa_smp *)mad; + + if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + return (smp->hop_cnt == 0 && + smp->route.dr.dr_slid == OPA_LID_PERMISSIVE && + smp->route.dr.dr_dlid == OPA_LID_PERMISSIVE); + } + + return (in_wc->slid == ppd->lid); +} + +/* + * opa_local_smp_check() should only be called on MADs for which + * is_local_mad() returns true. It applies the SMP checks that are + * specific to SMPs which are sent from, and destined to this node. + * opa_local_smp_check() returns 0 if the SMP passes its checks, 1 + * otherwise. + * + * SMPs which arrive from other nodes are instead checked by + * opa_smp_check(). + */ +static int opa_local_smp_check(struct hfi1_ibport *ibp, + const struct ib_wc *in_wc) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u16 slid = in_wc->slid; + u16 pkey; + + if (in_wc->pkey_index >= ARRAY_SIZE(ppd->pkeys)) + return 1; + + pkey = ppd->pkeys[in_wc->pkey_index]; + /* + * We need to do the "node-local" checks specified in OPAv1, + * rev 0.90, section 9.10.26, which are: + * - pkey is 0x7fff, or 0xffff + * - Source QPN == 0 || Destination QPN == 0 + * - the MAD header's management class is either + * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE or + * IB_MGMT_CLASS_SUBN_LID_ROUTED + * - SLID != 0 + * + * However, we know (and so don't need to check again) that, + * for local SMPs, the MAD stack passes MADs with: + * - Source QPN of 0 + * - MAD mgmt_class is IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE + * - SLID is either: OPA_LID_PERMISSIVE (0xFFFFFFFF), or + * our own port's lid + * + */ + if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) + return 0; + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; +} + +static int process_subn_opa(struct ib_device *ibdev, int mad_flags, + u8 port, const struct opa_mad *in_mad, + struct opa_mad *out_mad, + u32 *resp_len) +{ + struct opa_smp *smp = (struct opa_smp *)out_mad; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + u8 *data; + u32 am; + __be16 attr_id; + int ret; + + *out_mad = *in_mad; + data = opa_get_smp_data(smp); + + am = be32_to_cpu(smp->attr_mod); + attr_id = smp->attr_id; + if (smp->class_version != OPA_SMI_CLASS_VERSION) { + smp->status |= IB_SMP_UNSUP_VERSION; + ret = reply((struct ib_mad_hdr *)smp); + return ret; + } + ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, smp->mkey, + smp->route.dr.dr_slid, smp->route.dr.return_path, + smp->hop_cnt); + if (ret) { + u32 port_num = be32_to_cpu(smp->attr_mod); + + /* + * If this is a get/set portinfo, we already check the + * M_Key if the MAD is for another port and the M_Key + * is OK on the receiving port. This check is needed + * to increment the error counters when the M_Key + * fails to match on *both* ports. + */ + if (attr_id == IB_SMP_ATTR_PORT_INFO && + (smp->method == IB_MGMT_METHOD_GET || + smp->method == IB_MGMT_METHOD_SET) && + port_num && port_num <= ibdev->phys_port_cnt && + port != port_num) + (void)check_mkey(to_iport(ibdev, port_num), + (struct ib_mad_hdr *)smp, 0, + smp->mkey, smp->route.dr.dr_slid, + smp->route.dr.return_path, + smp->hop_cnt); + ret = IB_MAD_RESULT_FAILURE; + return ret; + } + + *resp_len = opa_get_smp_header_size(smp); + + switch (smp->method) { + case IB_MGMT_METHOD_GET: + switch (attr_id) { + default: + clear_opa_smp_data(smp); + ret = subn_get_opa_sma(attr_id, smp, am, data, + ibdev, port, resp_len); + break; + case OPA_ATTRIB_ID_AGGREGATE: + ret = subn_get_opa_aggregate(smp, ibdev, port, + resp_len); + break; + } + break; + case IB_MGMT_METHOD_SET: + switch (attr_id) { + default: + ret = subn_set_opa_sma(attr_id, smp, am, data, + ibdev, port, resp_len); + break; + case OPA_ATTRIB_ID_AGGREGATE: + ret = subn_set_opa_aggregate(smp, ibdev, port, + resp_len); + break; + } + break; + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_REPORT: + case IB_MGMT_METHOD_REPORT_RESP: + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + break; + default: + smp->status |= IB_SMP_UNSUP_METHOD; + ret = reply((struct ib_mad_hdr *)smp); + break; + } + + return ret; +} + +static int process_subn(struct ib_device *ibdev, int mad_flags, + u8 port, const struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_smp *smp = (struct ib_smp *)out_mad; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + int ret; + + *out_mad = *in_mad; + if (smp->class_version != 1) { + smp->status |= IB_SMP_UNSUP_VERSION; + ret = reply((struct ib_mad_hdr *)smp); + return ret; + } + + ret = check_mkey(ibp, (struct ib_mad_hdr *)smp, mad_flags, + smp->mkey, (__force __be32)smp->dr_slid, + smp->return_path, smp->hop_cnt); + if (ret) { + u32 port_num = be32_to_cpu(smp->attr_mod); + + /* + * If this is a get/set portinfo, we already check the + * M_Key if the MAD is for another port and the M_Key + * is OK on the receiving port. This check is needed + * to increment the error counters when the M_Key + * fails to match on *both* ports. + */ + if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO && + (smp->method == IB_MGMT_METHOD_GET || + smp->method == IB_MGMT_METHOD_SET) && + port_num && port_num <= ibdev->phys_port_cnt && + port != port_num) + (void)check_mkey(to_iport(ibdev, port_num), + (struct ib_mad_hdr *)smp, 0, + smp->mkey, + (__force __be32)smp->dr_slid, + smp->return_path, smp->hop_cnt); + ret = IB_MAD_RESULT_FAILURE; + return ret; + } + + switch (smp->method) { + case IB_MGMT_METHOD_GET: + switch (smp->attr_id) { + case IB_SMP_ATTR_NODE_INFO: + ret = subn_get_nodeinfo(smp, ibdev, port); + break; + default: + smp->status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)smp); + break; + } + break; + } + + return ret; +} + +static int process_perf(struct ib_device *ibdev, u8 port, + const struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad; + struct ib_class_port_info *cpi = (struct ib_class_port_info *) + &pmp->data; + int ret = IB_MAD_RESULT_FAILURE; + + *out_mad = *in_mad; + if (pmp->mad_hdr.class_version != 1) { + pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION; + ret = reply((struct ib_mad_hdr *)pmp); + return ret; + } + + switch (pmp->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + switch (pmp->mad_hdr.attr_id) { + case IB_PMA_PORT_COUNTERS: + ret = pma_get_ib_portcounters(pmp, ibdev, port); + break; + case IB_PMA_PORT_COUNTERS_EXT: + ret = pma_get_ib_portcounters_ext(pmp, ibdev, port); + break; + case IB_PMA_CLASS_PORT_INFO: + cpi->capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH; + ret = reply((struct ib_mad_hdr *)pmp); + break; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)pmp); + break; + } + break; + + case IB_MGMT_METHOD_SET: + if (pmp->mad_hdr.attr_id) { + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)pmp); + } + break; + + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + break; + + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD; + ret = reply((struct ib_mad_hdr *)pmp); + break; + } + + return ret; +} + +static int process_perf_opa(struct ib_device *ibdev, u8 port, + const struct opa_mad *in_mad, + struct opa_mad *out_mad, u32 *resp_len) +{ + struct opa_pma_mad *pmp = (struct opa_pma_mad *)out_mad; + int ret; + + *out_mad = *in_mad; + + if (pmp->mad_hdr.class_version != OPA_SMI_CLASS_VERSION) { + pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION; + return reply((struct ib_mad_hdr *)pmp); + } + + *resp_len = sizeof(pmp->mad_hdr); + + switch (pmp->mad_hdr.method) { + case IB_MGMT_METHOD_GET: + switch (pmp->mad_hdr.attr_id) { + case IB_PMA_CLASS_PORT_INFO: + ret = pma_get_opa_classportinfo(pmp, ibdev, resp_len); + break; + case OPA_PM_ATTRIB_ID_PORT_STATUS: + ret = pma_get_opa_portstatus(pmp, ibdev, port, + resp_len); + break; + case OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS: + ret = pma_get_opa_datacounters(pmp, ibdev, port, + resp_len); + break; + case OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS: + ret = pma_get_opa_porterrors(pmp, ibdev, port, + resp_len); + break; + case OPA_PM_ATTRIB_ID_ERROR_INFO: + ret = pma_get_opa_errorinfo(pmp, ibdev, port, + resp_len); + break; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)pmp); + break; + } + break; + + case IB_MGMT_METHOD_SET: + switch (pmp->mad_hdr.attr_id) { + case OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS: + ret = pma_set_opa_portstatus(pmp, ibdev, port, + resp_len); + break; + case OPA_PM_ATTRIB_ID_ERROR_INFO: + ret = pma_set_opa_errorinfo(pmp, ibdev, port, + resp_len); + break; + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR; + ret = reply((struct ib_mad_hdr *)pmp); + break; + } + break; + + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_GET_RESP: + /* + * The ib_mad module will call us to process responses + * before checking for other consumers. + * Just tell the caller to process it normally. + */ + ret = IB_MAD_RESULT_SUCCESS; + break; + + default: + pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD; + ret = reply((struct ib_mad_hdr *)pmp); + break; + } + + return ret; +} + +static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags, + u8 port, const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const struct opa_mad *in_mad, + struct opa_mad *out_mad, size_t *out_mad_size, + u16 *out_mad_pkey_index) +{ + int ret; + int pkey_idx; + u32 resp_len = 0; + struct hfi1_ibport *ibp = to_iport(ibdev, port); + + pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY); + if (pkey_idx < 0) { + pr_warn("failed to find limited mgmt pkey, defaulting 0x%x\n", + hfi1_get_pkey(ibp, 1)); + pkey_idx = 1; + } + *out_mad_pkey_index = (u16)pkey_idx; + + switch (in_mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + if (is_local_mad(ibp, in_mad, in_wc)) { + ret = opa_local_smp_check(ibp, in_wc); + if (ret) + return IB_MAD_RESULT_FAILURE; + } + ret = process_subn_opa(ibdev, mad_flags, port, in_mad, + out_mad, &resp_len); + goto bail; + case IB_MGMT_CLASS_PERF_MGMT: + ret = process_perf_opa(ibdev, port, in_mad, out_mad, + &resp_len); + goto bail; + + default: + ret = IB_MAD_RESULT_SUCCESS; + } + +bail: + if (ret & IB_MAD_RESULT_REPLY) + *out_mad_size = round_up(resp_len, 8); + else if (ret & IB_MAD_RESULT_SUCCESS) + *out_mad_size = in_wc->byte_len - sizeof(struct ib_grh); + + return ret; +} + +static int hfi1_process_ib_mad(struct ib_device *ibdev, int mad_flags, u8 port, + const struct ib_wc *in_wc, + const struct ib_grh *in_grh, + const struct ib_mad *in_mad, + struct ib_mad *out_mad) +{ + int ret; + + switch (in_mad->mad_hdr.mgmt_class) { + case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE: + case IB_MGMT_CLASS_SUBN_LID_ROUTED: + ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad); + break; + case IB_MGMT_CLASS_PERF_MGMT: + ret = process_perf(ibdev, port, in_mad, out_mad); + break; + default: + ret = IB_MAD_RESULT_SUCCESS; + break; + } + + return ret; +} + +/** + * hfi1_process_mad - process an incoming MAD packet + * @ibdev: the infiniband device this packet came in on + * @mad_flags: MAD flags + * @port: the port number this packet came in on + * @in_wc: the work completion entry for this packet + * @in_grh: the global route header for this packet + * @in_mad: the incoming MAD + * @out_mad: any outgoing MAD reply + * + * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not + * interested in processing. + * + * Note that the verbs framework has already done the MAD sanity checks, + * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE + * MADs. + * + * This is called by the ib_mad module. + */ +int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in_mad, size_t in_mad_size, + struct ib_mad_hdr *out_mad, size_t *out_mad_size, + u16 *out_mad_pkey_index) +{ + switch (in_mad->base_version) { + case OPA_MGMT_BASE_VERSION: + if (unlikely(in_mad_size != sizeof(struct opa_mad))) { + dev_err(ibdev->dma_device, "invalid in_mad_size\n"); + return IB_MAD_RESULT_FAILURE; + } + return hfi1_process_opa_mad(ibdev, mad_flags, port, + in_wc, in_grh, + (struct opa_mad *)in_mad, + (struct opa_mad *)out_mad, + out_mad_size, + out_mad_pkey_index); + case IB_MGMT_BASE_VERSION: + return hfi1_process_ib_mad(ibdev, mad_flags, port, + in_wc, in_grh, + (const struct ib_mad *)in_mad, + (struct ib_mad *)out_mad); + default: + break; + } + + return IB_MAD_RESULT_FAILURE; +} diff --git a/drivers/infiniband/hw/hfi1/mad.h b/drivers/infiniband/hw/hfi1/mad.h new file mode 100644 index 000000000..8b734aaae --- /dev/null +++ b/drivers/infiniband/hw/hfi1/mad.h @@ -0,0 +1,439 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef _HFI1_MAD_H +#define _HFI1_MAD_H + +#include <rdma/ib_pma.h> +#define USE_PI_LED_ENABLE 1 /* + * use led enabled bit in struct + * opa_port_states, if available + */ +#include <rdma/opa_smi.h> +#include <rdma/opa_port_info.h> +#ifndef PI_LED_ENABLE_SUP +#define PI_LED_ENABLE_SUP 0 +#endif +#include "opa_compat.h" + +/* + * OPA Traps + */ +#define OPA_TRAP_GID_NOW_IN_SERVICE cpu_to_be16(64) +#define OPA_TRAP_GID_OUT_OF_SERVICE cpu_to_be16(65) +#define OPA_TRAP_ADD_MULTICAST_GROUP cpu_to_be16(66) +#define OPA_TRAL_DEL_MULTICAST_GROUP cpu_to_be16(67) +#define OPA_TRAP_UNPATH cpu_to_be16(68) +#define OPA_TRAP_REPATH cpu_to_be16(69) +#define OPA_TRAP_PORT_CHANGE_STATE cpu_to_be16(128) +#define OPA_TRAP_LINK_INTEGRITY cpu_to_be16(129) +#define OPA_TRAP_EXCESSIVE_BUFFER_OVERRUN cpu_to_be16(130) +#define OPA_TRAP_FLOW_WATCHDOG cpu_to_be16(131) +#define OPA_TRAP_CHANGE_CAPABILITY cpu_to_be16(144) +#define OPA_TRAP_CHANGE_SYSGUID cpu_to_be16(145) +#define OPA_TRAP_BAD_M_KEY cpu_to_be16(256) +#define OPA_TRAP_BAD_P_KEY cpu_to_be16(257) +#define OPA_TRAP_BAD_Q_KEY cpu_to_be16(258) +#define OPA_TRAP_SWITCH_BAD_PKEY cpu_to_be16(259) +#define OPA_SMA_TRAP_DATA_LINK_WIDTH cpu_to_be16(2048) + +/* + * Generic trap/notice other local changes flags (trap 144). + */ +#define OPA_NOTICE_TRAP_LWDE_CHG 0x08 /* Link Width Downgrade Enable + * changed + */ +#define OPA_NOTICE_TRAP_LSE_CHG 0x04 /* Link Speed Enable changed */ +#define OPA_NOTICE_TRAP_LWE_CHG 0x02 /* Link Width Enable changed */ +#define OPA_NOTICE_TRAP_NODE_DESC_CHG 0x01 + +struct opa_mad_notice_attr { + u8 generic_type; + u8 prod_type_msb; + __be16 prod_type_lsb; + __be16 trap_num; + __be16 toggle_count; + __be32 issuer_lid; + __be32 reserved1; + union ib_gid issuer_gid; + + union { + struct { + u8 details[64]; + } raw_data; + + struct { + union ib_gid gid; + } __packed ntc_64_65_66_67; + + struct { + __be32 lid; + } __packed ntc_128; + + struct { + __be32 lid; /* where violation happened */ + u8 port_num; /* where violation happened */ + } __packed ntc_129_130_131; + + struct { + __be32 lid; /* LID where change occurred */ + __be32 new_cap_mask; /* new capability mask */ + __be16 reserved2; + __be16 cap_mask; + __be16 change_flags; /* low 4 bits only */ + } __packed ntc_144; + + struct { + __be64 new_sys_guid; + __be32 lid; /* lid where sys guid changed */ + } __packed ntc_145; + + struct { + __be32 lid; + __be32 dr_slid; + u8 method; + u8 dr_trunc_hop; + __be16 attr_id; + __be32 attr_mod; + __be64 mkey; + u8 dr_rtn_path[30]; + } __packed ntc_256; + + struct { + __be32 lid1; + __be32 lid2; + __be32 key; + u8 sl; /* SL: high 5 bits */ + u8 reserved3[3]; + union ib_gid gid1; + union ib_gid gid2; + __be32 qp1; /* high 8 bits reserved */ + __be32 qp2; /* high 8 bits reserved */ + } __packed ntc_257_258; + + struct { + __be16 flags; /* low 8 bits reserved */ + __be16 pkey; + __be32 lid1; + __be32 lid2; + u8 sl; /* SL: high 5 bits */ + u8 reserved4[3]; + union ib_gid gid1; + union ib_gid gid2; + __be32 qp1; /* high 8 bits reserved */ + __be32 qp2; /* high 8 bits reserved */ + } __packed ntc_259; + + struct { + __be32 lid; + } __packed ntc_2048; + + }; + u8 class_data[0]; +}; + +#define IB_VLARB_LOWPRI_0_31 1 +#define IB_VLARB_LOWPRI_32_63 2 +#define IB_VLARB_HIGHPRI_0_31 3 +#define IB_VLARB_HIGHPRI_32_63 4 + +#define OPA_MAX_PREEMPT_CAP 32 +#define OPA_VLARB_LOW_ELEMENTS 0 +#define OPA_VLARB_HIGH_ELEMENTS 1 +#define OPA_VLARB_PREEMPT_ELEMENTS 2 +#define OPA_VLARB_PREEMPT_MATRIX 3 + +#define IB_PMA_PORT_COUNTERS_CONG cpu_to_be16(0xFF00) + +struct ib_pma_portcounters_cong { + u8 reserved; + u8 reserved1; + __be16 port_check_rate; + __be16 symbol_error_counter; + u8 link_error_recovery_counter; + u8 link_downed_counter; + __be16 port_rcv_errors; + __be16 port_rcv_remphys_errors; + __be16 port_rcv_switch_relay_errors; + __be16 port_xmit_discards; + u8 port_xmit_constraint_errors; + u8 port_rcv_constraint_errors; + u8 reserved2; + u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */ + __be16 reserved3; + __be16 vl15_dropped; + __be64 port_xmit_data; + __be64 port_rcv_data; + __be64 port_xmit_packets; + __be64 port_rcv_packets; + __be64 port_xmit_wait; + __be64 port_adr_events; +} __packed; + +#define IB_SMP_UNSUP_VERSION cpu_to_be16(0x0004) +#define IB_SMP_UNSUP_METHOD cpu_to_be16(0x0008) +#define IB_SMP_UNSUP_METH_ATTR cpu_to_be16(0x000C) +#define IB_SMP_INVALID_FIELD cpu_to_be16(0x001C) + +#define OPA_MAX_PREEMPT_CAP 32 +#define OPA_VLARB_LOW_ELEMENTS 0 +#define OPA_VLARB_HIGH_ELEMENTS 1 +#define OPA_VLARB_PREEMPT_ELEMENTS 2 +#define OPA_VLARB_PREEMPT_MATRIX 3 + +#define HFI1_XMIT_RATE_UNSUPPORTED 0x0 +#define HFI1_XMIT_RATE_PICO 0x7 +/* number of 4nsec cycles equaling 2secs */ +#define HFI1_CONG_TIMER_PSINTERVAL 0x1DCD64EC + +#define IB_CC_SVCTYPE_RC 0x0 +#define IB_CC_SVCTYPE_UC 0x1 +#define IB_CC_SVCTYPE_RD 0x2 +#define IB_CC_SVCTYPE_UD 0x3 + +/* + * There should be an equivalent IB #define for the following, but + * I cannot find it. + */ +#define OPA_CC_LOG_TYPE_HFI 2 + +struct opa_hfi1_cong_log_event_internal { + u32 lqpn; + u32 rqpn; + u8 sl; + u8 svc_type; + u32 rlid; + s64 timestamp; /* wider than 32 bits to detect 32 bit rollover */ +}; + +struct opa_hfi1_cong_log_event { + u8 local_qp_cn_entry[3]; + u8 remote_qp_number_cn_entry[3]; + u8 sl_svc_type_cn_entry; /* 5 bits SL, 3 bits svc type */ + u8 reserved; + __be32 remote_lid_cn_entry; + __be32 timestamp_cn_entry; +} __packed; + +#define OPA_CONG_LOG_ELEMS 96 + +struct opa_hfi1_cong_log { + u8 log_type; + u8 congestion_flags; + __be16 threshold_event_counter; + __be32 current_time_stamp; + u8 threshold_cong_event_map[OPA_MAX_SLS / 8]; + struct opa_hfi1_cong_log_event events[OPA_CONG_LOG_ELEMS]; +} __packed; + +#define IB_CC_TABLE_CAP_DEFAULT 31 + +/* Port control flags */ +#define IB_CC_CCS_PC_SL_BASED 0x01 + +struct opa_congestion_setting_entry { + u8 ccti_increase; + u8 reserved; + __be16 ccti_timer; + u8 trigger_threshold; + u8 ccti_min; /* min CCTI for cc table */ +} __packed; + +struct opa_congestion_setting_entry_shadow { + u8 ccti_increase; + u8 reserved; + u16 ccti_timer; + u8 trigger_threshold; + u8 ccti_min; /* min CCTI for cc table */ +} __packed; + +struct opa_congestion_setting_attr { + __be32 control_map; + __be16 port_control; + struct opa_congestion_setting_entry entries[OPA_MAX_SLS]; +} __packed; + +struct opa_congestion_setting_attr_shadow { + u32 control_map; + u16 port_control; + struct opa_congestion_setting_entry_shadow entries[OPA_MAX_SLS]; +} __packed; + +#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1 +#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1 + +/* 64 Congestion Control table entries in a single MAD */ +#define IB_CCT_ENTRIES 64 +#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2) + +struct ib_cc_table_entry { + __be16 entry; /* shift:2, multiplier:14 */ +}; + +struct ib_cc_table_entry_shadow { + u16 entry; /* shift:2, multiplier:14 */ +}; + +struct ib_cc_table_attr { + __be16 ccti_limit; /* max CCTI for cc table */ + struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES]; +} __packed; + +struct ib_cc_table_attr_shadow { + u16 ccti_limit; /* max CCTI for cc table */ + struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES]; +} __packed; + +#define CC_TABLE_SHADOW_MAX \ + (IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES) + +struct cc_table_shadow { + u16 ccti_limit; /* max CCTI for cc table */ + struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX]; +} __packed; + +/* + * struct cc_state combines the (active) per-port congestion control + * table, and the (active) per-SL congestion settings. cc_state data + * may need to be read in code paths that we want to be fast, so it + * is an RCU protected structure. + */ +struct cc_state { + struct rcu_head rcu; + struct cc_table_shadow cct; + struct opa_congestion_setting_attr_shadow cong_setting; +}; + +/* + * OPA BufferControl MAD + */ + +/* attribute modifier macros */ +#define OPA_AM_NPORT_SHIFT 24 +#define OPA_AM_NPORT_MASK 0xff +#define OPA_AM_NPORT_SMASK (OPA_AM_NPORT_MASK << OPA_AM_NPORT_SHIFT) +#define OPA_AM_NPORT(am) (((am) >> OPA_AM_NPORT_SHIFT) & \ + OPA_AM_NPORT_MASK) + +#define OPA_AM_NBLK_SHIFT 24 +#define OPA_AM_NBLK_MASK 0xff +#define OPA_AM_NBLK_SMASK (OPA_AM_NBLK_MASK << OPA_AM_NBLK_SHIFT) +#define OPA_AM_NBLK(am) (((am) >> OPA_AM_NBLK_SHIFT) & \ + OPA_AM_NBLK_MASK) + +#define OPA_AM_START_BLK_SHIFT 0 +#define OPA_AM_START_BLK_MASK 0xff +#define OPA_AM_START_BLK_SMASK (OPA_AM_START_BLK_MASK << \ + OPA_AM_START_BLK_SHIFT) +#define OPA_AM_START_BLK(am) (((am) >> OPA_AM_START_BLK_SHIFT) & \ + OPA_AM_START_BLK_MASK) + +#define OPA_AM_PORTNUM_SHIFT 0 +#define OPA_AM_PORTNUM_MASK 0xff +#define OPA_AM_PORTNUM_SMASK (OPA_AM_PORTNUM_MASK << OPA_AM_PORTNUM_SHIFT) +#define OPA_AM_PORTNUM(am) (((am) >> OPA_AM_PORTNUM_SHIFT) & \ + OPA_AM_PORTNUM_MASK) + +#define OPA_AM_ASYNC_SHIFT 12 +#define OPA_AM_ASYNC_MASK 0x1 +#define OPA_AM_ASYNC_SMASK (OPA_AM_ASYNC_MASK << OPA_AM_ASYNC_SHIFT) +#define OPA_AM_ASYNC(am) (((am) >> OPA_AM_ASYNC_SHIFT) & \ + OPA_AM_ASYNC_MASK) + +#define OPA_AM_START_SM_CFG_SHIFT 9 +#define OPA_AM_START_SM_CFG_MASK 0x1 +#define OPA_AM_START_SM_CFG_SMASK (OPA_AM_START_SM_CFG_MASK << \ + OPA_AM_START_SM_CFG_SHIFT) +#define OPA_AM_START_SM_CFG(am) (((am) >> OPA_AM_START_SM_CFG_SHIFT) \ + & OPA_AM_START_SM_CFG_MASK) + +#define OPA_AM_CI_ADDR_SHIFT 19 +#define OPA_AM_CI_ADDR_MASK 0xfff +#define OPA_AM_CI_ADDR_SMASK (OPA_AM_CI_ADDR_MASK << OPA_CI_ADDR_SHIFT) +#define OPA_AM_CI_ADDR(am) (((am) >> OPA_AM_CI_ADDR_SHIFT) & \ + OPA_AM_CI_ADDR_MASK) + +#define OPA_AM_CI_LEN_SHIFT 13 +#define OPA_AM_CI_LEN_MASK 0x3f +#define OPA_AM_CI_LEN_SMASK (OPA_AM_CI_LEN_MASK << OPA_CI_LEN_SHIFT) +#define OPA_AM_CI_LEN(am) (((am) >> OPA_AM_CI_LEN_SHIFT) & \ + OPA_AM_CI_LEN_MASK) + +/* error info macros */ +#define OPA_EI_STATUS_SMASK 0x80 +#define OPA_EI_CODE_SMASK 0x0f + +struct vl_limit { + __be16 dedicated; + __be16 shared; +}; + +struct buffer_control { + __be16 reserved; + __be16 overall_shared_limit; + struct vl_limit vl[OPA_MAX_VLS]; +}; + +struct sc2vlnt { + u8 vlnt[32]; /* 5 bit VL, 3 bits reserved */ +}; + +/* + * The PortSamplesControl.CounterMasks field is an array of 3 bit fields + * which specify the N'th counter's capabilities. See ch. 16.1.3.2. + * We support 5 counters which only count the mandatory quantities. + */ +#define COUNTER_MASK(q, n) (q << ((9 - n) * 3)) +#define COUNTER_MASK0_9 \ + cpu_to_be32(COUNTER_MASK(1, 0) | \ + COUNTER_MASK(1, 1) | \ + COUNTER_MASK(1, 2) | \ + COUNTER_MASK(1, 3) | \ + COUNTER_MASK(1, 4)) + +void hfi1_event_pkey_change(struct hfi1_devdata *dd, u8 port); + +#endif /* _HFI1_MAD_H */ diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c new file mode 100644 index 000000000..b7a80aa1a --- /dev/null +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -0,0 +1,325 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include <linux/list.h> +#include <linux/rculist.h> +#include <linux/mmu_notifier.h> +#include <linux/interval_tree_generic.h> + +#include "mmu_rb.h" +#include "trace.h" + +struct mmu_rb_handler { + struct list_head list; + struct mmu_notifier mn; + struct rb_root *root; + spinlock_t lock; /* protect the RB tree */ + struct mmu_rb_ops *ops; +}; + +static LIST_HEAD(mmu_rb_handlers); +static DEFINE_SPINLOCK(mmu_rb_lock); /* protect mmu_rb_handlers list */ + +static unsigned long mmu_node_start(struct mmu_rb_node *); +static unsigned long mmu_node_last(struct mmu_rb_node *); +static struct mmu_rb_handler *find_mmu_handler(struct rb_root *); +static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *, + unsigned long); +static inline void mmu_notifier_range_start(struct mmu_notifier *, + struct mm_struct *, + unsigned long, unsigned long); +static void mmu_notifier_mem_invalidate(struct mmu_notifier *, + struct mm_struct *, + unsigned long, unsigned long); +static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *, + unsigned long, unsigned long); + +static struct mmu_notifier_ops mn_opts = { + .invalidate_page = mmu_notifier_page, + .invalidate_range_start = mmu_notifier_range_start, +}; + +INTERVAL_TREE_DEFINE(struct mmu_rb_node, node, unsigned long, __last, + mmu_node_start, mmu_node_last, static, __mmu_int_rb); + +static unsigned long mmu_node_start(struct mmu_rb_node *node) +{ + return node->addr & PAGE_MASK; +} + +static unsigned long mmu_node_last(struct mmu_rb_node *node) +{ + return PAGE_ALIGN(node->addr + node->len) - 1; +} + +int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops) +{ + struct mmu_rb_handler *handlr; + + if (!ops->invalidate) + return -EINVAL; + + handlr = kmalloc(sizeof(*handlr), GFP_KERNEL); + if (!handlr) + return -ENOMEM; + + handlr->root = root; + handlr->ops = ops; + INIT_HLIST_NODE(&handlr->mn.hlist); + spin_lock_init(&handlr->lock); + handlr->mn.ops = &mn_opts; + spin_lock(&mmu_rb_lock); + list_add_tail_rcu(&handlr->list, &mmu_rb_handlers); + spin_unlock(&mmu_rb_lock); + + return mmu_notifier_register(&handlr->mn, current->mm); +} + +void hfi1_mmu_rb_unregister(struct rb_root *root) +{ + struct mmu_rb_handler *handler = find_mmu_handler(root); + unsigned long flags; + + if (!handler) + return; + + /* Unregister first so we don't get any more notifications. */ + if (current->mm) + mmu_notifier_unregister(&handler->mn, current->mm); + + spin_lock(&mmu_rb_lock); + list_del_rcu(&handler->list); + spin_unlock(&mmu_rb_lock); + synchronize_rcu(); + + spin_lock_irqsave(&handler->lock, flags); + if (!RB_EMPTY_ROOT(root)) { + struct rb_node *node; + struct mmu_rb_node *rbnode; + + while ((node = rb_first(root))) { + rbnode = rb_entry(node, struct mmu_rb_node, node); + rb_erase(node, root); + if (handler->ops->remove) + handler->ops->remove(root, rbnode, NULL); + } + } + spin_unlock_irqrestore(&handler->lock, flags); + + kfree(handler); +} + +int hfi1_mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode) +{ + struct mmu_rb_handler *handler = find_mmu_handler(root); + struct mmu_rb_node *node; + unsigned long flags; + int ret = 0; + + if (!handler) + return -EINVAL; + + spin_lock_irqsave(&handler->lock, flags); + hfi1_cdbg(MMU, "Inserting node addr 0x%llx, len %u", mnode->addr, + mnode->len); + node = __mmu_rb_search(handler, mnode->addr, mnode->len); + if (node) { + ret = -EINVAL; + goto unlock; + } + __mmu_int_rb_insert(mnode, root); + + if (handler->ops->insert) { + ret = handler->ops->insert(root, mnode); + if (ret) + __mmu_int_rb_remove(mnode, root); + } +unlock: + spin_unlock_irqrestore(&handler->lock, flags); + return ret; +} + +/* Caller must hold handler lock */ +static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler, + unsigned long addr, + unsigned long len) +{ + struct mmu_rb_node *node = NULL; + + hfi1_cdbg(MMU, "Searching for addr 0x%llx, len %u", addr, len); + if (!handler->ops->filter) { + node = __mmu_int_rb_iter_first(handler->root, addr, + (addr + len) - 1); + } else { + for (node = __mmu_int_rb_iter_first(handler->root, addr, + (addr + len) - 1); + node; + node = __mmu_int_rb_iter_next(node, addr, + (addr + len) - 1)) { + if (handler->ops->filter(node, addr, len)) + return node; + } + } + return node; +} + +/* Caller must *not* hold handler lock. */ +static void __mmu_rb_remove(struct mmu_rb_handler *handler, + struct mmu_rb_node *node, struct mm_struct *mm) +{ + unsigned long flags; + + /* Validity of handler and node pointers has been checked by caller. */ + hfi1_cdbg(MMU, "Removing node addr 0x%llx, len %u", node->addr, + node->len); + spin_lock_irqsave(&handler->lock, flags); + __mmu_int_rb_remove(node, handler->root); + spin_unlock_irqrestore(&handler->lock, flags); + + if (handler->ops->remove) + handler->ops->remove(handler->root, node, mm); +} + +struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *root, unsigned long addr, + unsigned long len) +{ + struct mmu_rb_handler *handler = find_mmu_handler(root); + struct mmu_rb_node *node; + unsigned long flags; + + if (!handler) + return ERR_PTR(-EINVAL); + + spin_lock_irqsave(&handler->lock, flags); + node = __mmu_rb_search(handler, addr, len); + spin_unlock_irqrestore(&handler->lock, flags); + + return node; +} + +struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *root, + unsigned long addr, unsigned long len) +{ + struct mmu_rb_handler *handler = find_mmu_handler(root); + struct mmu_rb_node *node; + unsigned long flags; + + if (!handler) + return ERR_PTR(-EINVAL); + + spin_lock_irqsave(&handler->lock, flags); + node = __mmu_rb_search(handler, addr, len); + if (node) + __mmu_int_rb_remove(node, handler->root); + spin_unlock_irqrestore(&handler->lock, flags); + + return node; +} + +void hfi1_mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node) +{ + struct mmu_rb_handler *handler = find_mmu_handler(root); + + if (!handler || !node) + return; + + __mmu_rb_remove(handler, node, NULL); +} + +static struct mmu_rb_handler *find_mmu_handler(struct rb_root *root) +{ + struct mmu_rb_handler *handler; + + rcu_read_lock(); + list_for_each_entry_rcu(handler, &mmu_rb_handlers, list) { + if (handler->root == root) + goto unlock; + } + handler = NULL; +unlock: + rcu_read_unlock(); + return handler; +} + +static inline void mmu_notifier_page(struct mmu_notifier *mn, + struct mm_struct *mm, unsigned long addr) +{ + mmu_notifier_mem_invalidate(mn, mm, addr, addr + PAGE_SIZE); +} + +static inline void mmu_notifier_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + mmu_notifier_mem_invalidate(mn, mm, start, end); +} + +static void mmu_notifier_mem_invalidate(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_rb_handler *handler = + container_of(mn, struct mmu_rb_handler, mn); + struct rb_root *root = handler->root; + struct mmu_rb_node *node, *ptr = NULL; + unsigned long flags; + + spin_lock_irqsave(&handler->lock, flags); + for (node = __mmu_int_rb_iter_first(root, start, end - 1); + node; node = ptr) { + /* Guard against node removal. */ + ptr = __mmu_int_rb_iter_next(node, start, end - 1); + hfi1_cdbg(MMU, "Invalidating node addr 0x%llx, len %u", + node->addr, node->len); + if (handler->ops->invalidate(root, node)) { + __mmu_int_rb_remove(node, root); + if (handler->ops->remove) + handler->ops->remove(root, node, mm); + } + } + spin_unlock_irqrestore(&handler->lock, flags); +} diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h new file mode 100644 index 000000000..7a57b9c49 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -0,0 +1,76 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef _HFI1_MMU_RB_H +#define _HFI1_MMU_RB_H + +#include "hfi.h" + +struct mmu_rb_node { + unsigned long addr; + unsigned long len; + unsigned long __last; + struct rb_node node; +}; + +struct mmu_rb_ops { + bool (*filter)(struct mmu_rb_node *, unsigned long, unsigned long); + int (*insert)(struct rb_root *, struct mmu_rb_node *); + void (*remove)(struct rb_root *, struct mmu_rb_node *, + struct mm_struct *); + int (*invalidate)(struct rb_root *, struct mmu_rb_node *); +}; + +int hfi1_mmu_rb_register(struct rb_root *root, struct mmu_rb_ops *ops); +void hfi1_mmu_rb_unregister(struct rb_root *); +int hfi1_mmu_rb_insert(struct rb_root *, struct mmu_rb_node *); +void hfi1_mmu_rb_remove(struct rb_root *, struct mmu_rb_node *); +struct mmu_rb_node *hfi1_mmu_rb_search(struct rb_root *, unsigned long, + unsigned long); +struct mmu_rb_node *hfi1_mmu_rb_extract(struct rb_root *, unsigned long, + unsigned long); + +#endif /* _HFI1_MMU_RB_H */ diff --git a/drivers/infiniband/hw/hfi1/opa_compat.h b/drivers/infiniband/hw/hfi1/opa_compat.h new file mode 100644 index 000000000..6ef3c1cbd --- /dev/null +++ b/drivers/infiniband/hw/hfi1/opa_compat.h @@ -0,0 +1,111 @@ +#ifndef _LINUX_H +#define _LINUX_H +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This header file is for OPA-specific definitions which are + * required by the HFI driver, and which aren't yet in the Linux + * IB core. We'll collect these all here, then merge them into + * the kernel when that's convenient. + */ + +/* OPA SMA attribute IDs */ +#define OPA_ATTRIB_ID_CONGESTION_INFO cpu_to_be16(0x008b) +#define OPA_ATTRIB_ID_HFI_CONGESTION_LOG cpu_to_be16(0x008f) +#define OPA_ATTRIB_ID_HFI_CONGESTION_SETTING cpu_to_be16(0x0090) +#define OPA_ATTRIB_ID_CONGESTION_CONTROL_TABLE cpu_to_be16(0x0091) + +/* OPA PMA attribute IDs */ +#define OPA_PM_ATTRIB_ID_PORT_STATUS cpu_to_be16(0x0040) +#define OPA_PM_ATTRIB_ID_CLEAR_PORT_STATUS cpu_to_be16(0x0041) +#define OPA_PM_ATTRIB_ID_DATA_PORT_COUNTERS cpu_to_be16(0x0042) +#define OPA_PM_ATTRIB_ID_ERROR_PORT_COUNTERS cpu_to_be16(0x0043) +#define OPA_PM_ATTRIB_ID_ERROR_INFO cpu_to_be16(0x0044) + +/* OPA status codes */ +#define OPA_PM_STATUS_REQUEST_TOO_LARGE cpu_to_be16(0x100) + +static inline u8 port_states_to_logical_state(struct opa_port_states *ps) +{ + return ps->portphysstate_portstate & OPA_PI_MASK_PORT_STATE; +} + +static inline u8 port_states_to_phys_state(struct opa_port_states *ps) +{ + return ((ps->portphysstate_portstate & + OPA_PI_MASK_PORT_PHYSICAL_STATE) >> 4) & 0xf; +} + +/* + * OPA port physical states + * IB Volume 1, Table 146 PortInfo/IB Volume 2 Section 5.4.2(1) PortPhysState + * values. + * + * When writing, only values 0-3 are valid, other values are ignored. + * When reading, 0 is reserved. + * + * Returned by the ibphys_portstate() routine. + */ +enum opa_port_phys_state { + IB_PORTPHYSSTATE_NOP = 0, + /* 1 is reserved */ + IB_PORTPHYSSTATE_POLLING = 2, + IB_PORTPHYSSTATE_DISABLED = 3, + IB_PORTPHYSSTATE_TRAINING = 4, + IB_PORTPHYSSTATE_LINKUP = 5, + IB_PORTPHYSSTATE_LINK_ERROR_RECOVERY = 6, + IB_PORTPHYSSTATE_PHY_TEST = 7, + /* 8 is reserved */ + OPA_PORTPHYSSTATE_OFFLINE = 9, + OPA_PORTPHYSSTATE_GANGED = 10, + OPA_PORTPHYSSTATE_TEST = 11, + OPA_PORTPHYSSTATE_MAX = 11, + /* values 12-15 are reserved/ignored */ +}; + +#endif /* _LINUX_H */ diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c new file mode 100644 index 000000000..0bac21e6a --- /dev/null +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -0,0 +1,1338 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/pci.h> +#include <linux/io.h> +#include <linux/delay.h> +#include <linux/vmalloc.h> +#include <linux/aer.h> +#include <linux/module.h> + +#include "hfi.h" +#include "chip_registers.h" +#include "aspm.h" + +/* link speed vector for Gen3 speed - not in Linux headers */ +#define GEN1_SPEED_VECTOR 0x1 +#define GEN2_SPEED_VECTOR 0x2 +#define GEN3_SPEED_VECTOR 0x3 + +/* + * This file contains PCIe utility routines. + */ + +/* + * Code to adjust PCIe capabilities. + */ +static void tune_pcie_caps(struct hfi1_devdata *); + +/* + * Do all the common PCIe setup and initialization. + * devdata is not yet allocated, and is not allocated until after this + * routine returns success. Therefore dd_dev_err() can't be used for error + * printing. + */ +int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int ret; + + ret = pci_enable_device(pdev); + if (ret) { + /* + * This can happen (in theory) iff: + * We did a chip reset, and then failed to reprogram the + * BAR, or the chip reset due to an internal error. We then + * unloaded the driver and reloaded it. + * + * Both reset cases set the BAR back to initial state. For + * the latter case, the AER sticky error bit at offset 0x718 + * should be set, but the Linux kernel doesn't yet know + * about that, it appears. If the original BAR was retained + * in the kernel data structures, this may be OK. + */ + hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n", + -ret); + goto done; + } + + ret = pci_request_regions(pdev, DRIVER_NAME); + if (ret) { + hfi1_early_err(&pdev->dev, + "pci_request_regions fails: err %d\n", -ret); + goto bail; + } + + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (ret) { + /* + * If the 64 bit setup fails, try 32 bit. Some systems + * do not setup 64 bit maps on systems with 2GB or less + * memory installed. + */ + ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (ret) { + hfi1_early_err(&pdev->dev, + "Unable to set DMA mask: %d\n", ret); + goto bail; + } + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + } else { + ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + } + if (ret) { + hfi1_early_err(&pdev->dev, + "Unable to set DMA consistent mask: %d\n", ret); + goto bail; + } + + pci_set_master(pdev); + (void)pci_enable_pcie_error_reporting(pdev); + goto done; + +bail: + hfi1_pcie_cleanup(pdev); +done: + return ret; +} + +/* + * Clean what was done in hfi1_pcie_init() + */ +void hfi1_pcie_cleanup(struct pci_dev *pdev) +{ + pci_disable_device(pdev); + /* + * Release regions should be called after the disable. OK to + * call if request regions has not been called or failed. + */ + pci_release_regions(pdev); +} + +/* + * Do remaining PCIe setup, once dd is allocated, and save away + * fields required to re-initialize after a chip reset, or for + * various other purposes + */ +int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + unsigned long len; + resource_size_t addr; + + dd->pcidev = pdev; + pci_set_drvdata(pdev, dd); + + addr = pci_resource_start(pdev, 0); + len = pci_resource_len(pdev, 0); + + /* + * The TXE PIO buffers are at the tail end of the chip space. + * Cut them off and map them separately. + */ + + /* sanity check vs expectations */ + if (len != TXE_PIO_SEND + TXE_PIO_SIZE) { + dd_dev_err(dd, "chip PIO range does not match\n"); + return -EINVAL; + } + + dd->kregbase = ioremap_nocache(addr, TXE_PIO_SEND); + if (!dd->kregbase) + return -ENOMEM; + + dd->piobase = ioremap_wc(addr + TXE_PIO_SEND, TXE_PIO_SIZE); + if (!dd->piobase) { + iounmap(dd->kregbase); + return -ENOMEM; + } + + dd->flags |= HFI1_PRESENT; /* now register routines work */ + + dd->kregend = dd->kregbase + TXE_PIO_SEND; + dd->physaddr = addr; /* used for io_remap, etc. */ + + /* + * Re-map the chip's RcvArray as write-combining to allow us + * to write an entire cacheline worth of entries in one shot. + * If this re-map fails, just continue - the RcvArray programming + * function will handle both cases. + */ + dd->chip_rcv_array_count = read_csr(dd, RCV_ARRAY_CNT); + dd->rcvarray_wc = ioremap_wc(addr + RCV_ARRAY, + dd->chip_rcv_array_count * 8); + dd_dev_info(dd, "WC Remapped RcvArray: %p\n", dd->rcvarray_wc); + /* + * Save BARs and command to rewrite after device reset. + */ + dd->pcibar0 = addr; + dd->pcibar1 = addr >> 32; + pci_read_config_dword(dd->pcidev, PCI_ROM_ADDRESS, &dd->pci_rom); + pci_read_config_word(dd->pcidev, PCI_COMMAND, &dd->pci_command); + pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &dd->pcie_devctl); + pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL, &dd->pcie_lnkctl); + pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL2, + &dd->pcie_devctl2); + pci_read_config_dword(dd->pcidev, PCI_CFG_MSIX0, &dd->pci_msix0); + pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, &dd->pci_lnkctl3); + pci_read_config_dword(dd->pcidev, PCIE_CFG_TPH2, &dd->pci_tph2); + + return 0; +} + +/* + * Do PCIe cleanup related to dd, after chip-specific cleanup, etc. Just prior + * to releasing the dd memory. + * Void because all of the core pcie cleanup functions are void. + */ +void hfi1_pcie_ddcleanup(struct hfi1_devdata *dd) +{ + u64 __iomem *base = (void __iomem *)dd->kregbase; + + dd->flags &= ~HFI1_PRESENT; + dd->kregbase = NULL; + iounmap(base); + if (dd->rcvarray_wc) + iounmap(dd->rcvarray_wc); + if (dd->piobase) + iounmap(dd->piobase); +} + +/* + * Do a Function Level Reset (FLR) on the device. + * Based on static function drivers/pci/pci.c:pcie_flr(). + */ +void hfi1_pcie_flr(struct hfi1_devdata *dd) +{ + int i; + u16 status; + + /* no need to check for the capability - we know the device has it */ + + /* wait for Transaction Pending bit to clear, at most a few ms */ + for (i = 0; i < 4; i++) { + if (i) + msleep((1 << (i - 1)) * 100); + + pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVSTA, &status); + if (!(status & PCI_EXP_DEVSTA_TRPND)) + goto clear; + } + + dd_dev_err(dd, "Transaction Pending bit is not clearing, proceeding with reset anyway\n"); + +clear: + pcie_capability_set_word(dd->pcidev, PCI_EXP_DEVCTL, + PCI_EXP_DEVCTL_BCR_FLR); + /* PCIe spec requires the function to be back within 100ms */ + msleep(100); +} + +static void msix_setup(struct hfi1_devdata *dd, int pos, u32 *msixcnt, + struct hfi1_msix_entry *hfi1_msix_entry) +{ + int ret; + int nvec = *msixcnt; + struct msix_entry *msix_entry; + int i; + + /* + * We can't pass hfi1_msix_entry array to msix_setup + * so use a dummy msix_entry array and copy the allocated + * irq back to the hfi1_msix_entry array. + */ + msix_entry = kmalloc_array(nvec, sizeof(*msix_entry), GFP_KERNEL); + if (!msix_entry) { + ret = -ENOMEM; + goto do_intx; + } + + for (i = 0; i < nvec; i++) + msix_entry[i] = hfi1_msix_entry[i].msix; + + ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec); + if (ret < 0) + goto free_msix_entry; + nvec = ret; + + for (i = 0; i < nvec; i++) + hfi1_msix_entry[i].msix = msix_entry[i]; + + kfree(msix_entry); + *msixcnt = nvec; + return; + +free_msix_entry: + kfree(msix_entry); + +do_intx: + dd_dev_err(dd, "pci_enable_msix_range %d vectors failed: %d, falling back to INTx\n", + nvec, ret); + *msixcnt = 0; + hfi1_enable_intx(dd->pcidev); +} + +/* return the PCIe link speed from the given link status */ +static u32 extract_speed(u16 linkstat) +{ + u32 speed; + + switch (linkstat & PCI_EXP_LNKSTA_CLS) { + default: /* not defined, assume Gen1 */ + case PCI_EXP_LNKSTA_CLS_2_5GB: + speed = 2500; /* Gen 1, 2.5GHz */ + break; + case PCI_EXP_LNKSTA_CLS_5_0GB: + speed = 5000; /* Gen 2, 5GHz */ + break; + case GEN3_SPEED_VECTOR: + speed = 8000; /* Gen 3, 8GHz */ + break; + } + return speed; +} + +/* return the PCIe link speed from the given link status */ +static u32 extract_width(u16 linkstat) +{ + return (linkstat & PCI_EXP_LNKSTA_NLW) >> PCI_EXP_LNKSTA_NLW_SHIFT; +} + +/* read the link status and set dd->{lbus_width,lbus_speed,lbus_info} */ +static void update_lbus_info(struct hfi1_devdata *dd) +{ + u16 linkstat; + + pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat); + dd->lbus_width = extract_width(linkstat); + dd->lbus_speed = extract_speed(linkstat); + snprintf(dd->lbus_info, sizeof(dd->lbus_info), + "PCIe,%uMHz,x%u", dd->lbus_speed, dd->lbus_width); +} + +/* + * Read in the current PCIe link width and speed. Find if the link is + * Gen3 capable. + */ +int pcie_speeds(struct hfi1_devdata *dd) +{ + u32 linkcap; + struct pci_dev *parent = dd->pcidev->bus->self; + + if (!pci_is_pcie(dd->pcidev)) { + dd_dev_err(dd, "Can't find PCI Express capability!\n"); + return -EINVAL; + } + + /* find if our max speed is Gen3 and parent supports Gen3 speeds */ + dd->link_gen3_capable = 1; + + pcie_capability_read_dword(dd->pcidev, PCI_EXP_LNKCAP, &linkcap); + if ((linkcap & PCI_EXP_LNKCAP_SLS) != GEN3_SPEED_VECTOR) { + dd_dev_info(dd, + "This HFI is not Gen3 capable, max speed 0x%x, need 0x3\n", + linkcap & PCI_EXP_LNKCAP_SLS); + dd->link_gen3_capable = 0; + } + + /* + * bus->max_bus_speed is set from the bridge's linkcap Max Link Speed + */ + if (parent && dd->pcidev->bus->max_bus_speed != PCIE_SPEED_8_0GT) { + dd_dev_info(dd, "Parent PCIe bridge does not support Gen3\n"); + dd->link_gen3_capable = 0; + } + + /* obtain the link width and current speed */ + update_lbus_info(dd); + + dd_dev_info(dd, "%s\n", dd->lbus_info); + + return 0; +} + +/* + * Returns in *nent: + * - actual number of interrupts allocated + * - 0 if fell back to INTx. + */ +void request_msix(struct hfi1_devdata *dd, u32 *nent, + struct hfi1_msix_entry *entry) +{ + int pos; + + pos = dd->pcidev->msix_cap; + if (*nent && pos) { + msix_setup(dd, pos, nent, entry); + /* did it, either MSI-X or INTx */ + } else { + *nent = 0; + hfi1_enable_intx(dd->pcidev); + } + + tune_pcie_caps(dd); +} + +void hfi1_enable_intx(struct pci_dev *pdev) +{ + /* first, turn on INTx */ + pci_intx(pdev, 1); + /* then turn off MSI-X */ + pci_disable_msix(pdev); +} + +/* restore command and BARs after a reset has wiped them out */ +void restore_pci_variables(struct hfi1_devdata *dd) +{ + pci_write_config_word(dd->pcidev, PCI_COMMAND, dd->pci_command); + pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0, dd->pcibar0); + pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1, dd->pcibar1); + pci_write_config_dword(dd->pcidev, PCI_ROM_ADDRESS, dd->pci_rom); + pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, dd->pcie_devctl); + pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL, dd->pcie_lnkctl); + pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL2, + dd->pcie_devctl2); + pci_write_config_dword(dd->pcidev, PCI_CFG_MSIX0, dd->pci_msix0); + pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE1, dd->pci_lnkctl3); + pci_write_config_dword(dd->pcidev, PCIE_CFG_TPH2, dd->pci_tph2); +} + +/* + * BIOS may not set PCIe bus-utilization parameters for best performance. + * Check and optionally adjust them to maximize our throughput. + */ +static int hfi1_pcie_caps; +module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO); +MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)"); + +uint aspm_mode = ASPM_MODE_DISABLED; +module_param_named(aspm, aspm_mode, uint, S_IRUGO); +MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic"); + +static void tune_pcie_caps(struct hfi1_devdata *dd) +{ + struct pci_dev *parent; + u16 rc_mpss, rc_mps, ep_mpss, ep_mps; + u16 rc_mrrs, ep_mrrs, max_mrrs, ectl; + + /* + * Turn on extended tags in DevCtl in case the BIOS has turned it off + * to improve WFR SDMA bandwidth + */ + pcie_capability_read_word(dd->pcidev, PCI_EXP_DEVCTL, &ectl); + if (!(ectl & PCI_EXP_DEVCTL_EXT_TAG)) { + dd_dev_info(dd, "Enabling PCIe extended tags\n"); + ectl |= PCI_EXP_DEVCTL_EXT_TAG; + pcie_capability_write_word(dd->pcidev, PCI_EXP_DEVCTL, ectl); + } + /* Find out supported and configured values for parent (root) */ + parent = dd->pcidev->bus->self; + /* + * The driver cannot perform the tuning if it does not have + * access to the upstream component. + */ + if (!parent) + return; + if (!pci_is_root_bus(parent->bus)) { + dd_dev_info(dd, "Parent not root\n"); + return; + } + + if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev)) + return; + rc_mpss = parent->pcie_mpss; + rc_mps = ffs(pcie_get_mps(parent)) - 8; + /* Find out supported and configured values for endpoint (us) */ + ep_mpss = dd->pcidev->pcie_mpss; + ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8; + + /* Find max payload supported by root, endpoint */ + if (rc_mpss > ep_mpss) + rc_mpss = ep_mpss; + + /* If Supported greater than limit in module param, limit it */ + if (rc_mpss > (hfi1_pcie_caps & 7)) + rc_mpss = hfi1_pcie_caps & 7; + /* If less than (allowed, supported), bump root payload */ + if (rc_mpss > rc_mps) { + rc_mps = rc_mpss; + pcie_set_mps(parent, 128 << rc_mps); + } + /* If less than (allowed, supported), bump endpoint payload */ + if (rc_mpss > ep_mps) { + ep_mps = rc_mpss; + pcie_set_mps(dd->pcidev, 128 << ep_mps); + } + + /* + * Now the Read Request size. + * No field for max supported, but PCIe spec limits it to 4096, + * which is code '5' (log2(4096) - 7) + */ + max_mrrs = 5; + if (max_mrrs > ((hfi1_pcie_caps >> 4) & 7)) + max_mrrs = (hfi1_pcie_caps >> 4) & 7; + + max_mrrs = 128 << max_mrrs; + rc_mrrs = pcie_get_readrq(parent); + ep_mrrs = pcie_get_readrq(dd->pcidev); + + if (max_mrrs > rc_mrrs) { + rc_mrrs = max_mrrs; + pcie_set_readrq(parent, rc_mrrs); + } + if (max_mrrs > ep_mrrs) { + ep_mrrs = max_mrrs; + pcie_set_readrq(dd->pcidev, ep_mrrs); + } +} + +/* End of PCIe capability tuning */ + +/* + * From here through hfi1_pci_err_handler definition is invoked via + * PCI error infrastructure, registered via pci + */ +static pci_ers_result_t +pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED; + + switch (state) { + case pci_channel_io_normal: + dd_dev_info(dd, "State Normal, ignoring\n"); + break; + + case pci_channel_io_frozen: + dd_dev_info(dd, "State Frozen, requesting reset\n"); + pci_disable_device(pdev); + ret = PCI_ERS_RESULT_NEED_RESET; + break; + + case pci_channel_io_perm_failure: + if (dd) { + dd_dev_info(dd, "State Permanent Failure, disabling\n"); + /* no more register accesses! */ + dd->flags &= ~HFI1_PRESENT; + hfi1_disable_after_error(dd); + } + /* else early, or other problem */ + ret = PCI_ERS_RESULT_DISCONNECT; + break; + + default: /* shouldn't happen */ + dd_dev_info(dd, "HFI1 PCI errors detected (state %d)\n", + state); + break; + } + return ret; +} + +static pci_ers_result_t +pci_mmio_enabled(struct pci_dev *pdev) +{ + u64 words = 0U; + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED; + + if (dd && dd->pport) { + words = read_port_cntr(dd->pport, C_RX_WORDS, CNTR_INVALID_VL); + if (words == ~0ULL) + ret = PCI_ERS_RESULT_NEED_RESET; + dd_dev_info(dd, + "HFI1 mmio_enabled function called, read wordscntr %Lx, returning %d\n", + words, ret); + } + return ret; +} + +static pci_ers_result_t +pci_slot_reset(struct pci_dev *pdev) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + + dd_dev_info(dd, "HFI1 slot_reset function called, ignored\n"); + return PCI_ERS_RESULT_CAN_RECOVER; +} + +static pci_ers_result_t +pci_link_reset(struct pci_dev *pdev) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + + dd_dev_info(dd, "HFI1 link_reset function called, ignored\n"); + return PCI_ERS_RESULT_CAN_RECOVER; +} + +static void +pci_resume(struct pci_dev *pdev) +{ + struct hfi1_devdata *dd = pci_get_drvdata(pdev); + + dd_dev_info(dd, "HFI1 resume function called\n"); + pci_cleanup_aer_uncorrect_error_status(pdev); + /* + * Running jobs will fail, since it's asynchronous + * unlike sysfs-requested reset. Better than + * doing nothing. + */ + hfi1_init(dd, 1); /* same as re-init after reset */ +} + +const struct pci_error_handlers hfi1_pci_err_handler = { + .error_detected = pci_error_detected, + .mmio_enabled = pci_mmio_enabled, + .link_reset = pci_link_reset, + .slot_reset = pci_slot_reset, + .resume = pci_resume, +}; + +/*============================================================================*/ +/* PCIe Gen3 support */ + +/* + * This code is separated out because it is expected to be removed in the + * final shipping product. If not, then it will be revisited and items + * will be moved to more standard locations. + */ + +/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_STS field values */ +#define DL_STATUS_HFI0 0x1 /* hfi0 firmware download complete */ +#define DL_STATUS_HFI1 0x2 /* hfi1 firmware download complete */ +#define DL_STATUS_BOTH 0x3 /* hfi0 and hfi1 firmware download complete */ + +/* ASIC_PCI_SD_HOST_STATUS.FW_DNLD_ERR field values */ +#define DL_ERR_NONE 0x0 /* no error */ +#define DL_ERR_SWAP_PARITY 0x1 /* parity error in SerDes interrupt */ + /* or response data */ +#define DL_ERR_DISABLED 0x2 /* hfi disabled */ +#define DL_ERR_SECURITY 0x3 /* security check failed */ +#define DL_ERR_SBUS 0x4 /* SBus status error */ +#define DL_ERR_XFR_PARITY 0x5 /* parity error during ROM transfer*/ + +/* gasket block secondary bus reset delay */ +#define SBR_DELAY_US 200000 /* 200ms */ + +/* mask for PCIe capability register lnkctl2 target link speed */ +#define LNKCTL2_TARGET_LINK_SPEED_MASK 0xf + +static uint pcie_target = 3; +module_param(pcie_target, uint, S_IRUGO); +MODULE_PARM_DESC(pcie_target, "PCIe target speed (0 skip, 1-3 Gen1-3)"); + +static uint pcie_force; +module_param(pcie_force, uint, S_IRUGO); +MODULE_PARM_DESC(pcie_force, "Force driver to do a PCIe firmware download even if already at target speed"); + +static uint pcie_retry = 5; +module_param(pcie_retry, uint, S_IRUGO); +MODULE_PARM_DESC(pcie_retry, "Driver will try this many times to reach requested speed"); + +#define UNSET_PSET 255 +#define DEFAULT_DISCRETE_PSET 2 /* discrete HFI */ +#define DEFAULT_MCP_PSET 4 /* MCP HFI */ +static uint pcie_pset = UNSET_PSET; +module_param(pcie_pset, uint, S_IRUGO); +MODULE_PARM_DESC(pcie_pset, "PCIe Eq Pset value to use, range is 0-10"); + +/* equalization columns */ +#define PREC 0 +#define ATTN 1 +#define POST 2 + +/* discrete silicon preliminary equalization values */ +static const u8 discrete_preliminary_eq[11][3] = { + /* prec attn post */ + { 0x00, 0x00, 0x12 }, /* p0 */ + { 0x00, 0x00, 0x0c }, /* p1 */ + { 0x00, 0x00, 0x0f }, /* p2 */ + { 0x00, 0x00, 0x09 }, /* p3 */ + { 0x00, 0x00, 0x00 }, /* p4 */ + { 0x06, 0x00, 0x00 }, /* p5 */ + { 0x09, 0x00, 0x00 }, /* p6 */ + { 0x06, 0x00, 0x0f }, /* p7 */ + { 0x09, 0x00, 0x09 }, /* p8 */ + { 0x0c, 0x00, 0x00 }, /* p9 */ + { 0x00, 0x00, 0x18 }, /* p10 */ +}; + +/* integrated silicon preliminary equalization values */ +static const u8 integrated_preliminary_eq[11][3] = { + /* prec attn post */ + { 0x00, 0x1e, 0x07 }, /* p0 */ + { 0x00, 0x1e, 0x05 }, /* p1 */ + { 0x00, 0x1e, 0x06 }, /* p2 */ + { 0x00, 0x1e, 0x04 }, /* p3 */ + { 0x00, 0x1e, 0x00 }, /* p4 */ + { 0x03, 0x1e, 0x00 }, /* p5 */ + { 0x04, 0x1e, 0x00 }, /* p6 */ + { 0x03, 0x1e, 0x06 }, /* p7 */ + { 0x03, 0x1e, 0x04 }, /* p8 */ + { 0x05, 0x1e, 0x00 }, /* p9 */ + { 0x00, 0x1e, 0x0a }, /* p10 */ +}; + +/* helper to format the value to write to hardware */ +#define eq_value(pre, curr, post) \ + ((((u32)(pre)) << \ + PCIE_CFG_REG_PL102_GEN3_EQ_PRE_CURSOR_PSET_SHIFT) \ + | (((u32)(curr)) << PCIE_CFG_REG_PL102_GEN3_EQ_CURSOR_PSET_SHIFT) \ + | (((u32)(post)) << \ + PCIE_CFG_REG_PL102_GEN3_EQ_POST_CURSOR_PSET_SHIFT)) + +/* + * Load the given EQ preset table into the PCIe hardware. + */ +static int load_eq_table(struct hfi1_devdata *dd, const u8 eq[11][3], u8 fs, + u8 div) +{ + struct pci_dev *pdev = dd->pcidev; + u32 hit_error = 0; + u32 violation; + u32 i; + u8 c_minus1, c0, c_plus1; + + for (i = 0; i < 11; i++) { + /* set index */ + pci_write_config_dword(pdev, PCIE_CFG_REG_PL103, i); + /* write the value */ + c_minus1 = eq[i][PREC] / div; + c0 = fs - (eq[i][PREC] / div) - (eq[i][POST] / div); + c_plus1 = eq[i][POST] / div; + pci_write_config_dword(pdev, PCIE_CFG_REG_PL102, + eq_value(c_minus1, c0, c_plus1)); + /* check if these coefficients violate EQ rules */ + pci_read_config_dword(dd->pcidev, PCIE_CFG_REG_PL105, + &violation); + if (violation + & PCIE_CFG_REG_PL105_GEN3_EQ_VIOLATE_COEF_RULES_SMASK){ + if (hit_error == 0) { + dd_dev_err(dd, + "Gen3 EQ Table Coefficient rule violations\n"); + dd_dev_err(dd, " prec attn post\n"); + } + dd_dev_err(dd, " p%02d: %02x %02x %02x\n", + i, (u32)eq[i][0], (u32)eq[i][1], + (u32)eq[i][2]); + dd_dev_err(dd, " %02x %02x %02x\n", + (u32)c_minus1, (u32)c0, (u32)c_plus1); + hit_error = 1; + } + } + if (hit_error) + return -EINVAL; + return 0; +} + +/* + * Steps to be done after the PCIe firmware is downloaded and + * before the SBR for the Pcie Gen3. + * The SBus resource is already being held. + */ +static void pcie_post_steps(struct hfi1_devdata *dd) +{ + int i; + + set_sbus_fast_mode(dd); + /* + * Write to the PCIe PCSes to set the G3_LOCKED_NEXT bits to 1. + * This avoids a spurious framing error that can otherwise be + * generated by the MAC layer. + * + * Use individual addresses since no broadcast is set up. + */ + for (i = 0; i < NUM_PCIE_SERDES; i++) { + sbus_request(dd, pcie_pcs_addrs[dd->hfi1_id][i], + 0x03, WRITE_SBUS_RECEIVER, 0x00022132); + } + + clear_sbus_fast_mode(dd); +} + +/* + * Trigger a secondary bus reset (SBR) on ourselves using our parent. + * + * Based on pci_parent_bus_reset() which is not exported by the + * kernel core. + */ +static int trigger_sbr(struct hfi1_devdata *dd) +{ + struct pci_dev *dev = dd->pcidev; + struct pci_dev *pdev; + + /* need a parent */ + if (!dev->bus->self) { + dd_dev_err(dd, "%s: no parent device\n", __func__); + return -ENOTTY; + } + + /* should not be anyone else on the bus */ + list_for_each_entry(pdev, &dev->bus->devices, bus_list) + if (pdev != dev) { + dd_dev_err(dd, + "%s: another device is on the same bus\n", + __func__); + return -ENOTTY; + } + + /* + * A secondary bus reset (SBR) issues a hot reset to our device. + * The following routine does a 1s wait after the reset is dropped + * per PCI Trhfa (recovery time). PCIe 3.0 section 6.6.1 - + * Conventional Reset, paragraph 3, line 35 also says that a 1s + * delay after a reset is required. Per spec requirements, + * the link is either working or not after that point. + */ + pci_reset_bridge_secondary_bus(dev->bus->self); + + return 0; +} + +/* + * Write the given gasket interrupt register. + */ +static void write_gasket_interrupt(struct hfi1_devdata *dd, int index, + u16 code, u16 data) +{ + write_csr(dd, ASIC_PCIE_SD_INTRPT_LIST + (index * 8), + (((u64)code << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_CODE_SHIFT) | + ((u64)data << ASIC_PCIE_SD_INTRPT_LIST_INTRPT_DATA_SHIFT))); +} + +/* + * Tell the gasket logic how to react to the reset. + */ +static void arm_gasket_logic(struct hfi1_devdata *dd) +{ + u64 reg; + + reg = (((u64)1 << dd->hfi1_id) << + ASIC_PCIE_SD_HOST_CMD_INTRPT_CMD_SHIFT) | + ((u64)pcie_serdes_broadcast[dd->hfi1_id] << + ASIC_PCIE_SD_HOST_CMD_SBUS_RCVR_ADDR_SHIFT | + ASIC_PCIE_SD_HOST_CMD_SBR_MODE_SMASK | + ((u64)SBR_DELAY_US & ASIC_PCIE_SD_HOST_CMD_TIMER_MASK) << + ASIC_PCIE_SD_HOST_CMD_TIMER_SHIFT); + write_csr(dd, ASIC_PCIE_SD_HOST_CMD, reg); + /* read back to push the write */ + read_csr(dd, ASIC_PCIE_SD_HOST_CMD); +} + +/* + * CCE_PCIE_CTRL long name helpers + * We redefine these shorter macros to use in the code while leaving + * chip_registers.h to be autogenerated from the hardware spec. + */ +#define LANE_BUNDLE_MASK CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_MASK +#define LANE_BUNDLE_SHIFT CCE_PCIE_CTRL_PCIE_LANE_BUNDLE_SHIFT +#define LANE_DELAY_MASK CCE_PCIE_CTRL_PCIE_LANE_DELAY_MASK +#define LANE_DELAY_SHIFT CCE_PCIE_CTRL_PCIE_LANE_DELAY_SHIFT +#define MARGIN_OVERWRITE_ENABLE_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_OVERWRITE_ENABLE_SHIFT +#define MARGIN_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_SHIFT +#define MARGIN_G1_G2_OVERWRITE_MASK CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_MASK +#define MARGIN_G1_G2_OVERWRITE_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_OVERWRITE_ENABLE_SHIFT +#define MARGIN_GEN1_GEN2_MASK CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_MASK +#define MARGIN_GEN1_GEN2_SHIFT CCE_PCIE_CTRL_XMT_MARGIN_GEN1_GEN2_SHIFT + + /* + * Write xmt_margin for full-swing (WFR-B) or half-swing (WFR-C). + */ +static void write_xmt_margin(struct hfi1_devdata *dd, const char *fname) +{ + u64 pcie_ctrl; + u64 xmt_margin; + u64 xmt_margin_oe; + u64 lane_delay; + u64 lane_bundle; + + pcie_ctrl = read_csr(dd, CCE_PCIE_CTRL); + + /* + * For Discrete, use full-swing. + * - PCIe TX defaults to full-swing. + * Leave this register as default. + * For Integrated, use half-swing + * - Copy xmt_margin and xmt_margin_oe + * from Gen1/Gen2 to Gen3. + */ + if (dd->pcidev->device == PCI_DEVICE_ID_INTEL1) { /* integrated */ + /* extract initial fields */ + xmt_margin = (pcie_ctrl >> MARGIN_GEN1_GEN2_SHIFT) + & MARGIN_GEN1_GEN2_MASK; + xmt_margin_oe = (pcie_ctrl >> MARGIN_G1_G2_OVERWRITE_SHIFT) + & MARGIN_G1_G2_OVERWRITE_MASK; + lane_delay = (pcie_ctrl >> LANE_DELAY_SHIFT) & LANE_DELAY_MASK; + lane_bundle = (pcie_ctrl >> LANE_BUNDLE_SHIFT) + & LANE_BUNDLE_MASK; + + /* + * For A0, EFUSE values are not set. Override with the + * correct values. + */ + if (is_ax(dd)) { + /* + * xmt_margin and OverwiteEnabel should be the + * same for Gen1/Gen2 and Gen3 + */ + xmt_margin = 0x5; + xmt_margin_oe = 0x1; + lane_delay = 0xF; /* Delay 240ns. */ + lane_bundle = 0x0; /* Set to 1 lane. */ + } + + /* overwrite existing values */ + pcie_ctrl = (xmt_margin << MARGIN_GEN1_GEN2_SHIFT) + | (xmt_margin_oe << MARGIN_G1_G2_OVERWRITE_SHIFT) + | (xmt_margin << MARGIN_SHIFT) + | (xmt_margin_oe << MARGIN_OVERWRITE_ENABLE_SHIFT) + | (lane_delay << LANE_DELAY_SHIFT) + | (lane_bundle << LANE_BUNDLE_SHIFT); + + write_csr(dd, CCE_PCIE_CTRL, pcie_ctrl); + } + + dd_dev_dbg(dd, "%s: program XMT margin, CcePcieCtrl 0x%llx\n", + fname, pcie_ctrl); +} + +/* + * Do all the steps needed to transition the PCIe link to Gen3 speed. + */ +int do_pcie_gen3_transition(struct hfi1_devdata *dd) +{ + struct pci_dev *parent = dd->pcidev->bus->self; + u64 fw_ctrl; + u64 reg, therm; + u32 reg32, fs, lf; + u32 status, err; + int ret; + int do_retry, retry_count = 0; + uint default_pset; + u16 target_vector, target_speed; + u16 lnkctl2, vendor; + u8 div; + const u8 (*eq)[3]; + int return_error = 0; + + /* PCIe Gen3 is for the ASIC only */ + if (dd->icode != ICODE_RTL_SILICON) + return 0; + + if (pcie_target == 1) { /* target Gen1 */ + target_vector = GEN1_SPEED_VECTOR; + target_speed = 2500; + } else if (pcie_target == 2) { /* target Gen2 */ + target_vector = GEN2_SPEED_VECTOR; + target_speed = 5000; + } else if (pcie_target == 3) { /* target Gen3 */ + target_vector = GEN3_SPEED_VECTOR; + target_speed = 8000; + } else { + /* off or invalid target - skip */ + dd_dev_info(dd, "%s: Skipping PCIe transition\n", __func__); + return 0; + } + + /* if already at target speed, done (unless forced) */ + if (dd->lbus_speed == target_speed) { + dd_dev_info(dd, "%s: PCIe already at gen%d, %s\n", __func__, + pcie_target, + pcie_force ? "re-doing anyway" : "skipping"); + if (!pcie_force) + return 0; + } + + /* + * The driver cannot do the transition if it has no access to the + * upstream component + */ + if (!parent) { + dd_dev_info(dd, "%s: No upstream, Can't do gen3 transition\n", + __func__); + return 0; + } + + /* + * Do the Gen3 transition. Steps are those of the PCIe Gen3 + * recipe. + */ + + /* step 1: pcie link working in gen1/gen2 */ + + /* step 2: if either side is not capable of Gen3, done */ + if (pcie_target == 3 && !dd->link_gen3_capable) { + dd_dev_err(dd, "The PCIe link is not Gen3 capable\n"); + ret = -ENOSYS; + goto done_no_mutex; + } + + /* hold the SBus resource across the firmware download and SBR */ + ret = acquire_chip_resource(dd, CR_SBUS, SBUS_TIMEOUT); + if (ret) { + dd_dev_err(dd, "%s: unable to acquire SBus resource\n", + __func__); + return ret; + } + + /* make sure thermal polling is not causing interrupts */ + therm = read_csr(dd, ASIC_CFG_THERM_POLL_EN); + if (therm) { + write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x0); + msleep(100); + dd_dev_info(dd, "%s: Disabled therm polling\n", + __func__); + } + +retry: + /* the SBus download will reset the spico for thermal */ + + /* step 3: download SBus Master firmware */ + /* step 4: download PCIe Gen3 SerDes firmware */ + dd_dev_info(dd, "%s: downloading firmware\n", __func__); + ret = load_pcie_firmware(dd); + if (ret) { + /* do not proceed if the firmware cannot be downloaded */ + return_error = 1; + goto done; + } + + /* step 5: set up device parameter settings */ + dd_dev_info(dd, "%s: setting PCIe registers\n", __func__); + + /* + * PcieCfgSpcie1 - Link Control 3 + * Leave at reset value. No need to set PerfEq - link equalization + * will be performed automatically after the SBR when the target + * speed is 8GT/s. + */ + + /* clear all 16 per-lane error bits (PCIe: Lane Error Status) */ + pci_write_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, 0xffff); + + /* step 5a: Set Synopsys Port Logic registers */ + + /* + * PcieCfgRegPl2 - Port Force Link + * + * Set the low power field to 0x10 to avoid unnecessary power + * management messages. All other fields are zero. + */ + reg32 = 0x10ul << PCIE_CFG_REG_PL2_LOW_PWR_ENT_CNT_SHIFT; + pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL2, reg32); + + /* + * PcieCfgRegPl100 - Gen3 Control + * + * turn off PcieCfgRegPl100.Gen3ZRxDcNonCompl + * turn on PcieCfgRegPl100.EqEieosCnt + * Everything else zero. + */ + reg32 = PCIE_CFG_REG_PL100_EQ_EIEOS_CNT_SMASK; + pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL100, reg32); + + /* + * PcieCfgRegPl101 - Gen3 EQ FS and LF + * PcieCfgRegPl102 - Gen3 EQ Presets to Coefficients Mapping + * PcieCfgRegPl103 - Gen3 EQ Preset Index + * PcieCfgRegPl105 - Gen3 EQ Status + * + * Give initial EQ settings. + */ + if (dd->pcidev->device == PCI_DEVICE_ID_INTEL0) { /* discrete */ + /* 1000mV, FS=24, LF = 8 */ + fs = 24; + lf = 8; + div = 3; + eq = discrete_preliminary_eq; + default_pset = DEFAULT_DISCRETE_PSET; + } else { + /* 400mV, FS=29, LF = 9 */ + fs = 29; + lf = 9; + div = 1; + eq = integrated_preliminary_eq; + default_pset = DEFAULT_MCP_PSET; + } + pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL101, + (fs << + PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_FS_SHIFT) | + (lf << + PCIE_CFG_REG_PL101_GEN3_EQ_LOCAL_LF_SHIFT)); + ret = load_eq_table(dd, eq, fs, div); + if (ret) + goto done; + + /* + * PcieCfgRegPl106 - Gen3 EQ Control + * + * Set Gen3EqPsetReqVec, leave other fields 0. + */ + if (pcie_pset == UNSET_PSET) + pcie_pset = default_pset; + if (pcie_pset > 10) { /* valid range is 0-10, inclusive */ + dd_dev_err(dd, "%s: Invalid Eq Pset %u, setting to %d\n", + __func__, pcie_pset, default_pset); + pcie_pset = default_pset; + } + dd_dev_info(dd, "%s: using EQ Pset %u\n", __func__, pcie_pset); + pci_write_config_dword(dd->pcidev, PCIE_CFG_REG_PL106, + ((1 << pcie_pset) << + PCIE_CFG_REG_PL106_GEN3_EQ_PSET_REQ_VEC_SHIFT) | + PCIE_CFG_REG_PL106_GEN3_EQ_EVAL2MS_DISABLE_SMASK | + PCIE_CFG_REG_PL106_GEN3_EQ_PHASE23_EXIT_MODE_SMASK); + + /* + * step 5b: Do post firmware download steps via SBus + */ + dd_dev_info(dd, "%s: doing pcie post steps\n", __func__); + pcie_post_steps(dd); + + /* + * step 5c: Program gasket interrupts + */ + /* set the Rx Bit Rate to REFCLK ratio */ + write_gasket_interrupt(dd, 0, 0x0006, 0x0050); + /* disable pCal for PCIe Gen3 RX equalization */ + write_gasket_interrupt(dd, 1, 0x0026, 0x5b01); + /* + * Enable iCal for PCIe Gen3 RX equalization, and set which + * evaluation of RX_EQ_EVAL will launch the iCal procedure. + */ + write_gasket_interrupt(dd, 2, 0x0026, 0x5202); + /* terminate list */ + write_gasket_interrupt(dd, 3, 0x0000, 0x0000); + + /* + * step 5d: program XMT margin + */ + write_xmt_margin(dd, __func__); + + /* + * step 5e: disable active state power management (ASPM). It + * will be enabled if required later + */ + dd_dev_info(dd, "%s: clearing ASPM\n", __func__); + aspm_hw_disable_l1(dd); + + /* + * step 5f: clear DirectSpeedChange + * PcieCfgRegPl67.DirectSpeedChange must be zero to prevent the + * change in the speed target from starting before we are ready. + * This field defaults to 0 and we are not changing it, so nothing + * needs to be done. + */ + + /* step 5g: Set target link speed */ + /* + * Set target link speed to be target on both device and parent. + * On setting the parent: Some system BIOSs "helpfully" set the + * parent target speed to Gen2 to match the ASIC's initial speed. + * We can set the target Gen3 because we have already checked + * that it is Gen3 capable earlier. + */ + dd_dev_info(dd, "%s: setting parent target link speed\n", __func__); + pcie_capability_read_word(parent, PCI_EXP_LNKCTL2, &lnkctl2); + dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__, + (u32)lnkctl2); + /* only write to parent if target is not as high as ours */ + if ((lnkctl2 & LNKCTL2_TARGET_LINK_SPEED_MASK) < target_vector) { + lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK; + lnkctl2 |= target_vector; + dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__, + (u32)lnkctl2); + pcie_capability_write_word(parent, PCI_EXP_LNKCTL2, lnkctl2); + } else { + dd_dev_info(dd, "%s: ..target speed is OK\n", __func__); + } + + dd_dev_info(dd, "%s: setting target link speed\n", __func__); + pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKCTL2, &lnkctl2); + dd_dev_info(dd, "%s: ..old link control2: 0x%x\n", __func__, + (u32)lnkctl2); + lnkctl2 &= ~LNKCTL2_TARGET_LINK_SPEED_MASK; + lnkctl2 |= target_vector; + dd_dev_info(dd, "%s: ..new link control2: 0x%x\n", __func__, + (u32)lnkctl2); + pcie_capability_write_word(dd->pcidev, PCI_EXP_LNKCTL2, lnkctl2); + + /* step 5h: arm gasket logic */ + /* hold DC in reset across the SBR */ + write_csr(dd, CCE_DC_CTRL, CCE_DC_CTRL_DC_RESET_SMASK); + (void)read_csr(dd, CCE_DC_CTRL); /* DC reset hold */ + /* save firmware control across the SBR */ + fw_ctrl = read_csr(dd, MISC_CFG_FW_CTRL); + + dd_dev_info(dd, "%s: arming gasket logic\n", __func__); + arm_gasket_logic(dd); + + /* + * step 6: quiesce PCIe link + * The chip has already been reset, so there will be no traffic + * from the chip. Linux has no easy way to enforce that it will + * not try to access the device, so we just need to hope it doesn't + * do it while we are doing the reset. + */ + + /* + * step 7: initiate the secondary bus reset (SBR) + * step 8: hardware brings the links back up + * step 9: wait for link speed transition to be complete + */ + dd_dev_info(dd, "%s: calling trigger_sbr\n", __func__); + ret = trigger_sbr(dd); + if (ret) + goto done; + + /* step 10: decide what to do next */ + + /* check if we can read PCI space */ + ret = pci_read_config_word(dd->pcidev, PCI_VENDOR_ID, &vendor); + if (ret) { + dd_dev_info(dd, + "%s: read of VendorID failed after SBR, err %d\n", + __func__, ret); + return_error = 1; + goto done; + } + if (vendor == 0xffff) { + dd_dev_info(dd, "%s: VendorID is all 1s after SBR\n", __func__); + return_error = 1; + ret = -EIO; + goto done; + } + + /* restore PCI space registers we know were reset */ + dd_dev_info(dd, "%s: calling restore_pci_variables\n", __func__); + restore_pci_variables(dd); + /* restore firmware control */ + write_csr(dd, MISC_CFG_FW_CTRL, fw_ctrl); + + /* + * Check the gasket block status. + * + * This is the first CSR read after the SBR. If the read returns + * all 1s (fails), the link did not make it back. + * + * Once we're sure we can read and write, clear the DC reset after + * the SBR. Then check for any per-lane errors. Then look over + * the status. + */ + reg = read_csr(dd, ASIC_PCIE_SD_HOST_STATUS); + dd_dev_info(dd, "%s: gasket block status: 0x%llx\n", __func__, reg); + if (reg == ~0ull) { /* PCIe read failed/timeout */ + dd_dev_err(dd, "SBR failed - unable to read from device\n"); + return_error = 1; + ret = -ENOSYS; + goto done; + } + + /* clear the DC reset */ + write_csr(dd, CCE_DC_CTRL, 0); + + /* Set the LED off */ + setextled(dd, 0); + + /* check for any per-lane errors */ + pci_read_config_dword(dd->pcidev, PCIE_CFG_SPCIE2, ®32); + dd_dev_info(dd, "%s: per-lane errors: 0x%x\n", __func__, reg32); + + /* extract status, look for our HFI */ + status = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_SHIFT) + & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_STS_MASK; + if ((status & (1 << dd->hfi1_id)) == 0) { + dd_dev_err(dd, + "%s: gasket status 0x%x, expecting 0x%x\n", + __func__, status, 1 << dd->hfi1_id); + ret = -EIO; + goto done; + } + + /* extract error */ + err = (reg >> ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_SHIFT) + & ASIC_PCIE_SD_HOST_STATUS_FW_DNLD_ERR_MASK; + if (err) { + dd_dev_err(dd, "%s: gasket error %d\n", __func__, err); + ret = -EIO; + goto done; + } + + /* update our link information cache */ + update_lbus_info(dd); + dd_dev_info(dd, "%s: new speed and width: %s\n", __func__, + dd->lbus_info); + + if (dd->lbus_speed != target_speed) { /* not target */ + /* maybe retry */ + do_retry = retry_count < pcie_retry; + dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n", + pcie_target, do_retry ? ", retrying" : ""); + retry_count++; + if (do_retry) { + msleep(100); /* allow time to settle */ + goto retry; + } + ret = -EIO; + } + +done: + if (therm) { + write_csr(dd, ASIC_CFG_THERM_POLL_EN, 0x1); + msleep(100); + dd_dev_info(dd, "%s: Re-enable therm polling\n", + __func__); + } + release_chip_resource(dd, CR_SBUS); +done_no_mutex: + /* return no error if it is OK to be at current speed */ + if (ret && !return_error) { + dd_dev_err(dd, "Proceeding at current speed PCIe speed\n"); + ret = 0; + } + + dd_dev_info(dd, "%s: done\n", __func__); + return ret; +} diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c new file mode 100644 index 000000000..d4022450b --- /dev/null +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -0,0 +1,2092 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/delay.h> +#include "hfi.h" +#include "qp.h" +#include "trace.h" + +#define SC_CTXT_PACKET_EGRESS_TIMEOUT 350 /* in chip cycles */ + +#define SC(name) SEND_CTXT_##name +/* + * Send Context functions + */ +static void sc_wait_for_packet_egress(struct send_context *sc, int pause); + +/* + * Set the CM reset bit and wait for it to clear. Use the provided + * sendctrl register. This routine has no locking. + */ +void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl) +{ + write_csr(dd, SEND_CTRL, sendctrl | SEND_CTRL_CM_RESET_SMASK); + while (1) { + udelay(1); + sendctrl = read_csr(dd, SEND_CTRL); + if ((sendctrl & SEND_CTRL_CM_RESET_SMASK) == 0) + break; + } +} + +/* defined in header release 48 and higher */ +#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT +#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3 +#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull +#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \ + << SEND_CTRL_UNSUPPORTED_VL_SHIFT) +#endif + +/* global control of PIO send */ +void pio_send_control(struct hfi1_devdata *dd, int op) +{ + u64 reg, mask; + unsigned long flags; + int write = 1; /* write sendctrl back */ + int flush = 0; /* re-read sendctrl to make sure it is flushed */ + + spin_lock_irqsave(&dd->sendctrl_lock, flags); + + reg = read_csr(dd, SEND_CTRL); + switch (op) { + case PSC_GLOBAL_ENABLE: + reg |= SEND_CTRL_SEND_ENABLE_SMASK; + /* Fall through */ + case PSC_DATA_VL_ENABLE: + /* Disallow sending on VLs not enabled */ + mask = (((~0ull) << num_vls) & SEND_CTRL_UNSUPPORTED_VL_MASK) << + SEND_CTRL_UNSUPPORTED_VL_SHIFT; + reg = (reg & ~SEND_CTRL_UNSUPPORTED_VL_SMASK) | mask; + break; + case PSC_GLOBAL_DISABLE: + reg &= ~SEND_CTRL_SEND_ENABLE_SMASK; + break; + case PSC_GLOBAL_VLARB_ENABLE: + reg |= SEND_CTRL_VL_ARBITER_ENABLE_SMASK; + break; + case PSC_GLOBAL_VLARB_DISABLE: + reg &= ~SEND_CTRL_VL_ARBITER_ENABLE_SMASK; + break; + case PSC_CM_RESET: + __cm_reset(dd, reg); + write = 0; /* CSR already written (and flushed) */ + break; + case PSC_DATA_VL_DISABLE: + reg |= SEND_CTRL_UNSUPPORTED_VL_SMASK; + flush = 1; + break; + default: + dd_dev_err(dd, "%s: invalid control %d\n", __func__, op); + break; + } + + if (write) { + write_csr(dd, SEND_CTRL, reg); + if (flush) + (void)read_csr(dd, SEND_CTRL); /* flush write */ + } + + spin_unlock_irqrestore(&dd->sendctrl_lock, flags); +} + +/* number of send context memory pools */ +#define NUM_SC_POOLS 2 + +/* Send Context Size (SCS) wildcards */ +#define SCS_POOL_0 -1 +#define SCS_POOL_1 -2 + +/* Send Context Count (SCC) wildcards */ +#define SCC_PER_VL -1 +#define SCC_PER_CPU -2 +#define SCC_PER_KRCVQ -3 + +/* Send Context Size (SCS) constants */ +#define SCS_ACK_CREDITS 32 +#define SCS_VL15_CREDITS 102 /* 3 pkts of 2048B data + 128B header */ + +#define PIO_THRESHOLD_CEILING 4096 + +#define PIO_WAIT_BATCH_SIZE 5 + +/* default send context sizes */ +static struct sc_config_sizes sc_config_sizes[SC_MAX] = { + [SC_KERNEL] = { .size = SCS_POOL_0, /* even divide, pool 0 */ + .count = SCC_PER_VL }, /* one per NUMA */ + [SC_ACK] = { .size = SCS_ACK_CREDITS, + .count = SCC_PER_KRCVQ }, + [SC_USER] = { .size = SCS_POOL_0, /* even divide, pool 0 */ + .count = SCC_PER_CPU }, /* one per CPU */ + [SC_VL15] = { .size = SCS_VL15_CREDITS, + .count = 1 }, + +}; + +/* send context memory pool configuration */ +struct mem_pool_config { + int centipercent; /* % of memory, in 100ths of 1% */ + int absolute_blocks; /* absolute block count */ +}; + +/* default memory pool configuration: 100% in pool 0 */ +static struct mem_pool_config sc_mem_pool_config[NUM_SC_POOLS] = { + /* centi%, abs blocks */ + { 10000, -1 }, /* pool 0 */ + { 0, -1 }, /* pool 1 */ +}; + +/* memory pool information, used when calculating final sizes */ +struct mem_pool_info { + int centipercent; /* + * 100th of 1% of memory to use, -1 if blocks + * already set + */ + int count; /* count of contexts in the pool */ + int blocks; /* block size of the pool */ + int size; /* context size, in blocks */ +}; + +/* + * Convert a pool wildcard to a valid pool index. The wildcards + * start at -1 and increase negatively. Map them as: + * -1 => 0 + * -2 => 1 + * etc. + * + * Return -1 on non-wildcard input, otherwise convert to a pool number. + */ +static int wildcard_to_pool(int wc) +{ + if (wc >= 0) + return -1; /* non-wildcard */ + return -wc - 1; +} + +static const char *sc_type_names[SC_MAX] = { + "kernel", + "ack", + "user", + "vl15" +}; + +static const char *sc_type_name(int index) +{ + if (index < 0 || index >= SC_MAX) + return "unknown"; + return sc_type_names[index]; +} + +/* + * Read the send context memory pool configuration and send context + * size configuration. Replace any wildcards and come up with final + * counts and sizes for the send context types. + */ +int init_sc_pools_and_sizes(struct hfi1_devdata *dd) +{ + struct mem_pool_info mem_pool_info[NUM_SC_POOLS] = { { 0 } }; + int total_blocks = (dd->chip_pio_mem_size / PIO_BLOCK_SIZE) - 1; + int total_contexts = 0; + int fixed_blocks; + int pool_blocks; + int used_blocks; + int cp_total; /* centipercent total */ + int ab_total; /* absolute block total */ + int extra; + int i; + + /* + * When SDMA is enabled, kernel context pio packet size is capped by + * "piothreshold". Reduce pio buffer allocation for kernel context by + * setting it to a fixed size. The allocation allows 3-deep buffering + * of the largest pio packets plus up to 128 bytes header, sufficient + * to maintain verbs performance. + * + * When SDMA is disabled, keep the default pooling allocation. + */ + if (HFI1_CAP_IS_KSET(SDMA)) { + u16 max_pkt_size = (piothreshold < PIO_THRESHOLD_CEILING) ? + piothreshold : PIO_THRESHOLD_CEILING; + sc_config_sizes[SC_KERNEL].size = + 3 * (max_pkt_size + 128) / PIO_BLOCK_SIZE; + } + + /* + * Step 0: + * - copy the centipercents/absolute sizes from the pool config + * - sanity check these values + * - add up centipercents, then later check for full value + * - add up absolute blocks, then later check for over-commit + */ + cp_total = 0; + ab_total = 0; + for (i = 0; i < NUM_SC_POOLS; i++) { + int cp = sc_mem_pool_config[i].centipercent; + int ab = sc_mem_pool_config[i].absolute_blocks; + + /* + * A negative value is "unused" or "invalid". Both *can* + * be valid, but centipercent wins, so check that first + */ + if (cp >= 0) { /* centipercent valid */ + cp_total += cp; + } else if (ab >= 0) { /* absolute blocks valid */ + ab_total += ab; + } else { /* neither valid */ + dd_dev_err( + dd, + "Send context memory pool %d: both the block count and centipercent are invalid\n", + i); + return -EINVAL; + } + + mem_pool_info[i].centipercent = cp; + mem_pool_info[i].blocks = ab; + } + + /* do not use both % and absolute blocks for different pools */ + if (cp_total != 0 && ab_total != 0) { + dd_dev_err( + dd, + "All send context memory pools must be described as either centipercent or blocks, no mixing between pools\n"); + return -EINVAL; + } + + /* if any percentages are present, they must add up to 100% x 100 */ + if (cp_total != 0 && cp_total != 10000) { + dd_dev_err( + dd, + "Send context memory pool centipercent is %d, expecting 10000\n", + cp_total); + return -EINVAL; + } + + /* the absolute pool total cannot be more than the mem total */ + if (ab_total > total_blocks) { + dd_dev_err( + dd, + "Send context memory pool absolute block count %d is larger than the memory size %d\n", + ab_total, total_blocks); + return -EINVAL; + } + + /* + * Step 2: + * - copy from the context size config + * - replace context type wildcard counts with real values + * - add up non-memory pool block sizes + * - add up memory pool user counts + */ + fixed_blocks = 0; + for (i = 0; i < SC_MAX; i++) { + int count = sc_config_sizes[i].count; + int size = sc_config_sizes[i].size; + int pool; + + /* + * Sanity check count: Either a positive value or + * one of the expected wildcards is valid. The positive + * value is checked later when we compare against total + * memory available. + */ + if (i == SC_ACK) { + count = dd->n_krcv_queues; + } else if (i == SC_KERNEL) { + count = INIT_SC_PER_VL * num_vls; + } else if (count == SCC_PER_CPU) { + count = dd->num_rcv_contexts - dd->n_krcv_queues; + } else if (count < 0) { + dd_dev_err( + dd, + "%s send context invalid count wildcard %d\n", + sc_type_name(i), count); + return -EINVAL; + } + if (total_contexts + count > dd->chip_send_contexts) + count = dd->chip_send_contexts - total_contexts; + + total_contexts += count; + + /* + * Sanity check pool: The conversion will return a pool + * number or -1 if a fixed (non-negative) value. The fixed + * value is checked later when we compare against + * total memory available. + */ + pool = wildcard_to_pool(size); + if (pool == -1) { /* non-wildcard */ + fixed_blocks += size * count; + } else if (pool < NUM_SC_POOLS) { /* valid wildcard */ + mem_pool_info[pool].count += count; + } else { /* invalid wildcard */ + dd_dev_err( + dd, + "%s send context invalid pool wildcard %d\n", + sc_type_name(i), size); + return -EINVAL; + } + + dd->sc_sizes[i].count = count; + dd->sc_sizes[i].size = size; + } + if (fixed_blocks > total_blocks) { + dd_dev_err( + dd, + "Send context fixed block count, %u, larger than total block count %u\n", + fixed_blocks, total_blocks); + return -EINVAL; + } + + /* step 3: calculate the blocks in the pools, and pool context sizes */ + pool_blocks = total_blocks - fixed_blocks; + if (ab_total > pool_blocks) { + dd_dev_err( + dd, + "Send context fixed pool sizes, %u, larger than pool block count %u\n", + ab_total, pool_blocks); + return -EINVAL; + } + /* subtract off the fixed pool blocks */ + pool_blocks -= ab_total; + + for (i = 0; i < NUM_SC_POOLS; i++) { + struct mem_pool_info *pi = &mem_pool_info[i]; + + /* % beats absolute blocks */ + if (pi->centipercent >= 0) + pi->blocks = (pool_blocks * pi->centipercent) / 10000; + + if (pi->blocks == 0 && pi->count != 0) { + dd_dev_err( + dd, + "Send context memory pool %d has %u contexts, but no blocks\n", + i, pi->count); + return -EINVAL; + } + if (pi->count == 0) { + /* warn about wasted blocks */ + if (pi->blocks != 0) + dd_dev_err( + dd, + "Send context memory pool %d has %u blocks, but zero contexts\n", + i, pi->blocks); + pi->size = 0; + } else { + pi->size = pi->blocks / pi->count; + } + } + + /* step 4: fill in the context type sizes from the pool sizes */ + used_blocks = 0; + for (i = 0; i < SC_MAX; i++) { + if (dd->sc_sizes[i].size < 0) { + unsigned pool = wildcard_to_pool(dd->sc_sizes[i].size); + + WARN_ON_ONCE(pool >= NUM_SC_POOLS); + dd->sc_sizes[i].size = mem_pool_info[pool].size; + } + /* make sure we are not larger than what is allowed by the HW */ +#define PIO_MAX_BLOCKS 1024 + if (dd->sc_sizes[i].size > PIO_MAX_BLOCKS) + dd->sc_sizes[i].size = PIO_MAX_BLOCKS; + + /* calculate our total usage */ + used_blocks += dd->sc_sizes[i].size * dd->sc_sizes[i].count; + } + extra = total_blocks - used_blocks; + if (extra != 0) + dd_dev_info(dd, "unused send context blocks: %d\n", extra); + + return total_contexts; +} + +int init_send_contexts(struct hfi1_devdata *dd) +{ + u16 base; + int ret, i, j, context; + + ret = init_credit_return(dd); + if (ret) + return ret; + + dd->hw_to_sw = kmalloc_array(TXE_NUM_CONTEXTS, sizeof(u8), + GFP_KERNEL); + dd->send_contexts = kcalloc(dd->num_send_contexts, + sizeof(struct send_context_info), + GFP_KERNEL); + if (!dd->send_contexts || !dd->hw_to_sw) { + kfree(dd->hw_to_sw); + kfree(dd->send_contexts); + free_credit_return(dd); + return -ENOMEM; + } + + /* hardware context map starts with invalid send context indices */ + for (i = 0; i < TXE_NUM_CONTEXTS; i++) + dd->hw_to_sw[i] = INVALID_SCI; + + /* + * All send contexts have their credit sizes. Allocate credits + * for each context one after another from the global space. + */ + context = 0; + base = 1; + for (i = 0; i < SC_MAX; i++) { + struct sc_config_sizes *scs = &dd->sc_sizes[i]; + + for (j = 0; j < scs->count; j++) { + struct send_context_info *sci = + &dd->send_contexts[context]; + sci->type = i; + sci->base = base; + sci->credits = scs->size; + + context++; + base += scs->size; + } + } + + return 0; +} + +/* + * Allocate a software index and hardware context of the given type. + * + * Must be called with dd->sc_lock held. + */ +static int sc_hw_alloc(struct hfi1_devdata *dd, int type, u32 *sw_index, + u32 *hw_context) +{ + struct send_context_info *sci; + u32 index; + u32 context; + + for (index = 0, sci = &dd->send_contexts[0]; + index < dd->num_send_contexts; index++, sci++) { + if (sci->type == type && sci->allocated == 0) { + sci->allocated = 1; + /* use a 1:1 mapping, but make them non-equal */ + context = dd->chip_send_contexts - index - 1; + dd->hw_to_sw[context] = index; + *sw_index = index; + *hw_context = context; + return 0; /* success */ + } + } + dd_dev_err(dd, "Unable to locate a free type %d send context\n", type); + return -ENOSPC; +} + +/* + * Free the send context given by its software index. + * + * Must be called with dd->sc_lock held. + */ +static void sc_hw_free(struct hfi1_devdata *dd, u32 sw_index, u32 hw_context) +{ + struct send_context_info *sci; + + sci = &dd->send_contexts[sw_index]; + if (!sci->allocated) { + dd_dev_err(dd, "%s: sw_index %u not allocated? hw_context %u\n", + __func__, sw_index, hw_context); + } + sci->allocated = 0; + dd->hw_to_sw[hw_context] = INVALID_SCI; +} + +/* return the base context of a context in a group */ +static inline u32 group_context(u32 context, u32 group) +{ + return (context >> group) << group; +} + +/* return the size of a group */ +static inline u32 group_size(u32 group) +{ + return 1 << group; +} + +/* + * Obtain the credit return addresses, kernel virtual and physical, for the + * given sc. + * + * To understand this routine: + * o va and pa are arrays of struct credit_return. One for each physical + * send context, per NUMA. + * o Each send context always looks in its relative location in a struct + * credit_return for its credit return. + * o Each send context in a group must have its return address CSR programmed + * with the same value. Use the address of the first send context in the + * group. + */ +static void cr_group_addresses(struct send_context *sc, dma_addr_t *pa) +{ + u32 gc = group_context(sc->hw_context, sc->group); + u32 index = sc->hw_context & 0x7; + + sc->hw_free = &sc->dd->cr_base[sc->node].va[gc].cr[index]; + *pa = (unsigned long) + &((struct credit_return *)sc->dd->cr_base[sc->node].pa)[gc]; +} + +/* + * Work queue function triggered in error interrupt routine for + * kernel contexts. + */ +static void sc_halted(struct work_struct *work) +{ + struct send_context *sc; + + sc = container_of(work, struct send_context, halt_work); + sc_restart(sc); +} + +/* + * Calculate PIO block threshold for this send context using the given MTU. + * Trigger a return when one MTU plus optional header of credits remain. + * + * Parameter mtu is in bytes. + * Parameter hdrqentsize is in DWORDs. + * + * Return value is what to write into the CSR: trigger return when + * unreturned credits pass this count. + */ +u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize) +{ + u32 release_credits; + u32 threshold; + + /* add in the header size, then divide by the PIO block size */ + mtu += hdrqentsize << 2; + release_credits = DIV_ROUND_UP(mtu, PIO_BLOCK_SIZE); + + /* check against this context's credits */ + if (sc->credits <= release_credits) + threshold = 1; + else + threshold = sc->credits - release_credits; + + return threshold; +} + +/* + * Calculate credit threshold in terms of percent of the allocated credits. + * Trigger when unreturned credits equal or exceed the percentage of the whole. + * + * Return value is what to write into the CSR: trigger return when + * unreturned credits pass this count. + */ +u32 sc_percent_to_threshold(struct send_context *sc, u32 percent) +{ + return (sc->credits * percent) / 100; +} + +/* + * Set the credit return threshold. + */ +void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold) +{ + unsigned long flags; + u32 old_threshold; + int force_return = 0; + + spin_lock_irqsave(&sc->credit_ctrl_lock, flags); + + old_threshold = (sc->credit_ctrl >> + SC(CREDIT_CTRL_THRESHOLD_SHIFT)) + & SC(CREDIT_CTRL_THRESHOLD_MASK); + + if (new_threshold != old_threshold) { + sc->credit_ctrl = + (sc->credit_ctrl + & ~SC(CREDIT_CTRL_THRESHOLD_SMASK)) + | ((new_threshold + & SC(CREDIT_CTRL_THRESHOLD_MASK)) + << SC(CREDIT_CTRL_THRESHOLD_SHIFT)); + write_kctxt_csr(sc->dd, sc->hw_context, + SC(CREDIT_CTRL), sc->credit_ctrl); + + /* force a credit return on change to avoid a possible stall */ + force_return = 1; + } + + spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags); + + if (force_return) + sc_return_credits(sc); +} + +/* + * set_pio_integrity + * + * Set the CHECK_ENABLE register for the send context 'sc'. + */ +void set_pio_integrity(struct send_context *sc) +{ + struct hfi1_devdata *dd = sc->dd; + u64 reg = 0; + u32 hw_context = sc->hw_context; + int type = sc->type; + + /* + * No integrity checks if HFI1_CAP_NO_INTEGRITY is set, or if + * we're snooping. + */ + if (likely(!HFI1_CAP_IS_KSET(NO_INTEGRITY)) && + dd->hfi1_snoop.mode_flag != HFI1_PORT_SNOOP_MODE) + reg = hfi1_pkt_default_send_ctxt_mask(dd, type); + + write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), reg); +} + +static u32 get_buffers_allocated(struct send_context *sc) +{ + int cpu; + u32 ret = 0; + + for_each_possible_cpu(cpu) + ret += *per_cpu_ptr(sc->buffers_allocated, cpu); + return ret; +} + +static void reset_buffers_allocated(struct send_context *sc) +{ + int cpu; + + for_each_possible_cpu(cpu) + (*per_cpu_ptr(sc->buffers_allocated, cpu)) = 0; +} + +/* + * Allocate a NUMA relative send context structure of the given type along + * with a HW context. + */ +struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, + uint hdrqentsize, int numa) +{ + struct send_context_info *sci; + struct send_context *sc = NULL; + dma_addr_t pa; + unsigned long flags; + u64 reg; + u32 thresh; + u32 sw_index; + u32 hw_context; + int ret; + u8 opval, opmask; + + /* do not allocate while frozen */ + if (dd->flags & HFI1_FROZEN) + return NULL; + + sc = kzalloc_node(sizeof(*sc), GFP_KERNEL, numa); + if (!sc) + return NULL; + + sc->buffers_allocated = alloc_percpu(u32); + if (!sc->buffers_allocated) { + kfree(sc); + dd_dev_err(dd, + "Cannot allocate buffers_allocated per cpu counters\n" + ); + return NULL; + } + + spin_lock_irqsave(&dd->sc_lock, flags); + ret = sc_hw_alloc(dd, type, &sw_index, &hw_context); + if (ret) { + spin_unlock_irqrestore(&dd->sc_lock, flags); + free_percpu(sc->buffers_allocated); + kfree(sc); + return NULL; + } + + sci = &dd->send_contexts[sw_index]; + sci->sc = sc; + + sc->dd = dd; + sc->node = numa; + sc->type = type; + spin_lock_init(&sc->alloc_lock); + spin_lock_init(&sc->release_lock); + spin_lock_init(&sc->credit_ctrl_lock); + INIT_LIST_HEAD(&sc->piowait); + INIT_WORK(&sc->halt_work, sc_halted); + init_waitqueue_head(&sc->halt_wait); + + /* grouping is always single context for now */ + sc->group = 0; + + sc->sw_index = sw_index; + sc->hw_context = hw_context; + cr_group_addresses(sc, &pa); + sc->credits = sci->credits; + +/* PIO Send Memory Address details */ +#define PIO_ADDR_CONTEXT_MASK 0xfful +#define PIO_ADDR_CONTEXT_SHIFT 16 + sc->base_addr = dd->piobase + ((hw_context & PIO_ADDR_CONTEXT_MASK) + << PIO_ADDR_CONTEXT_SHIFT); + + /* set base and credits */ + reg = ((sci->credits & SC(CTRL_CTXT_DEPTH_MASK)) + << SC(CTRL_CTXT_DEPTH_SHIFT)) + | ((sci->base & SC(CTRL_CTXT_BASE_MASK)) + << SC(CTRL_CTXT_BASE_SHIFT)); + write_kctxt_csr(dd, hw_context, SC(CTRL), reg); + + set_pio_integrity(sc); + + /* unmask all errors */ + write_kctxt_csr(dd, hw_context, SC(ERR_MASK), (u64)-1); + + /* set the default partition key */ + write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), + (SC(CHECK_PARTITION_KEY_VALUE_MASK) & + DEFAULT_PKEY) << + SC(CHECK_PARTITION_KEY_VALUE_SHIFT)); + + /* per context type checks */ + if (type == SC_USER) { + opval = USER_OPCODE_CHECK_VAL; + opmask = USER_OPCODE_CHECK_MASK; + } else { + opval = OPCODE_CHECK_VAL_DISABLED; + opmask = OPCODE_CHECK_MASK_DISABLED; + } + + /* set the send context check opcode mask and value */ + write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), + ((u64)opmask << SC(CHECK_OPCODE_MASK_SHIFT)) | + ((u64)opval << SC(CHECK_OPCODE_VALUE_SHIFT))); + + /* set up credit return */ + reg = pa & SC(CREDIT_RETURN_ADDR_ADDRESS_SMASK); + write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), reg); + + /* + * Calculate the initial credit return threshold. + * + * For Ack contexts, set a threshold for half the credits. + * For User contexts use the given percentage. This has been + * sanitized on driver start-up. + * For Kernel contexts, use the default MTU plus a header + * or half the credits, whichever is smaller. This should + * work for both the 3-deep buffering allocation and the + * pooling allocation. + */ + if (type == SC_ACK) { + thresh = sc_percent_to_threshold(sc, 50); + } else if (type == SC_USER) { + thresh = sc_percent_to_threshold(sc, + user_credit_return_threshold); + } else { /* kernel */ + thresh = min(sc_percent_to_threshold(sc, 50), + sc_mtu_to_threshold(sc, hfi1_max_mtu, + hdrqentsize)); + } + reg = thresh << SC(CREDIT_CTRL_THRESHOLD_SHIFT); + /* add in early return */ + if (type == SC_USER && HFI1_CAP_IS_USET(EARLY_CREDIT_RETURN)) + reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK); + else if (HFI1_CAP_IS_KSET(EARLY_CREDIT_RETURN)) /* kernel, ack */ + reg |= SC(CREDIT_CTRL_EARLY_RETURN_SMASK); + + /* set up write-through credit_ctrl */ + sc->credit_ctrl = reg; + write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), reg); + + /* User send contexts should not allow sending on VL15 */ + if (type == SC_USER) { + reg = 1ULL << 15; + write_kctxt_csr(dd, hw_context, SC(CHECK_VL), reg); + } + + spin_unlock_irqrestore(&dd->sc_lock, flags); + + /* + * Allocate shadow ring to track outstanding PIO buffers _after_ + * unlocking. We don't know the size until the lock is held and + * we can't allocate while the lock is held. No one is using + * the context yet, so allocate it now. + * + * User contexts do not get a shadow ring. + */ + if (type != SC_USER) { + /* + * Size the shadow ring 1 larger than the number of credits + * so head == tail can mean empty. + */ + sc->sr_size = sci->credits + 1; + sc->sr = kzalloc_node(sizeof(union pio_shadow_ring) * + sc->sr_size, GFP_KERNEL, numa); + if (!sc->sr) { + sc_free(sc); + return NULL; + } + } + + hfi1_cdbg(PIO, + "Send context %u(%u) %s group %u credits %u credit_ctrl 0x%llx threshold %u\n", + sw_index, + hw_context, + sc_type_name(type), + sc->group, + sc->credits, + sc->credit_ctrl, + thresh); + + return sc; +} + +/* free a per-NUMA send context structure */ +void sc_free(struct send_context *sc) +{ + struct hfi1_devdata *dd; + unsigned long flags; + u32 sw_index; + u32 hw_context; + + if (!sc) + return; + + sc->flags |= SCF_IN_FREE; /* ensure no restarts */ + dd = sc->dd; + if (!list_empty(&sc->piowait)) + dd_dev_err(dd, "piowait list not empty!\n"); + sw_index = sc->sw_index; + hw_context = sc->hw_context; + sc_disable(sc); /* make sure the HW is disabled */ + flush_work(&sc->halt_work); + + spin_lock_irqsave(&dd->sc_lock, flags); + dd->send_contexts[sw_index].sc = NULL; + + /* clear/disable all registers set in sc_alloc */ + write_kctxt_csr(dd, hw_context, SC(CTRL), 0); + write_kctxt_csr(dd, hw_context, SC(CHECK_ENABLE), 0); + write_kctxt_csr(dd, hw_context, SC(ERR_MASK), 0); + write_kctxt_csr(dd, hw_context, SC(CHECK_PARTITION_KEY), 0); + write_kctxt_csr(dd, hw_context, SC(CHECK_OPCODE), 0); + write_kctxt_csr(dd, hw_context, SC(CREDIT_RETURN_ADDR), 0); + write_kctxt_csr(dd, hw_context, SC(CREDIT_CTRL), 0); + + /* release the index and context for re-use */ + sc_hw_free(dd, sw_index, hw_context); + spin_unlock_irqrestore(&dd->sc_lock, flags); + + kfree(sc->sr); + free_percpu(sc->buffers_allocated); + kfree(sc); +} + +/* disable the context */ +void sc_disable(struct send_context *sc) +{ + u64 reg; + unsigned long flags; + struct pio_buf *pbuf; + + if (!sc) + return; + + /* do all steps, even if already disabled */ + spin_lock_irqsave(&sc->alloc_lock, flags); + reg = read_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL)); + reg &= ~SC(CTRL_CTXT_ENABLE_SMASK); + sc->flags &= ~SCF_ENABLED; + sc_wait_for_packet_egress(sc, 1); + write_kctxt_csr(sc->dd, sc->hw_context, SC(CTRL), reg); + spin_unlock_irqrestore(&sc->alloc_lock, flags); + + /* + * Flush any waiters. Once the context is disabled, + * credit return interrupts are stopped (although there + * could be one in-process when the context is disabled). + * Wait one microsecond for any lingering interrupts, then + * proceed with the flush. + */ + udelay(1); + spin_lock_irqsave(&sc->release_lock, flags); + if (sc->sr) { /* this context has a shadow ring */ + while (sc->sr_tail != sc->sr_head) { + pbuf = &sc->sr[sc->sr_tail].pbuf; + if (pbuf->cb) + (*pbuf->cb)(pbuf->arg, PRC_SC_DISABLE); + sc->sr_tail++; + if (sc->sr_tail >= sc->sr_size) + sc->sr_tail = 0; + } + } + spin_unlock_irqrestore(&sc->release_lock, flags); +} + +/* return SendEgressCtxtStatus.PacketOccupancy */ +#define packet_occupancy(r) \ + (((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SMASK)\ + >> SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_PACKET_OCCUPANCY_SHIFT) + +/* is egress halted on the context? */ +#define egress_halted(r) \ + ((r) & SEND_EGRESS_CTXT_STATUS_CTXT_EGRESS_HALT_STATUS_SMASK) + +/* wait for packet egress, optionally pause for credit return */ +static void sc_wait_for_packet_egress(struct send_context *sc, int pause) +{ + struct hfi1_devdata *dd = sc->dd; + u64 reg = 0; + u64 reg_prev; + u32 loop = 0; + + while (1) { + reg_prev = reg; + reg = read_csr(dd, sc->hw_context * 8 + + SEND_EGRESS_CTXT_STATUS); + /* done if egress is stopped */ + if (egress_halted(reg)) + break; + reg = packet_occupancy(reg); + if (reg == 0) + break; + /* counter is reset if occupancy count changes */ + if (reg != reg_prev) + loop = 0; + if (loop > 50000) { + /* timed out - bounce the link */ + dd_dev_err(dd, + "%s: context %u(%u) timeout waiting for packets to egress, remaining count %u, bouncing link\n", + __func__, sc->sw_index, + sc->hw_context, (u32)reg); + queue_work(dd->pport->hfi1_wq, + &dd->pport->link_bounce_work); + break; + } + loop++; + udelay(1); + } + + if (pause) + /* Add additional delay to ensure chip returns all credits */ + pause_for_credit_return(dd); +} + +void sc_wait(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < dd->num_send_contexts; i++) { + struct send_context *sc = dd->send_contexts[i].sc; + + if (!sc) + continue; + sc_wait_for_packet_egress(sc, 0); + } +} + +/* + * Restart a context after it has been halted due to error. + * + * If the first step fails - wait for the halt to be asserted, return early. + * Otherwise complain about timeouts but keep going. + * + * It is expected that allocations (enabled flag bit) have been shut off + * already (only applies to kernel contexts). + */ +int sc_restart(struct send_context *sc) +{ + struct hfi1_devdata *dd = sc->dd; + u64 reg; + u32 loop; + int count; + + /* bounce off if not halted, or being free'd */ + if (!(sc->flags & SCF_HALTED) || (sc->flags & SCF_IN_FREE)) + return -EINVAL; + + dd_dev_info(dd, "restarting send context %u(%u)\n", sc->sw_index, + sc->hw_context); + + /* + * Step 1: Wait for the context to actually halt. + * + * The error interrupt is asynchronous to actually setting halt + * on the context. + */ + loop = 0; + while (1) { + reg = read_kctxt_csr(dd, sc->hw_context, SC(STATUS)); + if (reg & SC(STATUS_CTXT_HALTED_SMASK)) + break; + if (loop > 100) { + dd_dev_err(dd, "%s: context %u(%u) not halting, skipping\n", + __func__, sc->sw_index, sc->hw_context); + return -ETIME; + } + loop++; + udelay(1); + } + + /* + * Step 2: Ensure no users are still trying to write to PIO. + * + * For kernel contexts, we have already turned off buffer allocation. + * Now wait for the buffer count to go to zero. + * + * For user contexts, the user handling code has cut off write access + * to the context's PIO pages before calling this routine and will + * restore write access after this routine returns. + */ + if (sc->type != SC_USER) { + /* kernel context */ + loop = 0; + while (1) { + count = get_buffers_allocated(sc); + if (count == 0) + break; + if (loop > 100) { + dd_dev_err(dd, + "%s: context %u(%u) timeout waiting for PIO buffers to zero, remaining %d\n", + __func__, sc->sw_index, + sc->hw_context, count); + } + loop++; + udelay(1); + } + } + + /* + * Step 3: Wait for all packets to egress. + * This is done while disabling the send context + * + * Step 4: Disable the context + * + * This is a superset of the halt. After the disable, the + * errors can be cleared. + */ + sc_disable(sc); + + /* + * Step 5: Enable the context + * + * This enable will clear the halted flag and per-send context + * error flags. + */ + return sc_enable(sc); +} + +/* + * PIO freeze processing. To be called after the TXE block is fully frozen. + * Go through all frozen send contexts and disable them. The contexts are + * already stopped by the freeze. + */ +void pio_freeze(struct hfi1_devdata *dd) +{ + struct send_context *sc; + int i; + + for (i = 0; i < dd->num_send_contexts; i++) { + sc = dd->send_contexts[i].sc; + /* + * Don't disable unallocated, unfrozen, or user send contexts. + * User send contexts will be disabled when the process + * calls into the driver to reset its context. + */ + if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER) + continue; + + /* only need to disable, the context is already stopped */ + sc_disable(sc); + } +} + +/* + * Unfreeze PIO for kernel send contexts. The precondition for calling this + * is that all PIO send contexts have been disabled and the SPC freeze has + * been cleared. Now perform the last step and re-enable each kernel context. + * User (PSM) processing will occur when PSM calls into the kernel to + * acknowledge the freeze. + */ +void pio_kernel_unfreeze(struct hfi1_devdata *dd) +{ + struct send_context *sc; + int i; + + for (i = 0; i < dd->num_send_contexts; i++) { + sc = dd->send_contexts[i].sc; + if (!sc || !(sc->flags & SCF_FROZEN) || sc->type == SC_USER) + continue; + + sc_enable(sc); /* will clear the sc frozen flag */ + } +} + +/* + * Wait for the SendPioInitCtxt.PioInitInProgress bit to clear. + * Returns: + * -ETIMEDOUT - if we wait too long + * -EIO - if there was an error + */ +static int pio_init_wait_progress(struct hfi1_devdata *dd) +{ + u64 reg; + int max, count = 0; + + /* max is the longest possible HW init time / delay */ + max = (dd->icode == ICODE_FPGA_EMULATION) ? 120 : 5; + while (1) { + reg = read_csr(dd, SEND_PIO_INIT_CTXT); + if (!(reg & SEND_PIO_INIT_CTXT_PIO_INIT_IN_PROGRESS_SMASK)) + break; + if (count >= max) + return -ETIMEDOUT; + udelay(5); + count++; + } + + return reg & SEND_PIO_INIT_CTXT_PIO_INIT_ERR_SMASK ? -EIO : 0; +} + +/* + * Reset all of the send contexts to their power-on state. Used + * only during manual init - no lock against sc_enable needed. + */ +void pio_reset_all(struct hfi1_devdata *dd) +{ + int ret; + + /* make sure the init engine is not busy */ + ret = pio_init_wait_progress(dd); + /* ignore any timeout */ + if (ret == -EIO) { + /* clear the error */ + write_csr(dd, SEND_PIO_ERR_CLEAR, + SEND_PIO_ERR_CLEAR_PIO_INIT_SM_IN_ERR_SMASK); + } + + /* reset init all */ + write_csr(dd, SEND_PIO_INIT_CTXT, + SEND_PIO_INIT_CTXT_PIO_ALL_CTXT_INIT_SMASK); + udelay(2); + ret = pio_init_wait_progress(dd); + if (ret < 0) { + dd_dev_err(dd, + "PIO send context init %s while initializing all PIO blocks\n", + ret == -ETIMEDOUT ? "is stuck" : "had an error"); + } +} + +/* enable the context */ +int sc_enable(struct send_context *sc) +{ + u64 sc_ctrl, reg, pio; + struct hfi1_devdata *dd; + unsigned long flags; + int ret = 0; + + if (!sc) + return -EINVAL; + dd = sc->dd; + + /* + * Obtain the allocator lock to guard against any allocation + * attempts (which should not happen prior to context being + * enabled). On the release/disable side we don't need to + * worry about locking since the releaser will not do anything + * if the context accounting values have not changed. + */ + spin_lock_irqsave(&sc->alloc_lock, flags); + sc_ctrl = read_kctxt_csr(dd, sc->hw_context, SC(CTRL)); + if ((sc_ctrl & SC(CTRL_CTXT_ENABLE_SMASK))) + goto unlock; /* already enabled */ + + /* IMPORTANT: only clear free and fill if transitioning 0 -> 1 */ + + *sc->hw_free = 0; + sc->free = 0; + sc->alloc_free = 0; + sc->fill = 0; + sc->sr_head = 0; + sc->sr_tail = 0; + sc->flags = 0; + /* the alloc lock insures no fast path allocation */ + reset_buffers_allocated(sc); + + /* + * Clear all per-context errors. Some of these will be set when + * we are re-enabling after a context halt. Now that the context + * is disabled, the halt will not clear until after the PIO init + * engine runs below. + */ + reg = read_kctxt_csr(dd, sc->hw_context, SC(ERR_STATUS)); + if (reg) + write_kctxt_csr(dd, sc->hw_context, SC(ERR_CLEAR), reg); + + /* + * The HW PIO initialization engine can handle only one init + * request at a time. Serialize access to each device's engine. + */ + spin_lock(&dd->sc_init_lock); + /* + * Since access to this code block is serialized and + * each access waits for the initialization to complete + * before releasing the lock, the PIO initialization engine + * should not be in use, so we don't have to wait for the + * InProgress bit to go down. + */ + pio = ((sc->hw_context & SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_MASK) << + SEND_PIO_INIT_CTXT_PIO_CTXT_NUM_SHIFT) | + SEND_PIO_INIT_CTXT_PIO_SINGLE_CTXT_INIT_SMASK; + write_csr(dd, SEND_PIO_INIT_CTXT, pio); + /* + * Wait until the engine is done. Give the chip the required time + * so, hopefully, we read the register just once. + */ + udelay(2); + ret = pio_init_wait_progress(dd); + spin_unlock(&dd->sc_init_lock); + if (ret) { + dd_dev_err(dd, + "sctxt%u(%u): Context not enabled due to init failure %d\n", + sc->sw_index, sc->hw_context, ret); + goto unlock; + } + + /* + * All is well. Enable the context. + */ + sc_ctrl |= SC(CTRL_CTXT_ENABLE_SMASK); + write_kctxt_csr(dd, sc->hw_context, SC(CTRL), sc_ctrl); + /* + * Read SendCtxtCtrl to force the write out and prevent a timing + * hazard where a PIO write may reach the context before the enable. + */ + read_kctxt_csr(dd, sc->hw_context, SC(CTRL)); + sc->flags |= SCF_ENABLED; + +unlock: + spin_unlock_irqrestore(&sc->alloc_lock, flags); + + return ret; +} + +/* force a credit return on the context */ +void sc_return_credits(struct send_context *sc) +{ + if (!sc) + return; + + /* a 0->1 transition schedules a credit return */ + write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), + SC(CREDIT_FORCE_FORCE_RETURN_SMASK)); + /* + * Ensure that the write is flushed and the credit return is + * scheduled. We care more about the 0 -> 1 transition. + */ + read_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE)); + /* set back to 0 for next time */ + write_kctxt_csr(sc->dd, sc->hw_context, SC(CREDIT_FORCE), 0); +} + +/* allow all in-flight packets to drain on the context */ +void sc_flush(struct send_context *sc) +{ + if (!sc) + return; + + sc_wait_for_packet_egress(sc, 1); +} + +/* drop all packets on the context, no waiting until they are sent */ +void sc_drop(struct send_context *sc) +{ + if (!sc) + return; + + dd_dev_info(sc->dd, "%s: context %u(%u) - not implemented\n", + __func__, sc->sw_index, sc->hw_context); +} + +/* + * Start the software reaction to a context halt or SPC freeze: + * - mark the context as halted or frozen + * - stop buffer allocations + * + * Called from the error interrupt. Other work is deferred until + * out of the interrupt. + */ +void sc_stop(struct send_context *sc, int flag) +{ + unsigned long flags; + + /* mark the context */ + sc->flags |= flag; + + /* stop buffer allocations */ + spin_lock_irqsave(&sc->alloc_lock, flags); + sc->flags &= ~SCF_ENABLED; + spin_unlock_irqrestore(&sc->alloc_lock, flags); + wake_up(&sc->halt_wait); +} + +#define BLOCK_DWORDS (PIO_BLOCK_SIZE / sizeof(u32)) +#define dwords_to_blocks(x) DIV_ROUND_UP(x, BLOCK_DWORDS) + +/* + * The send context buffer "allocator". + * + * @sc: the PIO send context we are allocating from + * @len: length of whole packet - including PBC - in dwords + * @cb: optional callback to call when the buffer is finished sending + * @arg: argument for cb + * + * Return a pointer to a PIO buffer if successful, NULL if not enough room. + */ +struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len, + pio_release_cb cb, void *arg) +{ + struct pio_buf *pbuf = NULL; + unsigned long flags; + unsigned long avail; + unsigned long blocks = dwords_to_blocks(dw_len); + unsigned long start_fill; + int trycount = 0; + u32 head, next; + + spin_lock_irqsave(&sc->alloc_lock, flags); + if (!(sc->flags & SCF_ENABLED)) { + spin_unlock_irqrestore(&sc->alloc_lock, flags); + goto done; + } + +retry: + avail = (unsigned long)sc->credits - (sc->fill - sc->alloc_free); + if (blocks > avail) { + /* not enough room */ + if (unlikely(trycount)) { /* already tried to get more room */ + spin_unlock_irqrestore(&sc->alloc_lock, flags); + goto done; + } + /* copy from receiver cache line and recalculate */ + sc->alloc_free = ACCESS_ONCE(sc->free); + avail = + (unsigned long)sc->credits - + (sc->fill - sc->alloc_free); + if (blocks > avail) { + /* still no room, actively update */ + spin_unlock_irqrestore(&sc->alloc_lock, flags); + sc_release_update(sc); + spin_lock_irqsave(&sc->alloc_lock, flags); + sc->alloc_free = ACCESS_ONCE(sc->free); + trycount++; + goto retry; + } + } + + /* there is enough room */ + + preempt_disable(); + this_cpu_inc(*sc->buffers_allocated); + + /* read this once */ + head = sc->sr_head; + + /* "allocate" the buffer */ + start_fill = sc->fill; + sc->fill += blocks; + + /* + * Fill the parts that the releaser looks at before moving the head. + * The only necessary piece is the sent_at field. The credits + * we have just allocated cannot have been returned yet, so the + * cb and arg will not be looked at for a "while". Put them + * on this side of the memory barrier anyway. + */ + pbuf = &sc->sr[head].pbuf; + pbuf->sent_at = sc->fill; + pbuf->cb = cb; + pbuf->arg = arg; + pbuf->sc = sc; /* could be filled in at sc->sr init time */ + /* make sure this is in memory before updating the head */ + + /* calculate next head index, do not store */ + next = head + 1; + if (next >= sc->sr_size) + next = 0; + /* + * update the head - must be last! - the releaser can look at fields + * in pbuf once we move the head + */ + smp_wmb(); + sc->sr_head = next; + spin_unlock_irqrestore(&sc->alloc_lock, flags); + + /* finish filling in the buffer outside the lock */ + pbuf->start = sc->base_addr + ((start_fill % sc->credits) + * PIO_BLOCK_SIZE); + pbuf->size = sc->credits * PIO_BLOCK_SIZE; + pbuf->end = sc->base_addr + pbuf->size; + pbuf->block_count = blocks; + pbuf->qw_written = 0; + pbuf->carry_bytes = 0; + pbuf->carry.val64 = 0; +done: + return pbuf; +} + +/* + * There are at least two entities that can turn on credit return + * interrupts and they can overlap. Avoid problems by implementing + * a count scheme that is enforced by a lock. The lock is needed because + * the count and CSR write must be paired. + */ + +/* + * Start credit return interrupts. This is managed by a count. If already + * on, just increment the count. + */ +void sc_add_credit_return_intr(struct send_context *sc) +{ + unsigned long flags; + + /* lock must surround both the count change and the CSR update */ + spin_lock_irqsave(&sc->credit_ctrl_lock, flags); + if (sc->credit_intr_count == 0) { + sc->credit_ctrl |= SC(CREDIT_CTRL_CREDIT_INTR_SMASK); + write_kctxt_csr(sc->dd, sc->hw_context, + SC(CREDIT_CTRL), sc->credit_ctrl); + } + sc->credit_intr_count++; + spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags); +} + +/* + * Stop credit return interrupts. This is managed by a count. Decrement the + * count, if the last user, then turn the credit interrupts off. + */ +void sc_del_credit_return_intr(struct send_context *sc) +{ + unsigned long flags; + + WARN_ON(sc->credit_intr_count == 0); + + /* lock must surround both the count change and the CSR update */ + spin_lock_irqsave(&sc->credit_ctrl_lock, flags); + sc->credit_intr_count--; + if (sc->credit_intr_count == 0) { + sc->credit_ctrl &= ~SC(CREDIT_CTRL_CREDIT_INTR_SMASK); + write_kctxt_csr(sc->dd, sc->hw_context, + SC(CREDIT_CTRL), sc->credit_ctrl); + } + spin_unlock_irqrestore(&sc->credit_ctrl_lock, flags); +} + +/* + * The caller must be careful when calling this. All needint calls + * must be paired with !needint. + */ +void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint) +{ + if (needint) + sc_add_credit_return_intr(sc); + else + sc_del_credit_return_intr(sc); + trace_hfi1_wantpiointr(sc, needint, sc->credit_ctrl); + if (needint) { + mmiowb(); + sc_return_credits(sc); + } +} + +/** + * sc_piobufavail - callback when a PIO buffer is available + * @sc: the send context + * + * This is called from the interrupt handler when a PIO buffer is + * available after hfi1_verbs_send() returned an error that no buffers were + * available. Disable the interrupt if there are no more QPs waiting. + */ +static void sc_piobufavail(struct send_context *sc) +{ + struct hfi1_devdata *dd = sc->dd; + struct hfi1_ibdev *dev = &dd->verbs_dev; + struct list_head *list; + struct rvt_qp *qps[PIO_WAIT_BATCH_SIZE]; + struct rvt_qp *qp; + struct hfi1_qp_priv *priv; + unsigned long flags; + unsigned i, n = 0; + + if (dd->send_contexts[sc->sw_index].type != SC_KERNEL && + dd->send_contexts[sc->sw_index].type != SC_VL15) + return; + list = &sc->piowait; + /* + * Note: checking that the piowait list is empty and clearing + * the buffer available interrupt needs to be atomic or we + * could end up with QPs on the wait list with the interrupt + * disabled. + */ + write_seqlock_irqsave(&dev->iowait_lock, flags); + while (!list_empty(list)) { + struct iowait *wait; + + if (n == ARRAY_SIZE(qps)) + break; + wait = list_first_entry(list, struct iowait, list); + qp = iowait_to_qp(wait); + priv = qp->priv; + list_del_init(&priv->s_iowait.list); + /* refcount held until actual wake up */ + qps[n++] = qp; + } + /* + * If there had been waiters and there are more + * insure that we redo the force to avoid a potential hang. + */ + if (n) { + hfi1_sc_wantpiobuf_intr(sc, 0); + if (!list_empty(list)) + hfi1_sc_wantpiobuf_intr(sc, 1); + } + write_sequnlock_irqrestore(&dev->iowait_lock, flags); + + for (i = 0; i < n; i++) + hfi1_qp_wakeup(qps[i], + RVT_S_WAIT_PIO | RVT_S_WAIT_PIO_DRAIN); +} + +/* translate a send credit update to a bit code of reasons */ +static inline int fill_code(u64 hw_free) +{ + int code = 0; + + if (hw_free & CR_STATUS_SMASK) + code |= PRC_STATUS_ERR; + if (hw_free & CR_CREDIT_RETURN_DUE_TO_PBC_SMASK) + code |= PRC_PBC; + if (hw_free & CR_CREDIT_RETURN_DUE_TO_THRESHOLD_SMASK) + code |= PRC_THRESHOLD; + if (hw_free & CR_CREDIT_RETURN_DUE_TO_ERR_SMASK) + code |= PRC_FILL_ERR; + if (hw_free & CR_CREDIT_RETURN_DUE_TO_FORCE_SMASK) + code |= PRC_SC_DISABLE; + return code; +} + +/* use the jiffies compare to get the wrap right */ +#define sent_before(a, b) time_before(a, b) /* a < b */ + +/* + * The send context buffer "releaser". + */ +void sc_release_update(struct send_context *sc) +{ + struct pio_buf *pbuf; + u64 hw_free; + u32 head, tail; + unsigned long old_free; + unsigned long free; + unsigned long extra; + unsigned long flags; + int code; + + if (!sc) + return; + + spin_lock_irqsave(&sc->release_lock, flags); + /* update free */ + hw_free = le64_to_cpu(*sc->hw_free); /* volatile read */ + old_free = sc->free; + extra = (((hw_free & CR_COUNTER_SMASK) >> CR_COUNTER_SHIFT) + - (old_free & CR_COUNTER_MASK)) + & CR_COUNTER_MASK; + free = old_free + extra; + trace_hfi1_piofree(sc, extra); + + /* call sent buffer callbacks */ + code = -1; /* code not yet set */ + head = ACCESS_ONCE(sc->sr_head); /* snapshot the head */ + tail = sc->sr_tail; + while (head != tail) { + pbuf = &sc->sr[tail].pbuf; + + if (sent_before(free, pbuf->sent_at)) { + /* not sent yet */ + break; + } + if (pbuf->cb) { + if (code < 0) /* fill in code on first user */ + code = fill_code(hw_free); + (*pbuf->cb)(pbuf->arg, code); + } + + tail++; + if (tail >= sc->sr_size) + tail = 0; + } + sc->sr_tail = tail; + /* make sure tail is updated before free */ + smp_wmb(); + sc->free = free; + spin_unlock_irqrestore(&sc->release_lock, flags); + sc_piobufavail(sc); +} + +/* + * Send context group releaser. Argument is the send context that caused + * the interrupt. Called from the send context interrupt handler. + * + * Call release on all contexts in the group. + * + * This routine takes the sc_lock without an irqsave because it is only + * called from an interrupt handler. Adjust if that changes. + */ +void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context) +{ + struct send_context *sc; + u32 sw_index; + u32 gc, gc_end; + + spin_lock(&dd->sc_lock); + sw_index = dd->hw_to_sw[hw_context]; + if (unlikely(sw_index >= dd->num_send_contexts)) { + dd_dev_err(dd, "%s: invalid hw (%u) to sw (%u) mapping\n", + __func__, hw_context, sw_index); + goto done; + } + sc = dd->send_contexts[sw_index].sc; + if (unlikely(!sc)) + goto done; + + gc = group_context(hw_context, sc->group); + gc_end = gc + group_size(sc->group); + for (; gc < gc_end; gc++) { + sw_index = dd->hw_to_sw[gc]; + if (unlikely(sw_index >= dd->num_send_contexts)) { + dd_dev_err(dd, + "%s: invalid hw (%u) to sw (%u) mapping\n", + __func__, hw_context, sw_index); + continue; + } + sc_release_update(dd->send_contexts[sw_index].sc); + } +done: + spin_unlock(&dd->sc_lock); +} + +/* + * pio_select_send_context_vl() - select send context + * @dd: devdata + * @selector: a spreading factor + * @vl: this vl + * + * This function returns a send context based on the selector and a vl. + * The mapping fields are protected by RCU + */ +struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd, + u32 selector, u8 vl) +{ + struct pio_vl_map *m; + struct pio_map_elem *e; + struct send_context *rval; + + /* + * NOTE This should only happen if SC->VL changed after the initial + * checks on the QP/AH + * Default will return VL0's send context below + */ + if (unlikely(vl >= num_vls)) { + rval = NULL; + goto done; + } + + rcu_read_lock(); + m = rcu_dereference(dd->pio_map); + if (unlikely(!m)) { + rcu_read_unlock(); + return dd->vld[0].sc; + } + e = m->map[vl & m->mask]; + rval = e->ksc[selector & e->mask]; + rcu_read_unlock(); + +done: + rval = !rval ? dd->vld[0].sc : rval; + return rval; +} + +/* + * pio_select_send_context_sc() - select send context + * @dd: devdata + * @selector: a spreading factor + * @sc5: the 5 bit sc + * + * This function returns an send context based on the selector and an sc + */ +struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd, + u32 selector, u8 sc5) +{ + u8 vl = sc_to_vlt(dd, sc5); + + return pio_select_send_context_vl(dd, selector, vl); +} + +/* + * Free the indicated map struct + */ +static void pio_map_free(struct pio_vl_map *m) +{ + int i; + + for (i = 0; m && i < m->actual_vls; i++) + kfree(m->map[i]); + kfree(m); +} + +/* + * Handle RCU callback + */ +static void pio_map_rcu_callback(struct rcu_head *list) +{ + struct pio_vl_map *m = container_of(list, struct pio_vl_map, list); + + pio_map_free(m); +} + +/* + * Set credit return threshold for the kernel send context + */ +static void set_threshold(struct hfi1_devdata *dd, int scontext, int i) +{ + u32 thres; + + thres = min(sc_percent_to_threshold(dd->kernel_send_context[scontext], + 50), + sc_mtu_to_threshold(dd->kernel_send_context[scontext], + dd->vld[i].mtu, + dd->rcd[0]->rcvhdrqentsize)); + sc_set_cr_threshold(dd->kernel_send_context[scontext], thres); +} + +/* + * pio_map_init - called when #vls change + * @dd: hfi1_devdata + * @port: port number + * @num_vls: number of vls + * @vl_scontexts: per vl send context mapping (optional) + * + * This routine changes the mapping based on the number of vls. + * + * vl_scontexts is used to specify a non-uniform vl/send context + * loading. NULL implies auto computing the loading and giving each + * VL an uniform distribution of send contexts per VL. + * + * The auto algorithm computers the sc_per_vl and the number of extra + * send contexts. Any extra send contexts are added from the last VL + * on down + * + * rcu locking is used here to control access to the mapping fields. + * + * If either the num_vls or num_send_contexts are non-power of 2, the + * array sizes in the struct pio_vl_map and the struct pio_map_elem are + * rounded up to the next highest power of 2 and the first entry is + * reused in a round robin fashion. + * + * If an error occurs the map change is not done and the mapping is not + * chaged. + * + */ +int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_scontexts) +{ + int i, j; + int extra, sc_per_vl; + int scontext = 1; + int num_kernel_send_contexts = 0; + u8 lvl_scontexts[OPA_MAX_VLS]; + struct pio_vl_map *oldmap, *newmap; + + if (!vl_scontexts) { + for (i = 0; i < dd->num_send_contexts; i++) + if (dd->send_contexts[i].type == SC_KERNEL) + num_kernel_send_contexts++; + /* truncate divide */ + sc_per_vl = num_kernel_send_contexts / num_vls; + /* extras */ + extra = num_kernel_send_contexts % num_vls; + vl_scontexts = lvl_scontexts; + /* add extras from last vl down */ + for (i = num_vls - 1; i >= 0; i--, extra--) + vl_scontexts[i] = sc_per_vl + (extra > 0 ? 1 : 0); + } + /* build new map */ + newmap = kzalloc(sizeof(*newmap) + + roundup_pow_of_two(num_vls) * + sizeof(struct pio_map_elem *), + GFP_KERNEL); + if (!newmap) + goto bail; + newmap->actual_vls = num_vls; + newmap->vls = roundup_pow_of_two(num_vls); + newmap->mask = (1 << ilog2(newmap->vls)) - 1; + for (i = 0; i < newmap->vls; i++) { + /* save for wrap around */ + int first_scontext = scontext; + + if (i < newmap->actual_vls) { + int sz = roundup_pow_of_two(vl_scontexts[i]); + + /* only allocate once */ + newmap->map[i] = kzalloc(sizeof(*newmap->map[i]) + + sz * sizeof(struct + send_context *), + GFP_KERNEL); + if (!newmap->map[i]) + goto bail; + newmap->map[i]->mask = (1 << ilog2(sz)) - 1; + /* + * assign send contexts and + * adjust credit return threshold + */ + for (j = 0; j < sz; j++) { + if (dd->kernel_send_context[scontext]) { + newmap->map[i]->ksc[j] = + dd->kernel_send_context[scontext]; + set_threshold(dd, scontext, i); + } + if (++scontext >= first_scontext + + vl_scontexts[i]) + /* wrap back to first send context */ + scontext = first_scontext; + } + } else { + /* just re-use entry without allocating */ + newmap->map[i] = newmap->map[i % num_vls]; + } + scontext = first_scontext + vl_scontexts[i]; + } + /* newmap in hand, save old map */ + spin_lock_irq(&dd->pio_map_lock); + oldmap = rcu_dereference_protected(dd->pio_map, + lockdep_is_held(&dd->pio_map_lock)); + + /* publish newmap */ + rcu_assign_pointer(dd->pio_map, newmap); + + spin_unlock_irq(&dd->pio_map_lock); + /* success, free any old map after grace period */ + if (oldmap) + call_rcu(&oldmap->list, pio_map_rcu_callback); + return 0; +bail: + /* free any partial allocation */ + pio_map_free(newmap); + return -ENOMEM; +} + +void free_pio_map(struct hfi1_devdata *dd) +{ + /* Free PIO map if allocated */ + if (rcu_access_pointer(dd->pio_map)) { + spin_lock_irq(&dd->pio_map_lock); + pio_map_free(rcu_access_pointer(dd->pio_map)); + RCU_INIT_POINTER(dd->pio_map, NULL); + spin_unlock_irq(&dd->pio_map_lock); + synchronize_rcu(); + } + kfree(dd->kernel_send_context); + dd->kernel_send_context = NULL; +} + +int init_pervl_scs(struct hfi1_devdata *dd) +{ + int i; + u64 mask, all_vl_mask = (u64)0x80ff; /* VLs 0-7, 15 */ + u64 data_vls_mask = (u64)0x00ff; /* VLs 0-7 */ + u32 ctxt; + struct hfi1_pportdata *ppd = dd->pport; + + dd->vld[15].sc = sc_alloc(dd, SC_VL15, + dd->rcd[0]->rcvhdrqentsize, dd->node); + if (!dd->vld[15].sc) + goto nomem; + hfi1_init_ctxt(dd->vld[15].sc); + dd->vld[15].mtu = enum_to_mtu(OPA_MTU_2048); + + dd->kernel_send_context = kmalloc_node(dd->num_send_contexts * + sizeof(struct send_context *), + GFP_KERNEL, dd->node); + dd->kernel_send_context[0] = dd->vld[15].sc; + + for (i = 0; i < num_vls; i++) { + /* + * Since this function does not deal with a specific + * receive context but we need the RcvHdrQ entry size, + * use the size from rcd[0]. It is guaranteed to be + * valid at this point and will remain the same for all + * receive contexts. + */ + dd->vld[i].sc = sc_alloc(dd, SC_KERNEL, + dd->rcd[0]->rcvhdrqentsize, dd->node); + if (!dd->vld[i].sc) + goto nomem; + dd->kernel_send_context[i + 1] = dd->vld[i].sc; + hfi1_init_ctxt(dd->vld[i].sc); + /* non VL15 start with the max MTU */ + dd->vld[i].mtu = hfi1_max_mtu; + } + for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) { + dd->kernel_send_context[i + 1] = + sc_alloc(dd, SC_KERNEL, dd->rcd[0]->rcvhdrqentsize, dd->node); + if (!dd->kernel_send_context[i + 1]) + goto nomem; + hfi1_init_ctxt(dd->kernel_send_context[i + 1]); + } + + sc_enable(dd->vld[15].sc); + ctxt = dd->vld[15].sc->hw_context; + mask = all_vl_mask & ~(1LL << 15); + write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask); + dd_dev_info(dd, + "Using send context %u(%u) for VL15\n", + dd->vld[15].sc->sw_index, ctxt); + + for (i = 0; i < num_vls; i++) { + sc_enable(dd->vld[i].sc); + ctxt = dd->vld[i].sc->hw_context; + mask = all_vl_mask & ~(data_vls_mask); + write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask); + } + for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) { + sc_enable(dd->kernel_send_context[i + 1]); + ctxt = dd->kernel_send_context[i + 1]->hw_context; + mask = all_vl_mask & ~(data_vls_mask); + write_kctxt_csr(dd, ctxt, SC(CHECK_VL), mask); + } + + if (pio_map_init(dd, ppd->port - 1, num_vls, NULL)) + goto nomem; + return 0; +nomem: + sc_free(dd->vld[15].sc); + for (i = 0; i < num_vls; i++) + sc_free(dd->vld[i].sc); + for (i = num_vls; i < INIT_SC_PER_VL * num_vls; i++) + sc_free(dd->kernel_send_context[i + 1]); + return -ENOMEM; +} + +int init_credit_return(struct hfi1_devdata *dd) +{ + int ret; + int num_numa; + int i; + + num_numa = num_online_nodes(); + /* enforce the expectation that the numas are compact */ + for (i = 0; i < num_numa; i++) { + if (!node_online(i)) { + dd_dev_err(dd, "NUMA nodes are not compact\n"); + ret = -EINVAL; + goto done; + } + } + + dd->cr_base = kcalloc( + num_numa, + sizeof(struct credit_return_base), + GFP_KERNEL); + if (!dd->cr_base) { + dd_dev_err(dd, "Unable to allocate credit return base\n"); + ret = -ENOMEM; + goto done; + } + for (i = 0; i < num_numa; i++) { + int bytes = TXE_NUM_CONTEXTS * sizeof(struct credit_return); + + set_dev_node(&dd->pcidev->dev, i); + dd->cr_base[i].va = dma_zalloc_coherent( + &dd->pcidev->dev, + bytes, + &dd->cr_base[i].pa, + GFP_KERNEL); + if (!dd->cr_base[i].va) { + set_dev_node(&dd->pcidev->dev, dd->node); + dd_dev_err(dd, + "Unable to allocate credit return DMA range for NUMA %d\n", + i); + ret = -ENOMEM; + goto done; + } + } + set_dev_node(&dd->pcidev->dev, dd->node); + + ret = 0; +done: + return ret; +} + +void free_credit_return(struct hfi1_devdata *dd) +{ + int num_numa; + int i; + + if (!dd->cr_base) + return; + + num_numa = num_online_nodes(); + for (i = 0; i < num_numa; i++) { + if (dd->cr_base[i].va) { + dma_free_coherent(&dd->pcidev->dev, + TXE_NUM_CONTEXTS * + sizeof(struct credit_return), + dd->cr_base[i].va, + dd->cr_base[i].pa); + } + } + kfree(dd->cr_base); + dd->cr_base = NULL; +} diff --git a/drivers/infiniband/hw/hfi1/pio.h b/drivers/infiniband/hw/hfi1/pio.h new file mode 100644 index 000000000..464cbd27b --- /dev/null +++ b/drivers/infiniband/hw/hfi1/pio.h @@ -0,0 +1,328 @@ +#ifndef _PIO_H +#define _PIO_H +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* send context types */ +#define SC_KERNEL 0 +#define SC_VL15 1 +#define SC_ACK 2 +#define SC_USER 3 /* must be the last one: it may take all left */ +#define SC_MAX 4 /* count of send context types */ + +/* invalid send context index */ +#define INVALID_SCI 0xff + +/* PIO buffer release callback function */ +typedef void (*pio_release_cb)(void *arg, int code); + +/* PIO release codes - in bits, as there could more than one that apply */ +#define PRC_OK 0 /* no known error */ +#define PRC_STATUS_ERR 0x01 /* credit return due to status error */ +#define PRC_PBC 0x02 /* credit return due to PBC */ +#define PRC_THRESHOLD 0x04 /* credit return due to threshold */ +#define PRC_FILL_ERR 0x08 /* credit return due fill error */ +#define PRC_FORCE 0x10 /* credit return due credit force */ +#define PRC_SC_DISABLE 0x20 /* clean-up after a context disable */ + +/* byte helper */ +union mix { + u64 val64; + u32 val32[2]; + u8 val8[8]; +}; + +/* an allocated PIO buffer */ +struct pio_buf { + struct send_context *sc;/* back pointer to owning send context */ + pio_release_cb cb; /* called when the buffer is released */ + void *arg; /* argument for cb */ + void __iomem *start; /* buffer start address */ + void __iomem *end; /* context end address */ + unsigned long size; /* context size, in bytes */ + unsigned long sent_at; /* buffer is sent when <= free */ + u32 block_count; /* size of buffer, in blocks */ + u32 qw_written; /* QW written so far */ + u32 carry_bytes; /* number of valid bytes in carry */ + union mix carry; /* pending unwritten bytes */ +}; + +/* cache line aligned pio buffer array */ +union pio_shadow_ring { + struct pio_buf pbuf; + u64 unused[16]; /* cache line spacer */ +} ____cacheline_aligned; + +/* per-NUMA send context */ +struct send_context { + /* read-only after init */ + struct hfi1_devdata *dd; /* device */ + void __iomem *base_addr; /* start of PIO memory */ + union pio_shadow_ring *sr; /* shadow ring */ + + volatile __le64 *hw_free; /* HW free counter */ + struct work_struct halt_work; /* halted context work queue entry */ + unsigned long flags; /* flags */ + int node; /* context home node */ + int type; /* context type */ + u32 sw_index; /* software index number */ + u32 hw_context; /* hardware context number */ + u32 credits; /* number of blocks in context */ + u32 sr_size; /* size of the shadow ring */ + u32 group; /* credit return group */ + /* allocator fields */ + spinlock_t alloc_lock ____cacheline_aligned_in_smp; + unsigned long fill; /* official alloc count */ + unsigned long alloc_free; /* copy of free (less cache thrash) */ + u32 sr_head; /* shadow ring head */ + /* releaser fields */ + spinlock_t release_lock ____cacheline_aligned_in_smp; + unsigned long free; /* official free count */ + u32 sr_tail; /* shadow ring tail */ + /* list for PIO waiters */ + struct list_head piowait ____cacheline_aligned_in_smp; + spinlock_t credit_ctrl_lock ____cacheline_aligned_in_smp; + u64 credit_ctrl; /* cache for credit control */ + u32 credit_intr_count; /* count of credit intr users */ + u32 __percpu *buffers_allocated;/* count of buffers allocated */ + wait_queue_head_t halt_wait; /* wait until kernel sees interrupt */ +}; + +/* send context flags */ +#define SCF_ENABLED 0x01 +#define SCF_IN_FREE 0x02 +#define SCF_HALTED 0x04 +#define SCF_FROZEN 0x08 + +struct send_context_info { + struct send_context *sc; /* allocated working context */ + u16 allocated; /* has this been allocated? */ + u16 type; /* context type */ + u16 base; /* base in PIO array */ + u16 credits; /* size in PIO array */ +}; + +/* DMA credit return, index is always (context & 0x7) */ +struct credit_return { + volatile __le64 cr[8]; +}; + +/* NUMA indexed credit return array */ +struct credit_return_base { + struct credit_return *va; + dma_addr_t pa; +}; + +/* send context configuration sizes (one per type) */ +struct sc_config_sizes { + short int size; + short int count; +}; + +/* + * The diagram below details the relationship of the mapping structures + * + * Since the mapping now allows for non-uniform send contexts per vl, the + * number of send contexts for a vl is either the vl_scontexts[vl] or + * a computation based on num_kernel_send_contexts/num_vls: + * + * For example: + * nactual = vl_scontexts ? vl_scontexts[vl] : num_kernel_send_contexts/num_vls + * + * n = roundup to next highest power of 2 using nactual + * + * In the case where there are num_kernel_send_contexts/num_vls doesn't divide + * evenly, the extras are added from the last vl downward. + * + * For the case where n > nactual, the send contexts are assigned + * in a round robin fashion wrapping back to the first send context + * for a particular vl. + * + * dd->pio_map + * | pio_map_elem[0] + * | +--------------------+ + * v | mask | + * pio_vl_map |--------------------| + * +--------------------------+ | ksc[0] -> sc 1 | + * | list (RCU) | |--------------------| + * |--------------------------| ->| ksc[1] -> sc 2 | + * | mask | --/ |--------------------| + * |--------------------------| -/ | * | + * | actual_vls (max 8) | -/ |--------------------| + * |--------------------------| --/ | ksc[n] -> sc n | + * | vls (max 8) | -/ +--------------------+ + * |--------------------------| --/ + * | map[0] |-/ + * |--------------------------| +--------------------+ + * | map[1] |--- | mask | + * |--------------------------| \---- |--------------------| + * | * | \-- | ksc[0] -> sc 1+n | + * | * | \---- |--------------------| + * | * | \->| ksc[1] -> sc 2+n | + * |--------------------------| |--------------------| + * | map[vls - 1] |- | * | + * +--------------------------+ \- |--------------------| + * \- | ksc[m] -> sc m+n | + * \ +--------------------+ + * \- + * \ + * \- +--------------------+ + * \- | mask | + * \ |--------------------| + * \- | ksc[0] -> sc 1+m+n | + * \- |--------------------| + * >| ksc[1] -> sc 2+m+n | + * |--------------------| + * | * | + * |--------------------| + * | ksc[o] -> sc o+m+n | + * +--------------------+ + * + */ + +/* Initial number of send contexts per VL */ +#define INIT_SC_PER_VL 2 + +/* + * struct pio_map_elem - mapping for a vl + * @mask - selector mask + * @ksc - array of kernel send contexts for this vl + * + * The mask is used to "mod" the selector to + * produce index into the trailing array of + * kscs + */ +struct pio_map_elem { + u32 mask; + struct send_context *ksc[0]; +}; + +/* + * struct pio_vl_map - mapping for a vl + * @list - rcu head for free callback + * @mask - vl mask to "mod" the vl to produce an index to map array + * @actual_vls - number of vls + * @vls - numbers of vls rounded to next power of 2 + * @map - array of pio_map_elem entries + * + * This is the parent mapping structure. The trailing members of the + * struct point to pio_map_elem entries, which in turn point to an + * array of kscs for that vl. + */ +struct pio_vl_map { + struct rcu_head list; + u32 mask; + u8 actual_vls; + u8 vls; + struct pio_map_elem *map[0]; +}; + +int pio_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, + u8 *vl_scontexts); +void free_pio_map(struct hfi1_devdata *dd); +struct send_context *pio_select_send_context_vl(struct hfi1_devdata *dd, + u32 selector, u8 vl); +struct send_context *pio_select_send_context_sc(struct hfi1_devdata *dd, + u32 selector, u8 sc5); + +/* send context functions */ +int init_credit_return(struct hfi1_devdata *dd); +void free_credit_return(struct hfi1_devdata *dd); +int init_sc_pools_and_sizes(struct hfi1_devdata *dd); +int init_send_contexts(struct hfi1_devdata *dd); +int init_credit_return(struct hfi1_devdata *dd); +int init_pervl_scs(struct hfi1_devdata *dd); +struct send_context *sc_alloc(struct hfi1_devdata *dd, int type, + uint hdrqentsize, int numa); +void sc_free(struct send_context *sc); +int sc_enable(struct send_context *sc); +void sc_disable(struct send_context *sc); +int sc_restart(struct send_context *sc); +void sc_return_credits(struct send_context *sc); +void sc_flush(struct send_context *sc); +void sc_drop(struct send_context *sc); +void sc_stop(struct send_context *sc, int bit); +struct pio_buf *sc_buffer_alloc(struct send_context *sc, u32 dw_len, + pio_release_cb cb, void *arg); +void sc_release_update(struct send_context *sc); +void sc_return_credits(struct send_context *sc); +void sc_group_release_update(struct hfi1_devdata *dd, u32 hw_context); +void sc_add_credit_return_intr(struct send_context *sc); +void sc_del_credit_return_intr(struct send_context *sc); +void sc_set_cr_threshold(struct send_context *sc, u32 new_threshold); +u32 sc_percent_to_threshold(struct send_context *sc, u32 percent); +u32 sc_mtu_to_threshold(struct send_context *sc, u32 mtu, u32 hdrqentsize); +void hfi1_sc_wantpiobuf_intr(struct send_context *sc, u32 needint); +void sc_wait(struct hfi1_devdata *dd); +void set_pio_integrity(struct send_context *sc); + +/* support functions */ +void pio_reset_all(struct hfi1_devdata *dd); +void pio_freeze(struct hfi1_devdata *dd); +void pio_kernel_unfreeze(struct hfi1_devdata *dd); + +/* global PIO send control operations */ +#define PSC_GLOBAL_ENABLE 0 +#define PSC_GLOBAL_DISABLE 1 +#define PSC_GLOBAL_VLARB_ENABLE 2 +#define PSC_GLOBAL_VLARB_DISABLE 3 +#define PSC_CM_RESET 4 +#define PSC_DATA_VL_ENABLE 5 +#define PSC_DATA_VL_DISABLE 6 + +void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl); +void pio_send_control(struct hfi1_devdata *dd, int op); + +/* PIO copy routines */ +void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc, + const void *from, size_t count); +void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc, + const void *from, size_t nbytes); +void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes); +void seg_pio_copy_end(struct pio_buf *pbuf); + +#endif /* _PIO_H */ diff --git a/drivers/infiniband/hw/hfi1/pio_copy.c b/drivers/infiniband/hw/hfi1/pio_copy.c new file mode 100644 index 000000000..8c25e1b58 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/pio_copy.c @@ -0,0 +1,867 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "hfi.h" + +/* additive distance between non-SOP and SOP space */ +#define SOP_DISTANCE (TXE_PIO_SIZE / 2) +#define PIO_BLOCK_MASK (PIO_BLOCK_SIZE - 1) +/* number of QUADWORDs in a block */ +#define PIO_BLOCK_QWS (PIO_BLOCK_SIZE / sizeof(u64)) + +/** + * pio_copy - copy data block to MMIO space + * @pbuf: a number of blocks allocated within a PIO send context + * @pbc: PBC to send + * @from: source, must be 8 byte aligned + * @count: number of DWORD (32-bit) quantities to copy from source + * + * Copy data from source to PIO Send Buffer memory, 8 bytes at a time. + * Must always write full BLOCK_SIZE bytes blocks. The first block must + * be written to the corresponding SOP=1 address. + * + * Known: + * o pbuf->start always starts on a block boundary + * o pbuf can wrap only at a block boundary + */ +void pio_copy(struct hfi1_devdata *dd, struct pio_buf *pbuf, u64 pbc, + const void *from, size_t count) +{ + void __iomem *dest = pbuf->start + SOP_DISTANCE; + void __iomem *send = dest + PIO_BLOCK_SIZE; + void __iomem *dend; /* 8-byte data end */ + + /* write the PBC */ + writeq(pbc, dest); + dest += sizeof(u64); + + /* calculate where the QWORD data ends - in SOP=1 space */ + dend = dest + ((count >> 1) * sizeof(u64)); + + if (dend < send) { + /* + * all QWORD data is within the SOP block, does *not* + * reach the end of the SOP block + */ + + while (dest < dend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + /* + * No boundary checks are needed here: + * 0. We're not on the SOP block boundary + * 1. The possible DWORD dangle will still be within + * the SOP block + * 2. We cannot wrap except on a block boundary. + */ + } else { + /* QWORD data extends _to_ or beyond the SOP block */ + + /* write 8-byte SOP chunk data */ + while (dest < send) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + /* drop out of the SOP range */ + dest -= SOP_DISTANCE; + dend -= SOP_DISTANCE; + + /* + * If the wrap comes before or matches the data end, + * copy until until the wrap, then wrap. + * + * If the data ends at the end of the SOP above and + * the buffer wraps, then pbuf->end == dend == dest + * and nothing will get written, but we will wrap in + * case there is a dangling DWORD. + */ + if (pbuf->end <= dend) { + while (dest < pbuf->end) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + + dest -= pbuf->size; + dend -= pbuf->size; + } + + /* write 8-byte non-SOP, non-wrap chunk data */ + while (dest < dend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + } + /* at this point we have wrapped if we are going to wrap */ + + /* write dangling u32, if any */ + if (count & 1) { + union mix val; + + val.val64 = 0; + val.val32[0] = *(u32 *)from; + writeq(val.val64, dest); + dest += sizeof(u64); + } + /* + * fill in rest of block, no need to check pbuf->end + * as we only wrap on a block boundary + */ + while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) { + writeq(0, dest); + dest += sizeof(u64); + } + + /* finished with this buffer */ + this_cpu_dec(*pbuf->sc->buffers_allocated); + preempt_enable(); +} + +/* USE_SHIFTS is faster in user-space tests on a Xeon X5570 @ 2.93GHz */ +#define USE_SHIFTS 1 +#ifdef USE_SHIFTS +/* + * Handle carry bytes using shifts and masks. + * + * NOTE: the value the unused portion of carry is expected to always be zero. + */ + +/* + * "zero" shift - bit shift used to zero out upper bytes. Input is + * the count of LSB bytes to preserve. + */ +#define zshift(x) (8 * (8 - (x))) + +/* + * "merge" shift - bit shift used to merge with carry bytes. Input is + * the LSB byte count to move beyond. + */ +#define mshift(x) (8 * (x)) + +/* + * Read nbytes bytes from "from" and return them in the LSB bytes + * of pbuf->carry. Other bytes are zeroed. Any previous value + * pbuf->carry is lost. + * + * NOTES: + * o do not read from from if nbytes is zero + * o from may _not_ be u64 aligned + * o nbytes must not span a QW boundary + */ +static inline void read_low_bytes(struct pio_buf *pbuf, const void *from, + unsigned int nbytes) +{ + unsigned long off; + + if (nbytes == 0) { + pbuf->carry.val64 = 0; + } else { + /* align our pointer */ + off = (unsigned long)from & 0x7; + from = (void *)((unsigned long)from & ~0x7l); + pbuf->carry.val64 = ((*(u64 *)from) + << zshift(nbytes + off))/* zero upper bytes */ + >> zshift(nbytes); /* place at bottom */ + } + pbuf->carry_bytes = nbytes; +} + +/* + * Read nbytes bytes from "from" and put them at the next significant bytes + * of pbuf->carry. Unused bytes are zeroed. It is expected that the extra + * read does not overfill carry. + * + * NOTES: + * o from may _not_ be u64 aligned + * o nbytes may span a QW boundary + */ +static inline void read_extra_bytes(struct pio_buf *pbuf, + const void *from, unsigned int nbytes) +{ + unsigned long off = (unsigned long)from & 0x7; + unsigned int room, xbytes; + + /* align our pointer */ + from = (void *)((unsigned long)from & ~0x7l); + + /* check count first - don't read anything if count is zero */ + while (nbytes) { + /* find the number of bytes in this u64 */ + room = 8 - off; /* this u64 has room for this many bytes */ + xbytes = min(room, nbytes); + + /* + * shift down to zero lower bytes, shift up to zero upper + * bytes, shift back down to move into place + */ + pbuf->carry.val64 |= (((*(u64 *)from) + >> mshift(off)) + << zshift(xbytes)) + >> zshift(xbytes + pbuf->carry_bytes); + off = 0; + pbuf->carry_bytes += xbytes; + nbytes -= xbytes; + from += sizeof(u64); + } +} + +/* + * Zero extra bytes from the end of pbuf->carry. + * + * NOTES: + * o zbytes <= old_bytes + */ +static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes) +{ + unsigned int remaining; + + if (zbytes == 0) /* nothing to do */ + return; + + remaining = pbuf->carry_bytes - zbytes; /* remaining bytes */ + + /* NOTE: zshift only guaranteed to work if remaining != 0 */ + if (remaining) + pbuf->carry.val64 = (pbuf->carry.val64 << zshift(remaining)) + >> zshift(remaining); + else + pbuf->carry.val64 = 0; + pbuf->carry_bytes = remaining; +} + +/* + * Write a quad word using parts of pbuf->carry and the next 8 bytes of src. + * Put the unused part of the next 8 bytes of src into the LSB bytes of + * pbuf->carry with the upper bytes zeroed.. + * + * NOTES: + * o result must keep unused bytes zeroed + * o src must be u64 aligned + */ +static inline void merge_write8( + struct pio_buf *pbuf, + void __iomem *dest, + const void *src) +{ + u64 new, temp; + + new = *(u64 *)src; + temp = pbuf->carry.val64 | (new << mshift(pbuf->carry_bytes)); + writeq(temp, dest); + pbuf->carry.val64 = new >> zshift(pbuf->carry_bytes); +} + +/* + * Write a quad word using all bytes of carry. + */ +static inline void carry8_write8(union mix carry, void __iomem *dest) +{ + writeq(carry.val64, dest); +} + +/* + * Write a quad word using all the valid bytes of carry. If carry + * has zero valid bytes, nothing is written. + * Returns 0 on nothing written, non-zero on quad word written. + */ +static inline int carry_write8(struct pio_buf *pbuf, void __iomem *dest) +{ + if (pbuf->carry_bytes) { + /* unused bytes are always kept zeroed, so just write */ + writeq(pbuf->carry.val64, dest); + return 1; + } + + return 0; +} + +#else /* USE_SHIFTS */ +/* + * Handle carry bytes using byte copies. + * + * NOTE: the value the unused portion of carry is left uninitialized. + */ + +/* + * Jump copy - no-loop copy for < 8 bytes. + */ +static inline void jcopy(u8 *dest, const u8 *src, u32 n) +{ + switch (n) { + case 7: + *dest++ = *src++; + case 6: + *dest++ = *src++; + case 5: + *dest++ = *src++; + case 4: + *dest++ = *src++; + case 3: + *dest++ = *src++; + case 2: + *dest++ = *src++; + case 1: + *dest++ = *src++; + } +} + +/* + * Read nbytes from "from" and and place them in the low bytes + * of pbuf->carry. Other bytes are left as-is. Any previous + * value in pbuf->carry is lost. + * + * NOTES: + * o do not read from from if nbytes is zero + * o from may _not_ be u64 aligned. + */ +static inline void read_low_bytes(struct pio_buf *pbuf, const void *from, + unsigned int nbytes) +{ + jcopy(&pbuf->carry.val8[0], from, nbytes); + pbuf->carry_bytes = nbytes; +} + +/* + * Read nbytes bytes from "from" and put them at the end of pbuf->carry. + * It is expected that the extra read does not overfill carry. + * + * NOTES: + * o from may _not_ be u64 aligned + * o nbytes may span a QW boundary + */ +static inline void read_extra_bytes(struct pio_buf *pbuf, + const void *from, unsigned int nbytes) +{ + jcopy(&pbuf->carry.val8[pbuf->carry_bytes], from, nbytes); + pbuf->carry_bytes += nbytes; +} + +/* + * Zero extra bytes from the end of pbuf->carry. + * + * We do not care about the value of unused bytes in carry, so just + * reduce the byte count. + * + * NOTES: + * o zbytes <= old_bytes + */ +static inline void zero_extra_bytes(struct pio_buf *pbuf, unsigned int zbytes) +{ + pbuf->carry_bytes -= zbytes; +} + +/* + * Write a quad word using parts of pbuf->carry and the next 8 bytes of src. + * Put the unused part of the next 8 bytes of src into the low bytes of + * pbuf->carry. + */ +static inline void merge_write8( + struct pio_buf *pbuf, + void *dest, + const void *src) +{ + u32 remainder = 8 - pbuf->carry_bytes; + + jcopy(&pbuf->carry.val8[pbuf->carry_bytes], src, remainder); + writeq(pbuf->carry.val64, dest); + jcopy(&pbuf->carry.val8[0], src + remainder, pbuf->carry_bytes); +} + +/* + * Write a quad word using all bytes of carry. + */ +static inline void carry8_write8(union mix carry, void *dest) +{ + writeq(carry.val64, dest); +} + +/* + * Write a quad word using all the valid bytes of carry. If carry + * has zero valid bytes, nothing is written. + * Returns 0 on nothing written, non-zero on quad word written. + */ +static inline int carry_write8(struct pio_buf *pbuf, void *dest) +{ + if (pbuf->carry_bytes) { + u64 zero = 0; + + jcopy(&pbuf->carry.val8[pbuf->carry_bytes], (u8 *)&zero, + 8 - pbuf->carry_bytes); + writeq(pbuf->carry.val64, dest); + return 1; + } + + return 0; +} +#endif /* USE_SHIFTS */ + +/* + * Segmented PIO Copy - start + * + * Start a PIO copy. + * + * @pbuf: destination buffer + * @pbc: the PBC for the PIO buffer + * @from: data source, QWORD aligned + * @nbytes: bytes to copy + */ +void seg_pio_copy_start(struct pio_buf *pbuf, u64 pbc, + const void *from, size_t nbytes) +{ + void __iomem *dest = pbuf->start + SOP_DISTANCE; + void __iomem *send = dest + PIO_BLOCK_SIZE; + void __iomem *dend; /* 8-byte data end */ + + writeq(pbc, dest); + dest += sizeof(u64); + + /* calculate where the QWORD data ends - in SOP=1 space */ + dend = dest + ((nbytes >> 3) * sizeof(u64)); + + if (dend < send) { + /* + * all QWORD data is within the SOP block, does *not* + * reach the end of the SOP block + */ + + while (dest < dend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + /* + * No boundary checks are needed here: + * 0. We're not on the SOP block boundary + * 1. The possible DWORD dangle will still be within + * the SOP block + * 2. We cannot wrap except on a block boundary. + */ + } else { + /* QWORD data extends _to_ or beyond the SOP block */ + + /* write 8-byte SOP chunk data */ + while (dest < send) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + /* drop out of the SOP range */ + dest -= SOP_DISTANCE; + dend -= SOP_DISTANCE; + + /* + * If the wrap comes before or matches the data end, + * copy until until the wrap, then wrap. + * + * If the data ends at the end of the SOP above and + * the buffer wraps, then pbuf->end == dend == dest + * and nothing will get written, but we will wrap in + * case there is a dangling DWORD. + */ + if (pbuf->end <= dend) { + while (dest < pbuf->end) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + + dest -= pbuf->size; + dend -= pbuf->size; + } + + /* write 8-byte non-SOP, non-wrap chunk data */ + while (dest < dend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + } + /* at this point we have wrapped if we are going to wrap */ + + /* ...but it doesn't matter as we're done writing */ + + /* save dangling bytes, if any */ + read_low_bytes(pbuf, from, nbytes & 0x7); + + pbuf->qw_written = 1 /*PBC*/ + (nbytes >> 3); +} + +/* + * Mid copy helper, "mixed case" - source is 64-bit aligned but carry + * bytes are non-zero. + * + * Whole u64s must be written to the chip, so bytes must be manually merged. + * + * @pbuf: destination buffer + * @from: data source, is QWORD aligned. + * @nbytes: bytes to copy + * + * Must handle nbytes < 8. + */ +static void mid_copy_mix(struct pio_buf *pbuf, const void *from, size_t nbytes) +{ + void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64)); + void __iomem *dend; /* 8-byte data end */ + unsigned long qw_to_write = (pbuf->carry_bytes + nbytes) >> 3; + unsigned long bytes_left = (pbuf->carry_bytes + nbytes) & 0x7; + + /* calculate 8-byte data end */ + dend = dest + (qw_to_write * sizeof(u64)); + + if (pbuf->qw_written < PIO_BLOCK_QWS) { + /* + * Still within SOP block. We don't need to check for + * wrap because we are still in the first block and + * can only wrap on block boundaries. + */ + void __iomem *send; /* SOP end */ + void __iomem *xend; + + /* + * calculate the end of data or end of block, whichever + * comes first + */ + send = pbuf->start + PIO_BLOCK_SIZE; + xend = min(send, dend); + + /* shift up to SOP=1 space */ + dest += SOP_DISTANCE; + xend += SOP_DISTANCE; + + /* write 8-byte chunk data */ + while (dest < xend) { + merge_write8(pbuf, dest, from); + from += sizeof(u64); + dest += sizeof(u64); + } + + /* shift down to SOP=0 space */ + dest -= SOP_DISTANCE; + } + /* + * At this point dest could be (either, both, or neither): + * - at dend + * - at the wrap + */ + + /* + * If the wrap comes before or matches the data end, + * copy until until the wrap, then wrap. + * + * If dest is at the wrap, we will fall into the if, + * not do the loop, when wrap. + * + * If the data ends at the end of the SOP above and + * the buffer wraps, then pbuf->end == dend == dest + * and nothing will get written. + */ + if (pbuf->end <= dend) { + while (dest < pbuf->end) { + merge_write8(pbuf, dest, from); + from += sizeof(u64); + dest += sizeof(u64); + } + + dest -= pbuf->size; + dend -= pbuf->size; + } + + /* write 8-byte non-SOP, non-wrap chunk data */ + while (dest < dend) { + merge_write8(pbuf, dest, from); + from += sizeof(u64); + dest += sizeof(u64); + } + + /* adjust carry */ + if (pbuf->carry_bytes < bytes_left) { + /* need to read more */ + read_extra_bytes(pbuf, from, bytes_left - pbuf->carry_bytes); + } else { + /* remove invalid bytes */ + zero_extra_bytes(pbuf, pbuf->carry_bytes - bytes_left); + } + + pbuf->qw_written += qw_to_write; +} + +/* + * Mid copy helper, "straight case" - source pointer is 64-bit aligned + * with no carry bytes. + * + * @pbuf: destination buffer + * @from: data source, is QWORD aligned + * @nbytes: bytes to copy + * + * Must handle nbytes < 8. + */ +static void mid_copy_straight(struct pio_buf *pbuf, + const void *from, size_t nbytes) +{ + void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64)); + void __iomem *dend; /* 8-byte data end */ + + /* calculate 8-byte data end */ + dend = dest + ((nbytes >> 3) * sizeof(u64)); + + if (pbuf->qw_written < PIO_BLOCK_QWS) { + /* + * Still within SOP block. We don't need to check for + * wrap because we are still in the first block and + * can only wrap on block boundaries. + */ + void __iomem *send; /* SOP end */ + void __iomem *xend; + + /* + * calculate the end of data or end of block, whichever + * comes first + */ + send = pbuf->start + PIO_BLOCK_SIZE; + xend = min(send, dend); + + /* shift up to SOP=1 space */ + dest += SOP_DISTANCE; + xend += SOP_DISTANCE; + + /* write 8-byte chunk data */ + while (dest < xend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + + /* shift down to SOP=0 space */ + dest -= SOP_DISTANCE; + } + /* + * At this point dest could be (either, both, or neither): + * - at dend + * - at the wrap + */ + + /* + * If the wrap comes before or matches the data end, + * copy until until the wrap, then wrap. + * + * If dest is at the wrap, we will fall into the if, + * not do the loop, when wrap. + * + * If the data ends at the end of the SOP above and + * the buffer wraps, then pbuf->end == dend == dest + * and nothing will get written. + */ + if (pbuf->end <= dend) { + while (dest < pbuf->end) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + + dest -= pbuf->size; + dend -= pbuf->size; + } + + /* write 8-byte non-SOP, non-wrap chunk data */ + while (dest < dend) { + writeq(*(u64 *)from, dest); + from += sizeof(u64); + dest += sizeof(u64); + } + + /* we know carry_bytes was zero on entry to this routine */ + read_low_bytes(pbuf, from, nbytes & 0x7); + + pbuf->qw_written += nbytes >> 3; +} + +/* + * Segmented PIO Copy - middle + * + * Must handle any aligned tail and any aligned source with any byte count. + * + * @pbuf: a number of blocks allocated within a PIO send context + * @from: data source + * @nbytes: number of bytes to copy + */ +void seg_pio_copy_mid(struct pio_buf *pbuf, const void *from, size_t nbytes) +{ + unsigned long from_align = (unsigned long)from & 0x7; + + if (pbuf->carry_bytes + nbytes < 8) { + /* not enough bytes to fill a QW */ + read_extra_bytes(pbuf, from, nbytes); + return; + } + + if (from_align) { + /* misaligned source pointer - align it */ + unsigned long to_align; + + /* bytes to read to align "from" */ + to_align = 8 - from_align; + + /* + * In the advance-to-alignment logic below, we do not need + * to check if we are using more than nbytes. This is because + * if we are here, we already know that carry+nbytes will + * fill at least one QW. + */ + if (pbuf->carry_bytes + to_align < 8) { + /* not enough align bytes to fill a QW */ + read_extra_bytes(pbuf, from, to_align); + from += to_align; + nbytes -= to_align; + } else { + /* bytes to fill carry */ + unsigned long to_fill = 8 - pbuf->carry_bytes; + /* bytes left over to be read */ + unsigned long extra = to_align - to_fill; + void __iomem *dest; + + /* fill carry... */ + read_extra_bytes(pbuf, from, to_fill); + from += to_fill; + nbytes -= to_fill; + + /* ...now write carry */ + dest = pbuf->start + (pbuf->qw_written * sizeof(u64)); + + /* + * The two checks immediately below cannot both be + * true, hence the else. If we have wrapped, we + * cannot still be within the first block. + * Conversely, if we are still in the first block, we + * cannot have wrapped. We do the wrap check first + * as that is more likely. + */ + /* adjust if we've wrapped */ + if (dest >= pbuf->end) + dest -= pbuf->size; + /* jump to SOP range if within the first block */ + else if (pbuf->qw_written < PIO_BLOCK_QWS) + dest += SOP_DISTANCE; + + carry8_write8(pbuf->carry, dest); + pbuf->qw_written++; + + /* read any extra bytes to do final alignment */ + /* this will overwrite anything in pbuf->carry */ + read_low_bytes(pbuf, from, extra); + from += extra; + nbytes -= extra; + } + + /* at this point, from is QW aligned */ + } + + if (pbuf->carry_bytes) + mid_copy_mix(pbuf, from, nbytes); + else + mid_copy_straight(pbuf, from, nbytes); +} + +/* + * Segmented PIO Copy - end + * + * Write any remainder (in pbuf->carry) and finish writing the whole block. + * + * @pbuf: a number of blocks allocated within a PIO send context + */ +void seg_pio_copy_end(struct pio_buf *pbuf) +{ + void __iomem *dest = pbuf->start + (pbuf->qw_written * sizeof(u64)); + + /* + * The two checks immediately below cannot both be true, hence the + * else. If we have wrapped, we cannot still be within the first + * block. Conversely, if we are still in the first block, we + * cannot have wrapped. We do the wrap check first as that is + * more likely. + */ + /* adjust if we have wrapped */ + if (dest >= pbuf->end) + dest -= pbuf->size; + /* jump to the SOP range if within the first block */ + else if (pbuf->qw_written < PIO_BLOCK_QWS) + dest += SOP_DISTANCE; + + /* write final bytes, if any */ + if (carry_write8(pbuf, dest)) { + dest += sizeof(u64); + /* + * NOTE: We do not need to recalculate whether dest needs + * SOP_DISTANCE or not. + * + * If we are in the first block and the dangle write + * keeps us in the same block, dest will need + * to retain SOP_DISTANCE in the loop below. + * + * If we are in the first block and the dangle write pushes + * us to the next block, then loop below will not run + * and dest is not used. Hence we do not need to update + * it. + * + * If we are past the first block, then SOP_DISTANCE + * was never added, so there is nothing to do. + */ + } + + /* fill in rest of block */ + while (((unsigned long)dest & PIO_BLOCK_MASK) != 0) { + writeq(0, dest); + dest += sizeof(u64); + } + + /* finished with this buffer */ + this_cpu_dec(*pbuf->sc->buffers_allocated); + preempt_enable(); +} diff --git a/drivers/infiniband/hw/hfi1/platform.c b/drivers/infiniband/hw/hfi1/platform.c new file mode 100644 index 000000000..03df9322f --- /dev/null +++ b/drivers/infiniband/hw/hfi1/platform.c @@ -0,0 +1,909 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "hfi.h" +#include "efivar.h" + +void get_platform_config(struct hfi1_devdata *dd) +{ + int ret = 0; + unsigned long size = 0; + u8 *temp_platform_config = NULL; + + ret = read_hfi1_efi_var(dd, "configuration", &size, + (void **)&temp_platform_config); + if (ret) { + dd_dev_info(dd, + "%s: Failed to get platform config from UEFI, falling back to request firmware\n", + __func__); + /* fall back to request firmware */ + platform_config_load = 1; + goto bail; + } + + dd->platform_config.data = temp_platform_config; + dd->platform_config.size = size; + +bail: + /* exit */; +} + +void free_platform_config(struct hfi1_devdata *dd) +{ + if (!platform_config_load) { + /* + * was loaded from EFI, release memory + * allocated by read_efi_var + */ + kfree(dd->platform_config.data); + } + /* + * else do nothing, dispose_firmware will release + * struct firmware platform_config on driver exit + */ +} + +void get_port_type(struct hfi1_pportdata *ppd) +{ + int ret; + + ret = get_platform_config_field(ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_PORT_TYPE, &ppd->port_type, + 4); + if (ret) + ppd->port_type = PORT_TYPE_UNKNOWN; +} + +int set_qsfp_tx(struct hfi1_pportdata *ppd, int on) +{ + u8 tx_ctrl_byte = on ? 0x0 : 0xF; + int ret = 0; + + ret = qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_TX_CTRL_BYTE_OFFS, + &tx_ctrl_byte, 1); + /* we expected 1, so consider 0 an error */ + if (ret == 0) + ret = -EIO; + else if (ret == 1) + ret = 0; + return ret; +} + +static int qual_power(struct hfi1_pportdata *ppd) +{ + u32 cable_power_class = 0, power_class_max = 0; + u8 *cache = ppd->qsfp_info.cache; + int ret = 0; + + ret = get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_SYSTEM_TABLE, 0, + SYSTEM_TABLE_QSFP_POWER_CLASS_MAX, &power_class_max, 4); + if (ret) + return ret; + + cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]); + + if (cable_power_class > power_class_max) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY); + + if (ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_POWER_POLICY)) { + dd_dev_info( + ppd->dd, + "%s: Port disabled due to system power restrictions\n", + __func__); + ret = -EPERM; + } + return ret; +} + +static int qual_bitrate(struct hfi1_pportdata *ppd) +{ + u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled; + u8 *cache = ppd->qsfp_info.cache; + + if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G) && + cache[QSFP_NOM_BIT_RATE_250_OFFS] < 0x64) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY); + + if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G) && + cache[QSFP_NOM_BIT_RATE_100_OFFS] < 0x7D) + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY); + + if (ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_LINKSPEED_POLICY)) { + dd_dev_info( + ppd->dd, + "%s: Cable failed bitrate check, disabling port\n", + __func__); + return -EPERM; + } + return 0; +} + +static int set_qsfp_high_power(struct hfi1_pportdata *ppd) +{ + u8 cable_power_class = 0, power_ctrl_byte = 0; + u8 *cache = ppd->qsfp_info.cache; + int ret; + + cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]); + + if (cable_power_class > QSFP_POWER_CLASS_1) { + power_ctrl_byte = cache[QSFP_PWR_CTRL_BYTE_OFFS]; + + power_ctrl_byte |= 1; + power_ctrl_byte &= ~(0x2); + + ret = qsfp_write(ppd, ppd->dd->hfi1_id, + QSFP_PWR_CTRL_BYTE_OFFS, + &power_ctrl_byte, 1); + if (ret != 1) + return -EIO; + + if (cable_power_class > QSFP_POWER_CLASS_4) { + power_ctrl_byte |= (1 << 2); + ret = qsfp_write(ppd, ppd->dd->hfi1_id, + QSFP_PWR_CTRL_BYTE_OFFS, + &power_ctrl_byte, 1); + if (ret != 1) + return -EIO; + } + + /* SFF 8679 rev 1.7 LPMode Deassert time */ + msleep(300); + } + return 0; +} + +static void apply_rx_cdr(struct hfi1_pportdata *ppd, + u32 rx_preset_index, + u8 *cdr_ctrl_byte) +{ + u32 rx_preset; + u8 *cache = ppd->qsfp_info.cache; + int cable_power_class; + + if (!((cache[QSFP_MOD_PWR_OFFS] & 0x4) && + (cache[QSFP_CDR_INFO_OFFS] & 0x40))) + return; + + /* RX CDR present, bypass supported */ + cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]); + + if (cable_power_class <= QSFP_POWER_CLASS_3) { + /* Power class <= 3, ignore config & turn RX CDR on */ + *cdr_ctrl_byte |= 0xF; + return; + } + + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR_APPLY, + &rx_preset, 4); + + if (!rx_preset) { + dd_dev_info( + ppd->dd, + "%s: RX_CDR_APPLY is set to disabled\n", + __func__); + return; + } + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, RX_PRESET_TABLE_QSFP_RX_CDR, + &rx_preset, 4); + + /* Expand cdr setting to all 4 lanes */ + rx_preset = (rx_preset | (rx_preset << 1) | + (rx_preset << 2) | (rx_preset << 3)); + + if (rx_preset) { + *cdr_ctrl_byte |= rx_preset; + } else { + *cdr_ctrl_byte &= rx_preset; + /* Preserve current TX CDR status */ + *cdr_ctrl_byte |= (cache[QSFP_CDR_CTRL_BYTE_OFFS] & 0xF0); + } +} + +static void apply_tx_cdr(struct hfi1_pportdata *ppd, + u32 tx_preset_index, + u8 *cdr_ctrl_byte) +{ + u32 tx_preset; + u8 *cache = ppd->qsfp_info.cache; + int cable_power_class; + + if (!((cache[QSFP_MOD_PWR_OFFS] & 0x8) && + (cache[QSFP_CDR_INFO_OFFS] & 0x80))) + return; + + /* TX CDR present, bypass supported */ + cable_power_class = get_qsfp_power_class(cache[QSFP_MOD_PWR_OFFS]); + + if (cable_power_class <= QSFP_POWER_CLASS_3) { + /* Power class <= 3, ignore config & turn TX CDR on */ + *cdr_ctrl_byte |= 0xF0; + return; + } + + get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index, + TX_PRESET_TABLE_QSFP_TX_CDR_APPLY, &tx_preset, 4); + + if (!tx_preset) { + dd_dev_info( + ppd->dd, + "%s: TX_CDR_APPLY is set to disabled\n", + __func__); + return; + } + get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_TX_PRESET_TABLE, + tx_preset_index, + TX_PRESET_TABLE_QSFP_TX_CDR, &tx_preset, 4); + + /* Expand cdr setting to all 4 lanes */ + tx_preset = (tx_preset | (tx_preset << 1) | + (tx_preset << 2) | (tx_preset << 3)); + + if (tx_preset) + *cdr_ctrl_byte |= (tx_preset << 4); + else + /* Preserve current/determined RX CDR status */ + *cdr_ctrl_byte &= ((tx_preset << 4) | 0xF); +} + +static void apply_cdr_settings( + struct hfi1_pportdata *ppd, u32 rx_preset_index, + u32 tx_preset_index) +{ + u8 *cache = ppd->qsfp_info.cache; + u8 cdr_ctrl_byte = cache[QSFP_CDR_CTRL_BYTE_OFFS]; + + apply_rx_cdr(ppd, rx_preset_index, &cdr_ctrl_byte); + + apply_tx_cdr(ppd, tx_preset_index, &cdr_ctrl_byte); + + qsfp_write(ppd, ppd->dd->hfi1_id, QSFP_CDR_CTRL_BYTE_OFFS, + &cdr_ctrl_byte, 1); +} + +static void apply_tx_eq_auto(struct hfi1_pportdata *ppd) +{ + u8 *cache = ppd->qsfp_info.cache; + u8 tx_eq; + + if (!(cache[QSFP_EQ_INFO_OFFS] & 0x8)) + return; + /* Disable adaptive TX EQ if present */ + tx_eq = cache[(128 * 3) + 241]; + tx_eq &= 0xF0; + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 241, &tx_eq, 1); +} + +static void apply_tx_eq_prog(struct hfi1_pportdata *ppd, u32 tx_preset_index) +{ + u8 *cache = ppd->qsfp_info.cache; + u32 tx_preset; + u8 tx_eq; + + if (!(cache[QSFP_EQ_INFO_OFFS] & 0x4)) + return; + + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, + tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ_APPLY, + &tx_preset, 4); + if (!tx_preset) { + dd_dev_info( + ppd->dd, + "%s: TX_EQ_APPLY is set to disabled\n", + __func__); + return; + } + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, + tx_preset_index, TX_PRESET_TABLE_QSFP_TX_EQ, + &tx_preset, 4); + + if (((cache[(128 * 3) + 224] & 0xF0) >> 4) < tx_preset) { + dd_dev_info( + ppd->dd, + "%s: TX EQ %x unsupported\n", + __func__, tx_preset); + + dd_dev_info( + ppd->dd, + "%s: Applying EQ %x\n", + __func__, cache[608] & 0xF0); + + tx_preset = (cache[608] & 0xF0) >> 4; + } + + tx_eq = tx_preset | (tx_preset << 4); + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 234, &tx_eq, 1); + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 235, &tx_eq, 1); +} + +static void apply_rx_eq_emp(struct hfi1_pportdata *ppd, u32 rx_preset_index) +{ + u32 rx_preset; + u8 rx_eq, *cache = ppd->qsfp_info.cache; + + if (!(cache[QSFP_EQ_INFO_OFFS] & 0x2)) + return; + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP_APPLY, + &rx_preset, 4); + + if (!rx_preset) { + dd_dev_info( + ppd->dd, + "%s: RX_EMP_APPLY is set to disabled\n", + __func__); + return; + } + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, RX_PRESET_TABLE_QSFP_RX_EMP, + &rx_preset, 4); + + if ((cache[(128 * 3) + 224] & 0xF) < rx_preset) { + dd_dev_info( + ppd->dd, + "%s: Requested RX EMP %x\n", + __func__, rx_preset); + + dd_dev_info( + ppd->dd, + "%s: Applying supported EMP %x\n", + __func__, cache[608] & 0xF); + + rx_preset = cache[608] & 0xF; + } + + rx_eq = rx_preset | (rx_preset << 4); + + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 236, &rx_eq, 1); + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 237, &rx_eq, 1); +} + +static void apply_eq_settings(struct hfi1_pportdata *ppd, + u32 rx_preset_index, u32 tx_preset_index) +{ + u8 *cache = ppd->qsfp_info.cache; + + /* no point going on w/o a page 3 */ + if (cache[2] & 4) { + dd_dev_info(ppd->dd, + "%s: Upper page 03 not present\n", + __func__); + return; + } + + apply_tx_eq_auto(ppd); + + apply_tx_eq_prog(ppd, tx_preset_index); + + apply_rx_eq_emp(ppd, rx_preset_index); +} + +static void apply_rx_amplitude_settings( + struct hfi1_pportdata *ppd, u32 rx_preset_index, + u32 tx_preset_index) +{ + u32 rx_preset; + u8 rx_amp = 0, i = 0, preferred = 0, *cache = ppd->qsfp_info.cache; + + /* no point going on w/o a page 3 */ + if (cache[2] & 4) { + dd_dev_info(ppd->dd, + "%s: Upper page 03 not present\n", + __func__); + return; + } + if (!(cache[QSFP_EQ_INFO_OFFS] & 0x1)) { + dd_dev_info(ppd->dd, + "%s: RX_AMP_APPLY is set to disabled\n", + __func__); + return; + } + + get_platform_config_field(ppd->dd, + PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, + RX_PRESET_TABLE_QSFP_RX_AMP_APPLY, + &rx_preset, 4); + + if (!rx_preset) { + dd_dev_info(ppd->dd, + "%s: RX_AMP_APPLY is set to disabled\n", + __func__); + return; + } + get_platform_config_field(ppd->dd, + PLATFORM_CONFIG_RX_PRESET_TABLE, + rx_preset_index, + RX_PRESET_TABLE_QSFP_RX_AMP, + &rx_preset, 4); + + dd_dev_info(ppd->dd, + "%s: Requested RX AMP %x\n", + __func__, + rx_preset); + + for (i = 0; i < 4; i++) { + if (cache[(128 * 3) + 225] & (1 << i)) { + preferred = i; + if (preferred == rx_preset) + break; + } + } + + /* + * Verify that preferred RX amplitude is not just a + * fall through of the default + */ + if (!preferred && !(cache[(128 * 3) + 225] & 0x1)) { + dd_dev_info(ppd->dd, "No supported RX AMP, not applying\n"); + return; + } + + dd_dev_info(ppd->dd, + "%s: Applying RX AMP %x\n", __func__, preferred); + + rx_amp = preferred | (preferred << 4); + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 238, &rx_amp, 1); + qsfp_write(ppd, ppd->dd->hfi1_id, (256 * 3) + 239, &rx_amp, 1); +} + +#define OPA_INVALID_INDEX 0xFFF + +static void apply_tx_lanes(struct hfi1_pportdata *ppd, u8 field_id, + u32 config_data, const char *message) +{ + u8 i; + int ret = HCMD_SUCCESS; + + for (i = 0; i < 4; i++) { + ret = load_8051_config(ppd->dd, field_id, i, config_data); + if (ret != HCMD_SUCCESS) { + dd_dev_err( + ppd->dd, + "%s: %s for lane %u failed\n", + message, __func__, i); + } + } +} + +static void apply_tunings( + struct hfi1_pportdata *ppd, u32 tx_preset_index, + u8 tuning_method, u32 total_atten, u8 limiting_active) +{ + int ret = 0; + u32 config_data = 0, tx_preset = 0; + u8 precur = 0, attn = 0, postcur = 0, external_device_config = 0; + u8 *cache = ppd->qsfp_info.cache; + + /* Enable external device config if channel is limiting active */ + read_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS, + GENERAL_CONFIG, &config_data); + config_data &= ~(0xff << ENABLE_EXT_DEV_CONFIG_SHIFT); + config_data |= ((u32)limiting_active << ENABLE_EXT_DEV_CONFIG_SHIFT); + ret = load_8051_config(ppd->dd, LINK_OPTIMIZATION_SETTINGS, + GENERAL_CONFIG, config_data); + if (ret != HCMD_SUCCESS) + dd_dev_err( + ppd->dd, + "%s: Failed to set enable external device config\n", + __func__); + + config_data = 0; /* re-init */ + /* Pass tuning method to 8051 */ + read_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG, + &config_data); + config_data &= ~(0xff << TUNING_METHOD_SHIFT); + config_data |= ((u32)tuning_method << TUNING_METHOD_SHIFT); + ret = load_8051_config(ppd->dd, LINK_TUNING_PARAMETERS, GENERAL_CONFIG, + config_data); + if (ret != HCMD_SUCCESS) + dd_dev_err(ppd->dd, "%s: Failed to set tuning method\n", + __func__); + + /* Set same channel loss for both TX and RX */ + config_data = 0 | (total_atten << 16) | (total_atten << 24); + apply_tx_lanes(ppd, CHANNEL_LOSS_SETTINGS, config_data, + "Setting channel loss"); + + /* Inform 8051 of cable capabilities */ + if (ppd->qsfp_info.cache_valid) { + external_device_config = + ((cache[QSFP_MOD_PWR_OFFS] & 0x4) << 3) | + ((cache[QSFP_MOD_PWR_OFFS] & 0x8) << 2) | + ((cache[QSFP_EQ_INFO_OFFS] & 0x2) << 1) | + (cache[QSFP_EQ_INFO_OFFS] & 0x4); + ret = read_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS, + GENERAL_CONFIG, &config_data); + /* Clear, then set the external device config field */ + config_data &= ~(u32)0xFF; + config_data |= external_device_config; + ret = load_8051_config(ppd->dd, DC_HOST_COMM_SETTINGS, + GENERAL_CONFIG, config_data); + if (ret != HCMD_SUCCESS) + dd_dev_info(ppd->dd, + "%s: Failed set ext device config params\n", + __func__); + } + + if (tx_preset_index == OPA_INVALID_INDEX) { + if (ppd->port_type == PORT_TYPE_QSFP && limiting_active) + dd_dev_info(ppd->dd, "%s: Invalid Tx preset index\n", + __func__); + return; + } + + /* Following for limiting active channels only */ + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, tx_preset_index, + TX_PRESET_TABLE_PRECUR, &tx_preset, 4); + precur = tx_preset; + + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, + tx_preset_index, TX_PRESET_TABLE_ATTN, &tx_preset, 4); + attn = tx_preset; + + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_TX_PRESET_TABLE, + tx_preset_index, TX_PRESET_TABLE_POSTCUR, &tx_preset, 4); + postcur = tx_preset; + + config_data = precur | (attn << 8) | (postcur << 16); + + apply_tx_lanes(ppd, TX_EQ_SETTINGS, config_data, + "Applying TX settings"); +} + +/* Must be holding the QSFP i2c resource */ +static int tune_active_qsfp(struct hfi1_pportdata *ppd, u32 *ptr_tx_preset, + u32 *ptr_rx_preset, u32 *ptr_total_atten) +{ + int ret; + u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled; + u8 *cache = ppd->qsfp_info.cache; + + ppd->qsfp_info.limiting_active = 1; + + ret = set_qsfp_tx(ppd, 0); + if (ret) + return ret; + + ret = qual_power(ppd); + if (ret) + return ret; + + ret = qual_bitrate(ppd); + if (ret) + return ret; + + if (ppd->qsfp_info.reset_needed) { + reset_qsfp(ppd); + ppd->qsfp_info.reset_needed = 0; + refresh_qsfp_cache(ppd, &ppd->qsfp_info); + } else { + ppd->qsfp_info.reset_needed = 1; + } + + ret = set_qsfp_high_power(ppd); + if (ret) + return ret; + + if (cache[QSFP_EQ_INFO_OFFS] & 0x4) { + ret = get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ, + ptr_tx_preset, 4); + if (ret) { + *ptr_tx_preset = OPA_INVALID_INDEX; + return ret; + } + } else { + ret = get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ, + ptr_tx_preset, 4); + if (ret) { + *ptr_tx_preset = OPA_INVALID_INDEX; + return ret; + } + } + + ret = get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_RX_PRESET_IDX, ptr_rx_preset, 4); + if (ret) { + *ptr_rx_preset = OPA_INVALID_INDEX; + return ret; + } + + if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G)) + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_LOCAL_ATTEN_25G, ptr_total_atten, 4); + else if ((lss & OPA_LINK_SPEED_12_5G) && (lse & OPA_LINK_SPEED_12_5G)) + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_LOCAL_ATTEN_12G, ptr_total_atten, 4); + + apply_cdr_settings(ppd, *ptr_rx_preset, *ptr_tx_preset); + + apply_eq_settings(ppd, *ptr_rx_preset, *ptr_tx_preset); + + apply_rx_amplitude_settings(ppd, *ptr_rx_preset, *ptr_tx_preset); + + ret = set_qsfp_tx(ppd, 1); + + return ret; +} + +static int tune_qsfp(struct hfi1_pportdata *ppd, + u32 *ptr_tx_preset, u32 *ptr_rx_preset, + u8 *ptr_tuning_method, u32 *ptr_total_atten) +{ + u32 cable_atten = 0, remote_atten = 0, platform_atten = 0; + u16 lss = ppd->link_speed_supported, lse = ppd->link_speed_enabled; + int ret = 0; + u8 *cache = ppd->qsfp_info.cache; + + switch ((cache[QSFP_MOD_TECH_OFFS] & 0xF0) >> 4) { + case 0xA ... 0xB: + ret = get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_LOCAL_ATTEN_25G, + &platform_atten, 4); + if (ret) + return ret; + + if ((lss & OPA_LINK_SPEED_25G) && (lse & OPA_LINK_SPEED_25G)) + cable_atten = cache[QSFP_CU_ATTEN_12G_OFFS]; + else if ((lss & OPA_LINK_SPEED_12_5G) && + (lse & OPA_LINK_SPEED_12_5G)) + cable_atten = cache[QSFP_CU_ATTEN_7G_OFFS]; + + /* Fallback to configured attenuation if cable memory is bad */ + if (cable_atten == 0 || cable_atten > 36) { + ret = get_platform_config_field( + ppd->dd, + PLATFORM_CONFIG_SYSTEM_TABLE, 0, + SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G, + &cable_atten, 4); + if (ret) + return ret; + } + + ret = get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4); + if (ret) + return ret; + + *ptr_total_atten = platform_atten + cable_atten + remote_atten; + + *ptr_tuning_method = OPA_PASSIVE_TUNING; + break; + case 0x0 ... 0x9: /* fallthrough */ + case 0xC: /* fallthrough */ + case 0xE: + ret = tune_active_qsfp(ppd, ptr_tx_preset, ptr_rx_preset, + ptr_total_atten); + if (ret) + return ret; + + *ptr_tuning_method = OPA_ACTIVE_TUNING; + break; + case 0xD: /* fallthrough */ + case 0xF: + default: + dd_dev_info(ppd->dd, "%s: Unknown/unsupported cable\n", + __func__); + break; + } + return ret; +} + +/* + * This function communicates its success or failure via ppd->driver_link_ready + * Thus, it depends on its association with start_link(...) which checks + * driver_link_ready before proceeding with the link negotiation and + * initialization process. + */ +void tune_serdes(struct hfi1_pportdata *ppd) +{ + int ret = 0; + u32 total_atten = 0; + u32 remote_atten = 0, platform_atten = 0; + u32 rx_preset_index, tx_preset_index; + u8 tuning_method = 0, limiting_active = 0; + struct hfi1_devdata *dd = ppd->dd; + + rx_preset_index = OPA_INVALID_INDEX; + tx_preset_index = OPA_INVALID_INDEX; + + /* the link defaults to enabled */ + ppd->link_enabled = 1; + /* the driver link ready state defaults to not ready */ + ppd->driver_link_ready = 0; + ppd->offline_disabled_reason = HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE); + + /* Skip the tuning for testing (loopback != none) and simulations */ + if (loopback != LOOPBACK_NONE || + ppd->dd->icode == ICODE_FUNCTIONAL_SIMULATOR) { + ppd->driver_link_ready = 1; + return; + } + + switch (ppd->port_type) { + case PORT_TYPE_DISCONNECTED: + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_DISCONNECTED); + dd_dev_info(dd, "%s: Port disconnected, disabling port\n", + __func__); + goto bail; + case PORT_TYPE_FIXED: + /* platform_atten, remote_atten pre-zeroed to catch error */ + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_LOCAL_ATTEN_25G, &platform_atten, 4); + + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_REMOTE_ATTEN_25G, &remote_atten, 4); + + total_atten = platform_atten + remote_atten; + + tuning_method = OPA_PASSIVE_TUNING; + break; + case PORT_TYPE_VARIABLE: + if (qsfp_mod_present(ppd)) { + /* + * platform_atten, remote_atten pre-zeroed to + * catch error + */ + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_LOCAL_ATTEN_25G, + &platform_atten, 4); + + get_platform_config_field( + ppd->dd, PLATFORM_CONFIG_PORT_TABLE, 0, + PORT_TABLE_REMOTE_ATTEN_25G, + &remote_atten, 4); + + total_atten = platform_atten + remote_atten; + + tuning_method = OPA_PASSIVE_TUNING; + } else { + ppd->offline_disabled_reason = + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_CHASSIS_CONFIG); + goto bail; + } + break; + case PORT_TYPE_QSFP: + if (qsfp_mod_present(ppd)) { + ret = acquire_chip_resource(ppd->dd, + qsfp_resource(ppd->dd), + QSFP_WAIT); + if (ret) { + dd_dev_err(ppd->dd, "%s: hfi%d: cannot lock i2c chain\n", + __func__, (int)ppd->dd->hfi1_id); + goto bail; + } + refresh_qsfp_cache(ppd, &ppd->qsfp_info); + + if (ppd->qsfp_info.cache_valid) { + ret = tune_qsfp(ppd, + &tx_preset_index, + &rx_preset_index, + &tuning_method, + &total_atten); + + /* + * We may have modified the QSFP memory, so + * update the cache to reflect the changes + */ + refresh_qsfp_cache(ppd, &ppd->qsfp_info); + limiting_active = + ppd->qsfp_info.limiting_active; + } else { + dd_dev_err(dd, + "%s: Reading QSFP memory failed\n", + __func__); + ret = -EINVAL; /* a fail indication */ + } + release_chip_resource(ppd->dd, qsfp_resource(ppd->dd)); + if (ret) + goto bail; + } else { + ppd->offline_disabled_reason = + HFI1_ODR_MASK( + OPA_LINKDOWN_REASON_LOCAL_MEDIA_NOT_INSTALLED); + goto bail; + } + break; + default: + dd_dev_info(ppd->dd, "%s: Unknown port type\n", __func__); + ppd->port_type = PORT_TYPE_UNKNOWN; + tuning_method = OPA_UNKNOWN_TUNING; + total_atten = 0; + limiting_active = 0; + tx_preset_index = OPA_INVALID_INDEX; + break; + } + + if (ppd->offline_disabled_reason == + HFI1_ODR_MASK(OPA_LINKDOWN_REASON_NONE)) + apply_tunings(ppd, tx_preset_index, tuning_method, + total_atten, limiting_active); + + if (!ret) + ppd->driver_link_ready = 1; + + return; +bail: + ppd->driver_link_ready = 0; +} diff --git a/drivers/infiniband/hw/hfi1/platform.h b/drivers/infiniband/hw/hfi1/platform.h new file mode 100644 index 000000000..e2c21613c --- /dev/null +++ b/drivers/infiniband/hw/hfi1/platform.h @@ -0,0 +1,305 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef __PLATFORM_H +#define __PLATFORM_H + +#define METADATA_TABLE_FIELD_START_SHIFT 0 +#define METADATA_TABLE_FIELD_START_LEN_BITS 15 +#define METADATA_TABLE_FIELD_LEN_SHIFT 16 +#define METADATA_TABLE_FIELD_LEN_LEN_BITS 16 + +/* Header structure */ +#define PLATFORM_CONFIG_HEADER_RECORD_IDX_SHIFT 0 +#define PLATFORM_CONFIG_HEADER_RECORD_IDX_LEN_BITS 6 +#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_SHIFT 16 +#define PLATFORM_CONFIG_HEADER_TABLE_LENGTH_LEN_BITS 12 +#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_SHIFT 28 +#define PLATFORM_CONFIG_HEADER_TABLE_TYPE_LEN_BITS 4 + +enum platform_config_table_type_encoding { + PLATFORM_CONFIG_TABLE_RESERVED, + PLATFORM_CONFIG_SYSTEM_TABLE, + PLATFORM_CONFIG_PORT_TABLE, + PLATFORM_CONFIG_RX_PRESET_TABLE, + PLATFORM_CONFIG_TX_PRESET_TABLE, + PLATFORM_CONFIG_QSFP_ATTEN_TABLE, + PLATFORM_CONFIG_VARIABLE_SETTINGS_TABLE, + PLATFORM_CONFIG_TABLE_MAX +}; + +enum platform_config_system_table_fields { + SYSTEM_TABLE_RESERVED, + SYSTEM_TABLE_NODE_STRING, + SYSTEM_TABLE_SYSTEM_IMAGE_GUID, + SYSTEM_TABLE_NODE_GUID, + SYSTEM_TABLE_REVISION, + SYSTEM_TABLE_VENDOR_OUI, + SYSTEM_TABLE_META_VERSION, + SYSTEM_TABLE_DEVICE_ID, + SYSTEM_TABLE_PARTITION_ENFORCEMENT_CAP, + SYSTEM_TABLE_QSFP_POWER_CLASS_MAX, + SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_12G, + SYSTEM_TABLE_QSFP_ATTENUATION_DEFAULT_25G, + SYSTEM_TABLE_VARIABLE_TABLE_ENTRIES_PER_PORT, + SYSTEM_TABLE_MAX +}; + +enum platform_config_port_table_fields { + PORT_TABLE_RESERVED, + PORT_TABLE_PORT_TYPE, + PORT_TABLE_LOCAL_ATTEN_12G, + PORT_TABLE_LOCAL_ATTEN_25G, + PORT_TABLE_LINK_SPEED_SUPPORTED, + PORT_TABLE_LINK_WIDTH_SUPPORTED, + PORT_TABLE_AUTO_LANE_SHEDDING_ENABLED, + PORT_TABLE_EXTERNAL_LOOPBACK_ALLOWED, + PORT_TABLE_VL_CAP, + PORT_TABLE_MTU_CAP, + PORT_TABLE_TX_LANE_ENABLE_MASK, + PORT_TABLE_LOCAL_MAX_TIMEOUT, + PORT_TABLE_REMOTE_ATTEN_12G, + PORT_TABLE_REMOTE_ATTEN_25G, + PORT_TABLE_TX_PRESET_IDX_ACTIVE_NO_EQ, + PORT_TABLE_TX_PRESET_IDX_ACTIVE_EQ, + PORT_TABLE_RX_PRESET_IDX, + PORT_TABLE_CABLE_REACH_CLASS, + PORT_TABLE_MAX +}; + +enum platform_config_rx_preset_table_fields { + RX_PRESET_TABLE_RESERVED, + RX_PRESET_TABLE_QSFP_RX_CDR_APPLY, + RX_PRESET_TABLE_QSFP_RX_EMP_APPLY, + RX_PRESET_TABLE_QSFP_RX_AMP_APPLY, + RX_PRESET_TABLE_QSFP_RX_CDR, + RX_PRESET_TABLE_QSFP_RX_EMP, + RX_PRESET_TABLE_QSFP_RX_AMP, + RX_PRESET_TABLE_MAX +}; + +enum platform_config_tx_preset_table_fields { + TX_PRESET_TABLE_RESERVED, + TX_PRESET_TABLE_PRECUR, + TX_PRESET_TABLE_ATTN, + TX_PRESET_TABLE_POSTCUR, + TX_PRESET_TABLE_QSFP_TX_CDR_APPLY, + TX_PRESET_TABLE_QSFP_TX_EQ_APPLY, + TX_PRESET_TABLE_QSFP_TX_CDR, + TX_PRESET_TABLE_QSFP_TX_EQ, + TX_PRESET_TABLE_MAX +}; + +enum platform_config_qsfp_attn_table_fields { + QSFP_ATTEN_TABLE_RESERVED, + QSFP_ATTEN_TABLE_TX_PRESET_IDX, + QSFP_ATTEN_TABLE_RX_PRESET_IDX, + QSFP_ATTEN_TABLE_MAX +}; + +enum platform_config_variable_settings_table_fields { + VARIABLE_SETTINGS_TABLE_RESERVED, + VARIABLE_SETTINGS_TABLE_TX_PRESET_IDX, + VARIABLE_SETTINGS_TABLE_RX_PRESET_IDX, + VARIABLE_SETTINGS_TABLE_MAX +}; + +struct platform_config { + size_t size; + const u8 *data; +}; + +struct platform_config_data { + u32 *table; + u32 *table_metadata; + u32 num_table; +}; + +/* + * This struct acts as a quick reference into the platform_data binary image + * and is populated by parse_platform_config(...) depending on the specific + * META_VERSION + */ +struct platform_config_cache { + u8 cache_valid; + struct platform_config_data config_tables[PLATFORM_CONFIG_TABLE_MAX]; +}; + +static const u32 platform_config_table_limits[PLATFORM_CONFIG_TABLE_MAX] = { + 0, + SYSTEM_TABLE_MAX, + PORT_TABLE_MAX, + RX_PRESET_TABLE_MAX, + TX_PRESET_TABLE_MAX, + QSFP_ATTEN_TABLE_MAX, + VARIABLE_SETTINGS_TABLE_MAX +}; + +/* This section defines default values and encodings for the + * fields defined for each table above + */ + +/* + * ===================================================== + * System table encodings + * ===================================================== + */ +#define PLATFORM_CONFIG_MAGIC_NUM 0x3d4f5041 +#define PLATFORM_CONFIG_MAGIC_NUMBER_LEN 4 + +/* + * These power classes are the same as defined in SFF 8636 spec rev 2.4 + * describing byte 129 in table 6-16, except enumerated in a different order + */ +enum platform_config_qsfp_power_class_encoding { + QSFP_POWER_CLASS_1 = 1, + QSFP_POWER_CLASS_2, + QSFP_POWER_CLASS_3, + QSFP_POWER_CLASS_4, + QSFP_POWER_CLASS_5, + QSFP_POWER_CLASS_6, + QSFP_POWER_CLASS_7 +}; + +/* + * ==================================================== + * Port table encodings + * ==================================================== + */ +enum platform_config_port_type_encoding { + PORT_TYPE_UNKNOWN, + PORT_TYPE_DISCONNECTED, + PORT_TYPE_FIXED, + PORT_TYPE_VARIABLE, + PORT_TYPE_QSFP, + PORT_TYPE_MAX +}; + +enum platform_config_link_speed_supported_encoding { + LINK_SPEED_SUPP_12G = 1, + LINK_SPEED_SUPP_25G, + LINK_SPEED_SUPP_12G_25G, + LINK_SPEED_SUPP_MAX +}; + +/* + * This is a subset (not strict) of the link downgrades + * supported. The link downgrades supported are expected + * to be supplied to the driver by another entity such as + * the fabric manager + */ +enum platform_config_link_width_supported_encoding { + LINK_WIDTH_SUPP_1X = 1, + LINK_WIDTH_SUPP_2X, + LINK_WIDTH_SUPP_2X_1X, + LINK_WIDTH_SUPP_3X, + LINK_WIDTH_SUPP_3X_1X, + LINK_WIDTH_SUPP_3X_2X, + LINK_WIDTH_SUPP_3X_2X_1X, + LINK_WIDTH_SUPP_4X, + LINK_WIDTH_SUPP_4X_1X, + LINK_WIDTH_SUPP_4X_2X, + LINK_WIDTH_SUPP_4X_2X_1X, + LINK_WIDTH_SUPP_4X_3X, + LINK_WIDTH_SUPP_4X_3X_1X, + LINK_WIDTH_SUPP_4X_3X_2X, + LINK_WIDTH_SUPP_4X_3X_2X_1X, + LINK_WIDTH_SUPP_MAX +}; + +enum platform_config_virtual_lane_capability_encoding { + VL_CAP_VL0 = 1, + VL_CAP_VL0_1, + VL_CAP_VL0_2, + VL_CAP_VL0_3, + VL_CAP_VL0_4, + VL_CAP_VL0_5, + VL_CAP_VL0_6, + VL_CAP_VL0_7, + VL_CAP_VL0_8, + VL_CAP_VL0_9, + VL_CAP_VL0_10, + VL_CAP_VL0_11, + VL_CAP_VL0_12, + VL_CAP_VL0_13, + VL_CAP_VL0_14, + VL_CAP_MAX +}; + +/* Max MTU */ +enum platform_config_mtu_capability_encoding { + MTU_CAP_256 = 1, + MTU_CAP_512 = 2, + MTU_CAP_1024 = 3, + MTU_CAP_2048 = 4, + MTU_CAP_4096 = 5, + MTU_CAP_8192 = 6, + MTU_CAP_10240 = 7 +}; + +enum platform_config_local_max_timeout_encoding { + LOCAL_MAX_TIMEOUT_10_MS = 1, + LOCAL_MAX_TIMEOUT_100_MS, + LOCAL_MAX_TIMEOUT_1_S, + LOCAL_MAX_TIMEOUT_10_S, + LOCAL_MAX_TIMEOUT_100_S, + LOCAL_MAX_TIMEOUT_1000_S +}; + +enum link_tuning_encoding { + OPA_PASSIVE_TUNING, + OPA_ACTIVE_TUNING, + OPA_UNKNOWN_TUNING +}; + +/* platform.c */ +void get_platform_config(struct hfi1_devdata *dd); +void free_platform_config(struct hfi1_devdata *dd); +void get_port_type(struct hfi1_pportdata *ppd); +int set_qsfp_tx(struct hfi1_pportdata *ppd, int on); +void tune_serdes(struct hfi1_pportdata *ppd); + +#endif /*__PLATFORM_H*/ diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c new file mode 100644 index 000000000..1a942ffba --- /dev/null +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -0,0 +1,974 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/err.h> +#include <linux/vmalloc.h> +#include <linux/hash.h> +#include <linux/module.h> +#include <linux/seq_file.h> +#include <rdma/rdma_vt.h> +#include <rdma/rdmavt_qp.h> + +#include "hfi.h" +#include "qp.h" +#include "trace.h" +#include "verbs_txreq.h" + +unsigned int hfi1_qp_table_size = 256; +module_param_named(qp_table_size, hfi1_qp_table_size, uint, S_IRUGO); +MODULE_PARM_DESC(qp_table_size, "QP table size"); + +static void flush_tx_list(struct rvt_qp *qp); +static int iowait_sleep( + struct sdma_engine *sde, + struct iowait *wait, + struct sdma_txreq *stx, + unsigned seq); +static void iowait_wakeup(struct iowait *wait, int reason); +static void iowait_sdma_drained(struct iowait *wait); +static void qp_pio_drain(struct rvt_qp *qp); + +static inline unsigned mk_qpn(struct rvt_qpn_table *qpt, + struct rvt_qpn_map *map, unsigned off) +{ + return (map - qpt->map) * RVT_BITS_PER_PAGE + off; +} + +/* + * Convert the AETH credit code into the number of credits. + */ +static const u16 credit_table[31] = { + 0, /* 0 */ + 1, /* 1 */ + 2, /* 2 */ + 3, /* 3 */ + 4, /* 4 */ + 6, /* 5 */ + 8, /* 6 */ + 12, /* 7 */ + 16, /* 8 */ + 24, /* 9 */ + 32, /* A */ + 48, /* B */ + 64, /* C */ + 96, /* D */ + 128, /* E */ + 192, /* F */ + 256, /* 10 */ + 384, /* 11 */ + 512, /* 12 */ + 768, /* 13 */ + 1024, /* 14 */ + 1536, /* 15 */ + 2048, /* 16 */ + 3072, /* 17 */ + 4096, /* 18 */ + 6144, /* 19 */ + 8192, /* 1A */ + 12288, /* 1B */ + 16384, /* 1C */ + 24576, /* 1D */ + 32768 /* 1E */ +}; + +static void flush_tx_list(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + while (!list_empty(&priv->s_iowait.tx_head)) { + struct sdma_txreq *tx; + + tx = list_first_entry( + &priv->s_iowait.tx_head, + struct sdma_txreq, + list); + list_del_init(&tx->list); + hfi1_put_txreq( + container_of(tx, struct verbs_txreq, txreq)); + } +} + +static void flush_iowait(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + unsigned long flags; + + write_seqlock_irqsave(&dev->iowait_lock, flags); + if (!list_empty(&priv->s_iowait.list)) { + list_del_init(&priv->s_iowait.list); + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } + write_sequnlock_irqrestore(&dev->iowait_lock, flags); +} + +static inline int opa_mtu_enum_to_int(int mtu) +{ + switch (mtu) { + case OPA_MTU_8192: return 8192; + case OPA_MTU_10240: return 10240; + default: return -1; + } +} + +/** + * This function is what we would push to the core layer if we wanted to be a + * "first class citizen". Instead we hide this here and rely on Verbs ULPs + * to blindly pass the MTU enum value from the PathRecord to us. + */ +static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu mtu) +{ + int val; + + /* Constraining 10KB packets to 8KB packets */ + if (mtu == (enum ib_mtu)OPA_MTU_10240) + mtu = OPA_MTU_8192; + val = opa_mtu_enum_to_int((int)mtu); + if (val > 0) + return val; + return ib_mtu_enum_to_int(mtu); +} + +int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct ib_qp *ibqp = &qp->ibqp; + struct hfi1_ibdev *dev = to_idev(ibqp->device); + struct hfi1_devdata *dd = dd_from_dev(dev); + u8 sc; + + if (attr_mask & IB_QP_AV) { + sc = ah_to_sc(ibqp->device, &attr->ah_attr); + if (sc == 0xf) + return -EINVAL; + + if (!qp_to_sdma_engine(qp, sc) && + dd->flags & HFI1_HAS_SEND_DMA) + return -EINVAL; + + if (!qp_to_send_context(qp, sc)) + return -EINVAL; + } + + if (attr_mask & IB_QP_ALT_PATH) { + sc = ah_to_sc(ibqp->device, &attr->alt_ah_attr); + if (sc == 0xf) + return -EINVAL; + + if (!qp_to_sdma_engine(qp, sc) && + dd->flags & HFI1_HAS_SEND_DMA) + return -EINVAL; + + if (!qp_to_send_context(qp, sc)) + return -EINVAL; + } + + return 0; +} + +void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct ib_qp *ibqp = &qp->ibqp; + struct hfi1_qp_priv *priv = qp->priv; + + if (attr_mask & IB_QP_AV) { + priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr); + priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); + priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); + } + + if (attr_mask & IB_QP_PATH_MIG_STATE && + attr->path_mig_state == IB_MIG_MIGRATED && + qp->s_mig_state == IB_MIG_ARMED) { + qp->s_flags |= RVT_S_AHG_CLEAR; + priv->s_sc = ah_to_sc(ibqp->device, &qp->remote_ah_attr); + priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); + priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); + } +} + +/** + * hfi1_check_send_wqe - validate wqe + * @qp - The qp + * @wqe - The built wqe + * + * validate wqe. This is called + * prior to inserting the wqe into + * the ring but after the wqe has been + * setup. + * + * Returns 0 on success, -EINVAL on failure + * + */ +int hfi1_check_send_wqe(struct rvt_qp *qp, + struct rvt_swqe *wqe) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct rvt_ah *ah; + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + case IB_QPT_UC: + if (wqe->length > 0x80000000U) + return -EINVAL; + break; + case IB_QPT_SMI: + ah = ibah_to_rvtah(wqe->ud_wr.ah); + if (wqe->length > (1 << ah->log_pmtu)) + return -EINVAL; + break; + case IB_QPT_GSI: + case IB_QPT_UD: + ah = ibah_to_rvtah(wqe->ud_wr.ah); + if (wqe->length > (1 << ah->log_pmtu)) + return -EINVAL; + if (ibp->sl_to_sc[ah->attr.sl] == 0xf) + return -EINVAL; + default: + break; + } + return wqe->length <= piothreshold; +} + +/** + * hfi1_compute_aeth - compute the AETH (syndrome + MSN) + * @qp: the queue pair to compute the AETH for + * + * Returns the AETH. + */ +__be32 hfi1_compute_aeth(struct rvt_qp *qp) +{ + u32 aeth = qp->r_msn & HFI1_MSN_MASK; + + if (qp->ibqp.srq) { + /* + * Shared receive queues don't generate credits. + * Set the credit field to the invalid value. + */ + aeth |= HFI1_AETH_CREDIT_INVAL << HFI1_AETH_CREDIT_SHIFT; + } else { + u32 min, max, x; + u32 credits; + struct rvt_rwq *wq = qp->r_rq.wq; + u32 head; + u32 tail; + + /* sanity check pointers before trusting them */ + head = wq->head; + if (head >= qp->r_rq.size) + head = 0; + tail = wq->tail; + if (tail >= qp->r_rq.size) + tail = 0; + /* + * Compute the number of credits available (RWQEs). + * There is a small chance that the pair of reads are + * not atomic, which is OK, since the fuzziness is + * resolved as further ACKs go out. + */ + credits = head - tail; + if ((int)credits < 0) + credits += qp->r_rq.size; + /* + * Binary search the credit table to find the code to + * use. + */ + min = 0; + max = 31; + for (;;) { + x = (min + max) / 2; + if (credit_table[x] == credits) + break; + if (credit_table[x] > credits) { + max = x; + } else { + if (min == x) + break; + min = x; + } + } + aeth |= x << HFI1_AETH_CREDIT_SHIFT; + } + return cpu_to_be32(aeth); +} + +/** + * _hfi1_schedule_send - schedule progress + * @qp: the QP + * + * This schedules qp progress w/o regard to the s_flags. + * + * It is only used in the post send, which doesn't hold + * the s_lock. + */ +void _hfi1_schedule_send(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + + iowait_schedule(&priv->s_iowait, ppd->hfi1_wq, + priv->s_sde ? + priv->s_sde->cpu : + cpumask_first(cpumask_of_node(dd->node))); +} + +static void qp_pio_drain(struct rvt_qp *qp) +{ + struct hfi1_ibdev *dev; + struct hfi1_qp_priv *priv = qp->priv; + + if (!priv->s_sendcontext) + return; + dev = to_idev(qp->ibqp.device); + while (iowait_pio_pending(&priv->s_iowait)) { + write_seqlock_irq(&dev->iowait_lock); + hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 1); + write_sequnlock_irq(&dev->iowait_lock); + iowait_pio_drain(&priv->s_iowait); + write_seqlock_irq(&dev->iowait_lock); + hfi1_sc_wantpiobuf_intr(priv->s_sendcontext, 0); + write_sequnlock_irq(&dev->iowait_lock); + } +} + +/** + * hfi1_schedule_send - schedule progress + * @qp: the QP + * + * This schedules qp progress and caller should hold + * the s_lock. + */ +void hfi1_schedule_send(struct rvt_qp *qp) +{ + if (hfi1_send_ok(qp)) + _hfi1_schedule_send(qp); +} + +/** + * hfi1_get_credit - flush the send work queue of a QP + * @qp: the qp who's send work queue to flush + * @aeth: the Acknowledge Extended Transport Header + * + * The QP s_lock should be held. + */ +void hfi1_get_credit(struct rvt_qp *qp, u32 aeth) +{ + u32 credit = (aeth >> HFI1_AETH_CREDIT_SHIFT) & HFI1_AETH_CREDIT_MASK; + + /* + * If the credit is invalid, we can send + * as many packets as we like. Otherwise, we have to + * honor the credit field. + */ + if (credit == HFI1_AETH_CREDIT_INVAL) { + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) { + qp->s_flags |= RVT_S_UNLIMITED_CREDIT; + if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) { + qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT; + hfi1_schedule_send(qp); + } + } + } else if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) { + /* Compute new LSN (i.e., MSN + credit) */ + credit = (aeth + credit_table[credit]) & HFI1_MSN_MASK; + if (cmp_msn(credit, qp->s_lsn) > 0) { + qp->s_lsn = credit; + if (qp->s_flags & RVT_S_WAIT_SSN_CREDIT) { + qp->s_flags &= ~RVT_S_WAIT_SSN_CREDIT; + hfi1_schedule_send(qp); + } + } + } +} + +void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) +{ + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_flags & flag) { + qp->s_flags &= ~flag; + trace_hfi1_qpwakeup(qp, flag); + hfi1_schedule_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, flags); + /* Notify hfi1_destroy_qp() if it is waiting. */ + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +static int iowait_sleep( + struct sdma_engine *sde, + struct iowait *wait, + struct sdma_txreq *stx, + unsigned seq) +{ + struct verbs_txreq *tx = container_of(stx, struct verbs_txreq, txreq); + struct rvt_qp *qp; + struct hfi1_qp_priv *priv; + unsigned long flags; + int ret = 0; + struct hfi1_ibdev *dev; + + qp = tx->qp; + priv = qp->priv; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { + /* + * If we couldn't queue the DMA request, save the info + * and try again later rather than destroying the + * buffer and undoing the side effects of the copy. + */ + /* Make a common routine? */ + dev = &sde->dd->verbs_dev; + list_add_tail(&stx->list, &wait->tx_head); + write_seqlock(&dev->iowait_lock); + if (sdma_progress(sde, seq, stx)) + goto eagain; + if (list_empty(&priv->s_iowait.list)) { + struct hfi1_ibport *ibp = + to_iport(qp->ibqp.device, qp->port_num); + + ibp->rvp.n_dmawait++; + qp->s_flags |= RVT_S_WAIT_DMA_DESC; + list_add_tail(&priv->s_iowait.list, &sde->dmawait); + trace_hfi1_qpsleep(qp, RVT_S_WAIT_DMA_DESC); + atomic_inc(&qp->refcount); + } + write_sequnlock(&dev->iowait_lock); + qp->s_flags &= ~RVT_S_BUSY; + spin_unlock_irqrestore(&qp->s_lock, flags); + ret = -EBUSY; + } else { + spin_unlock_irqrestore(&qp->s_lock, flags); + hfi1_put_txreq(tx); + } + return ret; +eagain: + write_sequnlock(&dev->iowait_lock); + spin_unlock_irqrestore(&qp->s_lock, flags); + list_del_init(&stx->list); + return -EAGAIN; +} + +static void iowait_wakeup(struct iowait *wait, int reason) +{ + struct rvt_qp *qp = iowait_to_qp(wait); + + WARN_ON(reason != SDMA_AVAIL_REASON); + hfi1_qp_wakeup(qp, RVT_S_WAIT_DMA_DESC); +} + +static void iowait_sdma_drained(struct iowait *wait) +{ + struct rvt_qp *qp = iowait_to_qp(wait); + unsigned long flags; + + /* + * This happens when the send engine notes + * a QP in the error state and cannot + * do the flush work until that QP's + * sdma work has finished. + */ + spin_lock_irqsave(&qp->s_lock, flags); + if (qp->s_flags & RVT_S_WAIT_DMA) { + qp->s_flags &= ~RVT_S_WAIT_DMA; + hfi1_schedule_send(qp); + } + spin_unlock_irqrestore(&qp->s_lock, flags); +} + +/** + * + * qp_to_sdma_engine - map a qp to a send engine + * @qp: the QP + * @sc5: the 5 bit sc + * + * Return: + * A send engine for the qp or NULL for SMI type qp. + */ +struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5) +{ + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct sdma_engine *sde; + + if (!(dd->flags & HFI1_HAS_SEND_DMA)) + return NULL; + switch (qp->ibqp.qp_type) { + case IB_QPT_SMI: + return NULL; + default: + break; + } + sde = sdma_select_engine_sc(dd, qp->ibqp.qp_num >> dd->qos_shift, sc5); + return sde; +} + +/* + * qp_to_send_context - map a qp to a send context + * @qp: the QP + * @sc5: the 5 bit sc + * + * Return: + * A send context for the qp + */ +struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5) +{ + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + + switch (qp->ibqp.qp_type) { + case IB_QPT_SMI: + /* SMA packets to VL15 */ + return dd->vld[15].sc; + default: + break; + } + + return pio_select_send_context_sc(dd, qp->ibqp.qp_num >> dd->qos_shift, + sc5); +} + +struct qp_iter { + struct hfi1_ibdev *dev; + struct rvt_qp *qp; + int specials; + int n; +}; + +struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev) +{ + struct qp_iter *iter; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return NULL; + + iter->dev = dev; + iter->specials = dev->rdi.ibdev.phys_port_cnt * 2; + if (qp_iter_next(iter)) { + kfree(iter); + return NULL; + } + + return iter; +} + +int qp_iter_next(struct qp_iter *iter) +{ + struct hfi1_ibdev *dev = iter->dev; + int n = iter->n; + int ret = 1; + struct rvt_qp *pqp = iter->qp; + struct rvt_qp *qp; + + /* + * The approach is to consider the special qps + * as an additional table entries before the + * real hash table. Since the qp code sets + * the qp->next hash link to NULL, this works just fine. + * + * iter->specials is 2 * # ports + * + * n = 0..iter->specials is the special qp indices + * + * n = iter->specials..dev->rdi.qp_dev->qp_table_size+iter->specials are + * the potential hash bucket entries + * + */ + for (; n < dev->rdi.qp_dev->qp_table_size + iter->specials; n++) { + if (pqp) { + qp = rcu_dereference(pqp->next); + } else { + if (n < iter->specials) { + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + int pidx; + + pidx = n % dev->rdi.ibdev.phys_port_cnt; + ppd = &dd_from_dev(dev)->pport[pidx]; + ibp = &ppd->ibport_data; + + if (!(n & 1)) + qp = rcu_dereference(ibp->rvp.qp[0]); + else + qp = rcu_dereference(ibp->rvp.qp[1]); + } else { + qp = rcu_dereference( + dev->rdi.qp_dev->qp_table[ + (n - iter->specials)]); + } + } + pqp = qp; + if (qp) { + iter->qp = qp; + iter->n = n; + return 0; + } + } + return ret; +} + +static const char * const qp_type_str[] = { + "SMI", "GSI", "RC", "UC", "UD", +}; + +static int qp_idle(struct rvt_qp *qp) +{ + return + qp->s_last == qp->s_acked && + qp->s_acked == qp->s_cur && + qp->s_cur == qp->s_tail && + qp->s_tail == qp->s_head; +} + +void qp_iter_print(struct seq_file *s, struct qp_iter *iter) +{ + struct rvt_swqe *wqe; + struct rvt_qp *qp = iter->qp; + struct hfi1_qp_priv *priv = qp->priv; + struct sdma_engine *sde; + struct send_context *send_context; + + sde = qp_to_sdma_engine(qp, priv->s_sc); + wqe = rvt_get_swqe_ptr(qp, qp->s_last); + send_context = qp_to_send_context(qp, priv->s_sc); + seq_printf(s, + "N %d %s QP %x R %u %s %u %u %u f=%x %u %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d\n", + iter->n, + qp_idle(qp) ? "I" : "B", + qp->ibqp.qp_num, + atomic_read(&qp->refcount), + qp_type_str[qp->ibqp.qp_type], + qp->state, + wqe ? wqe->wr.opcode : 0, + qp->s_hdrwords, + qp->s_flags, + iowait_sdma_pending(&priv->s_iowait), + iowait_pio_pending(&priv->s_iowait), + !list_empty(&priv->s_iowait.list), + qp->timeout, + wqe ? wqe->ssn : 0, + qp->s_lsn, + qp->s_last_psn, + qp->s_psn, qp->s_next_psn, + qp->s_sending_psn, qp->s_sending_hpsn, + qp->s_last, qp->s_acked, qp->s_cur, + qp->s_tail, qp->s_head, qp->s_size, + qp->s_avail, + qp->remote_qpn, + qp->remote_ah_attr.dlid, + qp->remote_ah_attr.sl, + qp->pmtu, + qp->s_retry, + qp->s_retry_cnt, + qp->s_rnr_retry_cnt, + sde, + sde ? sde->this_idx : 0, + send_context, + send_context ? send_context->sw_index : 0, + ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->head, + ibcq_to_rvtcq(qp->ibqp.send_cq)->queue->tail, + qp->pid); +} + +void qp_comm_est(struct rvt_qp *qp) +{ + qp->r_flags |= RVT_R_COMM_EST; + if (qp->ibqp.event_handler) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_COMM_EST; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } +} + +void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp, + gfp_t gfp) +{ + struct hfi1_qp_priv *priv; + + priv = kzalloc_node(sizeof(*priv), gfp, rdi->dparms.node); + if (!priv) + return ERR_PTR(-ENOMEM); + + priv->owner = qp; + + priv->s_hdr = kzalloc_node(sizeof(*priv->s_hdr), gfp, rdi->dparms.node); + if (!priv->s_hdr) { + kfree(priv); + return ERR_PTR(-ENOMEM); + } + setup_timer(&priv->s_rnr_timer, hfi1_rc_rnr_retry, (unsigned long)qp); + qp->s_timer.function = hfi1_rc_timeout; + return priv; +} + +void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + kfree(priv->s_hdr); + kfree(priv); +} + +unsigned free_all_qps(struct rvt_dev_info *rdi) +{ + struct hfi1_ibdev *verbs_dev = container_of(rdi, + struct hfi1_ibdev, + rdi); + struct hfi1_devdata *dd = container_of(verbs_dev, + struct hfi1_devdata, + verbs_dev); + int n; + unsigned qp_inuse = 0; + + for (n = 0; n < dd->num_pports; n++) { + struct hfi1_ibport *ibp = &dd->pport[n].ibport_data; + + rcu_read_lock(); + if (rcu_dereference(ibp->rvp.qp[0])) + qp_inuse++; + if (rcu_dereference(ibp->rvp.qp[1])) + qp_inuse++; + rcu_read_unlock(); + } + + return qp_inuse; +} + +void flush_qp_waiters(struct rvt_qp *qp) +{ + flush_iowait(qp); + hfi1_stop_rc_timers(qp); +} + +void stop_send_queue(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + cancel_work_sync(&priv->s_iowait.iowork); + hfi1_del_timers_sync(qp); +} + +void quiesce_qp(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + iowait_sdma_drain(&priv->s_iowait); + qp_pio_drain(qp); + flush_tx_list(qp); +} + +void notify_qp_reset(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + iowait_init( + &priv->s_iowait, + 1, + _hfi1_do_send, + iowait_sleep, + iowait_wakeup, + iowait_sdma_drained); + priv->r_adefered = 0; + clear_ahg(qp); +} + +/* + * Switch to alternate path. + * The QP s_lock should be held and interrupts disabled. + */ +void hfi1_migrate_qp(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ib_event ev; + + qp->s_mig_state = IB_MIG_MIGRATED; + qp->remote_ah_attr = qp->alt_ah_attr; + qp->port_num = qp->alt_ah_attr.port_num; + qp->s_pkey_index = qp->s_alt_pkey_index; + qp->s_flags |= RVT_S_AHG_CLEAR; + priv->s_sc = ah_to_sc(qp->ibqp.device, &qp->remote_ah_attr); + priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_PATH_MIG; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); +} + +int mtu_to_path_mtu(u32 mtu) +{ + return mtu_to_enum(mtu, OPA_MTU_8192); +} + +u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu) +{ + u32 mtu; + struct hfi1_ibdev *verbs_dev = container_of(rdi, + struct hfi1_ibdev, + rdi); + struct hfi1_devdata *dd = container_of(verbs_dev, + struct hfi1_devdata, + verbs_dev); + struct hfi1_ibport *ibp; + u8 sc, vl; + + ibp = &dd->pport[qp->port_num - 1].ibport_data; + sc = ibp->sl_to_sc[qp->remote_ah_attr.sl]; + vl = sc_to_vlt(dd, sc); + + mtu = verbs_mtu_enum_to_int(qp->ibqp.device, pmtu); + if (vl < PER_VL_SEND_CONTEXTS) + mtu = min_t(u32, mtu, dd->vld[vl].mtu); + return mtu; +} + +int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp, + struct ib_qp_attr *attr) +{ + int mtu, pidx = qp->port_num - 1; + struct hfi1_ibdev *verbs_dev = container_of(rdi, + struct hfi1_ibdev, + rdi); + struct hfi1_devdata *dd = container_of(verbs_dev, + struct hfi1_devdata, + verbs_dev); + mtu = verbs_mtu_enum_to_int(qp->ibqp.device, attr->path_mtu); + if (mtu == -1) + return -1; /* values less than 0 are error */ + + if (mtu > dd->pport[pidx].ibmtu) + return mtu_to_enum(dd->pport[pidx].ibmtu, IB_MTU_2048); + else + return attr->path_mtu; +} + +void notify_error_qp(struct rvt_qp *qp) +{ + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct hfi1_qp_priv *priv = qp->priv; + + write_seqlock(&dev->iowait_lock); + if (!list_empty(&priv->s_iowait.list) && !(qp->s_flags & RVT_S_BUSY)) { + qp->s_flags &= ~RVT_S_ANY_WAIT_IO; + list_del_init(&priv->s_iowait.list); + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); + } + write_sequnlock(&dev->iowait_lock); + + if (!(qp->s_flags & RVT_S_BUSY)) { + qp->s_hdrwords = 0; + if (qp->s_rdma_mr) { + rvt_put_mr(qp->s_rdma_mr); + qp->s_rdma_mr = NULL; + } + flush_tx_list(qp); + } +} + +/** + * hfi1_error_port_qps - put a port's RC/UC qps into error state + * @ibp: the ibport. + * @sl: the service level. + * + * This function places all RC/UC qps with a given service level into error + * state. It is generally called to force upper lay apps to abandon stale qps + * after an sl->sc mapping change. + */ +void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl) +{ + struct rvt_qp *qp = NULL; + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_ibdev *dev = &ppd->dd->verbs_dev; + int n; + int lastwqe; + struct ib_event ev; + + rcu_read_lock(); + + /* Deal only with RC/UC qps that use the given SL. */ + for (n = 0; n < dev->rdi.qp_dev->qp_table_size; n++) { + for (qp = rcu_dereference(dev->rdi.qp_dev->qp_table[n]); qp; + qp = rcu_dereference(qp->next)) { + if (qp->port_num == ppd->port && + (qp->ibqp.qp_type == IB_QPT_UC || + qp->ibqp.qp_type == IB_QPT_RC) && + qp->remote_ah_attr.sl == sl && + (ib_rvt_state_ops[qp->state] & + RVT_POST_SEND_OK)) { + spin_lock_irq(&qp->r_lock); + spin_lock(&qp->s_hlock); + spin_lock(&qp->s_lock); + lastwqe = rvt_error_qp(qp, + IB_WC_WR_FLUSH_ERR); + spin_unlock(&qp->s_lock); + spin_unlock(&qp->s_hlock); + spin_unlock_irq(&qp->r_lock); + if (lastwqe) { + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = + IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, + qp->ibqp.qp_context); + } + } + } + } + + rcu_read_unlock(); +} diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h new file mode 100644 index 000000000..e7bc8d6cf --- /dev/null +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -0,0 +1,160 @@ +#ifndef _QP_H +#define _QP_H +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/hash.h> +#include <rdma/rdmavt_qp.h> +#include "verbs.h" +#include "sdma.h" + +extern unsigned int hfi1_qp_table_size; + +/* + * free_ahg - clear ahg from QP + */ +static inline void clear_ahg(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + priv->s_hdr->ahgcount = 0; + qp->s_flags &= ~(RVT_S_AHG_VALID | RVT_S_AHG_CLEAR); + if (priv->s_sde && qp->s_ahgidx >= 0) + sdma_ahg_free(priv->s_sde, qp->s_ahgidx); + qp->s_ahgidx = -1; +} + +/** + * hfi1_compute_aeth - compute the AETH (syndrome + MSN) + * @qp: the queue pair to compute the AETH for + * + * Returns the AETH. + */ +__be32 hfi1_compute_aeth(struct rvt_qp *qp); + +/** + * hfi1_create_qp - create a queue pair for a device + * @ibpd: the protection domain who's device we create the queue pair for + * @init_attr: the attributes of the queue pair + * @udata: user data for libibverbs.so + * + * Returns the queue pair on success, otherwise returns an errno. + * + * Called by the ib_create_qp() core verbs function. + */ +struct ib_qp *hfi1_create_qp(struct ib_pd *ibpd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata); +/** + * hfi1_get_credit - flush the send work queue of a QP + * @qp: the qp who's send work queue to flush + * @aeth: the Acknowledge Extended Transport Header + * + * The QP s_lock should be held. + */ +void hfi1_get_credit(struct rvt_qp *qp, u32 aeth); + +/** + * hfi1_qp_wakeup - wake up on the indicated event + * @qp: the QP + * @flag: flag the qp on which the qp is stalled + */ +void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag); + +struct sdma_engine *qp_to_sdma_engine(struct rvt_qp *qp, u8 sc5); +struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5); + +struct qp_iter; + +/** + * qp_iter_init - initialize the iterator for the qp hash list + * @dev: the hfi1_ibdev + */ +struct qp_iter *qp_iter_init(struct hfi1_ibdev *dev); + +/** + * qp_iter_next - Find the next qp in the hash list + * @iter: the iterator for the qp hash list + */ +int qp_iter_next(struct qp_iter *iter); + +/** + * qp_iter_print - print the qp information to seq_file + * @s: the seq_file to emit the qp information on + * @iter: the iterator for the qp hash list + */ +void qp_iter_print(struct seq_file *s, struct qp_iter *iter); + +/** + * qp_comm_est - handle trap with QP established + * @qp: the QP + */ +void qp_comm_est(struct rvt_qp *qp); + +void _hfi1_schedule_send(struct rvt_qp *qp); +void hfi1_schedule_send(struct rvt_qp *qp); + +void hfi1_migrate_qp(struct rvt_qp *qp); + +/* + * Functions provided by hfi1 driver for rdmavt to use + */ +void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp, + gfp_t gfp); +void qp_priv_free(struct rvt_dev_info *rdi, struct rvt_qp *qp); +unsigned free_all_qps(struct rvt_dev_info *rdi); +void notify_qp_reset(struct rvt_qp *qp); +int get_pmtu_from_attr(struct rvt_dev_info *rdi, struct rvt_qp *qp, + struct ib_qp_attr *attr); +void flush_qp_waiters(struct rvt_qp *qp); +void notify_error_qp(struct rvt_qp *qp); +void stop_send_queue(struct rvt_qp *qp); +void quiesce_qp(struct rvt_qp *qp); +u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu); +int mtu_to_path_mtu(u32 mtu); +void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl); +#endif /* _QP_H */ diff --git a/drivers/infiniband/hw/hfi1/qsfp.c b/drivers/infiniband/hw/hfi1/qsfp.c new file mode 100644 index 000000000..9fb561682 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/qsfp.c @@ -0,0 +1,633 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/delay.h> +#include <linux/pci.h> +#include <linux/vmalloc.h> + +#include "hfi.h" +#include "twsi.h" + +/* + * QSFP support for hfi driver, using "Two Wire Serial Interface" driver + * in twsi.c + */ +#define I2C_MAX_RETRY 4 + +/* + * Raw i2c write. No set-up or lock checking. + */ +static int __i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, + int offset, void *bp, int len) +{ + struct hfi1_devdata *dd = ppd->dd; + int ret, cnt; + u8 *buff = bp; + + cnt = 0; + while (cnt < len) { + int wlen = len - cnt; + + ret = hfi1_twsi_blk_wr(dd, target, i2c_addr, offset, + buff + cnt, wlen); + if (ret) { + /* hfi1_twsi_blk_wr() 1 for error, else 0 */ + return -EIO; + } + offset += wlen; + cnt += wlen; + } + + /* Must wait min 20us between qsfp i2c transactions */ + udelay(20); + + return cnt; +} + +/* + * Caller must hold the i2c chain resource. + */ +int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, + void *bp, int len) +{ + int ret; + + if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) + return -EACCES; + + /* make sure the TWSI bus is in a sane state */ + ret = hfi1_twsi_reset(ppd->dd, target); + if (ret) { + hfi1_dev_porterr(ppd->dd, ppd->port, + "I2C chain %d write interface reset failed\n", + target); + return ret; + } + + return __i2c_write(ppd, target, i2c_addr, offset, bp, len); +} + +/* + * Raw i2c read. No set-up or lock checking. + */ +static int __i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, + int offset, void *bp, int len) +{ + struct hfi1_devdata *dd = ppd->dd; + int ret, cnt, pass = 0; + int orig_offset = offset; + + cnt = 0; + while (cnt < len) { + int rlen = len - cnt; + + ret = hfi1_twsi_blk_rd(dd, target, i2c_addr, offset, + bp + cnt, rlen); + /* Some QSFP's fail first try. Retry as experiment */ + if (ret && cnt == 0 && ++pass < I2C_MAX_RETRY) + continue; + if (ret) { + /* hfi1_twsi_blk_rd() 1 for error, else 0 */ + ret = -EIO; + goto exit; + } + offset += rlen; + cnt += rlen; + } + + ret = cnt; + +exit: + if (ret < 0) { + hfi1_dev_porterr(dd, ppd->port, + "I2C chain %d read failed, addr 0x%x, offset 0x%x, len %d\n", + target, i2c_addr, orig_offset, len); + } + + /* Must wait min 20us between qsfp i2c transactions */ + udelay(20); + + return ret; +} + +/* + * Caller must hold the i2c chain resource. + */ +int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, int offset, + void *bp, int len) +{ + int ret; + + if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) + return -EACCES; + + /* make sure the TWSI bus is in a sane state */ + ret = hfi1_twsi_reset(ppd->dd, target); + if (ret) { + hfi1_dev_porterr(ppd->dd, ppd->port, + "I2C chain %d read interface reset failed\n", + target); + return ret; + } + + return __i2c_read(ppd, target, i2c_addr, offset, bp, len); +} + +/* + * Write page n, offset m of QSFP memory as defined by SFF 8636 + * by writing @addr = ((256 * n) + m) + * + * Caller must hold the i2c chain resource. + */ +int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len) +{ + int count = 0; + int offset; + int nwrite; + int ret; + u8 page; + + if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) + return -EACCES; + + /* make sure the TWSI bus is in a sane state */ + ret = hfi1_twsi_reset(ppd->dd, target); + if (ret) { + hfi1_dev_porterr(ppd->dd, ppd->port, + "QSFP chain %d write interface reset failed\n", + target); + return ret; + } + + while (count < len) { + /* + * Set the qsfp page based on a zero-based address + * and a page size of QSFP_PAGESIZE bytes. + */ + page = (u8)(addr / QSFP_PAGESIZE); + + ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, + QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1); + if (ret != 1) { + hfi1_dev_porterr(ppd->dd, ppd->port, + "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n", + target, ret); + ret = -EIO; + break; + } + + offset = addr % QSFP_PAGESIZE; + nwrite = len - count; + /* truncate write to boundary if crossing boundary */ + if (((addr % QSFP_RW_BOUNDARY) + nwrite) > QSFP_RW_BOUNDARY) + nwrite = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY); + + ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, + offset, bp + count, nwrite); + if (ret <= 0) /* stop on error or nothing written */ + break; + + count += ret; + addr += ret; + } + + if (ret < 0) + return ret; + return count; +} + +/* + * Perform a stand-alone single QSFP write. Acquire the resource, do the + * read, then release the resource. + */ +int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 resource = qsfp_resource(dd); + int ret; + + ret = acquire_chip_resource(dd, resource, QSFP_WAIT); + if (ret) + return ret; + ret = qsfp_write(ppd, target, addr, bp, len); + release_chip_resource(dd, resource); + + return ret; +} + +/* + * Access page n, offset m of QSFP memory as defined by SFF 8636 + * by reading @addr = ((256 * n) + m) + * + * Caller must hold the i2c chain resource. + */ +int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len) +{ + int count = 0; + int offset; + int nread; + int ret; + u8 page; + + if (!check_chip_resource(ppd->dd, i2c_target(target), __func__)) + return -EACCES; + + /* make sure the TWSI bus is in a sane state */ + ret = hfi1_twsi_reset(ppd->dd, target); + if (ret) { + hfi1_dev_porterr(ppd->dd, ppd->port, + "QSFP chain %d read interface reset failed\n", + target); + return ret; + } + + while (count < len) { + /* + * Set the qsfp page based on a zero-based address + * and a page size of QSFP_PAGESIZE bytes. + */ + page = (u8)(addr / QSFP_PAGESIZE); + ret = __i2c_write(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, + QSFP_PAGE_SELECT_BYTE_OFFS, &page, 1); + if (ret != 1) { + hfi1_dev_porterr(ppd->dd, ppd->port, + "QSFP chain %d can't write QSFP_PAGE_SELECT_BYTE: %d\n", + target, ret); + ret = -EIO; + break; + } + + offset = addr % QSFP_PAGESIZE; + nread = len - count; + /* truncate read to boundary if crossing boundary */ + if (((addr % QSFP_RW_BOUNDARY) + nread) > QSFP_RW_BOUNDARY) + nread = QSFP_RW_BOUNDARY - (addr % QSFP_RW_BOUNDARY); + + /* QSFPs require a 5-10msec delay after write operations */ + mdelay(5); + ret = __i2c_read(ppd, target, QSFP_DEV | QSFP_OFFSET_SIZE, + offset, bp + count, nread); + if (ret <= 0) /* stop on error or nothing read */ + break; + + count += ret; + addr += ret; + } + + if (ret < 0) + return ret; + return count; +} + +/* + * Perform a stand-alone single QSFP read. Acquire the resource, do the + * read, then release the resource. + */ +int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len) +{ + struct hfi1_devdata *dd = ppd->dd; + u32 resource = qsfp_resource(dd); + int ret; + + ret = acquire_chip_resource(dd, resource, QSFP_WAIT); + if (ret) + return ret; + ret = qsfp_read(ppd, target, addr, bp, len); + release_chip_resource(dd, resource); + + return ret; +} + +/* + * This function caches the QSFP memory range in 128 byte chunks. + * As an example, the next byte after address 255 is byte 128 from + * upper page 01H (if existing) rather than byte 0 from lower page 00H. + * Access page n, offset m of QSFP memory as defined by SFF 8636 + * in the cache by reading byte ((128 * n) + m) + * The calls to qsfp_{read,write} in this function correctly handle the + * address map difference between this mapping and the mapping implemented + * by those functions + * + * The caller must be holding the QSFP i2c chain resource. + */ +int refresh_qsfp_cache(struct hfi1_pportdata *ppd, struct qsfp_data *cp) +{ + u32 target = ppd->dd->hfi1_id; + int ret; + unsigned long flags; + u8 *cache = &cp->cache[0]; + + /* ensure sane contents on invalid reads, for cable swaps */ + memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128)); + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + ppd->qsfp_info.cache_valid = 0; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags); + + if (!qsfp_mod_present(ppd)) { + ret = -ENODEV; + goto bail; + } + + ret = qsfp_read(ppd, target, 0, cache, QSFP_PAGESIZE); + if (ret != QSFP_PAGESIZE) { + dd_dev_info(ppd->dd, + "%s: Page 0 read failed, expected %d, got %d\n", + __func__, QSFP_PAGESIZE, ret); + goto bail; + } + + /* Is paging enabled? */ + if (!(cache[2] & 4)) { + /* Paging enabled, page 03 required */ + if ((cache[195] & 0xC0) == 0xC0) { + /* all */ + ret = qsfp_read(ppd, target, 384, cache + 256, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s failed\n", __func__); + goto bail; + } + ret = qsfp_read(ppd, target, 640, cache + 384, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s failed\n", __func__); + goto bail; + } + ret = qsfp_read(ppd, target, 896, cache + 512, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s failed\n", __func__); + goto bail; + } + } else if ((cache[195] & 0x80) == 0x80) { + /* only page 2 and 3 */ + ret = qsfp_read(ppd, target, 640, cache + 384, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s failed\n", __func__); + goto bail; + } + ret = qsfp_read(ppd, target, 896, cache + 512, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s failed\n", __func__); + goto bail; + } + } else if ((cache[195] & 0x40) == 0x40) { + /* only page 1 and 3 */ + ret = qsfp_read(ppd, target, 384, cache + 256, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s failed\n", __func__); + goto bail; + } + ret = qsfp_read(ppd, target, 896, cache + 512, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s failed\n", __func__); + goto bail; + } + } else { + /* only page 3 */ + ret = qsfp_read(ppd, target, 896, cache + 512, 128); + if (ret <= 0 || ret != 128) { + dd_dev_info(ppd->dd, "%s failed\n", __func__); + goto bail; + } + } + } + + spin_lock_irqsave(&ppd->qsfp_info.qsfp_lock, flags); + ppd->qsfp_info.cache_valid = 1; + ppd->qsfp_info.cache_refresh_required = 0; + spin_unlock_irqrestore(&ppd->qsfp_info.qsfp_lock, flags); + + return 0; + +bail: + memset(cache, 0, (QSFP_MAX_NUM_PAGES * 128)); + return ret; +} + +const char * const hfi1_qsfp_devtech[16] = { + "850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP", + "1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML", + "Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq", + "Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq" +}; + +#define QSFP_DUMP_CHUNK 16 /* Holds longest string */ +#define QSFP_DEFAULT_HDR_CNT 224 + +#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3) +#define QSFP_HIGH_PWR(pbyte) ((pbyte) & 3) +/* For use with QSFP_HIGH_PWR macro */ +#define QSFP_HIGH_PWR_UNUSED 0 /* Bits [1:0] = 00 implies low power module */ + +/* + * Takes power class byte [Page 00 Byte 129] in SFF 8636 + * Returns power class as integer (1 through 7, per SFF 8636 rev 2.4) + */ +int get_qsfp_power_class(u8 power_byte) +{ + if (QSFP_HIGH_PWR(power_byte) == QSFP_HIGH_PWR_UNUSED) + /* power classes count from 1, their bit encodings from 0 */ + return (QSFP_PWR(power_byte) + 1); + /* + * 00 in the high power classes stands for unused, bringing + * balance to the off-by-1 offset above, we add 4 here to + * account for the difference between the low and high power + * groups + */ + return (QSFP_HIGH_PWR(power_byte) + 4); +} + +int qsfp_mod_present(struct hfi1_pportdata *ppd) +{ + struct hfi1_devdata *dd = ppd->dd; + u64 reg; + + reg = read_csr(dd, dd->hfi1_id ? ASIC_QSFP2_IN : ASIC_QSFP1_IN); + return !(reg & QSFP_HFI0_MODPRST_N); +} + +/* + * This function maps QSFP memory addresses in 128 byte chunks in the following + * fashion per the CableInfo SMA query definition in the IBA 1.3 spec/OPA Gen 1 + * spec + * For addr 000-127, lower page 00h + * For addr 128-255, upper page 00h + * For addr 256-383, upper page 01h + * For addr 384-511, upper page 02h + * For addr 512-639, upper page 03h + * + * For addresses beyond this range, it returns the invalid range of data buffer + * set to 0. + * For upper pages that are optional, if they are not valid, returns the + * particular range of bytes in the data buffer set to 0. + */ +int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, u32 len, + u8 *data) +{ + struct hfi1_pportdata *ppd; + u32 excess_len = 0; + int ret = 0; + + if (port_num > dd->num_pports || port_num < 1) { + dd_dev_info(dd, "%s: Invalid port number %d\n", + __func__, port_num); + ret = -EINVAL; + goto set_zeroes; + } + + ppd = dd->pport + (port_num - 1); + if (!qsfp_mod_present(ppd)) { + ret = -ENODEV; + goto set_zeroes; + } + + if (!ppd->qsfp_info.cache_valid) { + ret = -EINVAL; + goto set_zeroes; + } + + if (addr >= (QSFP_MAX_NUM_PAGES * 128)) { + ret = -ERANGE; + goto set_zeroes; + } + + if ((addr + len) > (QSFP_MAX_NUM_PAGES * 128)) { + excess_len = (addr + len) - (QSFP_MAX_NUM_PAGES * 128); + memcpy(data, &ppd->qsfp_info.cache[addr], (len - excess_len)); + data += (len - excess_len); + goto set_zeroes; + } + + memcpy(data, &ppd->qsfp_info.cache[addr], len); + return 0; + +set_zeroes: + memset(data, 0, excess_len); + return ret; +} + +static const char *pwr_codes[8] = {"N/AW", + "1.5W", + "2.0W", + "2.5W", + "3.5W", + "4.0W", + "4.5W", + "5.0W" + }; + +int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len) +{ + u8 *cache = &ppd->qsfp_info.cache[0]; + u8 bin_buff[QSFP_DUMP_CHUNK]; + char lenstr[6]; + int sofar; + int bidx = 0; + u8 *atten = &cache[QSFP_ATTEN_OFFS]; + u8 *vendor_oui = &cache[QSFP_VOUI_OFFS]; + u8 power_byte = 0; + + sofar = 0; + lenstr[0] = ' '; + lenstr[1] = '\0'; + + if (ppd->qsfp_info.cache_valid) { + if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS])) + snprintf(lenstr, sizeof(lenstr), "%dM ", + cache[QSFP_MOD_LEN_OFFS]); + + power_byte = cache[QSFP_MOD_PWR_OFFS]; + sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n", + pwr_codes[get_qsfp_power_class(power_byte)]); + + sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n", + lenstr, + hfi1_qsfp_devtech[(cache[QSFP_MOD_TECH_OFFS]) >> 4]); + + sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n", + QSFP_VEND_LEN, &cache[QSFP_VEND_OFFS]); + + sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n", + QSFP_OUI(vendor_oui)); + + sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n", + QSFP_PN_LEN, &cache[QSFP_PN_OFFS]); + + sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n", + QSFP_REV_LEN, &cache[QSFP_REV_OFFS]); + + if (QSFP_IS_CU(cache[QSFP_MOD_TECH_OFFS])) + sofar += scnprintf(buf + sofar, len - sofar, + "Atten:%d, %d\n", + QSFP_ATTEN_SDR(atten), + QSFP_ATTEN_DDR(atten)); + + sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n", + QSFP_SN_LEN, &cache[QSFP_SN_OFFS]); + + sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n", + QSFP_DATE_LEN, &cache[QSFP_DATE_OFFS]); + + sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n", + QSFP_LOT_LEN, &cache[QSFP_LOT_OFFS]); + + while (bidx < QSFP_DEFAULT_HDR_CNT) { + int iidx; + + memcpy(bin_buff, &cache[bidx], QSFP_DUMP_CHUNK); + for (iidx = 0; iidx < QSFP_DUMP_CHUNK; ++iidx) { + sofar += scnprintf(buf + sofar, len - sofar, + " %02X", bin_buff[iidx]); + } + sofar += scnprintf(buf + sofar, len - sofar, "\n"); + bidx += QSFP_DUMP_CHUNK; + } + } + return sofar; +} diff --git a/drivers/infiniband/hw/hfi1/qsfp.h b/drivers/infiniband/hw/hfi1/qsfp.h new file mode 100644 index 000000000..dadc66c44 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/qsfp.h @@ -0,0 +1,240 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +/* QSFP support common definitions, for hfi driver */ + +#define QSFP_DEV 0xA0 +#define QSFP_PWR_LAG_MSEC 2000 +#define QSFP_MODPRS_LAG_MSEC 20 +/* 128 byte pages, per SFF 8636 rev 2.4 */ +#define QSFP_MAX_NUM_PAGES 5 + +/* + * Below are masks for QSFP pins. Pins are the same for HFI0 and HFI1. + * _N means asserted low + */ +#define QSFP_HFI0_I2CCLK BIT(0) +#define QSFP_HFI0_I2CDAT BIT(1) +#define QSFP_HFI0_RESET_N BIT(2) +#define QSFP_HFI0_INT_N BIT(3) +#define QSFP_HFI0_MODPRST_N BIT(4) + +/* QSFP is paged at 256 bytes */ +#define QSFP_PAGESIZE 256 +/* Reads/writes cannot cross 128 byte boundaries */ +#define QSFP_RW_BOUNDARY 128 + +/* number of bytes in i2c offset for QSFP devices */ +#define __QSFP_OFFSET_SIZE 1 /* num address bytes */ +#define QSFP_OFFSET_SIZE (__QSFP_OFFSET_SIZE << 8) /* shifted value */ + +/* Defined fields that Intel requires of qualified cables */ +/* Byte 0 is Identifier, not checked */ +/* Byte 1 is reserved "status MSB" */ +#define QSFP_TX_CTRL_BYTE_OFFS 86 +#define QSFP_PWR_CTRL_BYTE_OFFS 93 +#define QSFP_CDR_CTRL_BYTE_OFFS 98 + +#define QSFP_PAGE_SELECT_BYTE_OFFS 127 +/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */ +#define QSFP_MOD_ID_OFFS 128 +/* + * Byte 129 is "Extended Identifier". + * For bits [7:6]: 0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W + * For bits [1:0]: 0:Unused, 1:4W, 2:4.5W, 3:5W + */ +#define QSFP_MOD_PWR_OFFS 129 +/* Byte 130 is Connector type. Not Intel req'd */ +/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */ +/* Byte 139 is encoding. code 0x01 is 8b10b. Not Intel req'd */ +/* byte 140 is nominal bit-rate, in units of 100Mbits/sec */ +#define QSFP_NOM_BIT_RATE_100_OFFS 140 +/* Byte 141 is Extended Rate Select. Not Intel req'd */ +/* Bytes 142..145 are lengths for various fiber types. Not Intel req'd */ +/* Byte 146 is length for Copper. Units of 1 meter */ +#define QSFP_MOD_LEN_OFFS 146 +/* + * Byte 147 is Device technology. D0..3 not Intel req'd + * D4..7 select from 15 choices, translated by table: + */ +#define QSFP_MOD_TECH_OFFS 147 +extern const char *const hfi1_qsfp_devtech[16]; +/* Active Equalization includes fiber, copper full EQ, and copper near Eq */ +#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1) +/* Active Equalization includes fiber, copper full EQ, and copper far Eq */ +#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1) +/* Attenuation should be valid for copper other than full/near Eq */ +#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1) +/* Length is only valid if technology is "copper" */ +#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1) +#define QSFP_TECH_1490 9 + +#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \ + oui[2]) +#define QSFP_OUI_AMPHENOL 0x415048 +#define QSFP_OUI_FINISAR 0x009065 +#define QSFP_OUI_GORE 0x002177 + +/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */ +#define QSFP_VEND_OFFS 148 +#define QSFP_VEND_LEN 16 +/* Byte 164 is IB Extended transceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */ +#define QSFP_IBXCV_OFFS 164 +/* Bytes 165..167 are Vendor OUI number */ +#define QSFP_VOUI_OFFS 165 +#define QSFP_VOUI_LEN 3 +/* Bytes 168..183 are Vendor Part Number, string */ +#define QSFP_PN_OFFS 168 +#define QSFP_PN_LEN 16 +/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */ +#define QSFP_REV_OFFS 184 +#define QSFP_REV_LEN 2 +/* + * Bytes 186,187 are Wavelength, if Optical. Not Intel req'd + * If copper, they are attenuation in dB: + * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR) + */ +#define QSFP_ATTEN_OFFS 186 +#define QSFP_ATTEN_LEN 2 +/* + * Bytes 188,189 are Wavelength tolerance, if optical + * If copper, they are attenuation in dB: + * Byte 188 is at 12.5 Gb/s, Byte 189 at 25 Gb/s + */ +#define QSFP_CU_ATTEN_7G_OFFS 188 +#define QSFP_CU_ATTEN_12G_OFFS 189 +/* Byte 190 is Max Case Temp. Not Intel req'd */ +/* Byte 191 is LSB of sum of bytes 128..190. Not Intel req'd */ +#define QSFP_CC_OFFS 191 +#define QSFP_EQ_INFO_OFFS 193 +#define QSFP_CDR_INFO_OFFS 194 +/* Bytes 196..211 are Serial Number, String */ +#define QSFP_SN_OFFS 196 +#define QSFP_SN_LEN 16 +/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */ +#define QSFP_DATE_OFFS 212 +#define QSFP_DATE_LEN 6 +/* Bytes 218,219 are optional lot-code, string */ +#define QSFP_LOT_OFFS 218 +#define QSFP_LOT_LEN 2 +/* Bytes 220, 221 indicate monitoring options, Not Intel req'd */ +/* Byte 222 indicates nominal bitrate in units of 250Mbits/sec */ +#define QSFP_NOM_BIT_RATE_250_OFFS 222 +/* Byte 223 is LSB of sum of bytes 192..222 */ +#define QSFP_CC_EXT_OFFS 223 + +/* + * Interrupt flag masks + */ +#define QSFP_DATA_NOT_READY 0x01 + +#define QSFP_HIGH_TEMP_ALARM 0x80 +#define QSFP_LOW_TEMP_ALARM 0x40 +#define QSFP_HIGH_TEMP_WARNING 0x20 +#define QSFP_LOW_TEMP_WARNING 0x10 + +#define QSFP_HIGH_VCC_ALARM 0x80 +#define QSFP_LOW_VCC_ALARM 0x40 +#define QSFP_HIGH_VCC_WARNING 0x20 +#define QSFP_LOW_VCC_WARNING 0x10 + +#define QSFP_HIGH_POWER_ALARM 0x88 +#define QSFP_LOW_POWER_ALARM 0x44 +#define QSFP_HIGH_POWER_WARNING 0x22 +#define QSFP_LOW_POWER_WARNING 0x11 + +#define QSFP_HIGH_BIAS_ALARM 0x88 +#define QSFP_LOW_BIAS_ALARM 0x44 +#define QSFP_HIGH_BIAS_WARNING 0x22 +#define QSFP_LOW_BIAS_WARNING 0x11 + +#define QSFP_ATTEN_SDR(attenarray) (attenarray[0]) +#define QSFP_ATTEN_DDR(attenarray) (attenarray[1]) + +/* + * struct qsfp_data encapsulates state of QSFP device for one port. + * it will be part of port-specific data if a board supports QSFP. + * + * Since multiple board-types use QSFP, and their pport_data structs + * differ (in the chip-specific section), we need a pointer to its head. + * + * Avoiding premature optimization, we will have one work_struct per port, + * and let the qsfp_lock arbitrate access to common resources. + * + */ +struct qsfp_data { + /* Helps to find our way */ + struct hfi1_pportdata *ppd; + struct work_struct qsfp_work; + u8 cache[QSFP_MAX_NUM_PAGES * 128]; + /* protect qsfp data */ + spinlock_t qsfp_lock; + u8 check_interrupt_flags; + u8 reset_needed; + u8 limiting_active; + u8 cache_valid; + u8 cache_refresh_required; +}; + +int refresh_qsfp_cache(struct hfi1_pportdata *ppd, + struct qsfp_data *cp); +int get_qsfp_power_class(u8 power_byte); +int qsfp_mod_present(struct hfi1_pportdata *ppd); +int get_cable_info(struct hfi1_devdata *dd, u32 port_num, u32 addr, + u32 len, u8 *data); + +int i2c_write(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, + int offset, void *bp, int len); +int i2c_read(struct hfi1_pportdata *ppd, u32 target, int i2c_addr, + int offset, void *bp, int len); +int qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len); +int qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len); +int one_qsfp_write(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len); +int one_qsfp_read(struct hfi1_pportdata *ppd, u32 target, int addr, void *bp, + int len); diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c new file mode 100644 index 000000000..792f15eb8 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -0,0 +1,2580 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/io.h> +#include <rdma/rdma_vt.h> +#include <rdma/rdmavt_qp.h> + +#include "hfi.h" +#include "qp.h" +#include "verbs_txreq.h" +#include "trace.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_RC_##x + +/** + * hfi1_add_retry_timer - add/start a retry timer + * @qp - the QP + * + * add a retry timer on the QP + */ +static inline void hfi1_add_retry_timer(struct rvt_qp *qp) +{ + struct ib_qp *ibqp = &qp->ibqp; + struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); + + qp->s_flags |= RVT_S_TIMER; + /* 4.096 usec. * (1 << qp->timeout) */ + qp->s_timer.expires = jiffies + qp->timeout_jiffies + + rdi->busy_jiffies; + add_timer(&qp->s_timer); +} + +/** + * hfi1_add_rnr_timer - add/start an rnr timer + * @qp - the QP + * @to - timeout in usecs + * + * add an rnr timer on the QP + */ +void hfi1_add_rnr_timer(struct rvt_qp *qp, u32 to) +{ + struct hfi1_qp_priv *priv = qp->priv; + + qp->s_flags |= RVT_S_WAIT_RNR; + qp->s_timer.expires = jiffies + usecs_to_jiffies(to); + add_timer(&priv->s_rnr_timer); +} + +/** + * hfi1_mod_retry_timer - mod a retry timer + * @qp - the QP + * + * Modify a potentially already running retry + * timer + */ +static inline void hfi1_mod_retry_timer(struct rvt_qp *qp) +{ + struct ib_qp *ibqp = &qp->ibqp; + struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); + + qp->s_flags |= RVT_S_TIMER; + /* 4.096 usec. * (1 << qp->timeout) */ + mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies + + rdi->busy_jiffies); +} + +/** + * hfi1_stop_retry_timer - stop a retry timer + * @qp - the QP + * + * stop a retry timer and return if the timer + * had been pending. + */ +static inline int hfi1_stop_retry_timer(struct rvt_qp *qp) +{ + int rval = 0; + + /* Remove QP from retry */ + if (qp->s_flags & RVT_S_TIMER) { + qp->s_flags &= ~RVT_S_TIMER; + rval = del_timer(&qp->s_timer); + } + return rval; +} + +/** + * hfi1_stop_rc_timers - stop all timers + * @qp - the QP + * + * stop any pending timers + */ +void hfi1_stop_rc_timers(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + /* Remove QP from all timers */ + if (qp->s_flags & (RVT_S_TIMER | RVT_S_WAIT_RNR)) { + qp->s_flags &= ~(RVT_S_TIMER | RVT_S_WAIT_RNR); + del_timer(&qp->s_timer); + del_timer(&priv->s_rnr_timer); + } +} + +/** + * hfi1_stop_rnr_timer - stop an rnr timer + * @qp - the QP + * + * stop an rnr timer and return if the timer + * had been pending. + */ +static inline int hfi1_stop_rnr_timer(struct rvt_qp *qp) +{ + int rval = 0; + struct hfi1_qp_priv *priv = qp->priv; + + /* Remove QP from rnr timer */ + if (qp->s_flags & RVT_S_WAIT_RNR) { + qp->s_flags &= ~RVT_S_WAIT_RNR; + rval = del_timer(&priv->s_rnr_timer); + } + return rval; +} + +/** + * hfi1_del_timers_sync - wait for any timeout routines to exit + * @qp - the QP + */ +void hfi1_del_timers_sync(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + del_timer_sync(&qp->s_timer); + del_timer_sync(&priv->s_rnr_timer); +} + +/* only opcode mask for adaptive pio */ +const u32 rc_only_opcode = + BIT(OP(SEND_ONLY) & 0x1f) | + BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) | + BIT(OP(RDMA_WRITE_ONLY & 0x1f)) | + BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f)) | + BIT(OP(RDMA_READ_REQUEST & 0x1f)) | + BIT(OP(ACKNOWLEDGE & 0x1f)) | + BIT(OP(ATOMIC_ACKNOWLEDGE & 0x1f)) | + BIT(OP(COMPARE_SWAP & 0x1f)) | + BIT(OP(FETCH_ADD & 0x1f)); + +static u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe, + u32 psn, u32 pmtu) +{ + u32 len; + + len = delta_psn(psn, wqe->psn) * pmtu; + ss->sge = wqe->sg_list[0]; + ss->sg_list = wqe->sg_list + 1; + ss->num_sge = wqe->wr.num_sge; + ss->total_len = wqe->length; + hfi1_skip_sge(ss, len, 0); + return wqe->length - len; +} + +/** + * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read) + * @dev: the device for this QP + * @qp: a pointer to the QP + * @ohdr: a pointer to the IB header being constructed + * @ps: the xmit packet state + * + * Return 1 if constructed; otherwise, return 0. + * Note that we are in the responder's side of the QP context. + * Note the QP s_lock must be held. + */ +static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp, + struct hfi1_other_headers *ohdr, + struct hfi1_pkt_state *ps) +{ + struct rvt_ack_entry *e; + u32 hwords; + u32 len; + u32 bth0; + u32 bth2; + int middle = 0; + u32 pmtu = qp->pmtu; + struct hfi1_qp_priv *priv = qp->priv; + + /* Don't send an ACK if we aren't supposed to. */ + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) + goto bail; + + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + hwords = 5; + + switch (qp->s_ack_state) { + case OP(RDMA_READ_RESPONSE_LAST): + case OP(RDMA_READ_RESPONSE_ONLY): + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->rdma_sge.mr) { + rvt_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + /* FALLTHROUGH */ + case OP(ATOMIC_ACKNOWLEDGE): + /* + * We can increment the tail pointer now that the last + * response has been sent instead of only being + * constructed. + */ + if (++qp->s_tail_ack_queue > HFI1_MAX_RDMA_ATOMIC) + qp->s_tail_ack_queue = 0; + /* FALLTHROUGH */ + case OP(SEND_ONLY): + case OP(ACKNOWLEDGE): + /* Check for no next entry in the queue. */ + if (qp->r_head_ack_queue == qp->s_tail_ack_queue) { + if (qp->s_flags & RVT_S_ACK_PENDING) + goto normal; + goto bail; + } + + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + if (e->opcode == OP(RDMA_READ_REQUEST)) { + /* + * If a RDMA read response is being resent and + * we haven't seen the duplicate request yet, + * then stop sending the remaining responses the + * responder has seen until the requester re-sends it. + */ + len = e->rdma_sge.sge_length; + if (len && !e->rdma_sge.mr) { + qp->s_tail_ack_queue = qp->r_head_ack_queue; + goto bail; + } + /* Copy SGE state in case we need to resend */ + ps->s_txreq->mr = e->rdma_sge.mr; + if (ps->s_txreq->mr) + rvt_get_mr(ps->s_txreq->mr); + qp->s_ack_rdma_sge.sge = e->rdma_sge; + qp->s_ack_rdma_sge.num_sge = 1; + qp->s_cur_sge = &qp->s_ack_rdma_sge; + if (len > pmtu) { + len = pmtu; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST); + } else { + qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY); + e->sent = 1; + } + ohdr->u.aeth = hfi1_compute_aeth(qp); + hwords++; + qp->s_ack_rdma_psn = e->psn; + bth2 = mask_psn(qp->s_ack_rdma_psn++); + } else { + /* COMPARE_SWAP or FETCH_ADD */ + qp->s_cur_sge = NULL; + len = 0; + qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE); + ohdr->u.at.aeth = hfi1_compute_aeth(qp); + ohdr->u.at.atomic_ack_eth[0] = + cpu_to_be32(e->atomic_data >> 32); + ohdr->u.at.atomic_ack_eth[1] = + cpu_to_be32(e->atomic_data); + hwords += sizeof(ohdr->u.at) / sizeof(u32); + bth2 = mask_psn(e->psn); + e->sent = 1; + } + bth0 = qp->s_ack_state << 24; + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_READ_RESPONSE_MIDDLE): + qp->s_cur_sge = &qp->s_ack_rdma_sge; + ps->s_txreq->mr = qp->s_ack_rdma_sge.sge.mr; + if (ps->s_txreq->mr) + rvt_get_mr(ps->s_txreq->mr); + len = qp->s_ack_rdma_sge.sge.sge_length; + if (len > pmtu) { + len = pmtu; + middle = HFI1_CAP_IS_KSET(SDMA_AHG); + } else { + ohdr->u.aeth = hfi1_compute_aeth(qp); + hwords++; + qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST); + e = &qp->s_ack_queue[qp->s_tail_ack_queue]; + e->sent = 1; + } + bth0 = qp->s_ack_state << 24; + bth2 = mask_psn(qp->s_ack_rdma_psn++); + break; + + default: +normal: + /* + * Send a regular ACK. + * Set the s_ack_state so we wait until after sending + * the ACK before setting s_ack_state to ACKNOWLEDGE + * (see above). + */ + qp->s_ack_state = OP(SEND_ONLY); + qp->s_flags &= ~RVT_S_ACK_PENDING; + qp->s_cur_sge = NULL; + if (qp->s_nak_state) + ohdr->u.aeth = + cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) | + (qp->s_nak_state << + HFI1_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = hfi1_compute_aeth(qp); + hwords++; + len = 0; + bth0 = OP(ACKNOWLEDGE) << 24; + bth2 = mask_psn(qp->s_ack_psn); + } + qp->s_rdma_ack_cnt++; + qp->s_hdrwords = hwords; + ps->s_txreq->sde = priv->s_sde; + qp->s_cur_size = len; + hfi1_make_ruc_header(qp, ohdr, bth0, bth2, middle, ps); + /* pbc */ + ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2; + return 1; + +bail: + qp->s_ack_state = OP(ACKNOWLEDGE); + /* + * Ensure s_rdma_ack_cnt changes are committed prior to resetting + * RVT_S_RESP_PENDING + */ + smp_wmb(); + qp->s_flags &= ~(RVT_S_RESP_PENDING + | RVT_S_ACK_PENDING + | RVT_S_AHG_VALID); + return 0; +} + +/** + * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC) + * @qp: a pointer to the QP + * + * Assumes s_lock is held. + * + * Return 1 if constructed; otherwise, return 0. + */ +int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ibdev *dev = to_idev(qp->ibqp.device); + struct hfi1_other_headers *ohdr; + struct rvt_sge_state *ss; + struct rvt_swqe *wqe; + /* header size in 32-bit words LRH+BTH = (8+12)/4. */ + u32 hwords = 5; + u32 len; + u32 bth0 = 0; + u32 bth2; + u32 pmtu = qp->pmtu; + char newreq; + int middle = 0; + int delta; + + ps->s_txreq = get_txreq(ps->dev, qp); + if (IS_ERR(ps->s_txreq)) + goto bail_no_tx; + + ohdr = &ps->s_txreq->phdr.hdr.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &ps->s_txreq->phdr.hdr.u.l.oth; + + /* Sending responses has higher priority over sending requests. */ + if ((qp->s_flags & RVT_S_RESP_PENDING) && + make_rc_ack(dev, qp, ohdr, ps)) + return 1; + + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) { + if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + smp_read_barrier_depends(); /* see post_one_send() */ + if (qp->s_last == ACCESS_ONCE(qp->s_head)) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (iowait_sdma_pending(&priv->s_iowait)) { + qp->s_flags |= RVT_S_WAIT_DMA; + goto bail; + } + clear_ahg(qp); + wqe = rvt_get_swqe_ptr(qp, qp->s_last); + hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ? + IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); + /* will get called again */ + goto done_free_tx; + } + + if (qp->s_flags & (RVT_S_WAIT_RNR | RVT_S_WAIT_ACK)) + goto bail; + + if (cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) { + if (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) { + qp->s_flags |= RVT_S_WAIT_PSN; + goto bail; + } + qp->s_sending_psn = qp->s_psn; + qp->s_sending_hpsn = qp->s_psn - 1; + } + + /* Send a request. */ + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); + switch (qp->s_state) { + default: + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) + goto bail; + /* + * Resend an old request or start a new one. + * + * We keep track of the current SWQE so that + * we don't reset the "furthest progress" state + * if we need to back up. + */ + newreq = 0; + if (qp->s_cur == qp->s_tail) { + /* Check if send work queue is empty. */ + if (qp->s_tail == qp->s_head) { + clear_ahg(qp); + goto bail; + } + /* + * If a fence is requested, wait for previous + * RDMA read and atomic operations to finish. + */ + if ((wqe->wr.send_flags & IB_SEND_FENCE) && + qp->s_num_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_FENCE; + goto bail; + } + newreq = 1; + qp->s_psn = wqe->psn; + } + /* + * Note that we have to be careful not to modify the + * original work request since we may need to resend + * it. + */ + len = wqe->length; + ss = &qp->s_sge; + bth2 = mask_psn(qp->s_psn); + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + /* If no credit, return. */ + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) && + cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= RVT_S_WAIT_SSN_CREDIT; + goto bail; + } + if (len > pmtu) { + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) { + qp->s_state = OP(SEND_ONLY); + } else { + qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + if (newreq && !(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + /* FALLTHROUGH */ + case IB_WR_RDMA_WRITE_WITH_IMM: + /* If no credit, return. */ + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT) && + cmp_msn(wqe->ssn, qp->s_lsn + 1) > 0) { + qp->s_flags |= RVT_S_WAIT_SSN_CREDIT; + goto bail; + } + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->rdma_wr.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->rdma_wr.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / sizeof(u32); + if (len > pmtu) { + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { + qp->s_state = OP(RDMA_WRITE_ONLY); + } else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after RETH */ + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_READ: + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ + if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; + } + qp->s_num_rd_atomic++; + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + } + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->rdma_wr.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->rdma_wr.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); + ss = NULL; + len = 0; + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + /* + * Don't allow more operations to be started + * than the QP limits allow. + */ + if (newreq) { + if (qp->s_num_rd_atomic >= + qp->s_max_rd_atomic) { + qp->s_flags |= RVT_S_WAIT_RDMAR; + goto bail; + } + qp->s_num_rd_atomic++; + if (!(qp->s_flags & RVT_S_UNLIMITED_CREDIT)) + qp->s_lsn++; + } + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + qp->s_state = OP(COMPARE_SWAP); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->atomic_wr.swap); + ohdr->u.atomic_eth.compare_data = cpu_to_be64( + wqe->atomic_wr.compare_add); + } else { + qp->s_state = OP(FETCH_ADD); + ohdr->u.atomic_eth.swap_data = cpu_to_be64( + wqe->atomic_wr.compare_add); + ohdr->u.atomic_eth.compare_data = 0; + } + ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32( + wqe->atomic_wr.remote_addr >> 32); + ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32( + wqe->atomic_wr.remote_addr); + ohdr->u.atomic_eth.rkey = cpu_to_be32( + wqe->atomic_wr.rkey); + hwords += sizeof(struct ib_atomic_eth) / sizeof(u32); + ss = NULL; + len = 0; + bth2 |= IB_BTH_REQ_ACK; + if (++qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + + default: + goto bail; + } + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + qp->s_len = wqe->length; + if (newreq) { + qp->s_tail++; + if (qp->s_tail >= qp->s_size) + qp->s_tail = 0; + } + if (wqe->wr.opcode == IB_WR_RDMA_READ) + qp->s_psn = wqe->lpsn + 1; + else + qp->s_psn++; + break; + + case OP(RDMA_READ_RESPONSE_FIRST): + /* + * qp->s_state is normally set to the opcode of the + * last packet constructed for new requests and therefore + * is never set to RDMA read response. + * RDMA_READ_RESPONSE_FIRST is used by the ACK processing + * thread to indicate a SEND needs to be restarted from an + * earlier PSN without interfering with the sending thread. + * See restart_rc(). + */ + qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); + /* FALLTHROUGH */ + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + bth2 = mask_psn(qp->s_psn++); + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + middle = HFI1_CAP_IS_KSET(SDMA_AHG); + break; + } + if (wqe->wr.opcode == IB_WR_SEND) { + qp->s_state = OP(SEND_LAST); + } else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + bth2 |= IB_BTH_REQ_ACK; + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_LAST): + /* + * qp->s_state is normally set to the opcode of the + * last packet constructed for new requests and therefore + * is never set to RDMA read response. + * RDMA_READ_RESPONSE_LAST is used by the ACK processing + * thread to indicate a RDMA write needs to be restarted from + * an earlier PSN without interfering with the sending thread. + * See restart_rc(). + */ + qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + bth2 = mask_psn(qp->s_psn++); + ss = &qp->s_sge; + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + middle = HFI1_CAP_IS_KSET(SDMA_AHG); + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { + qp->s_state = OP(RDMA_WRITE_LAST); + } else { + qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + bth2 |= IB_BTH_REQ_ACK; + qp->s_cur++; + if (qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* + * qp->s_state is normally set to the opcode of the + * last packet constructed for new requests and therefore + * is never set to RDMA read response. + * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing + * thread to indicate a RDMA read needs to be restarted from + * an earlier PSN without interfering with the sending thread. + * See restart_rc(). + */ + len = (delta_psn(qp->s_psn, wqe->psn)) * pmtu; + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->rdma_wr.remote_addr + len); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->rdma_wr.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len); + qp->s_state = OP(RDMA_READ_REQUEST); + hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32); + bth2 = mask_psn(qp->s_psn) | IB_BTH_REQ_ACK; + qp->s_psn = wqe->lpsn + 1; + ss = NULL; + len = 0; + qp->s_cur++; + if (qp->s_cur == qp->s_size) + qp->s_cur = 0; + break; + } + qp->s_sending_hpsn = bth2; + delta = delta_psn(bth2, wqe->psn); + if (delta && delta % HFI1_PSN_CREDIT == 0) + bth2 |= IB_BTH_REQ_ACK; + if (qp->s_flags & RVT_S_SEND_ONE) { + qp->s_flags &= ~RVT_S_SEND_ONE; + qp->s_flags |= RVT_S_WAIT_ACK; + bth2 |= IB_BTH_REQ_ACK; + } + qp->s_len -= len; + qp->s_hdrwords = hwords; + ps->s_txreq->sde = priv->s_sde; + qp->s_cur_sge = ss; + qp->s_cur_size = len; + hfi1_make_ruc_header( + qp, + ohdr, + bth0 | (qp->s_state << 24), + bth2, + middle, + ps); + /* pbc */ + ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2; + return 1; + +done_free_tx: + hfi1_put_txreq(ps->s_txreq); + ps->s_txreq = NULL; + return 1; + +bail: + hfi1_put_txreq(ps->s_txreq); + +bail_no_tx: + ps->s_txreq = NULL; + qp->s_flags &= ~RVT_S_BUSY; + qp->s_hdrwords = 0; + return 0; +} + +/** + * hfi1_send_rc_ack - Construct an ACK packet and send it + * @qp: a pointer to the QP + * + * This is called from hfi1_rc_rcv() and handle_receive_interrupt(). + * Note that RDMA reads and atomics are handled in the + * send side QP state and tasklet. + */ +void hfi1_send_rc_ack(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp, + int is_fecn) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u64 pbc, pbc_flags = 0; + u16 lrh0; + u16 sc5; + u32 bth0; + u32 hwords; + u32 vl, plen; + struct send_context *sc; + struct pio_buf *pbuf; + struct hfi1_ib_header hdr; + struct hfi1_other_headers *ohdr; + unsigned long flags; + + /* Don't send ACK or NAK if a RDMA read or atomic is pending. */ + if (qp->s_flags & RVT_S_RESP_PENDING) + goto queue_ack; + + /* Ensure s_rdma_ack_cnt changes are committed */ + smp_read_barrier_depends(); + if (qp->s_rdma_ack_cnt) + goto queue_ack; + + /* Construct the header */ + /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */ + hwords = 6; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + hwords += hfi1_make_grh(ibp, &hdr.u.l.grh, + &qp->remote_ah_attr.grh, hwords, 0); + ohdr = &hdr.u.l.oth; + lrh0 = HFI1_LRH_GRH; + } else { + ohdr = &hdr.u.oth; + lrh0 = HFI1_LRH_BTH; + } + /* read pkey_index w/o lock (its atomic) */ + bth0 = hfi1_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24); + if (qp->s_mig_state == IB_MIG_MIGRATED) + bth0 |= IB_BTH_MIG_REQ; + if (qp->r_nak_state) + ohdr->u.aeth = cpu_to_be32((qp->r_msn & HFI1_MSN_MASK) | + (qp->r_nak_state << + HFI1_AETH_CREDIT_SHIFT)); + else + ohdr->u.aeth = hfi1_compute_aeth(qp); + sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl]; + /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ + pbc_flags |= ((!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT); + lrh0 |= (sc5 & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4; + hdr.lrh[0] = cpu_to_be16(lrh0); + hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC); + hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits); + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(qp->remote_qpn); + ohdr->bth[1] |= cpu_to_be32((!!is_fecn) << HFI1_BECN_SHIFT); + ohdr->bth[2] = cpu_to_be32(mask_psn(qp->r_ack_psn)); + + /* Don't try to send ACKs if the link isn't ACTIVE */ + if (driver_lstate(ppd) != IB_PORT_ACTIVE) + return; + + sc = rcd->sc; + plen = 2 /* PBC */ + hwords; + vl = sc_to_vlt(ppd->dd, sc5); + pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); + + pbuf = sc_buffer_alloc(sc, plen, NULL, NULL); + if (!pbuf) { + /* + * We have no room to send at the moment. Pass + * responsibility for sending the ACK to the send tasklet + * so that when enough buffer space becomes available, + * the ACK is sent ahead of other outgoing packets. + */ + goto queue_ack; + } + + trace_ack_output_ibhdr(dd_from_ibdev(qp->ibqp.device), &hdr); + + /* write the pbc and data */ + ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, &hdr, hwords); + + return; + +queue_ack: + this_cpu_inc(*ibp->rvp.rc_qacks); + spin_lock_irqsave(&qp->s_lock, flags); + qp->s_flags |= RVT_S_ACK_PENDING | RVT_S_RESP_PENDING; + qp->s_nak_state = qp->r_nak_state; + qp->s_ack_psn = qp->r_ack_psn; + if (is_fecn) + qp->s_flags |= RVT_S_ECN; + + /* Schedule the send tasklet. */ + hfi1_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); +} + +/** + * reset_psn - reset the QP state to send starting from PSN + * @qp: the QP + * @psn: the packet sequence number to restart at + * + * This is called from hfi1_rc_rcv() to process an incoming RC ACK + * for the given QP. + * Called at interrupt level with the QP s_lock held. + */ +static void reset_psn(struct rvt_qp *qp, u32 psn) +{ + u32 n = qp->s_acked; + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, n); + u32 opcode; + + qp->s_cur = n; + + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (cmp_psn(psn, wqe->psn) <= 0) { + qp->s_state = OP(SEND_LAST); + goto done; + } + + /* Find the work request opcode corresponding to the given PSN. */ + opcode = wqe->wr.opcode; + for (;;) { + int diff; + + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) + break; + wqe = rvt_get_swqe_ptr(qp, n); + diff = cmp_psn(psn, wqe->psn); + if (diff < 0) + break; + qp->s_cur = n; + /* + * If we are starting the request from the beginning, + * let the normal send code handle initialization. + */ + if (diff == 0) { + qp->s_state = OP(SEND_LAST); + goto done; + } + opcode = wqe->wr.opcode; + } + + /* + * Set the state to restart in the middle of a request. + * Don't change the s_sge, s_cur_sge, or s_cur_size. + * See hfi1_make_rc_req(). + */ + switch (opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_FIRST); + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + qp->s_state = OP(RDMA_READ_RESPONSE_LAST); + break; + + case IB_WR_RDMA_READ: + qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE); + break; + + default: + /* + * This case shouldn't happen since its only + * one PSN per req. + */ + qp->s_state = OP(SEND_LAST); + } +done: + qp->s_psn = psn; + /* + * Set RVT_S_WAIT_PSN as rc_complete() may start the timer + * asynchronously before the send tasklet can get scheduled. + * Doing it in hfi1_make_rc_req() is too late. + */ + if ((cmp_psn(qp->s_psn, qp->s_sending_hpsn) <= 0) && + (cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) + qp->s_flags |= RVT_S_WAIT_PSN; + qp->s_flags &= ~RVT_S_AHG_VALID; +} + +/* + * Back up requester to resend the last un-ACKed request. + * The QP r_lock and s_lock should be held and interrupts disabled. + */ +static void restart_rc(struct rvt_qp *qp, u32 psn, int wait) +{ + struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + struct hfi1_ibport *ibp; + + if (qp->s_retry == 0) { + if (qp->s_mig_state == IB_MIG_ARMED) { + hfi1_migrate_qp(qp); + qp->s_retry = qp->s_retry_cnt; + } else if (qp->s_last == qp->s_acked) { + hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + return; + } else { /* need to handle delayed completion */ + return; + } + } else { + qp->s_retry--; + } + + ibp = to_iport(qp->ibqp.device, qp->port_num); + if (wqe->wr.opcode == IB_WR_RDMA_READ) + ibp->rvp.n_rc_resends++; + else + ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); + + qp->s_flags &= ~(RVT_S_WAIT_FENCE | RVT_S_WAIT_RDMAR | + RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_PSN | + RVT_S_WAIT_ACK); + if (wait) + qp->s_flags |= RVT_S_SEND_ONE; + reset_psn(qp, psn); +} + +/* + * This is called from s_timer for missing responses. + */ +void hfi1_rc_timeout(unsigned long arg) +{ + struct rvt_qp *qp = (struct rvt_qp *)arg; + struct hfi1_ibport *ibp; + unsigned long flags; + + spin_lock_irqsave(&qp->r_lock, flags); + spin_lock(&qp->s_lock); + if (qp->s_flags & RVT_S_TIMER) { + ibp = to_iport(qp->ibqp.device, qp->port_num); + ibp->rvp.n_rc_timeouts++; + qp->s_flags &= ~RVT_S_TIMER; + del_timer(&qp->s_timer); + trace_hfi1_rc_timeout(qp, qp->s_last_psn + 1); + restart_rc(qp, qp->s_last_psn + 1, 1); + hfi1_schedule_send(qp); + } + spin_unlock(&qp->s_lock); + spin_unlock_irqrestore(&qp->r_lock, flags); +} + +/* + * This is called from s_timer for RNR timeouts. + */ +void hfi1_rc_rnr_retry(unsigned long arg) +{ + struct rvt_qp *qp = (struct rvt_qp *)arg; + unsigned long flags; + + spin_lock_irqsave(&qp->s_lock, flags); + hfi1_stop_rnr_timer(qp); + hfi1_schedule_send(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); +} + +/* + * Set qp->s_sending_psn to the next PSN after the given one. + * This would be psn+1 except when RDMA reads are present. + */ +static void reset_sending_psn(struct rvt_qp *qp, u32 psn) +{ + struct rvt_swqe *wqe; + u32 n = qp->s_last; + + /* Find the work request corresponding to the given PSN. */ + for (;;) { + wqe = rvt_get_swqe_ptr(qp, n); + if (cmp_psn(psn, wqe->lpsn) <= 0) { + if (wqe->wr.opcode == IB_WR_RDMA_READ) + qp->s_sending_psn = wqe->lpsn + 1; + else + qp->s_sending_psn = psn + 1; + break; + } + if (++n == qp->s_size) + n = 0; + if (n == qp->s_tail) + break; + } +} + +/* + * This should be called with the QP s_lock held and interrupts disabled. + */ +void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr) +{ + struct hfi1_other_headers *ohdr; + struct rvt_swqe *wqe; + struct ib_wc wc; + unsigned i; + u32 opcode; + u32 psn; + + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) + return; + + /* Find out where the BTH is */ + if ((be16_to_cpu(hdr->lrh[0]) & 3) == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) { + WARN_ON(!qp->s_rdma_ack_cnt); + qp->s_rdma_ack_cnt--; + return; + } + + psn = be32_to_cpu(ohdr->bth[2]); + reset_sending_psn(qp, psn); + + /* + * Start timer after a packet requesting an ACK has been sent and + * there are still requests that haven't been acked. + */ + if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail && + !(qp->s_flags & + (RVT_S_TIMER | RVT_S_WAIT_RNR | RVT_S_WAIT_PSN)) && + (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) + hfi1_add_retry_timer(qp); + + while (qp->s_last != qp->s_acked) { + u32 s_last; + + wqe = rvt_get_swqe_ptr(qp, qp->s_last); + if (cmp_psn(wqe->lpsn, qp->s_sending_psn) >= 0 && + cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) + break; + s_last = qp->s_last; + if (++s_last >= qp->s_size) + s_last = 0; + qp->s_last = s_last; + /* see post_send() */ + barrier(); + for (i = 0; i < wqe->wr.num_sge; i++) { + struct rvt_sge *sge = &wqe->sg_list[i]; + + rvt_put_mr(sge->mr); + } + /* Post a send completion queue entry if requested. */ + if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED)) { + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr.wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode]; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0); + } + } + /* + * If we were waiting for sends to complete before re-sending, + * and they are now complete, restart sending. + */ + trace_hfi1_rc_sendcomplete(qp, psn); + if (qp->s_flags & RVT_S_WAIT_PSN && + cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { + qp->s_flags &= ~RVT_S_WAIT_PSN; + qp->s_sending_psn = qp->s_psn; + qp->s_sending_hpsn = qp->s_psn - 1; + hfi1_schedule_send(qp); + } +} + +static inline void update_last_psn(struct rvt_qp *qp, u32 psn) +{ + qp->s_last_psn = psn; +} + +/* + * Generate a SWQE completion. + * This is similar to hfi1_send_complete but has to check to be sure + * that the SGEs are not being referenced if the SWQE is being resent. + */ +static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp, + struct rvt_swqe *wqe, + struct hfi1_ibport *ibp) +{ + struct ib_wc wc; + unsigned i; + + /* + * Don't decrement refcount and don't generate a + * completion if the SWQE is being resent until the send + * is finished. + */ + if (cmp_psn(wqe->lpsn, qp->s_sending_psn) < 0 || + cmp_psn(qp->s_sending_psn, qp->s_sending_hpsn) > 0) { + u32 s_last; + + for (i = 0; i < wqe->wr.num_sge; i++) { + struct rvt_sge *sge = &wqe->sg_list[i]; + + rvt_put_mr(sge->mr); + } + s_last = qp->s_last; + if (++s_last >= qp->s_size) + s_last = 0; + qp->s_last = s_last; + /* see post_send() */ + barrier(); + /* Post a send completion queue entry if requested. */ + if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED)) { + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr.wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode]; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, 0); + } + } else { + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + this_cpu_inc(*ibp->rvp.rc_delayed_comp); + /* + * If send progress not running attempt to progress + * SDMA queue. + */ + if (ppd->dd->flags & HFI1_HAS_SEND_DMA) { + struct sdma_engine *engine; + u8 sc5; + + /* For now use sc to find engine */ + sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl]; + engine = qp_to_sdma_engine(qp, sc5); + sdma_engine_progress_schedule(engine); + } + } + + qp->s_retry = qp->s_retry_cnt; + update_last_psn(qp, wqe->lpsn); + + /* + * If we are completing a request which is in the process of + * being resent, we can stop re-sending it since we know the + * responder has already seen it. + */ + if (qp->s_acked == qp->s_cur) { + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + qp->s_acked = qp->s_cur; + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); + if (qp->s_acked != qp->s_tail) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = wqe->psn; + } + } else { + if (++qp->s_acked >= qp->s_size) + qp->s_acked = 0; + if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur) + qp->s_draining = 0; + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + } + return wqe; +} + +/** + * do_rc_ack - process an incoming RC ACK + * @qp: the QP the ACK came in on + * @psn: the packet sequence number of the ACK + * @opcode: the opcode of the request that resulted in the ACK + * + * This is called from rc_rcv_resp() to process an incoming RC ACK + * for the given QP. + * May be called at interrupt level, with the QP s_lock held. + * Returns 1 if OK, 0 if current operation should be aborted (NAK). + */ +static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, + u64 val, struct hfi1_ctxtdata *rcd) +{ + struct hfi1_ibport *ibp; + enum ib_wc_status status; + struct rvt_swqe *wqe; + int ret = 0; + u32 ack_psn; + int diff; + unsigned long to; + + /* + * Note that NAKs implicitly ACK outstanding SEND and RDMA write + * requests and implicitly NAK RDMA read and atomic requests issued + * before the NAK'ed request. The MSN won't include the NAK'ed + * request but will include an ACK'ed request(s). + */ + ack_psn = psn; + if (aeth >> 29) + ack_psn--; + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + ibp = to_iport(qp->ibqp.device, qp->port_num); + + /* + * The MSN might be for a later WQE than the PSN indicates so + * only complete WQEs that the PSN finishes. + */ + while ((diff = delta_psn(ack_psn, wqe->lpsn)) >= 0) { + /* + * RDMA_READ_RESPONSE_ONLY is a special case since + * we want to generate completion events for everything + * before the RDMA read, copy the data, then generate + * the completion for the read. + */ + if (wqe->wr.opcode == IB_WR_RDMA_READ && + opcode == OP(RDMA_READ_RESPONSE_ONLY) && + diff == 0) { + ret = 1; + goto bail_stop; + } + /* + * If this request is a RDMA read or atomic, and the ACK is + * for a later operation, this ACK NAKs the RDMA read or + * atomic. In other words, only a RDMA_READ_LAST or ONLY + * can ACK a RDMA read and likewise for atomic ops. Note + * that the NAK case can only happen if relaxed ordering is + * used and requests are sent after an RDMA read or atomic + * is sent but before the response is received. + */ + if ((wqe->wr.opcode == IB_WR_RDMA_READ && + (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) || + ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) && + (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) { + /* Retry this request. */ + if (!(qp->r_flags & RVT_R_RDMAR_SEQ)) { + qp->r_flags |= RVT_R_RDMAR_SEQ; + restart_rc(qp, qp->s_last_psn + 1, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_SEND; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, + &rcd->qp_wait_list); + } + } + /* + * No need to process the ACK/NAK since we are + * restarting an earlier request. + */ + goto bail_stop; + } + if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + u64 *vaddr = wqe->sg_list[0].vaddr; + *vaddr = val; + } + if (qp->s_num_rd_atomic && + (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) { + qp->s_num_rd_atomic--; + /* Restart sending task if fence is complete */ + if ((qp->s_flags & RVT_S_WAIT_FENCE) && + !qp->s_num_rd_atomic) { + qp->s_flags &= ~(RVT_S_WAIT_FENCE | + RVT_S_WAIT_ACK); + hfi1_schedule_send(qp); + } else if (qp->s_flags & RVT_S_WAIT_RDMAR) { + qp->s_flags &= ~(RVT_S_WAIT_RDMAR | + RVT_S_WAIT_ACK); + hfi1_schedule_send(qp); + } + } + wqe = do_rc_completion(qp, wqe, ibp); + if (qp->s_acked == qp->s_tail) + break; + } + + switch (aeth >> 29) { + case 0: /* ACK */ + this_cpu_inc(*ibp->rvp.rc_acks); + if (qp->s_acked != qp->s_tail) { + /* + * We are expecting more ACKs so + * mod the retry timer. + */ + hfi1_mod_retry_timer(qp); + /* + * We can stop re-sending the earlier packets and + * continue with the next packet the receiver wants. + */ + if (cmp_psn(qp->s_psn, psn) <= 0) + reset_psn(qp, psn + 1); + } else { + /* No more acks - kill all timers */ + hfi1_stop_rc_timers(qp); + if (cmp_psn(qp->s_psn, psn) <= 0) { + qp->s_state = OP(SEND_LAST); + qp->s_psn = psn + 1; + } + } + if (qp->s_flags & RVT_S_WAIT_ACK) { + qp->s_flags &= ~RVT_S_WAIT_ACK; + hfi1_schedule_send(qp); + } + hfi1_get_credit(qp, aeth); + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + qp->s_retry = qp->s_retry_cnt; + update_last_psn(qp, psn); + return 1; + + case 1: /* RNR NAK */ + ibp->rvp.n_rnr_naks++; + if (qp->s_acked == qp->s_tail) + goto bail_stop; + if (qp->s_flags & RVT_S_WAIT_RNR) + goto bail_stop; + if (qp->s_rnr_retry == 0) { + status = IB_WC_RNR_RETRY_EXC_ERR; + goto class_b; + } + if (qp->s_rnr_retry_cnt < 7) + qp->s_rnr_retry--; + + /* The last valid PSN is the previous PSN. */ + update_last_psn(qp, psn - 1); + + ibp->rvp.n_rc_resends += delta_psn(qp->s_psn, psn); + + reset_psn(qp, psn); + + qp->s_flags &= ~(RVT_S_WAIT_SSN_CREDIT | RVT_S_WAIT_ACK); + hfi1_stop_rc_timers(qp); + to = + ib_hfi1_rnr_table[(aeth >> HFI1_AETH_CREDIT_SHIFT) & + HFI1_AETH_CREDIT_MASK]; + hfi1_add_rnr_timer(qp, to); + return 0; + + case 3: /* NAK */ + if (qp->s_acked == qp->s_tail) + goto bail_stop; + /* The last valid PSN is the previous PSN. */ + update_last_psn(qp, psn - 1); + switch ((aeth >> HFI1_AETH_CREDIT_SHIFT) & + HFI1_AETH_CREDIT_MASK) { + case 0: /* PSN sequence error */ + ibp->rvp.n_seq_naks++; + /* + * Back up to the responder's expected PSN. + * Note that we might get a NAK in the middle of an + * RDMA READ response which terminates the RDMA + * READ. + */ + restart_rc(qp, psn, 0); + hfi1_schedule_send(qp); + break; + + case 1: /* Invalid Request */ + status = IB_WC_REM_INV_REQ_ERR; + ibp->rvp.n_other_naks++; + goto class_b; + + case 2: /* Remote Access Error */ + status = IB_WC_REM_ACCESS_ERR; + ibp->rvp.n_other_naks++; + goto class_b; + + case 3: /* Remote Operation Error */ + status = IB_WC_REM_OP_ERR; + ibp->rvp.n_other_naks++; +class_b: + if (qp->s_last == qp->s_acked) { + hfi1_send_complete(qp, wqe, status); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } + break; + + default: + /* Ignore other reserved NAK error codes */ + goto reserved; + } + qp->s_retry = qp->s_retry_cnt; + qp->s_rnr_retry = qp->s_rnr_retry_cnt; + goto bail_stop; + + default: /* 2: reserved */ +reserved: + /* Ignore reserved NAK codes. */ + goto bail_stop; + } + /* cannot be reached */ +bail_stop: + hfi1_stop_rc_timers(qp); + return ret; +} + +/* + * We have seen an out of sequence RDMA read middle or last packet. + * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE. + */ +static void rdma_seq_err(struct rvt_qp *qp, struct hfi1_ibport *ibp, u32 psn, + struct hfi1_ctxtdata *rcd) +{ + struct rvt_swqe *wqe; + + /* Remove QP from retry timer */ + hfi1_stop_rc_timers(qp); + + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + + while (cmp_psn(psn, wqe->lpsn) > 0) { + if (wqe->wr.opcode == IB_WR_RDMA_READ || + wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) + break; + wqe = do_rc_completion(qp, wqe, ibp); + } + + ibp->rvp.n_rdma_seq++; + qp->r_flags |= RVT_R_RDMAR_SEQ; + restart_rc(qp, qp->s_last_psn + 1, 0); + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_SEND; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } +} + +/** + * rc_rcv_resp - process an incoming RC response packet + * @ibp: the port this packet came in on + * @ohdr: the other headers for this packet + * @data: the packet data + * @tlen: the packet length + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @hdrsize: the header length + * @pmtu: the path MTU + * + * This is called from hfi1_rc_rcv() to process an incoming RC response + * packet for the given QP. + * Called at interrupt level. + */ +static void rc_rcv_resp(struct hfi1_ibport *ibp, + struct hfi1_other_headers *ohdr, + void *data, u32 tlen, struct rvt_qp *qp, + u32 opcode, u32 psn, u32 hdrsize, u32 pmtu, + struct hfi1_ctxtdata *rcd) +{ + struct rvt_swqe *wqe; + enum ib_wc_status status; + unsigned long flags; + int diff; + u32 pad; + u32 aeth; + u64 val; + + spin_lock_irqsave(&qp->s_lock, flags); + + trace_hfi1_rc_ack(qp, psn); + + /* Ignore invalid responses. */ + smp_read_barrier_depends(); /* see post_one_send */ + if (cmp_psn(psn, ACCESS_ONCE(qp->s_next_psn)) >= 0) + goto ack_done; + + /* Ignore duplicate responses. */ + diff = cmp_psn(psn, qp->s_last_psn); + if (unlikely(diff <= 0)) { + /* Update credits for "ghost" ACKs */ + if (diff == 0 && opcode == OP(ACKNOWLEDGE)) { + aeth = be32_to_cpu(ohdr->u.aeth); + if ((aeth >> 29) == 0) + hfi1_get_credit(qp, aeth); + } + goto ack_done; + } + + /* + * Skip everything other than the PSN we expect, if we are waiting + * for a reply to a restarted RDMA read or atomic op. + */ + if (qp->r_flags & RVT_R_RDMAR_SEQ) { + if (cmp_psn(psn, qp->s_last_psn + 1) != 0) + goto ack_done; + qp->r_flags &= ~RVT_R_RDMAR_SEQ; + } + + if (unlikely(qp->s_acked == qp->s_tail)) + goto ack_done; + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + status = IB_WC_SUCCESS; + + switch (opcode) { + case OP(ACKNOWLEDGE): + case OP(ATOMIC_ACKNOWLEDGE): + case OP(RDMA_READ_RESPONSE_FIRST): + aeth = be32_to_cpu(ohdr->u.aeth); + if (opcode == OP(ATOMIC_ACKNOWLEDGE)) { + __be32 *p = ohdr->u.at.atomic_ack_eth; + + val = ((u64)be32_to_cpu(p[0]) << 32) | + be32_to_cpu(p[1]); + } else { + val = 0; + } + if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) || + opcode != OP(RDMA_READ_RESPONSE_FIRST)) + goto ack_done; + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + /* + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. + */ + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_middle; + + case OP(RDMA_READ_RESPONSE_MIDDLE): + /* no AETH, no ACK */ + if (unlikely(cmp_psn(psn, qp->s_last_psn + 1))) + goto ack_seq_err; + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; +read_middle: + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto ack_len_err; + if (unlikely(pmtu >= qp->s_rdma_read_len)) + goto ack_len_err; + + /* + * We got a response so update the timeout. + * 4.096 usec. * (1 << qp->timeout) + */ + qp->s_flags |= RVT_S_TIMER; + mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies); + if (qp->s_flags & RVT_S_WAIT_ACK) { + qp->s_flags &= ~RVT_S_WAIT_ACK; + hfi1_schedule_send(qp); + } + + if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE)) + qp->s_retry = qp->s_retry_cnt; + + /* + * Update the RDMA receive state but do the copy w/o + * holding the locks and blocking interrupts. + */ + qp->s_rdma_read_len -= pmtu; + update_last_psn(qp, psn); + spin_unlock_irqrestore(&qp->s_lock, flags); + hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0, 0); + goto bail; + + case OP(RDMA_READ_RESPONSE_ONLY): + aeth = be32_to_cpu(ohdr->u.aeth); + if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd)) + goto ack_done; + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* + * Check that the data size is >= 0 && <= pmtu. + * Remember to account for ICRC (4). + */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto ack_len_err; + /* + * If this is a response to a resent RDMA read, we + * have to be careful to copy the data to the right + * location. + */ + wqe = rvt_get_swqe_ptr(qp, qp->s_acked); + qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge, + wqe, psn, pmtu); + goto read_last; + + case OP(RDMA_READ_RESPONSE_LAST): + /* ACKs READ req. */ + if (unlikely(cmp_psn(psn, qp->s_last_psn + 1))) + goto ack_seq_err; + if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ)) + goto ack_op_err; + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* + * Check that the data size is >= 1 && <= pmtu. + * Remember to account for ICRC (4). + */ + if (unlikely(tlen <= (hdrsize + pad + 4))) + goto ack_len_err; +read_last: + tlen -= hdrsize + pad + 4; + if (unlikely(tlen != qp->s_rdma_read_len)) + goto ack_len_err; + aeth = be32_to_cpu(ohdr->u.aeth); + hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0, 0); + WARN_ON(qp->s_rdma_read_sge.num_sge); + (void)do_rc_ack(qp, aeth, psn, + OP(RDMA_READ_RESPONSE_LAST), 0, rcd); + goto ack_done; + } + +ack_op_err: + status = IB_WC_LOC_QP_OP_ERR; + goto ack_err; + +ack_seq_err: + rdma_seq_err(qp, ibp, psn, rcd); + goto ack_done; + +ack_len_err: + status = IB_WC_LOC_LEN_ERR; +ack_err: + if (qp->s_last == qp->s_acked) { + hfi1_send_complete(qp, wqe, status); + rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); + } +ack_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +bail: + return; +} + +static inline void rc_defered_ack(struct hfi1_ctxtdata *rcd, + struct rvt_qp *qp) +{ + if (list_empty(&qp->rspwait)) { + qp->r_flags |= RVT_R_RSP_NAK; + atomic_inc(&qp->refcount); + list_add_tail(&qp->rspwait, &rcd->qp_wait_list); + } +} + +static inline void rc_cancel_ack(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + priv->r_adefered = 0; + if (list_empty(&qp->rspwait)) + return; + list_del_init(&qp->rspwait); + qp->r_flags &= ~RVT_R_RSP_NAK; + if (atomic_dec_and_test(&qp->refcount)) + wake_up(&qp->wait); +} + +/** + * rc_rcv_error - process an incoming duplicate or error RC packet + * @ohdr: the other headers for this packet + * @data: the packet data + * @qp: the QP for this packet + * @opcode: the opcode for this packet + * @psn: the packet sequence number for this packet + * @diff: the difference between the PSN and the expected PSN + * + * This is called from hfi1_rc_rcv() to process an unexpected + * incoming RC packet for the given QP. + * Called at interrupt level. + * Return 1 if no more processing is needed; otherwise return 0 to + * schedule a response to be sent. + */ +static noinline int rc_rcv_error(struct hfi1_other_headers *ohdr, void *data, + struct rvt_qp *qp, u32 opcode, u32 psn, + int diff, struct hfi1_ctxtdata *rcd) +{ + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct rvt_ack_entry *e; + unsigned long flags; + u8 i, prev; + int old_req; + + trace_hfi1_rc_rcv_error(qp, psn); + if (diff > 0) { + /* + * Packet sequence error. + * A NAK will ACK earlier sends and RDMA writes. + * Don't queue the NAK if we already sent one. + */ + if (!qp->r_nak_state) { + ibp->rvp.n_rc_seqnak++; + qp->r_nak_state = IB_NAK_PSN_ERROR; + /* Use the expected PSN. */ + qp->r_ack_psn = qp->r_psn; + /* + * Wait to send the sequence NAK until all packets + * in the receive queue have been processed. + * Otherwise, we end up propagating congestion. + */ + rc_defered_ack(rcd, qp); + } + goto done; + } + + /* + * Handle a duplicate request. Don't re-execute SEND, RDMA + * write or atomic op. Don't NAK errors, just silently drop + * the duplicate request. Note that r_sge, r_len, and + * r_rcv_len may be in use so don't modify them. + * + * We are supposed to ACK the earliest duplicate PSN but we + * can coalesce an outstanding duplicate ACK. We have to + * send the earliest so that RDMA reads can be restarted at + * the requester's expected PSN. + * + * First, find where this duplicate PSN falls within the + * ACKs previously sent. + * old_req is true if there is an older response that is scheduled + * to be sent before sending this one. + */ + e = NULL; + old_req = 1; + ibp->rvp.n_rc_dupreq++; + + spin_lock_irqsave(&qp->s_lock, flags); + + for (i = qp->r_head_ack_queue; ; i = prev) { + if (i == qp->s_tail_ack_queue) + old_req = 0; + if (i) + prev = i - 1; + else + prev = HFI1_MAX_RDMA_ATOMIC; + if (prev == qp->r_head_ack_queue) { + e = NULL; + break; + } + e = &qp->s_ack_queue[prev]; + if (!e->opcode) { + e = NULL; + break; + } + if (cmp_psn(psn, e->psn) >= 0) { + if (prev == qp->s_tail_ack_queue && + cmp_psn(psn, e->lpsn) <= 0) + old_req = 0; + break; + } + } + switch (opcode) { + case OP(RDMA_READ_REQUEST): { + struct ib_reth *reth; + u32 offset; + u32 len; + + /* + * If we didn't find the RDMA read request in the ack queue, + * we can ignore this request. + */ + if (!e || e->opcode != OP(RDMA_READ_REQUEST)) + goto unlock_done; + /* RETH comes after BTH */ + reth = &ohdr->u.rc.reth; + /* + * Address range must be a subset of the original + * request and start on pmtu boundaries. + * We reuse the old ack_queue slot since the requester + * should not back up and request an earlier PSN for the + * same request. + */ + offset = delta_psn(psn, e->psn) * qp->pmtu; + len = be32_to_cpu(reth->length); + if (unlikely(offset + len != e->rdma_sge.sge_length)) + goto unlock_done; + if (e->rdma_sge.mr) { + rvt_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + if (len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey, + IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto unlock_done; + } else { + e->rdma_sge.vaddr = NULL; + e->rdma_sge.length = 0; + e->rdma_sge.sge_length = 0; + } + e->psn = psn; + if (old_req) + goto unlock_done; + qp->s_tail_ack_queue = prev; + break; + } + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): { + /* + * If we didn't find the atomic request in the ack queue + * or the send tasklet is already backed up to send an + * earlier entry, we can ignore this request. + */ + if (!e || e->opcode != (u8)opcode || old_req) + goto unlock_done; + qp->s_tail_ack_queue = prev; + break; + } + + default: + /* + * Ignore this operation if it doesn't request an ACK + * or an earlier RDMA read or atomic is going to be resent. + */ + if (!(psn & IB_BTH_REQ_ACK) || old_req) + goto unlock_done; + /* + * Resend the most recent ACK if this request is + * after all the previous RDMA reads and atomics. + */ + if (i == qp->r_head_ack_queue) { + spin_unlock_irqrestore(&qp->s_lock, flags); + qp->r_nak_state = 0; + qp->r_ack_psn = qp->r_psn - 1; + goto send_ack; + } + + /* + * Resend the RDMA read or atomic op which + * ACKs this duplicate request. + */ + qp->s_tail_ack_queue = i; + break; + } + qp->s_ack_state = OP(ACKNOWLEDGE); + qp->s_flags |= RVT_S_RESP_PENDING; + qp->r_nak_state = 0; + hfi1_schedule_send(qp); + +unlock_done: + spin_unlock_irqrestore(&qp->s_lock, flags); +done: + return 1; + +send_ack: + return 0; +} + +void hfi1_rc_error(struct rvt_qp *qp, enum ib_wc_status err) +{ + unsigned long flags; + int lastwqe; + + spin_lock_irqsave(&qp->s_lock, flags); + lastwqe = rvt_error_qp(qp, err); + spin_unlock_irqrestore(&qp->s_lock, flags); + + if (lastwqe) { + struct ib_event ev; + + ev.device = qp->ibqp.device; + ev.element.qp = &qp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qp->ibqp.event_handler(&ev, qp->ibqp.qp_context); + } +} + +static inline void update_ack_queue(struct rvt_qp *qp, unsigned n) +{ + unsigned next; + + next = n + 1; + if (next > HFI1_MAX_RDMA_ATOMIC) + next = 0; + qp->s_tail_ack_queue = next; + qp->s_ack_state = OP(ACKNOWLEDGE); +} + +static void log_cca_event(struct hfi1_pportdata *ppd, u8 sl, u32 rlid, + u32 lqpn, u32 rqpn, u8 svc_type) +{ + struct opa_hfi1_cong_log_event_internal *cc_event; + unsigned long flags; + + if (sl >= OPA_MAX_SLS) + return; + + spin_lock_irqsave(&ppd->cc_log_lock, flags); + + ppd->threshold_cong_event_map[sl / 8] |= 1 << (sl % 8); + ppd->threshold_event_counter++; + + cc_event = &ppd->cc_events[ppd->cc_log_idx++]; + if (ppd->cc_log_idx == OPA_CONG_LOG_ELEMS) + ppd->cc_log_idx = 0; + cc_event->lqpn = lqpn & RVT_QPN_MASK; + cc_event->rqpn = rqpn & RVT_QPN_MASK; + cc_event->sl = sl; + cc_event->svc_type = svc_type; + cc_event->rlid = rlid; + /* keep timestamp in units of 1.024 usec */ + cc_event->timestamp = ktime_to_ns(ktime_get()) / 1024; + + spin_unlock_irqrestore(&ppd->cc_log_lock, flags); +} + +void process_becn(struct hfi1_pportdata *ppd, u8 sl, u16 rlid, u32 lqpn, + u32 rqpn, u8 svc_type) +{ + struct cca_timer *cca_timer; + u16 ccti, ccti_incr, ccti_timer, ccti_limit; + u8 trigger_threshold; + struct cc_state *cc_state; + unsigned long flags; + + if (sl >= OPA_MAX_SLS) + return; + + cc_state = get_cc_state(ppd); + + if (!cc_state) + return; + + /* + * 1) increase CCTI (for this SL) + * 2) select IPG (i.e., call set_link_ipg()) + * 3) start timer + */ + ccti_limit = cc_state->cct.ccti_limit; + ccti_incr = cc_state->cong_setting.entries[sl].ccti_increase; + ccti_timer = cc_state->cong_setting.entries[sl].ccti_timer; + trigger_threshold = + cc_state->cong_setting.entries[sl].trigger_threshold; + + spin_lock_irqsave(&ppd->cca_timer_lock, flags); + + cca_timer = &ppd->cca_timer[sl]; + if (cca_timer->ccti < ccti_limit) { + if (cca_timer->ccti + ccti_incr <= ccti_limit) + cca_timer->ccti += ccti_incr; + else + cca_timer->ccti = ccti_limit; + set_link_ipg(ppd); + } + + ccti = cca_timer->ccti; + + if (!hrtimer_active(&cca_timer->hrtimer)) { + /* ccti_timer is in units of 1.024 usec */ + unsigned long nsec = 1024 * ccti_timer; + + hrtimer_start(&cca_timer->hrtimer, ns_to_ktime(nsec), + HRTIMER_MODE_REL); + } + + spin_unlock_irqrestore(&ppd->cca_timer_lock, flags); + + if ((trigger_threshold != 0) && (ccti >= trigger_threshold)) + log_cca_event(ppd, sl, rlid, lqpn, rqpn, svc_type); +} + +/** + * hfi1_rc_rcv - process an incoming RC packet + * @rcd: the context pointer + * @hdr: the header of this packet + * @rcv_flags: flags relevant to rcv processing + * @data: the packet data + * @tlen: the packet length + * @qp: the QP for this packet + * + * This is called from qp_rcv() to process an incoming RC packet + * for the given QP. + * May be called at interrupt level. + */ +void hfi1_rc_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct hfi1_ib_header *hdr = packet->hdr; + u32 rcv_flags = packet->rcv_flags; + void *data = packet->ebuf; + u32 tlen = packet->tlen; + struct rvt_qp *qp = packet->qp; + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_other_headers *ohdr = packet->ohdr; + u32 bth0, opcode; + u32 hdrsize = packet->hlen; + u32 psn; + u32 pad; + struct ib_wc wc; + u32 pmtu = qp->pmtu; + int diff; + struct ib_reth *reth; + unsigned long flags; + u32 bth1; + int ret, is_fecn = 0; + int copy_last = 0; + + bth0 = be32_to_cpu(ohdr->bth[0]); + if (hfi1_ruc_check_hdr(ibp, hdr, rcv_flags & HFI1_HAS_GRH, qp, bth0)) + return; + + bth1 = be32_to_cpu(ohdr->bth[1]); + if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) { + if (bth1 & HFI1_BECN_SMASK) { + u16 rlid = qp->remote_ah_attr.dlid; + u32 lqpn, rqpn; + + lqpn = qp->ibqp.qp_num; + rqpn = qp->remote_qpn; + process_becn( + ppd, + qp->remote_ah_attr.sl, + rlid, lqpn, rqpn, + IB_CC_SVCTYPE_RC); + } + is_fecn = bth1 & HFI1_FECN_SMASK; + } + + psn = be32_to_cpu(ohdr->bth[2]); + opcode = (bth0 >> 24) & 0xff; + + /* + * Process responses (ACKs) before anything else. Note that the + * packet sequence number will be for something in the send work + * queue rather than the expected receive packet sequence number. + * In other words, this QP is the requester. + */ + if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) && + opcode <= OP(ATOMIC_ACKNOWLEDGE)) { + rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn, + hdrsize, pmtu, rcd); + if (is_fecn) + goto send_ack; + return; + } + + /* Compute 24 bits worth of difference. */ + diff = delta_psn(psn, qp->r_psn); + if (unlikely(diff)) { + if (rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd)) + return; + goto send_ack; + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + break; + goto nack_inv; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto nack_inv; + + default: + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + goto nack_inv; + /* + * Note that it is up to the requester to not send a new + * RDMA read or atomic operation before receiving an ACK + * for the previous operation. + */ + break; + } + + if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) + qp_comm_est(qp); + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + ret = hfi1_rvt_get_rwqe(qp, 0); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + qp->r_rcv_len = 0; + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + case OP(RDMA_WRITE_MIDDLE): +send_middle: + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto nack_inv; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto nack_inv; + hfi1_copy_sge(&qp->r_sge, data, pmtu, 1, 0); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + /* consume RWQE */ + ret = hfi1_rvt_get_rwqe(qp, 1); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + goto send_last_imm; + + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + ret = hfi1_rvt_get_rwqe(qp, 0); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto no_immediate_data; + /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */ + case OP(SEND_LAST_WITH_IMMEDIATE): +send_last_imm: + wc.ex.imm_data = ohdr->u.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + goto send_last; + case OP(RDMA_WRITE_LAST): + copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user; + /* fall through */ + case OP(SEND_LAST): +no_immediate_data: + wc.wc_flags = 0; + wc.ex.imm_data = 0; +send_last: + /* Get the number of bytes the message was padded by. */ + pad = (bth0 >> 20) & 3; + /* Check for invalid length. */ + /* LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto nack_inv; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) + goto nack_inv; + hfi1_copy_sge(&qp->r_sge, data, tlen, 1, copy_last); + rvt_put_ss(&qp->r_sge); + qp->r_msn++; + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) + break; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + /* + * It seems that IB mandates the presence of an SL in a + * work completion only for the UD transport (see section + * 11.4.2 of IBTA Vol. 1). + * + * However, the way the SL is chosen below is consistent + * with the way that IB/qib works and is trying avoid + * introducing incompatibilities. + * + * See also OPA Vol. 1, section 9.7.6, and table 9-17. + */ + wc.sl = qp->remote_ah_attr.sl; + /* zero fields that are N/A */ + wc.vendor_err = 0; + wc.pkey_index = 0; + wc.dlid_path_bits = 0; + wc.port_num = 0; + /* Signal completion event if the solicited bit is set. */ + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, + (bth0 & IB_BTH_SOLICITED) != 0); + break; + + case OP(RDMA_WRITE_ONLY): + copy_last = 1; + /* fall through */ + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto nack_inv; + /* consume RWQE */ + reth = &ohdr->u.rc.reth; + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + qp->r_sge.sg_list = NULL; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey & NAK */ + ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr, + rkey, IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) + goto nack_acc; + qp->r_sge.num_sge = 1; + } else { + qp->r_sge.num_sge = 0; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (opcode == OP(RDMA_WRITE_FIRST)) + goto send_middle; + else if (opcode == OP(RDMA_WRITE_ONLY)) + goto no_immediate_data; + ret = hfi1_rvt_get_rwqe(qp, 1); + if (ret < 0) + goto nack_op_err; + if (!ret) + goto rnr_nak; + wc.ex.imm_data = ohdr->u.rc.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + goto send_last; + + case OP(RDMA_READ_REQUEST): { + struct rvt_ack_entry *e; + u32 len; + u8 next; + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */ + if (next > HFI1_MAX_RDMA_ATOMIC) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { + rvt_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + reth = &ohdr->u.rc.reth; + len = be32_to_cpu(reth->length); + if (len) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey & NAK */ + ok = rvt_rkey_ok(qp, &e->rdma_sge, len, vaddr, + rkey, IB_ACCESS_REMOTE_READ); + if (unlikely(!ok)) + goto nack_acc_unlck; + /* + * Update the next expected PSN. We add 1 later + * below, so only add the remainder here. + */ + if (len > pmtu) + qp->r_psn += (len - 1) / pmtu; + } else { + e->rdma_sge.mr = NULL; + e->rdma_sge.vaddr = NULL; + e->rdma_sge.length = 0; + e->rdma_sge.sge_length = 0; + } + e->opcode = opcode; + e->sent = 0; + e->psn = psn; + e->lpsn = qp->r_psn; + /* + * We need to increment the MSN here instead of when we + * finish sending the result since a duplicate request would + * increment it more than once. + */ + qp->r_msn++; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + qp->r_head_ack_queue = next; + + /* Schedule the send tasklet. */ + qp->s_flags |= RVT_S_RESP_PENDING; + hfi1_schedule_send(qp); + + spin_unlock_irqrestore(&qp->s_lock, flags); + if (is_fecn) + goto send_ack; + return; + } + + case OP(COMPARE_SWAP): + case OP(FETCH_ADD): { + struct ib_atomic_eth *ateth; + struct rvt_ack_entry *e; + u64 vaddr; + atomic64_t *maddr; + u64 sdata; + u32 rkey; + u8 next; + + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + goto nack_inv; + next = qp->r_head_ack_queue + 1; + if (next > HFI1_MAX_RDMA_ATOMIC) + next = 0; + spin_lock_irqsave(&qp->s_lock, flags); + if (unlikely(next == qp->s_tail_ack_queue)) { + if (!qp->s_ack_queue[next].sent) + goto nack_inv_unlck; + update_ack_queue(qp, next); + } + e = &qp->s_ack_queue[qp->r_head_ack_queue]; + if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) { + rvt_put_mr(e->rdma_sge.mr); + e->rdma_sge.mr = NULL; + } + ateth = &ohdr->u.atomic_eth; + vaddr = ((u64)be32_to_cpu(ateth->vaddr[0]) << 32) | + be32_to_cpu(ateth->vaddr[1]); + if (unlikely(vaddr & (sizeof(u64) - 1))) + goto nack_inv_unlck; + rkey = be32_to_cpu(ateth->rkey); + /* Check rkey & NAK */ + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), + vaddr, rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto nack_acc_unlck; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *)qp->r_sge.sge.vaddr; + sdata = be64_to_cpu(ateth->swap_data); + e->atomic_data = (opcode == OP(FETCH_ADD)) ? + (u64)atomic64_add_return(sdata, maddr) - sdata : + (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr, + be64_to_cpu(ateth->compare_data), + sdata); + rvt_put_mr(qp->r_sge.sge.mr); + qp->r_sge.num_sge = 0; + e->opcode = opcode; + e->sent = 0; + e->psn = psn; + e->lpsn = psn; + qp->r_msn++; + qp->r_psn++; + qp->r_state = opcode; + qp->r_nak_state = 0; + qp->r_head_ack_queue = next; + + /* Schedule the send tasklet. */ + qp->s_flags |= RVT_S_RESP_PENDING; + hfi1_schedule_send(qp); + + spin_unlock_irqrestore(&qp->s_lock, flags); + if (is_fecn) + goto send_ack; + return; + } + + default: + /* NAK unknown opcodes. */ + goto nack_inv; + } + qp->r_psn++; + qp->r_state = opcode; + qp->r_ack_psn = psn; + qp->r_nak_state = 0; + /* Send an ACK if requested or required. */ + if (psn & IB_BTH_REQ_ACK) { + struct hfi1_qp_priv *priv = qp->priv; + + if (packet->numpkt == 0) { + rc_cancel_ack(qp); + goto send_ack; + } + if (priv->r_adefered >= HFI1_PSN_CREDIT) { + rc_cancel_ack(qp); + goto send_ack; + } + if (unlikely(is_fecn)) { + rc_cancel_ack(qp); + goto send_ack; + } + priv->r_adefered++; + rc_defered_ack(rcd, qp); + } + return; + +rnr_nak: + qp->r_nak_state = qp->r_min_rnr_timer | IB_RNR_NAK; + qp->r_ack_psn = qp->r_psn; + /* Queue RNR NAK for later */ + rc_defered_ack(rcd, qp); + return; + +nack_op_err: + hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR; + qp->r_ack_psn = qp->r_psn; + /* Queue NAK for later */ + rc_defered_ack(rcd, qp); + return; + +nack_inv_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_inv: + hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + qp->r_nak_state = IB_NAK_INVALID_REQUEST; + qp->r_ack_psn = qp->r_psn; + /* Queue NAK for later */ + rc_defered_ack(rcd, qp); + return; + +nack_acc_unlck: + spin_unlock_irqrestore(&qp->s_lock, flags); +nack_acc: + hfi1_rc_error(qp, IB_WC_LOC_PROT_ERR); + qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR; + qp->r_ack_psn = qp->r_psn; +send_ack: + hfi1_send_rc_ack(rcd, qp, is_fecn); +} + +void hfi1_rc_hdrerr( + struct hfi1_ctxtdata *rcd, + struct hfi1_ib_header *hdr, + u32 rcv_flags, + struct rvt_qp *qp) +{ + int has_grh = rcv_flags & HFI1_HAS_GRH; + struct hfi1_other_headers *ohdr; + struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); + int diff; + u32 opcode; + u32 psn, bth0; + + /* Check for GRH */ + ohdr = &hdr->u.oth; + if (has_grh) + ohdr = &hdr->u.l.oth; + + bth0 = be32_to_cpu(ohdr->bth[0]); + if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0)) + return; + + psn = be32_to_cpu(ohdr->bth[2]); + opcode = (bth0 >> 24) & 0xff; + + /* Only deal with RDMA Writes for now */ + if (opcode < IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) { + diff = delta_psn(psn, qp->r_psn); + if (!qp->r_nak_state && diff >= 0) { + ibp->rvp.n_rc_seqnak++; + qp->r_nak_state = IB_NAK_PSN_ERROR; + /* Use the expected PSN. */ + qp->r_ack_psn = qp->r_psn; + /* + * Wait to send the sequence + * NAK until all packets + * in the receive queue have + * been processed. + * Otherwise, we end up + * propagating congestion. + */ + rc_defered_ack(rcd, qp); + } /* Out of sequence NAK */ + } /* QP Request NAKs */ +} diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c new file mode 100644 index 000000000..a659aec3c --- /dev/null +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -0,0 +1,979 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/spinlock.h> + +#include "hfi.h" +#include "mad.h" +#include "qp.h" +#include "verbs_txreq.h" +#include "trace.h" + +/* + * Convert the AETH RNR timeout code into the number of microseconds. + */ +const u32 ib_hfi1_rnr_table[32] = { + 655360, /* 00: 655.36 */ + 10, /* 01: .01 */ + 20, /* 02 .02 */ + 30, /* 03: .03 */ + 40, /* 04: .04 */ + 60, /* 05: .06 */ + 80, /* 06: .08 */ + 120, /* 07: .12 */ + 160, /* 08: .16 */ + 240, /* 09: .24 */ + 320, /* 0A: .32 */ + 480, /* 0B: .48 */ + 640, /* 0C: .64 */ + 960, /* 0D: .96 */ + 1280, /* 0E: 1.28 */ + 1920, /* 0F: 1.92 */ + 2560, /* 10: 2.56 */ + 3840, /* 11: 3.84 */ + 5120, /* 12: 5.12 */ + 7680, /* 13: 7.68 */ + 10240, /* 14: 10.24 */ + 15360, /* 15: 15.36 */ + 20480, /* 16: 20.48 */ + 30720, /* 17: 30.72 */ + 40960, /* 18: 40.96 */ + 61440, /* 19: 61.44 */ + 81920, /* 1A: 81.92 */ + 122880, /* 1B: 122.88 */ + 163840, /* 1C: 163.84 */ + 245760, /* 1D: 245.76 */ + 327680, /* 1E: 327.68 */ + 491520 /* 1F: 491.52 */ +}; + +/* + * Validate a RWQE and fill in the SGE state. + * Return 1 if OK. + */ +static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe) +{ + int i, j, ret; + struct ib_wc wc; + struct rvt_lkey_table *rkt; + struct rvt_pd *pd; + struct rvt_sge_state *ss; + + rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table; + pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd); + ss = &qp->r_sge; + ss->sg_list = qp->r_sg_list; + qp->r_len = 0; + for (i = j = 0; i < wqe->num_sge; i++) { + if (wqe->sg_list[i].length == 0) + continue; + /* Check LKEY */ + if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge, + &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE)) + goto bad_lkey; + qp->r_len += wqe->sg_list[i].length; + j++; + } + ss->num_sge = j; + ss->total_len = qp->r_len; + ret = 1; + goto bail; + +bad_lkey: + while (j) { + struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge; + + rvt_put_mr(sge->mr); + } + ss->num_sge = 0; + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr_id; + wc.status = IB_WC_LOC_PROT_ERR; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + /* Signal solicited completion event. */ + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1); + ret = 0; +bail: + return ret; +} + +/** + * hfi1_rvt_get_rwqe - copy the next RWQE into the QP's RWQE + * @qp: the QP + * @wr_id_only: update qp->r_wr_id only, not qp->r_sge + * + * Return -1 if there is a local error, 0 if no RWQE is available, + * otherwise return 1. + * + * Can be called from interrupt level. + */ +int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only) +{ + unsigned long flags; + struct rvt_rq *rq; + struct rvt_rwq *wq; + struct rvt_srq *srq; + struct rvt_rwqe *wqe; + void (*handler)(struct ib_event *, void *); + u32 tail; + int ret; + + if (qp->ibqp.srq) { + srq = ibsrq_to_rvtsrq(qp->ibqp.srq); + handler = srq->ibsrq.event_handler; + rq = &srq->rq; + } else { + srq = NULL; + handler = NULL; + rq = &qp->r_rq; + } + + spin_lock_irqsave(&rq->lock, flags); + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + ret = 0; + goto unlock; + } + + wq = rq->wq; + tail = wq->tail; + /* Validate tail before using it since it is user writable. */ + if (tail >= rq->size) + tail = 0; + if (unlikely(tail == wq->head)) { + ret = 0; + goto unlock; + } + /* Make sure entry is read after head index is read. */ + smp_rmb(); + wqe = rvt_get_rwqe_ptr(rq, tail); + /* + * Even though we update the tail index in memory, the verbs + * consumer is not supposed to post more entries until a + * completion is generated. + */ + if (++tail >= rq->size) + tail = 0; + wq->tail = tail; + if (!wr_id_only && !init_sge(qp, wqe)) { + ret = -1; + goto unlock; + } + qp->r_wr_id = wqe->wr_id; + + ret = 1; + set_bit(RVT_R_WRID_VALID, &qp->r_aflags); + if (handler) { + u32 n; + + /* + * Validate head pointer value and compute + * the number of remaining WQEs. + */ + n = wq->head; + if (n >= rq->size) + n = 0; + if (n < tail) + n += rq->size - tail; + else + n -= tail; + if (n < srq->limit) { + struct ib_event ev; + + srq->limit = 0; + spin_unlock_irqrestore(&rq->lock, flags); + ev.device = qp->ibqp.device; + ev.element.srq = qp->ibqp.srq; + ev.event = IB_EVENT_SRQ_LIMIT_REACHED; + handler(&ev, srq->ibsrq.srq_context); + goto bail; + } + } +unlock: + spin_unlock_irqrestore(&rq->lock, flags); +bail: + return ret; +} + +static __be64 get_sguid(struct hfi1_ibport *ibp, unsigned index) +{ + if (!index) { + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + return cpu_to_be64(ppd->guid); + } + return ibp->guids[index - 1]; +} + +static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id) +{ + return (gid->global.interface_id == id && + (gid->global.subnet_prefix == gid_prefix || + gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX)); +} + +/* + * + * This should be called with the QP r_lock held. + * + * The s_lock will be acquired around the hfi1_migrate_qp() call. + */ +int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr, + int has_grh, struct rvt_qp *qp, u32 bth0) +{ + __be64 guid; + unsigned long flags; + u8 sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl]; + + if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) { + if (!has_grh) { + if (qp->alt_ah_attr.ah_flags & IB_AH_GRH) + goto err; + } else { + if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH)) + goto err; + guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index); + if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix, + guid)) + goto err; + if (!gid_ok( + &hdr->u.l.grh.sgid, + qp->alt_ah_attr.grh.dgid.global.subnet_prefix, + qp->alt_ah_attr.grh.dgid.global.interface_id)) + goto err; + } + if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0, + sc5, be16_to_cpu(hdr->lrh[3])))) { + hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY, + (u16)bth0, + (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, + 0, qp->ibqp.qp_num, + be16_to_cpu(hdr->lrh[3]), + be16_to_cpu(hdr->lrh[1])); + goto err; + } + /* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */ + if (be16_to_cpu(hdr->lrh[3]) != qp->alt_ah_attr.dlid || + ppd_from_ibp(ibp)->port != qp->alt_ah_attr.port_num) + goto err; + spin_lock_irqsave(&qp->s_lock, flags); + hfi1_migrate_qp(qp); + spin_unlock_irqrestore(&qp->s_lock, flags); + } else { + if (!has_grh) { + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + goto err; + } else { + if (!(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) + goto err; + guid = get_sguid(ibp, + qp->remote_ah_attr.grh.sgid_index); + if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix, + guid)) + goto err; + if (!gid_ok( + &hdr->u.l.grh.sgid, + qp->remote_ah_attr.grh.dgid.global.subnet_prefix, + qp->remote_ah_attr.grh.dgid.global.interface_id)) + goto err; + } + if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0, + sc5, be16_to_cpu(hdr->lrh[3])))) { + hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY, + (u16)bth0, + (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, + 0, qp->ibqp.qp_num, + be16_to_cpu(hdr->lrh[3]), + be16_to_cpu(hdr->lrh[1])); + goto err; + } + /* Validate the SLID. See Ch. 9.6.1.5 */ + if (be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid || + ppd_from_ibp(ibp)->port != qp->port_num) + goto err; + if (qp->s_mig_state == IB_MIG_REARM && + !(bth0 & IB_BTH_MIG_REQ)) + qp->s_mig_state = IB_MIG_ARMED; + } + + return 0; + +err: + return 1; +} + +/** + * ruc_loopback - handle UC and RC loopback requests + * @sqp: the sending QP + * + * This is called from hfi1_do_send() to + * forward a WQE addressed to the same HFI. + * Note that although we are single threaded due to the tasklet, we still + * have to protect against post_send(). We don't have to worry about + * receive interrupts since this is a connected protocol and all packets + * will pass through here. + */ +static void ruc_loopback(struct rvt_qp *sqp) +{ + struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); + struct rvt_qp *qp; + struct rvt_swqe *wqe; + struct rvt_sge *sge; + unsigned long flags; + struct ib_wc wc; + u64 sdata; + atomic64_t *maddr; + enum ib_wc_status send_status; + int release; + int ret; + int copy_last = 0; + u32 to; + + rcu_read_lock(); + + /* + * Note that we check the responder QP state after + * checking the requester's state. + */ + qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp, + sqp->remote_qpn); + + spin_lock_irqsave(&sqp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) || + !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND)) + goto unlock; + + sqp->s_flags |= RVT_S_BUSY; + +again: + smp_read_barrier_depends(); /* see post_one_send() */ + if (sqp->s_last == ACCESS_ONCE(sqp->s_head)) + goto clr_busy; + wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); + + /* Return if it is not OK to start a new work request. */ + if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) { + if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND)) + goto clr_busy; + /* We are in the error state, flush the work request. */ + send_status = IB_WC_WR_FLUSH_ERR; + goto flush_send; + } + + /* + * We can rely on the entry not changing without the s_lock + * being held until we update s_last. + * We increment s_cur to indicate s_last is in progress. + */ + if (sqp->s_last == sqp->s_cur) { + if (++sqp->s_cur >= sqp->s_size) + sqp->s_cur = 0; + } + spin_unlock_irqrestore(&sqp->s_lock, flags); + + if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) || + qp->ibqp.qp_type != sqp->ibqp.qp_type) { + ibp->rvp.n_pkt_drops++; + /* + * For RC, the requester would timeout and retry so + * shortcut the timeouts and just signal too many retries. + */ + if (sqp->ibqp.qp_type == IB_QPT_RC) + send_status = IB_WC_RETRY_EXC_ERR; + else + send_status = IB_WC_SUCCESS; + goto serr; + } + + memset(&wc, 0, sizeof(wc)); + send_status = IB_WC_SUCCESS; + + release = 1; + sqp->s_sge.sge = wqe->sg_list[0]; + sqp->s_sge.sg_list = wqe->sg_list + 1; + sqp->s_sge.num_sge = wqe->wr.num_sge; + sqp->s_len = wqe->length; + switch (wqe->wr.opcode) { + case IB_WR_SEND_WITH_IMM: + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + /* FALLTHROUGH */ + case IB_WR_SEND: + ret = hfi1_rvt_get_rwqe(qp, 0); + if (ret < 0) + goto op_err; + if (!ret) + goto rnr_nak; + break; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + ret = hfi1_rvt_get_rwqe(qp, 1); + if (ret < 0) + goto op_err; + if (!ret) + goto rnr_nak; + /* skip copy_last set and qp_access_flags recheck */ + goto do_write; + case IB_WR_RDMA_WRITE: + copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user; + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; +do_write: + if (wqe->length == 0) + break; + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length, + wqe->rdma_wr.remote_addr, + wqe->rdma_wr.rkey, + IB_ACCESS_REMOTE_WRITE))) + goto acc_err; + qp->r_sge.sg_list = NULL; + qp->r_sge.num_sge = 1; + qp->r_sge.total_len = wqe->length; + break; + + case IB_WR_RDMA_READ: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto inv_err; + if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, + wqe->rdma_wr.remote_addr, + wqe->rdma_wr.rkey, + IB_ACCESS_REMOTE_READ))) + goto acc_err; + release = 0; + sqp->s_sge.sg_list = NULL; + sqp->s_sge.num_sge = 1; + qp->r_sge.sge = wqe->sg_list[0]; + qp->r_sge.sg_list = wqe->sg_list + 1; + qp->r_sge.num_sge = wqe->wr.num_sge; + qp->r_sge.total_len = wqe->length; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + goto inv_err; + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), + wqe->atomic_wr.remote_addr, + wqe->atomic_wr.rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto acc_err; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *)qp->r_sge.sge.vaddr; + sdata = wqe->atomic_wr.compare_add; + *(u64 *)sqp->s_sge.sge.vaddr = + (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? + (u64)atomic64_add_return(sdata, maddr) - sdata : + (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr, + sdata, wqe->atomic_wr.swap); + rvt_put_mr(qp->r_sge.sge.mr); + qp->r_sge.num_sge = 0; + goto send_comp; + + default: + send_status = IB_WC_LOC_QP_OP_ERR; + goto serr; + } + + sge = &sqp->s_sge.sge; + while (sqp->s_len) { + u32 len = sqp->s_len; + + if (len > sge->length) + len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; + WARN_ON_ONCE(len == 0); + hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (!release) + rvt_put_mr(sge->mr); + if (--sqp->s_sge.num_sge) + *sge = *sqp->s_sge.sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= RVT_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + sqp->s_len -= len; + } + if (release) + rvt_put_ss(&qp->r_sge); + + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) + goto send_comp; + + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + wc.sl = qp->remote_ah_attr.sl; + wc.port_num = 1; + /* Signal completion event if the solicited bit is set. */ + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, + wqe->wr.send_flags & IB_SEND_SOLICITED); + +send_comp: + spin_lock_irqsave(&sqp->s_lock, flags); + ibp->rvp.n_loop_pkts++; +flush_send: + sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; + hfi1_send_complete(sqp, wqe, send_status); + goto again; + +rnr_nak: + /* Handle RNR NAK */ + if (qp->ibqp.qp_type == IB_QPT_UC) + goto send_comp; + ibp->rvp.n_rnr_naks++; + /* + * Note: we don't need the s_lock held since the BUSY flag + * makes this single threaded. + */ + if (sqp->s_rnr_retry == 0) { + send_status = IB_WC_RNR_RETRY_EXC_ERR; + goto serr; + } + if (sqp->s_rnr_retry_cnt < 7) + sqp->s_rnr_retry--; + spin_lock_irqsave(&sqp->s_lock, flags); + if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK)) + goto clr_busy; + to = ib_hfi1_rnr_table[qp->r_min_rnr_timer]; + hfi1_add_rnr_timer(sqp, to); + goto clr_busy; + +op_err: + send_status = IB_WC_REM_OP_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +inv_err: + send_status = IB_WC_REM_INV_REQ_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +acc_err: + send_status = IB_WC_REM_ACCESS_ERR; + wc.status = IB_WC_LOC_PROT_ERR; +err: + /* responder goes to error state */ + hfi1_rc_error(qp, wc.status); + +serr: + spin_lock_irqsave(&sqp->s_lock, flags); + hfi1_send_complete(sqp, wqe, send_status); + if (sqp->ibqp.qp_type == IB_QPT_RC) { + int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); + + sqp->s_flags &= ~RVT_S_BUSY; + spin_unlock_irqrestore(&sqp->s_lock, flags); + if (lastwqe) { + struct ib_event ev; + + ev.device = sqp->ibqp.device; + ev.element.qp = &sqp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); + } + goto done; + } +clr_busy: + sqp->s_flags &= ~RVT_S_BUSY; +unlock: + spin_unlock_irqrestore(&sqp->s_lock, flags); +done: + rcu_read_unlock(); +} + +/** + * hfi1_make_grh - construct a GRH header + * @ibp: a pointer to the IB port + * @hdr: a pointer to the GRH header being constructed + * @grh: the global route address to send to + * @hwords: the number of 32 bit words of header being sent + * @nwords: the number of 32 bit words of data being sent + * + * Return the size of the header in 32 bit words. + */ +u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, + struct ib_global_route *grh, u32 hwords, u32 nwords) +{ + hdr->version_tclass_flow = + cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) | + (grh->traffic_class << IB_GRH_TCLASS_SHIFT) | + (grh->flow_label << IB_GRH_FLOW_SHIFT)); + hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2); + /* next_hdr is defined by C8-7 in ch. 8.4.1 */ + hdr->next_hdr = IB_GRH_NEXT_HDR; + hdr->hop_limit = grh->hop_limit; + /* The SGID is 32-bit aligned. */ + hdr->sgid.global.subnet_prefix = ibp->rvp.gid_prefix; + hdr->sgid.global.interface_id = + grh->sgid_index && grh->sgid_index < ARRAY_SIZE(ibp->guids) ? + ibp->guids[grh->sgid_index - 1] : + cpu_to_be64(ppd_from_ibp(ibp)->guid); + hdr->dgid = grh->dgid; + + /* GRH header size in 32-bit words. */ + return sizeof(struct ib_grh) / sizeof(u32); +} + +#define BTH2_OFFSET (offsetof(struct hfi1_pio_header, hdr.u.oth.bth[2]) / 4) + +/** + * build_ahg - create ahg in s_hdr + * @qp: a pointer to QP + * @npsn: the next PSN for the request/response + * + * This routine handles the AHG by allocating an ahg entry and causing the + * copy of the first middle. + * + * Subsequent middles use the copied entry, editing the + * PSN with 1 or 2 edits. + */ +static inline void build_ahg(struct rvt_qp *qp, u32 npsn) +{ + struct hfi1_qp_priv *priv = qp->priv; + + if (unlikely(qp->s_flags & RVT_S_AHG_CLEAR)) + clear_ahg(qp); + if (!(qp->s_flags & RVT_S_AHG_VALID)) { + /* first middle that needs copy */ + if (qp->s_ahgidx < 0) + qp->s_ahgidx = sdma_ahg_alloc(priv->s_sde); + if (qp->s_ahgidx >= 0) { + qp->s_ahgpsn = npsn; + priv->s_hdr->tx_flags |= SDMA_TXREQ_F_AHG_COPY; + /* save to protect a change in another thread */ + priv->s_hdr->sde = priv->s_sde; + priv->s_hdr->ahgidx = qp->s_ahgidx; + qp->s_flags |= RVT_S_AHG_VALID; + } + } else { + /* subsequent middle after valid */ + if (qp->s_ahgidx >= 0) { + priv->s_hdr->tx_flags |= SDMA_TXREQ_F_USE_AHG; + priv->s_hdr->ahgidx = qp->s_ahgidx; + priv->s_hdr->ahgcount++; + priv->s_hdr->ahgdesc[0] = + sdma_build_ahg_descriptor( + (__force u16)cpu_to_be16((u16)npsn), + BTH2_OFFSET, + 16, + 16); + if ((npsn & 0xffff0000) != + (qp->s_ahgpsn & 0xffff0000)) { + priv->s_hdr->ahgcount++; + priv->s_hdr->ahgdesc[1] = + sdma_build_ahg_descriptor( + (__force u16)cpu_to_be16( + (u16)(npsn >> 16)), + BTH2_OFFSET, + 0, + 16); + } + } + } +} + +void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr, + u32 bth0, u32 bth2, int middle, + struct hfi1_pkt_state *ps) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ibport *ibp = ps->ibp; + u16 lrh0; + u32 nwords; + u32 extra_bytes; + u32 bth1; + + /* Construct the header. */ + extra_bytes = -qp->s_cur_size & 3; + nwords = (qp->s_cur_size + extra_bytes) >> 2; + lrh0 = HFI1_LRH_BTH; + if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) { + qp->s_hdrwords += hfi1_make_grh(ibp, + &ps->s_txreq->phdr.hdr.u.l.grh, + &qp->remote_ah_attr.grh, + qp->s_hdrwords, nwords); + lrh0 = HFI1_LRH_GRH; + middle = 0; + } + lrh0 |= (priv->s_sc & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4; + /* + * reset s_hdr/AHG fields + * + * This insures that the ahgentry/ahgcount + * are at a non-AHG default to protect + * build_verbs_tx_desc() from using + * an include ahgidx. + * + * build_ahg() will modify as appropriate + * to use the AHG feature. + */ + priv->s_hdr->tx_flags = 0; + priv->s_hdr->ahgcount = 0; + priv->s_hdr->ahgidx = 0; + priv->s_hdr->sde = NULL; + if (qp->s_mig_state == IB_MIG_MIGRATED) + bth0 |= IB_BTH_MIG_REQ; + else + middle = 0; + if (middle) + build_ahg(qp, bth2); + else + qp->s_flags &= ~RVT_S_AHG_VALID; + ps->s_txreq->phdr.hdr.lrh[0] = cpu_to_be16(lrh0); + ps->s_txreq->phdr.hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid); + ps->s_txreq->phdr.hdr.lrh[2] = + cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); + ps->s_txreq->phdr.hdr.lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid | + qp->remote_ah_attr.src_path_bits); + bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index); + bth0 |= extra_bytes << 20; + ohdr->bth[0] = cpu_to_be32(bth0); + bth1 = qp->remote_qpn; + if (qp->s_flags & RVT_S_ECN) { + qp->s_flags &= ~RVT_S_ECN; + /* we recently received a FECN, so return a BECN */ + bth1 |= (HFI1_BECN_MASK << HFI1_BECN_SHIFT); + } + ohdr->bth[1] = cpu_to_be32(bth1); + ohdr->bth[2] = cpu_to_be32(bth2); +} + +/* when sending, force a reschedule every one of these periods */ +#define SEND_RESCHED_TIMEOUT (5 * HZ) /* 5s in jiffies */ + +void _hfi1_do_send(struct work_struct *work) +{ + struct iowait *wait = container_of(work, struct iowait, iowork); + struct rvt_qp *qp = iowait_to_qp(wait); + + hfi1_do_send(qp); +} + +/** + * hfi1_do_send - perform a send on a QP + * @work: contains a pointer to the QP + * + * Process entries in the send work queue until credit or queue is + * exhausted. Only allow one CPU to send a packet per QP (tasklet). + * Otherwise, two threads could send packets out of order. + */ +void hfi1_do_send(struct rvt_qp *qp) +{ + struct hfi1_pkt_state ps; + struct hfi1_qp_priv *priv = qp->priv; + int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps); + unsigned long timeout; + unsigned long timeout_int; + int cpu; + + ps.dev = to_idev(qp->ibqp.device); + ps.ibp = to_iport(qp->ibqp.device, qp->port_num); + ps.ppd = ppd_from_ibp(ps.ibp); + + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + if (!loopback && ((qp->remote_ah_attr.dlid & ~((1 << ps.ppd->lmc + ) - 1)) == + ps.ppd->lid)) { + ruc_loopback(qp); + return; + } + make_req = hfi1_make_rc_req; + timeout_int = (qp->timeout_jiffies); + break; + case IB_QPT_UC: + if (!loopback && ((qp->remote_ah_attr.dlid & ~((1 << ps.ppd->lmc + ) - 1)) == + ps.ppd->lid)) { + ruc_loopback(qp); + return; + } + make_req = hfi1_make_uc_req; + timeout_int = SEND_RESCHED_TIMEOUT; + break; + default: + make_req = hfi1_make_ud_req; + timeout_int = SEND_RESCHED_TIMEOUT; + } + + spin_lock_irqsave(&qp->s_lock, ps.flags); + + /* Return if we are already busy processing a work request. */ + if (!hfi1_send_ok(qp)) { + spin_unlock_irqrestore(&qp->s_lock, ps.flags); + return; + } + + qp->s_flags |= RVT_S_BUSY; + + timeout = jiffies + (timeout_int) / 8; + cpu = priv->s_sde ? priv->s_sde->cpu : + cpumask_first(cpumask_of_node(ps.ppd->dd->node)); + /* insure a pre-built packet is handled */ + ps.s_txreq = get_waiting_verbs_txreq(qp); + do { + /* Check for a constructed packet to be sent. */ + if (qp->s_hdrwords != 0) { + spin_unlock_irqrestore(&qp->s_lock, ps.flags); + /* + * If the packet cannot be sent now, return and + * the send tasklet will be woken up later. + */ + if (hfi1_verbs_send(qp, &ps)) + return; + /* Record that s_hdr is empty. */ + qp->s_hdrwords = 0; + /* allow other tasks to run */ + if (unlikely(time_after(jiffies, timeout))) { + if (workqueue_congested(cpu, + ps.ppd->hfi1_wq)) { + spin_lock_irqsave( + &qp->s_lock, + ps.flags); + qp->s_flags &= ~RVT_S_BUSY; + hfi1_schedule_send(qp); + spin_unlock_irqrestore( + &qp->s_lock, + ps.flags); + this_cpu_inc( + *ps.ppd->dd->send_schedule); + return; + } + if (!irqs_disabled()) { + cond_resched(); + this_cpu_inc( + *ps.ppd->dd->send_schedule); + } + timeout = jiffies + (timeout_int) / 8; + } + spin_lock_irqsave(&qp->s_lock, ps.flags); + } + } while (make_req(qp, &ps)); + + spin_unlock_irqrestore(&qp->s_lock, ps.flags); +} + +/* + * This should be called with s_lock held. + */ +void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, + enum ib_wc_status status) +{ + u32 old_last, last; + unsigned i; + + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) + return; + + last = qp->s_last; + old_last = last; + if (++last >= qp->s_size) + last = 0; + qp->s_last = last; + /* See post_send() */ + barrier(); + for (i = 0; i < wqe->wr.num_sge; i++) { + struct rvt_sge *sge = &wqe->sg_list[i]; + + rvt_put_mr(sge->mr); + } + if (qp->ibqp.qp_type == IB_QPT_UD || + qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI) + atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount); + + /* See ch. 11.2.4.1 and 10.7.3.1 */ + if (!(qp->s_flags & RVT_S_SIGNAL_REQ_WR) || + (wqe->wr.send_flags & IB_SEND_SIGNALED) || + status != IB_WC_SUCCESS) { + struct ib_wc wc; + + memset(&wc, 0, sizeof(wc)); + wc.wr_id = wqe->wr.wr_id; + wc.status = status; + wc.opcode = ib_hfi1_wc_opcode[wqe->wr.opcode]; + wc.qp = &qp->ibqp; + if (status == IB_WC_SUCCESS) + wc.byte_len = wqe->length; + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, + status != IB_WC_SUCCESS); + } + + if (qp->s_acked == old_last) + qp->s_acked = last; + if (qp->s_cur == old_last) + qp->s_cur = last; + if (qp->s_tail == old_last) + qp->s_tail = last; + if (qp->state == IB_QPS_SQD && last == qp->s_cur) + qp->s_draining = 0; +} diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c new file mode 100644 index 000000000..f9befc05b --- /dev/null +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -0,0 +1,3054 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/spinlock.h> +#include <linux/seqlock.h> +#include <linux/netdevice.h> +#include <linux/moduleparam.h> +#include <linux/bitops.h> +#include <linux/timer.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> + +#include "hfi.h" +#include "common.h" +#include "qp.h" +#include "sdma.h" +#include "iowait.h" +#include "trace.h" + +/* must be a power of 2 >= 64 <= 32768 */ +#define SDMA_DESCQ_CNT 2048 +#define SDMA_DESC_INTR 64 +#define INVALID_TAIL 0xffff + +static uint sdma_descq_cnt = SDMA_DESCQ_CNT; +module_param(sdma_descq_cnt, uint, S_IRUGO); +MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries"); + +static uint sdma_idle_cnt = 250; +module_param(sdma_idle_cnt, uint, S_IRUGO); +MODULE_PARM_DESC(sdma_idle_cnt, "sdma interrupt idle delay (ns,default 250)"); + +uint mod_num_sdma; +module_param_named(num_sdma, mod_num_sdma, uint, S_IRUGO); +MODULE_PARM_DESC(num_sdma, "Set max number SDMA engines to use"); + +static uint sdma_desct_intr = SDMA_DESC_INTR; +module_param_named(desct_intr, sdma_desct_intr, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(desct_intr, "Number of SDMA descriptor before interrupt"); + +#define SDMA_WAIT_BATCH_SIZE 20 +/* max wait time for a SDMA engine to indicate it has halted */ +#define SDMA_ERR_HALT_TIMEOUT 10 /* ms */ +/* all SDMA engine errors that cause a halt */ + +#define SD(name) SEND_DMA_##name +#define ALL_SDMA_ENG_HALT_ERRS \ + (SD(ENG_ERR_STATUS_SDMA_WRONG_DW_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_GEN_MISMATCH_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_TOO_LONG_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_TAIL_OUT_OF_BOUNDS_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_FIRST_DESC_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_MEM_READ_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_LENGTH_MISMATCH_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_PACKET_DESC_OVERFLOW_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HEADER_SELECT_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HEADER_ADDRESS_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HEADER_LENGTH_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_TIMEOUT_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_DESC_TABLE_UNC_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_ASSEMBLY_UNC_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_PACKET_TRACKING_UNC_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HEADER_STORAGE_UNC_ERR_SMASK) \ + | SD(ENG_ERR_STATUS_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SMASK)) + +/* sdma_sendctrl operations */ +#define SDMA_SENDCTRL_OP_ENABLE BIT(0) +#define SDMA_SENDCTRL_OP_INTENABLE BIT(1) +#define SDMA_SENDCTRL_OP_HALT BIT(2) +#define SDMA_SENDCTRL_OP_CLEANUP BIT(3) + +/* handle long defines */ +#define SDMA_EGRESS_PACKET_OCCUPANCY_SMASK \ +SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SMASK +#define SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT \ +SEND_EGRESS_SEND_DMA_STATUS_SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT + +static const char * const sdma_state_names[] = { + [sdma_state_s00_hw_down] = "s00_HwDown", + [sdma_state_s10_hw_start_up_halt_wait] = "s10_HwStartUpHaltWait", + [sdma_state_s15_hw_start_up_clean_wait] = "s15_HwStartUpCleanWait", + [sdma_state_s20_idle] = "s20_Idle", + [sdma_state_s30_sw_clean_up_wait] = "s30_SwCleanUpWait", + [sdma_state_s40_hw_clean_up_wait] = "s40_HwCleanUpWait", + [sdma_state_s50_hw_halt_wait] = "s50_HwHaltWait", + [sdma_state_s60_idle_halt_wait] = "s60_IdleHaltWait", + [sdma_state_s80_hw_freeze] = "s80_HwFreeze", + [sdma_state_s82_freeze_sw_clean] = "s82_FreezeSwClean", + [sdma_state_s99_running] = "s99_Running", +}; + +#ifdef CONFIG_SDMA_VERBOSITY +static const char * const sdma_event_names[] = { + [sdma_event_e00_go_hw_down] = "e00_GoHwDown", + [sdma_event_e10_go_hw_start] = "e10_GoHwStart", + [sdma_event_e15_hw_halt_done] = "e15_HwHaltDone", + [sdma_event_e25_hw_clean_up_done] = "e25_HwCleanUpDone", + [sdma_event_e30_go_running] = "e30_GoRunning", + [sdma_event_e40_sw_cleaned] = "e40_SwCleaned", + [sdma_event_e50_hw_cleaned] = "e50_HwCleaned", + [sdma_event_e60_hw_halted] = "e60_HwHalted", + [sdma_event_e70_go_idle] = "e70_GoIdle", + [sdma_event_e80_hw_freeze] = "e80_HwFreeze", + [sdma_event_e81_hw_frozen] = "e81_HwFrozen", + [sdma_event_e82_hw_unfreeze] = "e82_HwUnfreeze", + [sdma_event_e85_link_down] = "e85_LinkDown", + [sdma_event_e90_sw_halted] = "e90_SwHalted", +}; +#endif + +static const struct sdma_set_state_action sdma_action_table[] = { + [sdma_state_s00_hw_down] = { + .go_s99_running_tofalse = 1, + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s10_hw_start_up_halt_wait] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 1, + .op_cleanup = 0, + }, + [sdma_state_s15_hw_start_up_clean_wait] = { + .op_enable = 0, + .op_intenable = 1, + .op_halt = 0, + .op_cleanup = 1, + }, + [sdma_state_s20_idle] = { + .op_enable = 0, + .op_intenable = 1, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s30_sw_clean_up_wait] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s40_hw_clean_up_wait] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 1, + }, + [sdma_state_s50_hw_halt_wait] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s60_idle_halt_wait] = { + .go_s99_running_tofalse = 1, + .op_enable = 0, + .op_intenable = 0, + .op_halt = 1, + .op_cleanup = 0, + }, + [sdma_state_s80_hw_freeze] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s82_freeze_sw_clean] = { + .op_enable = 0, + .op_intenable = 0, + .op_halt = 0, + .op_cleanup = 0, + }, + [sdma_state_s99_running] = { + .op_enable = 1, + .op_intenable = 1, + .op_halt = 0, + .op_cleanup = 0, + .go_s99_running_totrue = 1, + }, +}; + +#define SDMA_TAIL_UPDATE_THRESH 0x1F + +/* declare all statics here rather than keep sorting */ +static void sdma_complete(struct kref *); +static void sdma_finalput(struct sdma_state *); +static void sdma_get(struct sdma_state *); +static void sdma_hw_clean_up_task(unsigned long); +static void sdma_put(struct sdma_state *); +static void sdma_set_state(struct sdma_engine *, enum sdma_states); +static void sdma_start_hw_clean_up(struct sdma_engine *); +static void sdma_sw_clean_up_task(unsigned long); +static void sdma_sendctrl(struct sdma_engine *, unsigned); +static void init_sdma_regs(struct sdma_engine *, u32, uint); +static void sdma_process_event( + struct sdma_engine *sde, + enum sdma_events event); +static void __sdma_process_event( + struct sdma_engine *sde, + enum sdma_events event); +static void dump_sdma_state(struct sdma_engine *sde); +static void sdma_make_progress(struct sdma_engine *sde, u64 status); +static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail); +static void sdma_flush_descq(struct sdma_engine *sde); + +/** + * sdma_state_name() - return state string from enum + * @state: state + */ +static const char *sdma_state_name(enum sdma_states state) +{ + return sdma_state_names[state]; +} + +static void sdma_get(struct sdma_state *ss) +{ + kref_get(&ss->kref); +} + +static void sdma_complete(struct kref *kref) +{ + struct sdma_state *ss = + container_of(kref, struct sdma_state, kref); + + complete(&ss->comp); +} + +static void sdma_put(struct sdma_state *ss) +{ + kref_put(&ss->kref, sdma_complete); +} + +static void sdma_finalput(struct sdma_state *ss) +{ + sdma_put(ss); + wait_for_completion(&ss->comp); +} + +static inline void write_sde_csr( + struct sdma_engine *sde, + u32 offset0, + u64 value) +{ + write_kctxt_csr(sde->dd, sde->this_idx, offset0, value); +} + +static inline u64 read_sde_csr( + struct sdma_engine *sde, + u32 offset0) +{ + return read_kctxt_csr(sde->dd, sde->this_idx, offset0); +} + +/* + * sdma_wait_for_packet_egress() - wait for the VL FIFO occupancy for + * sdma engine 'sde' to drop to 0. + */ +static void sdma_wait_for_packet_egress(struct sdma_engine *sde, + int pause) +{ + u64 off = 8 * sde->this_idx; + struct hfi1_devdata *dd = sde->dd; + int lcnt = 0; + u64 reg_prev; + u64 reg = 0; + + while (1) { + reg_prev = reg; + reg = read_csr(dd, off + SEND_EGRESS_SEND_DMA_STATUS); + + reg &= SDMA_EGRESS_PACKET_OCCUPANCY_SMASK; + reg >>= SDMA_EGRESS_PACKET_OCCUPANCY_SHIFT; + if (reg == 0) + break; + /* counter is reest if accupancy count changes */ + if (reg != reg_prev) + lcnt = 0; + if (lcnt++ > 500) { + /* timed out - bounce the link */ + dd_dev_err(dd, "%s: engine %u timeout waiting for packets to egress, remaining count %u, bouncing link\n", + __func__, sde->this_idx, (u32)reg); + queue_work(dd->pport->hfi1_wq, + &dd->pport->link_bounce_work); + break; + } + udelay(1); + } +} + +/* + * sdma_wait() - wait for packet egress to complete for all SDMA engines, + * and pause for credit return. + */ +void sdma_wait(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < dd->num_sdma; i++) { + struct sdma_engine *sde = &dd->per_sdma[i]; + + sdma_wait_for_packet_egress(sde, 0); + } +} + +static inline void sdma_set_desc_cnt(struct sdma_engine *sde, unsigned cnt) +{ + u64 reg; + + if (!(sde->dd->flags & HFI1_HAS_SDMA_TIMEOUT)) + return; + reg = cnt; + reg &= SD(DESC_CNT_CNT_MASK); + reg <<= SD(DESC_CNT_CNT_SHIFT); + write_sde_csr(sde, SD(DESC_CNT), reg); +} + +static inline void complete_tx(struct sdma_engine *sde, + struct sdma_txreq *tx, + int res) +{ + /* protect against complete modifying */ + struct iowait *wait = tx->wait; + callback_t complete = tx->complete; + +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + trace_hfi1_sdma_out_sn(sde, tx->sn); + if (WARN_ON_ONCE(sde->head_sn != tx->sn)) + dd_dev_err(sde->dd, "expected %llu got %llu\n", + sde->head_sn, tx->sn); + sde->head_sn++; +#endif + sdma_txclean(sde->dd, tx); + if (complete) + (*complete)(tx, res); + if (wait && iowait_sdma_dec(wait)) + iowait_drain_wakeup(wait); +} + +/* + * Complete all the sdma requests with a SDMA_TXREQ_S_ABORTED status + * + * Depending on timing there can be txreqs in two places: + * - in the descq ring + * - in the flush list + * + * To avoid ordering issues the descq ring needs to be flushed + * first followed by the flush list. + * + * This routine is called from two places + * - From a work queue item + * - Directly from the state machine just before setting the + * state to running + * + * Must be called with head_lock held + * + */ +static void sdma_flush(struct sdma_engine *sde) +{ + struct sdma_txreq *txp, *txp_next; + LIST_HEAD(flushlist); + unsigned long flags; + + /* flush from head to tail */ + sdma_flush_descq(sde); + spin_lock_irqsave(&sde->flushlist_lock, flags); + /* copy flush list */ + list_for_each_entry_safe(txp, txp_next, &sde->flushlist, list) { + list_del_init(&txp->list); + list_add_tail(&txp->list, &flushlist); + } + spin_unlock_irqrestore(&sde->flushlist_lock, flags); + /* flush from flush list */ + list_for_each_entry_safe(txp, txp_next, &flushlist, list) + complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED); +} + +/* + * Fields a work request for flushing the descq ring + * and the flush list + * + * If the engine has been brought to running during + * the scheduling delay, the flush is ignored, assuming + * that the process of bringing the engine to running + * would have done this flush prior to going to running. + * + */ +static void sdma_field_flush(struct work_struct *work) +{ + unsigned long flags; + struct sdma_engine *sde = + container_of(work, struct sdma_engine, flush_worker); + + write_seqlock_irqsave(&sde->head_lock, flags); + if (!__sdma_running(sde)) + sdma_flush(sde); + write_sequnlock_irqrestore(&sde->head_lock, flags); +} + +static void sdma_err_halt_wait(struct work_struct *work) +{ + struct sdma_engine *sde = container_of(work, struct sdma_engine, + err_halt_worker); + u64 statuscsr; + unsigned long timeout; + + timeout = jiffies + msecs_to_jiffies(SDMA_ERR_HALT_TIMEOUT); + while (1) { + statuscsr = read_sde_csr(sde, SD(STATUS)); + statuscsr &= SD(STATUS_ENG_HALTED_SMASK); + if (statuscsr) + break; + if (time_after(jiffies, timeout)) { + dd_dev_err(sde->dd, + "SDMA engine %d - timeout waiting for engine to halt\n", + sde->this_idx); + /* + * Continue anyway. This could happen if there was + * an uncorrectable error in the wrong spot. + */ + break; + } + usleep_range(80, 120); + } + + sdma_process_event(sde, sdma_event_e15_hw_halt_done); +} + +static void sdma_err_progress_check_schedule(struct sdma_engine *sde) +{ + if (!is_bx(sde->dd) && HFI1_CAP_IS_KSET(SDMA_AHG)) { + unsigned index; + struct hfi1_devdata *dd = sde->dd; + + for (index = 0; index < dd->num_sdma; index++) { + struct sdma_engine *curr_sdma = &dd->per_sdma[index]; + + if (curr_sdma != sde) + curr_sdma->progress_check_head = + curr_sdma->descq_head; + } + dd_dev_err(sde->dd, + "SDMA engine %d - check scheduled\n", + sde->this_idx); + mod_timer(&sde->err_progress_check_timer, jiffies + 10); + } +} + +static void sdma_err_progress_check(unsigned long data) +{ + unsigned index; + struct sdma_engine *sde = (struct sdma_engine *)data; + + dd_dev_err(sde->dd, "SDE progress check event\n"); + for (index = 0; index < sde->dd->num_sdma; index++) { + struct sdma_engine *curr_sde = &sde->dd->per_sdma[index]; + unsigned long flags; + + /* check progress on each engine except the current one */ + if (curr_sde == sde) + continue; + /* + * We must lock interrupts when acquiring sde->lock, + * to avoid a deadlock if interrupt triggers and spins on + * the same lock on same CPU + */ + spin_lock_irqsave(&curr_sde->tail_lock, flags); + write_seqlock(&curr_sde->head_lock); + + /* skip non-running queues */ + if (curr_sde->state.current_state != sdma_state_s99_running) { + write_sequnlock(&curr_sde->head_lock); + spin_unlock_irqrestore(&curr_sde->tail_lock, flags); + continue; + } + + if ((curr_sde->descq_head != curr_sde->descq_tail) && + (curr_sde->descq_head == + curr_sde->progress_check_head)) + __sdma_process_event(curr_sde, + sdma_event_e90_sw_halted); + write_sequnlock(&curr_sde->head_lock); + spin_unlock_irqrestore(&curr_sde->tail_lock, flags); + } + schedule_work(&sde->err_halt_worker); +} + +static void sdma_hw_clean_up_task(unsigned long opaque) +{ + struct sdma_engine *sde = (struct sdma_engine *)opaque; + u64 statuscsr; + + while (1) { +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", + sde->this_idx, slashstrip(__FILE__), __LINE__, + __func__); +#endif + statuscsr = read_sde_csr(sde, SD(STATUS)); + statuscsr &= SD(STATUS_ENG_CLEANED_UP_SMASK); + if (statuscsr) + break; + udelay(10); + } + + sdma_process_event(sde, sdma_event_e25_hw_clean_up_done); +} + +static inline struct sdma_txreq *get_txhead(struct sdma_engine *sde) +{ + smp_read_barrier_depends(); /* see sdma_update_tail() */ + return sde->tx_ring[sde->tx_head & sde->sdma_mask]; +} + +/* + * flush ring for recovery + */ +static void sdma_flush_descq(struct sdma_engine *sde) +{ + u16 head, tail; + int progress = 0; + struct sdma_txreq *txp = get_txhead(sde); + + /* The reason for some of the complexity of this code is that + * not all descriptors have corresponding txps. So, we have to + * be able to skip over descs until we wander into the range of + * the next txp on the list. + */ + head = sde->descq_head & sde->sdma_mask; + tail = sde->descq_tail & sde->sdma_mask; + while (head != tail) { + /* advance head, wrap if needed */ + head = ++sde->descq_head & sde->sdma_mask; + /* if now past this txp's descs, do the callback */ + if (txp && txp->next_descq_idx == head) { + /* remove from list */ + sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL; + complete_tx(sde, txp, SDMA_TXREQ_S_ABORTED); + trace_hfi1_sdma_progress(sde, head, tail, txp); + txp = get_txhead(sde); + } + progress++; + } + if (progress) + sdma_desc_avail(sde, sdma_descq_freecnt(sde)); +} + +static void sdma_sw_clean_up_task(unsigned long opaque) +{ + struct sdma_engine *sde = (struct sdma_engine *)opaque; + unsigned long flags; + + spin_lock_irqsave(&sde->tail_lock, flags); + write_seqlock(&sde->head_lock); + + /* + * At this point, the following should always be true: + * - We are halted, so no more descriptors are getting retired. + * - We are not running, so no one is submitting new work. + * - Only we can send the e40_sw_cleaned, so we can't start + * running again until we say so. So, the active list and + * descq are ours to play with. + */ + + /* + * In the error clean up sequence, software clean must be called + * before the hardware clean so we can use the hardware head in + * the progress routine. A hardware clean or SPC unfreeze will + * reset the hardware head. + * + * Process all retired requests. The progress routine will use the + * latest physical hardware head - we are not running so speed does + * not matter. + */ + sdma_make_progress(sde, 0); + + sdma_flush(sde); + + /* + * Reset our notion of head and tail. + * Note that the HW registers have been reset via an earlier + * clean up. + */ + sde->descq_tail = 0; + sde->descq_head = 0; + sde->desc_avail = sdma_descq_freecnt(sde); + *sde->head_dma = 0; + + __sdma_process_event(sde, sdma_event_e40_sw_cleaned); + + write_sequnlock(&sde->head_lock); + spin_unlock_irqrestore(&sde->tail_lock, flags); +} + +static void sdma_sw_tear_down(struct sdma_engine *sde) +{ + struct sdma_state *ss = &sde->state; + + /* Releasing this reference means the state machine has stopped. */ + sdma_put(ss); + + /* stop waiting for all unfreeze events to complete */ + atomic_set(&sde->dd->sdma_unfreeze_count, -1); + wake_up_interruptible(&sde->dd->sdma_unfreeze_wq); +} + +static void sdma_start_hw_clean_up(struct sdma_engine *sde) +{ + tasklet_hi_schedule(&sde->sdma_hw_clean_up_task); +} + +static void sdma_set_state(struct sdma_engine *sde, + enum sdma_states next_state) +{ + struct sdma_state *ss = &sde->state; + const struct sdma_set_state_action *action = sdma_action_table; + unsigned op = 0; + + trace_hfi1_sdma_state( + sde, + sdma_state_names[ss->current_state], + sdma_state_names[next_state]); + + /* debugging bookkeeping */ + ss->previous_state = ss->current_state; + ss->previous_op = ss->current_op; + ss->current_state = next_state; + + if (ss->previous_state != sdma_state_s99_running && + next_state == sdma_state_s99_running) + sdma_flush(sde); + + if (action[next_state].op_enable) + op |= SDMA_SENDCTRL_OP_ENABLE; + + if (action[next_state].op_intenable) + op |= SDMA_SENDCTRL_OP_INTENABLE; + + if (action[next_state].op_halt) + op |= SDMA_SENDCTRL_OP_HALT; + + if (action[next_state].op_cleanup) + op |= SDMA_SENDCTRL_OP_CLEANUP; + + if (action[next_state].go_s99_running_tofalse) + ss->go_s99_running = 0; + + if (action[next_state].go_s99_running_totrue) + ss->go_s99_running = 1; + + ss->current_op = op; + sdma_sendctrl(sde, ss->current_op); +} + +/** + * sdma_get_descq_cnt() - called when device probed + * + * Return a validated descq count. + * + * This is currently only used in the verbs initialization to build the tx + * list. + * + * This will probably be deleted in favor of a more scalable approach to + * alloc tx's. + * + */ +u16 sdma_get_descq_cnt(void) +{ + u16 count = sdma_descq_cnt; + + if (!count) + return SDMA_DESCQ_CNT; + /* count must be a power of 2 greater than 64 and less than + * 32768. Otherwise return default. + */ + if (!is_power_of_2(count)) + return SDMA_DESCQ_CNT; + if (count < 64 || count > 32768) + return SDMA_DESCQ_CNT; + return count; +} + +/** + * sdma_select_engine_vl() - select sdma engine + * @dd: devdata + * @selector: a spreading factor + * @vl: this vl + * + * + * This function returns an engine based on the selector and a vl. The + * mapping fields are protected by RCU. + */ +struct sdma_engine *sdma_select_engine_vl( + struct hfi1_devdata *dd, + u32 selector, + u8 vl) +{ + struct sdma_vl_map *m; + struct sdma_map_elem *e; + struct sdma_engine *rval; + + /* NOTE This should only happen if SC->VL changed after the initial + * checks on the QP/AH + * Default will return engine 0 below + */ + if (vl >= num_vls) { + rval = NULL; + goto done; + } + + rcu_read_lock(); + m = rcu_dereference(dd->sdma_map); + if (unlikely(!m)) { + rcu_read_unlock(); + return &dd->per_sdma[0]; + } + e = m->map[vl & m->mask]; + rval = e->sde[selector & e->mask]; + rcu_read_unlock(); + +done: + rval = !rval ? &dd->per_sdma[0] : rval; + trace_hfi1_sdma_engine_select(dd, selector, vl, rval->this_idx); + return rval; +} + +/** + * sdma_select_engine_sc() - select sdma engine + * @dd: devdata + * @selector: a spreading factor + * @sc5: the 5 bit sc + * + * + * This function returns an engine based on the selector and an sc. + */ +struct sdma_engine *sdma_select_engine_sc( + struct hfi1_devdata *dd, + u32 selector, + u8 sc5) +{ + u8 vl = sc_to_vlt(dd, sc5); + + return sdma_select_engine_vl(dd, selector, vl); +} + +/* + * Free the indicated map struct + */ +static void sdma_map_free(struct sdma_vl_map *m) +{ + int i; + + for (i = 0; m && i < m->actual_vls; i++) + kfree(m->map[i]); + kfree(m); +} + +/* + * Handle RCU callback + */ +static void sdma_map_rcu_callback(struct rcu_head *list) +{ + struct sdma_vl_map *m = container_of(list, struct sdma_vl_map, list); + + sdma_map_free(m); +} + +/** + * sdma_map_init - called when # vls change + * @dd: hfi1_devdata + * @port: port number + * @num_vls: number of vls + * @vl_engines: per vl engine mapping (optional) + * + * This routine changes the mapping based on the number of vls. + * + * vl_engines is used to specify a non-uniform vl/engine loading. NULL + * implies auto computing the loading and giving each VLs a uniform + * distribution of engines per VL. + * + * The auto algorithm computes the sde_per_vl and the number of extra + * engines. Any extra engines are added from the last VL on down. + * + * rcu locking is used here to control access to the mapping fields. + * + * If either the num_vls or num_sdma are non-power of 2, the array sizes + * in the struct sdma_vl_map and the struct sdma_map_elem are rounded + * up to the next highest power of 2 and the first entry is reused + * in a round robin fashion. + * + * If an error occurs the map change is not done and the mapping is + * not changed. + * + */ +int sdma_map_init(struct hfi1_devdata *dd, u8 port, u8 num_vls, u8 *vl_engines) +{ + int i, j; + int extra, sde_per_vl; + int engine = 0; + u8 lvl_engines[OPA_MAX_VLS]; + struct sdma_vl_map *oldmap, *newmap; + + if (!(dd->flags & HFI1_HAS_SEND_DMA)) + return 0; + + if (!vl_engines) { + /* truncate divide */ + sde_per_vl = dd->num_sdma / num_vls; + /* extras */ + extra = dd->num_sdma % num_vls; + vl_engines = lvl_engines; + /* add extras from last vl down */ + for (i = num_vls - 1; i >= 0; i--, extra--) + vl_engines[i] = sde_per_vl + (extra > 0 ? 1 : 0); + } + /* build new map */ + newmap = kzalloc( + sizeof(struct sdma_vl_map) + + roundup_pow_of_two(num_vls) * + sizeof(struct sdma_map_elem *), + GFP_KERNEL); + if (!newmap) + goto bail; + newmap->actual_vls = num_vls; + newmap->vls = roundup_pow_of_two(num_vls); + newmap->mask = (1 << ilog2(newmap->vls)) - 1; + /* initialize back-map */ + for (i = 0; i < TXE_NUM_SDMA_ENGINES; i++) + newmap->engine_to_vl[i] = -1; + for (i = 0; i < newmap->vls; i++) { + /* save for wrap around */ + int first_engine = engine; + + if (i < newmap->actual_vls) { + int sz = roundup_pow_of_two(vl_engines[i]); + + /* only allocate once */ + newmap->map[i] = kzalloc( + sizeof(struct sdma_map_elem) + + sz * sizeof(struct sdma_engine *), + GFP_KERNEL); + if (!newmap->map[i]) + goto bail; + newmap->map[i]->mask = (1 << ilog2(sz)) - 1; + /* assign engines */ + for (j = 0; j < sz; j++) { + newmap->map[i]->sde[j] = + &dd->per_sdma[engine]; + if (++engine >= first_engine + vl_engines[i]) + /* wrap back to first engine */ + engine = first_engine; + } + /* assign back-map */ + for (j = 0; j < vl_engines[i]; j++) + newmap->engine_to_vl[first_engine + j] = i; + } else { + /* just re-use entry without allocating */ + newmap->map[i] = newmap->map[i % num_vls]; + } + engine = first_engine + vl_engines[i]; + } + /* newmap in hand, save old map */ + spin_lock_irq(&dd->sde_map_lock); + oldmap = rcu_dereference_protected(dd->sdma_map, + lockdep_is_held(&dd->sde_map_lock)); + + /* publish newmap */ + rcu_assign_pointer(dd->sdma_map, newmap); + + spin_unlock_irq(&dd->sde_map_lock); + /* success, free any old map after grace period */ + if (oldmap) + call_rcu(&oldmap->list, sdma_map_rcu_callback); + return 0; +bail: + /* free any partial allocation */ + sdma_map_free(newmap); + return -ENOMEM; +} + +/* + * Clean up allocated memory. + * + * This routine is can be called regardless of the success of sdma_init() + * + */ +static void sdma_clean(struct hfi1_devdata *dd, size_t num_engines) +{ + size_t i; + struct sdma_engine *sde; + + if (dd->sdma_pad_dma) { + dma_free_coherent(&dd->pcidev->dev, 4, + (void *)dd->sdma_pad_dma, + dd->sdma_pad_phys); + dd->sdma_pad_dma = NULL; + dd->sdma_pad_phys = 0; + } + if (dd->sdma_heads_dma) { + dma_free_coherent(&dd->pcidev->dev, dd->sdma_heads_size, + (void *)dd->sdma_heads_dma, + dd->sdma_heads_phys); + dd->sdma_heads_dma = NULL; + dd->sdma_heads_phys = 0; + } + for (i = 0; dd->per_sdma && i < num_engines; ++i) { + sde = &dd->per_sdma[i]; + + sde->head_dma = NULL; + sde->head_phys = 0; + + if (sde->descq) { + dma_free_coherent( + &dd->pcidev->dev, + sde->descq_cnt * sizeof(u64[2]), + sde->descq, + sde->descq_phys + ); + sde->descq = NULL; + sde->descq_phys = 0; + } + kvfree(sde->tx_ring); + sde->tx_ring = NULL; + } + spin_lock_irq(&dd->sde_map_lock); + sdma_map_free(rcu_access_pointer(dd->sdma_map)); + RCU_INIT_POINTER(dd->sdma_map, NULL); + spin_unlock_irq(&dd->sde_map_lock); + synchronize_rcu(); + kfree(dd->per_sdma); + dd->per_sdma = NULL; +} + +/** + * sdma_init() - called when device probed + * @dd: hfi1_devdata + * @port: port number (currently only zero) + * + * sdma_init initializes the specified number of engines. + * + * The code initializes each sde, its csrs. Interrupts + * are not required to be enabled. + * + * Returns: + * 0 - success, -errno on failure + */ +int sdma_init(struct hfi1_devdata *dd, u8 port) +{ + unsigned this_idx; + struct sdma_engine *sde; + u16 descq_cnt; + void *curr_head; + struct hfi1_pportdata *ppd = dd->pport + port; + u32 per_sdma_credits; + uint idle_cnt = sdma_idle_cnt; + size_t num_engines = dd->chip_sdma_engines; + + if (!HFI1_CAP_IS_KSET(SDMA)) { + HFI1_CAP_CLEAR(SDMA_AHG); + return 0; + } + if (mod_num_sdma && + /* can't exceed chip support */ + mod_num_sdma <= dd->chip_sdma_engines && + /* count must be >= vls */ + mod_num_sdma >= num_vls) + num_engines = mod_num_sdma; + + dd_dev_info(dd, "SDMA mod_num_sdma: %u\n", mod_num_sdma); + dd_dev_info(dd, "SDMA chip_sdma_engines: %u\n", dd->chip_sdma_engines); + dd_dev_info(dd, "SDMA chip_sdma_mem_size: %u\n", + dd->chip_sdma_mem_size); + + per_sdma_credits = + dd->chip_sdma_mem_size / (num_engines * SDMA_BLOCK_SIZE); + + /* set up freeze waitqueue */ + init_waitqueue_head(&dd->sdma_unfreeze_wq); + atomic_set(&dd->sdma_unfreeze_count, 0); + + descq_cnt = sdma_get_descq_cnt(); + dd_dev_info(dd, "SDMA engines %zu descq_cnt %u\n", + num_engines, descq_cnt); + + /* alloc memory for array of send engines */ + dd->per_sdma = kcalloc(num_engines, sizeof(*dd->per_sdma), GFP_KERNEL); + if (!dd->per_sdma) + return -ENOMEM; + + idle_cnt = ns_to_cclock(dd, idle_cnt); + if (!sdma_desct_intr) + sdma_desct_intr = SDMA_DESC_INTR; + + /* Allocate memory for SendDMA descriptor FIFOs */ + for (this_idx = 0; this_idx < num_engines; ++this_idx) { + sde = &dd->per_sdma[this_idx]; + sde->dd = dd; + sde->ppd = ppd; + sde->this_idx = this_idx; + sde->descq_cnt = descq_cnt; + sde->desc_avail = sdma_descq_freecnt(sde); + sde->sdma_shift = ilog2(descq_cnt); + sde->sdma_mask = (1 << sde->sdma_shift) - 1; + + /* Create a mask specifically for each interrupt source */ + sde->int_mask = (u64)1 << (0 * TXE_NUM_SDMA_ENGINES + + this_idx); + sde->progress_mask = (u64)1 << (1 * TXE_NUM_SDMA_ENGINES + + this_idx); + sde->idle_mask = (u64)1 << (2 * TXE_NUM_SDMA_ENGINES + + this_idx); + /* Create a combined mask to cover all 3 interrupt sources */ + sde->imask = sde->int_mask | sde->progress_mask | + sde->idle_mask; + + spin_lock_init(&sde->tail_lock); + seqlock_init(&sde->head_lock); + spin_lock_init(&sde->senddmactrl_lock); + spin_lock_init(&sde->flushlist_lock); + /* insure there is always a zero bit */ + sde->ahg_bits = 0xfffffffe00000000ULL; + + sdma_set_state(sde, sdma_state_s00_hw_down); + + /* set up reference counting */ + kref_init(&sde->state.kref); + init_completion(&sde->state.comp); + + INIT_LIST_HEAD(&sde->flushlist); + INIT_LIST_HEAD(&sde->dmawait); + + sde->tail_csr = + get_kctxt_csr_addr(dd, this_idx, SD(TAIL)); + + if (idle_cnt) + dd->default_desc1 = + SDMA_DESC1_HEAD_TO_HOST_FLAG; + else + dd->default_desc1 = + SDMA_DESC1_INT_REQ_FLAG; + + tasklet_init(&sde->sdma_hw_clean_up_task, sdma_hw_clean_up_task, + (unsigned long)sde); + + tasklet_init(&sde->sdma_sw_clean_up_task, sdma_sw_clean_up_task, + (unsigned long)sde); + INIT_WORK(&sde->err_halt_worker, sdma_err_halt_wait); + INIT_WORK(&sde->flush_worker, sdma_field_flush); + + sde->progress_check_head = 0; + + setup_timer(&sde->err_progress_check_timer, + sdma_err_progress_check, (unsigned long)sde); + + sde->descq = dma_zalloc_coherent( + &dd->pcidev->dev, + descq_cnt * sizeof(u64[2]), + &sde->descq_phys, + GFP_KERNEL + ); + if (!sde->descq) + goto bail; + sde->tx_ring = + kcalloc(descq_cnt, sizeof(struct sdma_txreq *), + GFP_KERNEL); + if (!sde->tx_ring) + sde->tx_ring = + vzalloc( + sizeof(struct sdma_txreq *) * + descq_cnt); + if (!sde->tx_ring) + goto bail; + } + + dd->sdma_heads_size = L1_CACHE_BYTES * num_engines; + /* Allocate memory for DMA of head registers to memory */ + dd->sdma_heads_dma = dma_zalloc_coherent( + &dd->pcidev->dev, + dd->sdma_heads_size, + &dd->sdma_heads_phys, + GFP_KERNEL + ); + if (!dd->sdma_heads_dma) { + dd_dev_err(dd, "failed to allocate SendDMA head memory\n"); + goto bail; + } + + /* Allocate memory for pad */ + dd->sdma_pad_dma = dma_zalloc_coherent( + &dd->pcidev->dev, + sizeof(u32), + &dd->sdma_pad_phys, + GFP_KERNEL + ); + if (!dd->sdma_pad_dma) { + dd_dev_err(dd, "failed to allocate SendDMA pad memory\n"); + goto bail; + } + + /* assign each engine to different cacheline and init registers */ + curr_head = (void *)dd->sdma_heads_dma; + for (this_idx = 0; this_idx < num_engines; ++this_idx) { + unsigned long phys_offset; + + sde = &dd->per_sdma[this_idx]; + + sde->head_dma = curr_head; + curr_head += L1_CACHE_BYTES; + phys_offset = (unsigned long)sde->head_dma - + (unsigned long)dd->sdma_heads_dma; + sde->head_phys = dd->sdma_heads_phys + phys_offset; + init_sdma_regs(sde, per_sdma_credits, idle_cnt); + } + dd->flags |= HFI1_HAS_SEND_DMA; + dd->flags |= idle_cnt ? HFI1_HAS_SDMA_TIMEOUT : 0; + dd->num_sdma = num_engines; + if (sdma_map_init(dd, port, ppd->vls_operational, NULL)) + goto bail; + dd_dev_info(dd, "SDMA num_sdma: %u\n", dd->num_sdma); + return 0; + +bail: + sdma_clean(dd, num_engines); + return -ENOMEM; +} + +/** + * sdma_all_running() - called when the link goes up + * @dd: hfi1_devdata + * + * This routine moves all engines to the running state. + */ +void sdma_all_running(struct hfi1_devdata *dd) +{ + struct sdma_engine *sde; + unsigned int i; + + /* move all engines to running */ + for (i = 0; i < dd->num_sdma; ++i) { + sde = &dd->per_sdma[i]; + sdma_process_event(sde, sdma_event_e30_go_running); + } +} + +/** + * sdma_all_idle() - called when the link goes down + * @dd: hfi1_devdata + * + * This routine moves all engines to the idle state. + */ +void sdma_all_idle(struct hfi1_devdata *dd) +{ + struct sdma_engine *sde; + unsigned int i; + + /* idle all engines */ + for (i = 0; i < dd->num_sdma; ++i) { + sde = &dd->per_sdma[i]; + sdma_process_event(sde, sdma_event_e70_go_idle); + } +} + +/** + * sdma_start() - called to kick off state processing for all engines + * @dd: hfi1_devdata + * + * This routine is for kicking off the state processing for all required + * sdma engines. Interrupts need to be working at this point. + * + */ +void sdma_start(struct hfi1_devdata *dd) +{ + unsigned i; + struct sdma_engine *sde; + + /* kick off the engines state processing */ + for (i = 0; i < dd->num_sdma; ++i) { + sde = &dd->per_sdma[i]; + sdma_process_event(sde, sdma_event_e10_go_hw_start); + } +} + +/** + * sdma_exit() - used when module is removed + * @dd: hfi1_devdata + */ +void sdma_exit(struct hfi1_devdata *dd) +{ + unsigned this_idx; + struct sdma_engine *sde; + + for (this_idx = 0; dd->per_sdma && this_idx < dd->num_sdma; + ++this_idx) { + sde = &dd->per_sdma[this_idx]; + if (!list_empty(&sde->dmawait)) + dd_dev_err(dd, "sde %u: dmawait list not empty!\n", + sde->this_idx); + sdma_process_event(sde, sdma_event_e00_go_hw_down); + + del_timer_sync(&sde->err_progress_check_timer); + + /* + * This waits for the state machine to exit so it is not + * necessary to kill the sdma_sw_clean_up_task to make sure + * it is not running. + */ + sdma_finalput(&sde->state); + } + sdma_clean(dd, dd->num_sdma); +} + +/* + * unmap the indicated descriptor + */ +static inline void sdma_unmap_desc( + struct hfi1_devdata *dd, + struct sdma_desc *descp) +{ + switch (sdma_mapping_type(descp)) { + case SDMA_MAP_SINGLE: + dma_unmap_single( + &dd->pcidev->dev, + sdma_mapping_addr(descp), + sdma_mapping_len(descp), + DMA_TO_DEVICE); + break; + case SDMA_MAP_PAGE: + dma_unmap_page( + &dd->pcidev->dev, + sdma_mapping_addr(descp), + sdma_mapping_len(descp), + DMA_TO_DEVICE); + break; + } +} + +/* + * return the mode as indicated by the first + * descriptor in the tx. + */ +static inline u8 ahg_mode(struct sdma_txreq *tx) +{ + return (tx->descp[0].qw[1] & SDMA_DESC1_HEADER_MODE_SMASK) + >> SDMA_DESC1_HEADER_MODE_SHIFT; +} + +/** + * sdma_txclean() - clean tx of mappings, descp *kmalloc's + * @dd: hfi1_devdata for unmapping + * @tx: tx request to clean + * + * This is used in the progress routine to clean the tx or + * by the ULP to toss an in-process tx build. + * + * The code can be called multiple times without issue. + * + */ +void sdma_txclean( + struct hfi1_devdata *dd, + struct sdma_txreq *tx) +{ + u16 i; + + if (tx->num_desc) { + u8 skip = 0, mode = ahg_mode(tx); + + /* unmap first */ + sdma_unmap_desc(dd, &tx->descp[0]); + /* determine number of AHG descriptors to skip */ + if (mode > SDMA_AHG_APPLY_UPDATE1) + skip = mode >> 1; + for (i = 1 + skip; i < tx->num_desc; i++) + sdma_unmap_desc(dd, &tx->descp[i]); + tx->num_desc = 0; + } + kfree(tx->coalesce_buf); + tx->coalesce_buf = NULL; + /* kmalloc'ed descp */ + if (unlikely(tx->desc_limit > ARRAY_SIZE(tx->descs))) { + tx->desc_limit = ARRAY_SIZE(tx->descs); + kfree(tx->descp); + } +} + +static inline u16 sdma_gethead(struct sdma_engine *sde) +{ + struct hfi1_devdata *dd = sde->dd; + int use_dmahead; + u16 hwhead; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", + sde->this_idx, slashstrip(__FILE__), __LINE__, __func__); +#endif + +retry: + use_dmahead = HFI1_CAP_IS_KSET(USE_SDMA_HEAD) && __sdma_running(sde) && + (dd->flags & HFI1_HAS_SDMA_TIMEOUT); + hwhead = use_dmahead ? + (u16)le64_to_cpu(*sde->head_dma) : + (u16)read_sde_csr(sde, SD(HEAD)); + + if (unlikely(HFI1_CAP_IS_KSET(SDMA_HEAD_CHECK))) { + u16 cnt; + u16 swtail; + u16 swhead; + int sane; + + swhead = sde->descq_head & sde->sdma_mask; + /* this code is really bad for cache line trading */ + swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask; + cnt = sde->descq_cnt; + + if (swhead < swtail) + /* not wrapped */ + sane = (hwhead >= swhead) & (hwhead <= swtail); + else if (swhead > swtail) + /* wrapped around */ + sane = ((hwhead >= swhead) && (hwhead < cnt)) || + (hwhead <= swtail); + else + /* empty */ + sane = (hwhead == swhead); + + if (unlikely(!sane)) { + dd_dev_err(dd, "SDMA(%u) bad head (%s) hwhd=%hu swhd=%hu swtl=%hu cnt=%hu\n", + sde->this_idx, + use_dmahead ? "dma" : "kreg", + hwhead, swhead, swtail, cnt); + if (use_dmahead) { + /* try one more time, using csr */ + use_dmahead = 0; + goto retry; + } + /* proceed as if no progress */ + hwhead = swhead; + } + } + return hwhead; +} + +/* + * This is called when there are send DMA descriptors that might be + * available. + * + * This is called with head_lock held. + */ +static void sdma_desc_avail(struct sdma_engine *sde, unsigned avail) +{ + struct iowait *wait, *nw; + struct iowait *waits[SDMA_WAIT_BATCH_SIZE]; + unsigned i, n = 0, seq; + struct sdma_txreq *stx; + struct hfi1_ibdev *dev = &sde->dd->verbs_dev; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", sde->this_idx, + slashstrip(__FILE__), __LINE__, __func__); + dd_dev_err(sde->dd, "avail: %u\n", avail); +#endif + + do { + seq = read_seqbegin(&dev->iowait_lock); + if (!list_empty(&sde->dmawait)) { + /* at least one item */ + write_seqlock(&dev->iowait_lock); + /* Harvest waiters wanting DMA descriptors */ + list_for_each_entry_safe( + wait, + nw, + &sde->dmawait, + list) { + u16 num_desc = 0; + + if (!wait->wakeup) + continue; + if (n == ARRAY_SIZE(waits)) + break; + if (!list_empty(&wait->tx_head)) { + stx = list_first_entry( + &wait->tx_head, + struct sdma_txreq, + list); + num_desc = stx->num_desc; + } + if (num_desc > avail) + break; + avail -= num_desc; + list_del_init(&wait->list); + waits[n++] = wait; + } + write_sequnlock(&dev->iowait_lock); + break; + } + } while (read_seqretry(&dev->iowait_lock, seq)); + + for (i = 0; i < n; i++) + waits[i]->wakeup(waits[i], SDMA_AVAIL_REASON); +} + +/* head_lock must be held */ +static void sdma_make_progress(struct sdma_engine *sde, u64 status) +{ + struct sdma_txreq *txp = NULL; + int progress = 0; + u16 hwhead, swhead; + int idle_check_done = 0; + + hwhead = sdma_gethead(sde); + + /* The reason for some of the complexity of this code is that + * not all descriptors have corresponding txps. So, we have to + * be able to skip over descs until we wander into the range of + * the next txp on the list. + */ + +retry: + txp = get_txhead(sde); + swhead = sde->descq_head & sde->sdma_mask; + trace_hfi1_sdma_progress(sde, hwhead, swhead, txp); + while (swhead != hwhead) { + /* advance head, wrap if needed */ + swhead = ++sde->descq_head & sde->sdma_mask; + + /* if now past this txp's descs, do the callback */ + if (txp && txp->next_descq_idx == swhead) { + /* remove from list */ + sde->tx_ring[sde->tx_head++ & sde->sdma_mask] = NULL; + complete_tx(sde, txp, SDMA_TXREQ_S_OK); + /* see if there is another txp */ + txp = get_txhead(sde); + } + trace_hfi1_sdma_progress(sde, hwhead, swhead, txp); + progress++; + } + + /* + * The SDMA idle interrupt is not guaranteed to be ordered with respect + * to updates to the the dma_head location in host memory. The head + * value read might not be fully up to date. If there are pending + * descriptors and the SDMA idle interrupt fired then read from the + * CSR SDMA head instead to get the latest value from the hardware. + * The hardware SDMA head should be read at most once in this invocation + * of sdma_make_progress(..) which is ensured by idle_check_done flag + */ + if ((status & sde->idle_mask) && !idle_check_done) { + u16 swtail; + + swtail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask; + if (swtail != hwhead) { + hwhead = (u16)read_sde_csr(sde, SD(HEAD)); + idle_check_done = 1; + goto retry; + } + } + + sde->last_status = status; + if (progress) + sdma_desc_avail(sde, sdma_descq_freecnt(sde)); +} + +/* + * sdma_engine_interrupt() - interrupt handler for engine + * @sde: sdma engine + * @status: sdma interrupt reason + * + * Status is a mask of the 3 possible interrupts for this engine. It will + * contain bits _only_ for this SDMA engine. It will contain at least one + * bit, it may contain more. + */ +void sdma_engine_interrupt(struct sdma_engine *sde, u64 status) +{ + trace_hfi1_sdma_engine_interrupt(sde, status); + write_seqlock(&sde->head_lock); + sdma_set_desc_cnt(sde, sdma_desct_intr); + if (status & sde->idle_mask) + sde->idle_int_cnt++; + else if (status & sde->progress_mask) + sde->progress_int_cnt++; + else if (status & sde->int_mask) + sde->sdma_int_cnt++; + sdma_make_progress(sde, status); + write_sequnlock(&sde->head_lock); +} + +/** + * sdma_engine_error() - error handler for engine + * @sde: sdma engine + * @status: sdma interrupt reason + */ +void sdma_engine_error(struct sdma_engine *sde, u64 status) +{ + unsigned long flags; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) error status 0x%llx state %s\n", + sde->this_idx, + (unsigned long long)status, + sdma_state_names[sde->state.current_state]); +#endif + spin_lock_irqsave(&sde->tail_lock, flags); + write_seqlock(&sde->head_lock); + if (status & ALL_SDMA_ENG_HALT_ERRS) + __sdma_process_event(sde, sdma_event_e60_hw_halted); + if (status & ~SD(ENG_ERR_STATUS_SDMA_HALT_ERR_SMASK)) { + dd_dev_err(sde->dd, + "SDMA (%u) engine error: 0x%llx state %s\n", + sde->this_idx, + (unsigned long long)status, + sdma_state_names[sde->state.current_state]); + dump_sdma_state(sde); + } + write_sequnlock(&sde->head_lock); + spin_unlock_irqrestore(&sde->tail_lock, flags); +} + +static void sdma_sendctrl(struct sdma_engine *sde, unsigned op) +{ + u64 set_senddmactrl = 0; + u64 clr_senddmactrl = 0; + unsigned long flags; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) senddmactrl E=%d I=%d H=%d C=%d\n", + sde->this_idx, + (op & SDMA_SENDCTRL_OP_ENABLE) ? 1 : 0, + (op & SDMA_SENDCTRL_OP_INTENABLE) ? 1 : 0, + (op & SDMA_SENDCTRL_OP_HALT) ? 1 : 0, + (op & SDMA_SENDCTRL_OP_CLEANUP) ? 1 : 0); +#endif + + if (op & SDMA_SENDCTRL_OP_ENABLE) + set_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK); + else + clr_senddmactrl |= SD(CTRL_SDMA_ENABLE_SMASK); + + if (op & SDMA_SENDCTRL_OP_INTENABLE) + set_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK); + else + clr_senddmactrl |= SD(CTRL_SDMA_INT_ENABLE_SMASK); + + if (op & SDMA_SENDCTRL_OP_HALT) + set_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK); + else + clr_senddmactrl |= SD(CTRL_SDMA_HALT_SMASK); + + spin_lock_irqsave(&sde->senddmactrl_lock, flags); + + sde->p_senddmactrl |= set_senddmactrl; + sde->p_senddmactrl &= ~clr_senddmactrl; + + if (op & SDMA_SENDCTRL_OP_CLEANUP) + write_sde_csr(sde, SD(CTRL), + sde->p_senddmactrl | + SD(CTRL_SDMA_CLEANUP_SMASK)); + else + write_sde_csr(sde, SD(CTRL), sde->p_senddmactrl); + + spin_unlock_irqrestore(&sde->senddmactrl_lock, flags); + +#ifdef CONFIG_SDMA_VERBOSITY + sdma_dumpstate(sde); +#endif +} + +static void sdma_setlengen(struct sdma_engine *sde) +{ +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", + sde->this_idx, slashstrip(__FILE__), __LINE__, __func__); +#endif + + /* + * Set SendDmaLenGen and clear-then-set the MSB of the generation + * count to enable generation checking and load the internal + * generation counter. + */ + write_sde_csr(sde, SD(LEN_GEN), + (sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT)); + write_sde_csr(sde, SD(LEN_GEN), + ((sde->descq_cnt / 64) << SD(LEN_GEN_LENGTH_SHIFT)) | + (4ULL << SD(LEN_GEN_GENERATION_SHIFT))); +} + +static inline void sdma_update_tail(struct sdma_engine *sde, u16 tail) +{ + /* Commit writes to memory and advance the tail on the chip */ + smp_wmb(); /* see get_txhead() */ + writeq(tail, sde->tail_csr); +} + +/* + * This is called when changing to state s10_hw_start_up_halt_wait as + * a result of send buffer errors or send DMA descriptor errors. + */ +static void sdma_hw_start_up(struct sdma_engine *sde) +{ + u64 reg; + +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) %s:%d %s()\n", + sde->this_idx, slashstrip(__FILE__), __LINE__, __func__); +#endif + + sdma_setlengen(sde); + sdma_update_tail(sde, 0); /* Set SendDmaTail */ + *sde->head_dma = 0; + + reg = SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_MASK) << + SD(ENG_ERR_CLEAR_SDMA_HEADER_REQUEST_FIFO_UNC_ERR_SHIFT); + write_sde_csr(sde, SD(ENG_ERR_CLEAR), reg); +} + +#define CLEAR_STATIC_RATE_CONTROL_SMASK(r) \ +(r &= ~SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) + +#define SET_STATIC_RATE_CONTROL_SMASK(r) \ +(r |= SEND_DMA_CHECK_ENABLE_DISALLOW_PBC_STATIC_RATE_CONTROL_SMASK) +/* + * set_sdma_integrity + * + * Set the SEND_DMA_CHECK_ENABLE register for send DMA engine 'sde'. + */ +static void set_sdma_integrity(struct sdma_engine *sde) +{ + struct hfi1_devdata *dd = sde->dd; + u64 reg; + + if (unlikely(HFI1_CAP_IS_KSET(NO_INTEGRITY))) + return; + + reg = hfi1_pkt_base_sdma_integrity(dd); + + if (HFI1_CAP_IS_KSET(STATIC_RATE_CTRL)) + CLEAR_STATIC_RATE_CONTROL_SMASK(reg); + else + SET_STATIC_RATE_CONTROL_SMASK(reg); + + write_sde_csr(sde, SD(CHECK_ENABLE), reg); +} + +static void init_sdma_regs( + struct sdma_engine *sde, + u32 credits, + uint idle_cnt) +{ + u8 opval, opmask; +#ifdef CONFIG_SDMA_VERBOSITY + struct hfi1_devdata *dd = sde->dd; + + dd_dev_err(dd, "CONFIG SDMA(%u) %s:%d %s()\n", + sde->this_idx, slashstrip(__FILE__), __LINE__, __func__); +#endif + + write_sde_csr(sde, SD(BASE_ADDR), sde->descq_phys); + sdma_setlengen(sde); + sdma_update_tail(sde, 0); /* Set SendDmaTail */ + write_sde_csr(sde, SD(RELOAD_CNT), idle_cnt); + write_sde_csr(sde, SD(DESC_CNT), 0); + write_sde_csr(sde, SD(HEAD_ADDR), sde->head_phys); + write_sde_csr(sde, SD(MEMORY), + ((u64)credits << SD(MEMORY_SDMA_MEMORY_CNT_SHIFT)) | + ((u64)(credits * sde->this_idx) << + SD(MEMORY_SDMA_MEMORY_INDEX_SHIFT))); + write_sde_csr(sde, SD(ENG_ERR_MASK), ~0ull); + set_sdma_integrity(sde); + opmask = OPCODE_CHECK_MASK_DISABLED; + opval = OPCODE_CHECK_VAL_DISABLED; + write_sde_csr(sde, SD(CHECK_OPCODE), + (opmask << SEND_CTXT_CHECK_OPCODE_MASK_SHIFT) | + (opval << SEND_CTXT_CHECK_OPCODE_VALUE_SHIFT)); +} + +#ifdef CONFIG_SDMA_VERBOSITY + +#define sdma_dumpstate_helper0(reg) do { \ + csr = read_csr(sde->dd, reg); \ + dd_dev_err(sde->dd, "%36s 0x%016llx\n", #reg, csr); \ + } while (0) + +#define sdma_dumpstate_helper(reg) do { \ + csr = read_sde_csr(sde, reg); \ + dd_dev_err(sde->dd, "%36s[%02u] 0x%016llx\n", \ + #reg, sde->this_idx, csr); \ + } while (0) + +#define sdma_dumpstate_helper2(reg) do { \ + csr = read_csr(sde->dd, reg + (8 * i)); \ + dd_dev_err(sde->dd, "%33s_%02u 0x%016llx\n", \ + #reg, i, csr); \ + } while (0) + +void sdma_dumpstate(struct sdma_engine *sde) +{ + u64 csr; + unsigned i; + + sdma_dumpstate_helper(SD(CTRL)); + sdma_dumpstate_helper(SD(STATUS)); + sdma_dumpstate_helper0(SD(ERR_STATUS)); + sdma_dumpstate_helper0(SD(ERR_MASK)); + sdma_dumpstate_helper(SD(ENG_ERR_STATUS)); + sdma_dumpstate_helper(SD(ENG_ERR_MASK)); + + for (i = 0; i < CCE_NUM_INT_CSRS; ++i) { + sdma_dumpstate_helper2(CCE_INT_STATUS); + sdma_dumpstate_helper2(CCE_INT_MASK); + sdma_dumpstate_helper2(CCE_INT_BLOCKED); + } + + sdma_dumpstate_helper(SD(TAIL)); + sdma_dumpstate_helper(SD(HEAD)); + sdma_dumpstate_helper(SD(PRIORITY_THLD)); + sdma_dumpstate_helper(SD(IDLE_CNT)); + sdma_dumpstate_helper(SD(RELOAD_CNT)); + sdma_dumpstate_helper(SD(DESC_CNT)); + sdma_dumpstate_helper(SD(DESC_FETCHED_CNT)); + sdma_dumpstate_helper(SD(MEMORY)); + sdma_dumpstate_helper0(SD(ENGINES)); + sdma_dumpstate_helper0(SD(MEM_SIZE)); + /* sdma_dumpstate_helper(SEND_EGRESS_SEND_DMA_STATUS); */ + sdma_dumpstate_helper(SD(BASE_ADDR)); + sdma_dumpstate_helper(SD(LEN_GEN)); + sdma_dumpstate_helper(SD(HEAD_ADDR)); + sdma_dumpstate_helper(SD(CHECK_ENABLE)); + sdma_dumpstate_helper(SD(CHECK_VL)); + sdma_dumpstate_helper(SD(CHECK_JOB_KEY)); + sdma_dumpstate_helper(SD(CHECK_PARTITION_KEY)); + sdma_dumpstate_helper(SD(CHECK_SLID)); + sdma_dumpstate_helper(SD(CHECK_OPCODE)); +} +#endif + +static void dump_sdma_state(struct sdma_engine *sde) +{ + struct hw_sdma_desc *descq; + struct hw_sdma_desc *descqp; + u64 desc[2]; + u64 addr; + u8 gen; + u16 len; + u16 head, tail, cnt; + + head = sde->descq_head & sde->sdma_mask; + tail = sde->descq_tail & sde->sdma_mask; + cnt = sdma_descq_freecnt(sde); + descq = sde->descq; + + dd_dev_err(sde->dd, + "SDMA (%u) descq_head: %u descq_tail: %u freecnt: %u FLE %d\n", + sde->this_idx, head, tail, cnt, + !list_empty(&sde->flushlist)); + + /* print info for each entry in the descriptor queue */ + while (head != tail) { + char flags[6] = { 'x', 'x', 'x', 'x', 0 }; + + descqp = &sde->descq[head]; + desc[0] = le64_to_cpu(descqp->qw[0]); + desc[1] = le64_to_cpu(descqp->qw[1]); + flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-'; + flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ? + 'H' : '-'; + flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-'; + flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-'; + addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT) + & SDMA_DESC0_PHY_ADDR_MASK; + gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT) + & SDMA_DESC1_GENERATION_MASK; + len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT) + & SDMA_DESC0_BYTE_COUNT_MASK; + dd_dev_err(sde->dd, + "SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n", + head, flags, addr, gen, len); + dd_dev_err(sde->dd, + "\tdesc0:0x%016llx desc1 0x%016llx\n", + desc[0], desc[1]); + if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) + dd_dev_err(sde->dd, + "\taidx: %u amode: %u alen: %u\n", + (u8)((desc[1] & + SDMA_DESC1_HEADER_INDEX_SMASK) >> + SDMA_DESC1_HEADER_INDEX_SHIFT), + (u8)((desc[1] & + SDMA_DESC1_HEADER_MODE_SMASK) >> + SDMA_DESC1_HEADER_MODE_SHIFT), + (u8)((desc[1] & + SDMA_DESC1_HEADER_DWS_SMASK) >> + SDMA_DESC1_HEADER_DWS_SHIFT)); + head++; + head &= sde->sdma_mask; + } +} + +#define SDE_FMT \ + "SDE %u CPU %d STE %s C 0x%llx S 0x%016llx E 0x%llx T(HW) 0x%llx T(SW) 0x%x H(HW) 0x%llx H(SW) 0x%x H(D) 0x%llx DM 0x%llx GL 0x%llx R 0x%llx LIS 0x%llx AHGI 0x%llx TXT %u TXH %u DT %u DH %u FLNE %d DQF %u SLC 0x%llx\n" +/** + * sdma_seqfile_dump_sde() - debugfs dump of sde + * @s: seq file + * @sde: send dma engine to dump + * + * This routine dumps the sde to the indicated seq file. + */ +void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *sde) +{ + u16 head, tail; + struct hw_sdma_desc *descqp; + u64 desc[2]; + u64 addr; + u8 gen; + u16 len; + + head = sde->descq_head & sde->sdma_mask; + tail = ACCESS_ONCE(sde->descq_tail) & sde->sdma_mask; + seq_printf(s, SDE_FMT, sde->this_idx, + sde->cpu, + sdma_state_name(sde->state.current_state), + (unsigned long long)read_sde_csr(sde, SD(CTRL)), + (unsigned long long)read_sde_csr(sde, SD(STATUS)), + (unsigned long long)read_sde_csr(sde, SD(ENG_ERR_STATUS)), + (unsigned long long)read_sde_csr(sde, SD(TAIL)), tail, + (unsigned long long)read_sde_csr(sde, SD(HEAD)), head, + (unsigned long long)le64_to_cpu(*sde->head_dma), + (unsigned long long)read_sde_csr(sde, SD(MEMORY)), + (unsigned long long)read_sde_csr(sde, SD(LEN_GEN)), + (unsigned long long)read_sde_csr(sde, SD(RELOAD_CNT)), + (unsigned long long)sde->last_status, + (unsigned long long)sde->ahg_bits, + sde->tx_tail, + sde->tx_head, + sde->descq_tail, + sde->descq_head, + !list_empty(&sde->flushlist), + sde->descq_full_count, + (unsigned long long)read_sde_csr(sde, SEND_DMA_CHECK_SLID)); + + /* print info for each entry in the descriptor queue */ + while (head != tail) { + char flags[6] = { 'x', 'x', 'x', 'x', 0 }; + + descqp = &sde->descq[head]; + desc[0] = le64_to_cpu(descqp->qw[0]); + desc[1] = le64_to_cpu(descqp->qw[1]); + flags[0] = (desc[1] & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-'; + flags[1] = (desc[1] & SDMA_DESC1_HEAD_TO_HOST_FLAG) ? + 'H' : '-'; + flags[2] = (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-'; + flags[3] = (desc[0] & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-'; + addr = (desc[0] >> SDMA_DESC0_PHY_ADDR_SHIFT) + & SDMA_DESC0_PHY_ADDR_MASK; + gen = (desc[1] >> SDMA_DESC1_GENERATION_SHIFT) + & SDMA_DESC1_GENERATION_MASK; + len = (desc[0] >> SDMA_DESC0_BYTE_COUNT_SHIFT) + & SDMA_DESC0_BYTE_COUNT_MASK; + seq_printf(s, + "\tdesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes\n", + head, flags, addr, gen, len); + if (desc[0] & SDMA_DESC0_FIRST_DESC_FLAG) + seq_printf(s, "\t\tahgidx: %u ahgmode: %u\n", + (u8)((desc[1] & + SDMA_DESC1_HEADER_INDEX_SMASK) >> + SDMA_DESC1_HEADER_INDEX_SHIFT), + (u8)((desc[1] & + SDMA_DESC1_HEADER_MODE_SMASK) >> + SDMA_DESC1_HEADER_MODE_SHIFT)); + head = (head + 1) & sde->sdma_mask; + } +} + +/* + * add the generation number into + * the qw1 and return + */ +static inline u64 add_gen(struct sdma_engine *sde, u64 qw1) +{ + u8 generation = (sde->descq_tail >> sde->sdma_shift) & 3; + + qw1 &= ~SDMA_DESC1_GENERATION_SMASK; + qw1 |= ((u64)generation & SDMA_DESC1_GENERATION_MASK) + << SDMA_DESC1_GENERATION_SHIFT; + return qw1; +} + +/* + * This routine submits the indicated tx + * + * Space has already been guaranteed and + * tail side of ring is locked. + * + * The hardware tail update is done + * in the caller and that is facilitated + * by returning the new tail. + * + * There is special case logic for ahg + * to not add the generation number for + * up to 2 descriptors that follow the + * first descriptor. + * + */ +static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx) +{ + int i; + u16 tail; + struct sdma_desc *descp = tx->descp; + u8 skip = 0, mode = ahg_mode(tx); + + tail = sde->descq_tail & sde->sdma_mask; + sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]); + sde->descq[tail].qw[1] = cpu_to_le64(add_gen(sde, descp->qw[1])); + trace_hfi1_sdma_descriptor(sde, descp->qw[0], descp->qw[1], + tail, &sde->descq[tail]); + tail = ++sde->descq_tail & sde->sdma_mask; + descp++; + if (mode > SDMA_AHG_APPLY_UPDATE1) + skip = mode >> 1; + for (i = 1; i < tx->num_desc; i++, descp++) { + u64 qw1; + + sde->descq[tail].qw[0] = cpu_to_le64(descp->qw[0]); + if (skip) { + /* edits don't have generation */ + qw1 = descp->qw[1]; + skip--; + } else { + /* replace generation with real one for non-edits */ + qw1 = add_gen(sde, descp->qw[1]); + } + sde->descq[tail].qw[1] = cpu_to_le64(qw1); + trace_hfi1_sdma_descriptor(sde, descp->qw[0], qw1, + tail, &sde->descq[tail]); + tail = ++sde->descq_tail & sde->sdma_mask; + } + tx->next_descq_idx = tail; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + tx->sn = sde->tail_sn++; + trace_hfi1_sdma_in_sn(sde, tx->sn); + WARN_ON_ONCE(sde->tx_ring[sde->tx_tail & sde->sdma_mask]); +#endif + sde->tx_ring[sde->tx_tail++ & sde->sdma_mask] = tx; + sde->desc_avail -= tx->num_desc; + return tail; +} + +/* + * Check for progress + */ +static int sdma_check_progress( + struct sdma_engine *sde, + struct iowait *wait, + struct sdma_txreq *tx) +{ + int ret; + + sde->desc_avail = sdma_descq_freecnt(sde); + if (tx->num_desc <= sde->desc_avail) + return -EAGAIN; + /* pulse the head_lock */ + if (wait && wait->sleep) { + unsigned seq; + + seq = raw_seqcount_begin( + (const seqcount_t *)&sde->head_lock.seqcount); + ret = wait->sleep(sde, wait, tx, seq); + if (ret == -EAGAIN) + sde->desc_avail = sdma_descq_freecnt(sde); + } else { + ret = -EBUSY; + } + return ret; +} + +/** + * sdma_send_txreq() - submit a tx req to ring + * @sde: sdma engine to use + * @wait: wait structure to use when full (may be NULL) + * @tx: sdma_txreq to submit + * + * The call submits the tx into the ring. If a iowait structure is non-NULL + * the packet will be queued to the list in wait. + * + * Return: + * 0 - Success, -EINVAL - sdma_txreq incomplete, -EBUSY - no space in + * ring (wait == NULL) + * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state + */ +int sdma_send_txreq(struct sdma_engine *sde, + struct iowait *wait, + struct sdma_txreq *tx) +{ + int ret = 0; + u16 tail; + unsigned long flags; + + /* user should have supplied entire packet */ + if (unlikely(tx->tlen)) + return -EINVAL; + tx->wait = wait; + spin_lock_irqsave(&sde->tail_lock, flags); +retry: + if (unlikely(!__sdma_running(sde))) + goto unlock_noconn; + if (unlikely(tx->num_desc > sde->desc_avail)) + goto nodesc; + tail = submit_tx(sde, tx); + if (wait) + iowait_sdma_inc(wait); + sdma_update_tail(sde, tail); +unlock: + spin_unlock_irqrestore(&sde->tail_lock, flags); + return ret; +unlock_noconn: + if (wait) + iowait_sdma_inc(wait); + tx->next_descq_idx = 0; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + tx->sn = sde->tail_sn++; + trace_hfi1_sdma_in_sn(sde, tx->sn); +#endif + spin_lock(&sde->flushlist_lock); + list_add_tail(&tx->list, &sde->flushlist); + spin_unlock(&sde->flushlist_lock); + if (wait) { + wait->tx_count++; + wait->count += tx->num_desc; + } + schedule_work(&sde->flush_worker); + ret = -ECOMM; + goto unlock; +nodesc: + ret = sdma_check_progress(sde, wait, tx); + if (ret == -EAGAIN) { + ret = 0; + goto retry; + } + sde->descq_full_count++; + goto unlock; +} + +/** + * sdma_send_txlist() - submit a list of tx req to ring + * @sde: sdma engine to use + * @wait: wait structure to use when full (may be NULL) + * @tx_list: list of sdma_txreqs to submit + * + * The call submits the list into the ring. + * + * If the iowait structure is non-NULL and not equal to the iowait list + * the unprocessed part of the list will be appended to the list in wait. + * + * In all cases, the tx_list will be updated so the head of the tx_list is + * the list of descriptors that have yet to be transmitted. + * + * The intent of this call is to provide a more efficient + * way of submitting multiple packets to SDMA while holding the tail + * side locking. + * + * Return: + * > 0 - Success (value is number of sdma_txreq's submitted), + * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL) + * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state + */ +int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait, + struct list_head *tx_list) +{ + struct sdma_txreq *tx, *tx_next; + int ret = 0; + unsigned long flags; + u16 tail = INVALID_TAIL; + int count = 0; + + spin_lock_irqsave(&sde->tail_lock, flags); +retry: + list_for_each_entry_safe(tx, tx_next, tx_list, list) { + tx->wait = wait; + if (unlikely(!__sdma_running(sde))) + goto unlock_noconn; + if (unlikely(tx->num_desc > sde->desc_avail)) + goto nodesc; + if (unlikely(tx->tlen)) { + ret = -EINVAL; + goto update_tail; + } + list_del_init(&tx->list); + tail = submit_tx(sde, tx); + count++; + if (tail != INVALID_TAIL && + (count & SDMA_TAIL_UPDATE_THRESH) == 0) { + sdma_update_tail(sde, tail); + tail = INVALID_TAIL; + } + } +update_tail: + if (wait) + iowait_sdma_add(wait, count); + if (tail != INVALID_TAIL) + sdma_update_tail(sde, tail); + spin_unlock_irqrestore(&sde->tail_lock, flags); + return ret == 0 ? count : ret; +unlock_noconn: + spin_lock(&sde->flushlist_lock); + list_for_each_entry_safe(tx, tx_next, tx_list, list) { + tx->wait = wait; + list_del_init(&tx->list); + if (wait) + iowait_sdma_inc(wait); + tx->next_descq_idx = 0; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + tx->sn = sde->tail_sn++; + trace_hfi1_sdma_in_sn(sde, tx->sn); +#endif + list_add_tail(&tx->list, &sde->flushlist); + if (wait) { + wait->tx_count++; + wait->count += tx->num_desc; + } + } + spin_unlock(&sde->flushlist_lock); + schedule_work(&sde->flush_worker); + ret = -ECOMM; + goto update_tail; +nodesc: + ret = sdma_check_progress(sde, wait, tx); + if (ret == -EAGAIN) { + ret = 0; + goto retry; + } + sde->descq_full_count++; + goto update_tail; +} + +static void sdma_process_event(struct sdma_engine *sde, enum sdma_events event) +{ + unsigned long flags; + + spin_lock_irqsave(&sde->tail_lock, flags); + write_seqlock(&sde->head_lock); + + __sdma_process_event(sde, event); + + if (sde->state.current_state == sdma_state_s99_running) + sdma_desc_avail(sde, sdma_descq_freecnt(sde)); + + write_sequnlock(&sde->head_lock); + spin_unlock_irqrestore(&sde->tail_lock, flags); +} + +static void __sdma_process_event(struct sdma_engine *sde, + enum sdma_events event) +{ + struct sdma_state *ss = &sde->state; + int need_progress = 0; + + /* CONFIG SDMA temporary */ +#ifdef CONFIG_SDMA_VERBOSITY + dd_dev_err(sde->dd, "CONFIG SDMA(%u) [%s] %s\n", sde->this_idx, + sdma_state_names[ss->current_state], + sdma_event_names[event]); +#endif + + switch (ss->current_state) { + case sdma_state_s00_hw_down: + switch (event) { + case sdma_event_e00_go_hw_down: + break; + case sdma_event_e30_go_running: + /* + * If down, but running requested (usually result + * of link up, then we need to start up. + * This can happen when hw down is requested while + * bringing the link up with traffic active on + * 7220, e.g. + */ + ss->go_s99_running = 1; + /* fall through and start dma engine */ + case sdma_event_e10_go_hw_start: + /* This reference means the state machine is started */ + sdma_get(&sde->state); + sdma_set_state(sde, + sdma_state_s10_hw_start_up_halt_wait); + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e40_sw_cleaned: + sdma_sw_tear_down(sde); + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s10_hw_start_up_halt_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + sdma_sw_tear_down(sde); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + sdma_set_state(sde, + sdma_state_s15_hw_start_up_clean_wait); + sdma_start_hw_clean_up(sde); + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + schedule_work(&sde->err_halt_worker); + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s15_hw_start_up_clean_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + sdma_sw_tear_down(sde); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + sdma_hw_start_up(sde); + sdma_set_state(sde, ss->go_s99_running ? + sdma_state_s99_running : + sdma_state_s20_idle); + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s20_idle: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + sdma_sw_tear_down(sde); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + sdma_set_state(sde, sdma_state_s99_running); + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + sdma_set_state(sde, sdma_state_s50_hw_halt_wait); + schedule_work(&sde->err_halt_worker); + break; + case sdma_event_e70_go_idle: + break; + case sdma_event_e85_link_down: + /* fall through */ + case sdma_event_e80_hw_freeze: + sdma_set_state(sde, sdma_state_s80_hw_freeze); + atomic_dec(&sde->dd->sdma_unfreeze_count); + wake_up_interruptible(&sde->dd->sdma_unfreeze_wq); + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s30_sw_clean_up_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + sdma_set_state(sde, sdma_state_s40_hw_clean_up_wait); + sdma_start_hw_clean_up(sde); + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + ss->go_s99_running = 0; + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s40_hw_clean_up_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + sdma_hw_start_up(sde); + sdma_set_state(sde, ss->go_s99_running ? + sdma_state_s99_running : + sdma_state_s20_idle); + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + ss->go_s99_running = 0; + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s50_hw_halt_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + schedule_work(&sde->err_halt_worker); + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + ss->go_s99_running = 0; + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s60_idle_halt_wait: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + sdma_set_state(sde, sdma_state_s30_sw_clean_up_wait); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + schedule_work(&sde->err_halt_worker); + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s80_hw_freeze: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + sdma_set_state(sde, sdma_state_s82_freeze_sw_clean); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e82_hw_unfreeze: + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s82_freeze_sw_clean: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + ss->go_s99_running = 1; + break; + case sdma_event_e40_sw_cleaned: + /* notify caller this engine is done cleaning */ + atomic_dec(&sde->dd->sdma_unfreeze_count); + wake_up_interruptible(&sde->dd->sdma_unfreeze_wq); + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + break; + case sdma_event_e70_go_idle: + ss->go_s99_running = 0; + break; + case sdma_event_e80_hw_freeze: + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + sdma_hw_start_up(sde); + sdma_set_state(sde, ss->go_s99_running ? + sdma_state_s99_running : + sdma_state_s20_idle); + break; + case sdma_event_e85_link_down: + break; + case sdma_event_e90_sw_halted: + break; + } + break; + + case sdma_state_s99_running: + switch (event) { + case sdma_event_e00_go_hw_down: + sdma_set_state(sde, sdma_state_s00_hw_down); + tasklet_hi_schedule(&sde->sdma_sw_clean_up_task); + break; + case sdma_event_e10_go_hw_start: + break; + case sdma_event_e15_hw_halt_done: + break; + case sdma_event_e25_hw_clean_up_done: + break; + case sdma_event_e30_go_running: + break; + case sdma_event_e40_sw_cleaned: + break; + case sdma_event_e50_hw_cleaned: + break; + case sdma_event_e60_hw_halted: + need_progress = 1; + sdma_err_progress_check_schedule(sde); + case sdma_event_e90_sw_halted: + /* + * SW initiated halt does not perform engines + * progress check + */ + sdma_set_state(sde, sdma_state_s50_hw_halt_wait); + schedule_work(&sde->err_halt_worker); + break; + case sdma_event_e70_go_idle: + sdma_set_state(sde, sdma_state_s60_idle_halt_wait); + break; + case sdma_event_e85_link_down: + ss->go_s99_running = 0; + /* fall through */ + case sdma_event_e80_hw_freeze: + sdma_set_state(sde, sdma_state_s80_hw_freeze); + atomic_dec(&sde->dd->sdma_unfreeze_count); + wake_up_interruptible(&sde->dd->sdma_unfreeze_wq); + break; + case sdma_event_e81_hw_frozen: + break; + case sdma_event_e82_hw_unfreeze: + break; + } + break; + } + + ss->last_event = event; + if (need_progress) + sdma_make_progress(sde, 0); +} + +/* + * _extend_sdma_tx_descs() - helper to extend txreq + * + * This is called once the initial nominal allocation + * of descriptors in the sdma_txreq is exhausted. + * + * The code will bump the allocation up to the max + * of MAX_DESC (64) descriptors. There doesn't seem + * much point in an interim step. The last descriptor + * is reserved for coalesce buffer in order to support + * cases where input packet has >MAX_DESC iovecs. + * + */ +static int _extend_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) +{ + int i; + + /* Handle last descriptor */ + if (unlikely((tx->num_desc == (MAX_DESC - 1)))) { + /* if tlen is 0, it is for padding, release last descriptor */ + if (!tx->tlen) { + tx->desc_limit = MAX_DESC; + } else if (!tx->coalesce_buf) { + /* allocate coalesce buffer with space for padding */ + tx->coalesce_buf = kmalloc(tx->tlen + sizeof(u32), + GFP_ATOMIC); + if (!tx->coalesce_buf) + goto enomem; + tx->coalesce_idx = 0; + } + return 0; + } + + if (unlikely(tx->num_desc == MAX_DESC)) + goto enomem; + + tx->descp = kmalloc_array( + MAX_DESC, + sizeof(struct sdma_desc), + GFP_ATOMIC); + if (!tx->descp) + goto enomem; + + /* reserve last descriptor for coalescing */ + tx->desc_limit = MAX_DESC - 1; + /* copy ones already built */ + for (i = 0; i < tx->num_desc; i++) + tx->descp[i] = tx->descs[i]; + return 0; +enomem: + sdma_txclean(dd, tx); + return -ENOMEM; +} + +/* + * ext_coal_sdma_tx_descs() - extend or coalesce sdma tx descriptors + * + * This is called once the initial nominal allocation of descriptors + * in the sdma_txreq is exhausted. + * + * This function calls _extend_sdma_tx_descs to extend or allocate + * coalesce buffer. If there is a allocated coalesce buffer, it will + * copy the input packet data into the coalesce buffer. It also adds + * coalesce buffer descriptor once when whole packet is received. + * + * Return: + * <0 - error + * 0 - coalescing, don't populate descriptor + * 1 - continue with populating descriptor + */ +int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx, + int type, void *kvaddr, struct page *page, + unsigned long offset, u16 len) +{ + int pad_len, rval; + dma_addr_t addr; + + rval = _extend_sdma_tx_descs(dd, tx); + if (rval) { + sdma_txclean(dd, tx); + return rval; + } + + /* If coalesce buffer is allocated, copy data into it */ + if (tx->coalesce_buf) { + if (type == SDMA_MAP_NONE) { + sdma_txclean(dd, tx); + return -EINVAL; + } + + if (type == SDMA_MAP_PAGE) { + kvaddr = kmap(page); + kvaddr += offset; + } else if (WARN_ON(!kvaddr)) { + sdma_txclean(dd, tx); + return -EINVAL; + } + + memcpy(tx->coalesce_buf + tx->coalesce_idx, kvaddr, len); + tx->coalesce_idx += len; + if (type == SDMA_MAP_PAGE) + kunmap(page); + + /* If there is more data, return */ + if (tx->tlen - tx->coalesce_idx) + return 0; + + /* Whole packet is received; add any padding */ + pad_len = tx->packet_len & (sizeof(u32) - 1); + if (pad_len) { + pad_len = sizeof(u32) - pad_len; + memset(tx->coalesce_buf + tx->coalesce_idx, 0, pad_len); + /* padding is taken care of for coalescing case */ + tx->packet_len += pad_len; + tx->tlen += pad_len; + } + + /* dma map the coalesce buffer */ + addr = dma_map_single(&dd->pcidev->dev, + tx->coalesce_buf, + tx->tlen, + DMA_TO_DEVICE); + + if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) { + sdma_txclean(dd, tx); + return -ENOSPC; + } + + /* Add descriptor for coalesce buffer */ + tx->desc_limit = MAX_DESC; + return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx, + addr, tx->tlen); + } + + return 1; +} + +/* Update sdes when the lmc changes */ +void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid) +{ + struct sdma_engine *sde; + int i; + u64 sreg; + + sreg = ((mask & SD(CHECK_SLID_MASK_MASK)) << + SD(CHECK_SLID_MASK_SHIFT)) | + (((lid & mask) & SD(CHECK_SLID_VALUE_MASK)) << + SD(CHECK_SLID_VALUE_SHIFT)); + + for (i = 0; i < dd->num_sdma; i++) { + hfi1_cdbg(LINKVERB, "SendDmaEngine[%d].SLID_CHECK = 0x%x", + i, (u32)sreg); + sde = &dd->per_sdma[i]; + write_sde_csr(sde, SD(CHECK_SLID), sreg); + } +} + +/* tx not dword sized - pad */ +int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) +{ + int rval = 0; + + tx->num_desc++; + if ((unlikely(tx->num_desc == tx->desc_limit))) { + rval = _extend_sdma_tx_descs(dd, tx); + if (rval) { + sdma_txclean(dd, tx); + return rval; + } + } + /* finish the one just added */ + make_tx_sdma_desc( + tx, + SDMA_MAP_NONE, + dd->sdma_pad_phys, + sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1))); + _sdma_close_tx(dd, tx); + return rval; +} + +/* + * Add ahg to the sdma_txreq + * + * The logic will consume up to 3 + * descriptors at the beginning of + * sdma_txreq. + */ +void _sdma_txreq_ahgadd( + struct sdma_txreq *tx, + u8 num_ahg, + u8 ahg_entry, + u32 *ahg, + u8 ahg_hlen) +{ + u32 i, shift = 0, desc = 0; + u8 mode; + + WARN_ON_ONCE(num_ahg > 9 || (ahg_hlen & 3) || ahg_hlen == 4); + /* compute mode */ + if (num_ahg == 1) + mode = SDMA_AHG_APPLY_UPDATE1; + else if (num_ahg <= 5) + mode = SDMA_AHG_APPLY_UPDATE2; + else + mode = SDMA_AHG_APPLY_UPDATE3; + tx->num_desc++; + /* initialize to consumed descriptors to zero */ + switch (mode) { + case SDMA_AHG_APPLY_UPDATE3: + tx->num_desc++; + tx->descs[2].qw[0] = 0; + tx->descs[2].qw[1] = 0; + /* FALLTHROUGH */ + case SDMA_AHG_APPLY_UPDATE2: + tx->num_desc++; + tx->descs[1].qw[0] = 0; + tx->descs[1].qw[1] = 0; + break; + } + ahg_hlen >>= 2; + tx->descs[0].qw[1] |= + (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK) + << SDMA_DESC1_HEADER_INDEX_SHIFT) | + (((u64)ahg_hlen & SDMA_DESC1_HEADER_DWS_MASK) + << SDMA_DESC1_HEADER_DWS_SHIFT) | + (((u64)mode & SDMA_DESC1_HEADER_MODE_MASK) + << SDMA_DESC1_HEADER_MODE_SHIFT) | + (((u64)ahg[0] & SDMA_DESC1_HEADER_UPDATE1_MASK) + << SDMA_DESC1_HEADER_UPDATE1_SHIFT); + for (i = 0; i < (num_ahg - 1); i++) { + if (!shift && !(i & 2)) + desc++; + tx->descs[desc].qw[!!(i & 2)] |= + (((u64)ahg[i + 1]) + << shift); + shift = (shift + 32) & 63; + } +} + +/** + * sdma_ahg_alloc - allocate an AHG entry + * @sde: engine to allocate from + * + * Return: + * 0-31 when successful, -EOPNOTSUPP if AHG is not enabled, + * -ENOSPC if an entry is not available + */ +int sdma_ahg_alloc(struct sdma_engine *sde) +{ + int nr; + int oldbit; + + if (!sde) { + trace_hfi1_ahg_allocate(sde, -EINVAL); + return -EINVAL; + } + while (1) { + nr = ffz(ACCESS_ONCE(sde->ahg_bits)); + if (nr > 31) { + trace_hfi1_ahg_allocate(sde, -ENOSPC); + return -ENOSPC; + } + oldbit = test_and_set_bit(nr, &sde->ahg_bits); + if (!oldbit) + break; + cpu_relax(); + } + trace_hfi1_ahg_allocate(sde, nr); + return nr; +} + +/** + * sdma_ahg_free - free an AHG entry + * @sde: engine to return AHG entry + * @ahg_index: index to free + * + * This routine frees the indicate AHG entry. + */ +void sdma_ahg_free(struct sdma_engine *sde, int ahg_index) +{ + if (!sde) + return; + trace_hfi1_ahg_deallocate(sde, ahg_index); + if (ahg_index < 0 || ahg_index > 31) + return; + clear_bit(ahg_index, &sde->ahg_bits); +} + +/* + * SPC freeze handling for SDMA engines. Called when the driver knows + * the SPC is going into a freeze but before the freeze is fully + * settled. Generally an error interrupt. + * + * This event will pull the engine out of running so no more entries can be + * added to the engine's queue. + */ +void sdma_freeze_notify(struct hfi1_devdata *dd, int link_down) +{ + int i; + enum sdma_events event = link_down ? sdma_event_e85_link_down : + sdma_event_e80_hw_freeze; + + /* set up the wait but do not wait here */ + atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma); + + /* tell all engines to stop running and wait */ + for (i = 0; i < dd->num_sdma; i++) + sdma_process_event(&dd->per_sdma[i], event); + + /* sdma_freeze() will wait for all engines to have stopped */ +} + +/* + * SPC freeze handling for SDMA engines. Called when the driver knows + * the SPC is fully frozen. + */ +void sdma_freeze(struct hfi1_devdata *dd) +{ + int i; + int ret; + + /* + * Make sure all engines have moved out of the running state before + * continuing. + */ + ret = wait_event_interruptible(dd->sdma_unfreeze_wq, + atomic_read(&dd->sdma_unfreeze_count) <= + 0); + /* interrupted or count is negative, then unloading - just exit */ + if (ret || atomic_read(&dd->sdma_unfreeze_count) < 0) + return; + + /* set up the count for the next wait */ + atomic_set(&dd->sdma_unfreeze_count, dd->num_sdma); + + /* tell all engines that the SPC is frozen, they can start cleaning */ + for (i = 0; i < dd->num_sdma; i++) + sdma_process_event(&dd->per_sdma[i], sdma_event_e81_hw_frozen); + + /* + * Wait for everyone to finish software clean before exiting. The + * software clean will read engine CSRs, so must be completed before + * the next step, which will clear the engine CSRs. + */ + (void)wait_event_interruptible(dd->sdma_unfreeze_wq, + atomic_read(&dd->sdma_unfreeze_count) <= 0); + /* no need to check results - done no matter what */ +} + +/* + * SPC freeze handling for the SDMA engines. Called after the SPC is unfrozen. + * + * The SPC freeze acts like a SDMA halt and a hardware clean combined. All + * that is left is a software clean. We could do it after the SPC is fully + * frozen, but then we'd have to add another state to wait for the unfreeze. + * Instead, just defer the software clean until the unfreeze step. + */ +void sdma_unfreeze(struct hfi1_devdata *dd) +{ + int i; + + /* tell all engines start freeze clean up */ + for (i = 0; i < dd->num_sdma; i++) + sdma_process_event(&dd->per_sdma[i], + sdma_event_e82_hw_unfreeze); +} + +/** + * _sdma_engine_progress_schedule() - schedule progress on engine + * @sde: sdma_engine to schedule progress + * + */ +void _sdma_engine_progress_schedule( + struct sdma_engine *sde) +{ + trace_hfi1_sdma_engine_progress(sde, sde->progress_mask); + /* assume we have selected a good cpu */ + write_csr(sde->dd, + CCE_INT_FORCE + (8 * (IS_SDMA_START / 64)), + sde->progress_mask); +} diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h new file mode 100644 index 000000000..8f50c99fe --- /dev/null +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -0,0 +1,1082 @@ +#ifndef _HFI1_SDMA_H +#define _HFI1_SDMA_H +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/types.h> +#include <linux/list.h> +#include <asm/byteorder.h> +#include <linux/workqueue.h> +#include <linux/rculist.h> + +#include "hfi.h" +#include "verbs.h" +#include "sdma_txreq.h" + +/* Hardware limit */ +#define MAX_DESC 64 +/* Hardware limit for SDMA packet size */ +#define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1) + +#define SDMA_TXREQ_S_OK 0 +#define SDMA_TXREQ_S_SENDERROR 1 +#define SDMA_TXREQ_S_ABORTED 2 +#define SDMA_TXREQ_S_SHUTDOWN 3 + +/* flags bits */ +#define SDMA_TXREQ_F_URGENT 0x0001 +#define SDMA_TXREQ_F_AHG_COPY 0x0002 +#define SDMA_TXREQ_F_USE_AHG 0x0004 + +#define SDMA_MAP_NONE 0 +#define SDMA_MAP_SINGLE 1 +#define SDMA_MAP_PAGE 2 + +#define SDMA_AHG_VALUE_MASK 0xffff +#define SDMA_AHG_VALUE_SHIFT 0 +#define SDMA_AHG_INDEX_MASK 0xf +#define SDMA_AHG_INDEX_SHIFT 16 +#define SDMA_AHG_FIELD_LEN_MASK 0xf +#define SDMA_AHG_FIELD_LEN_SHIFT 20 +#define SDMA_AHG_FIELD_START_MASK 0x1f +#define SDMA_AHG_FIELD_START_SHIFT 24 +#define SDMA_AHG_UPDATE_ENABLE_MASK 0x1 +#define SDMA_AHG_UPDATE_ENABLE_SHIFT 31 + +/* AHG modes */ + +/* + * Be aware the ordering and values + * for SDMA_AHG_APPLY_UPDATE[123] + * are assumed in generating a skip + * count in submit_tx() in sdma.c + */ +#define SDMA_AHG_NO_AHG 0 +#define SDMA_AHG_COPY 1 +#define SDMA_AHG_APPLY_UPDATE1 2 +#define SDMA_AHG_APPLY_UPDATE2 3 +#define SDMA_AHG_APPLY_UPDATE3 4 + +/* + * Bits defined in the send DMA descriptor. + */ +#define SDMA_DESC0_FIRST_DESC_FLAG BIT_ULL(63) +#define SDMA_DESC0_LAST_DESC_FLAG BIT_ULL(62) +#define SDMA_DESC0_BYTE_COUNT_SHIFT 48 +#define SDMA_DESC0_BYTE_COUNT_WIDTH 14 +#define SDMA_DESC0_BYTE_COUNT_MASK \ + ((1ULL << SDMA_DESC0_BYTE_COUNT_WIDTH) - 1) +#define SDMA_DESC0_BYTE_COUNT_SMASK \ + (SDMA_DESC0_BYTE_COUNT_MASK << SDMA_DESC0_BYTE_COUNT_SHIFT) +#define SDMA_DESC0_PHY_ADDR_SHIFT 0 +#define SDMA_DESC0_PHY_ADDR_WIDTH 48 +#define SDMA_DESC0_PHY_ADDR_MASK \ + ((1ULL << SDMA_DESC0_PHY_ADDR_WIDTH) - 1) +#define SDMA_DESC0_PHY_ADDR_SMASK \ + (SDMA_DESC0_PHY_ADDR_MASK << SDMA_DESC0_PHY_ADDR_SHIFT) + +#define SDMA_DESC1_HEADER_UPDATE1_SHIFT 32 +#define SDMA_DESC1_HEADER_UPDATE1_WIDTH 32 +#define SDMA_DESC1_HEADER_UPDATE1_MASK \ + ((1ULL << SDMA_DESC1_HEADER_UPDATE1_WIDTH) - 1) +#define SDMA_DESC1_HEADER_UPDATE1_SMASK \ + (SDMA_DESC1_HEADER_UPDATE1_MASK << SDMA_DESC1_HEADER_UPDATE1_SHIFT) +#define SDMA_DESC1_HEADER_MODE_SHIFT 13 +#define SDMA_DESC1_HEADER_MODE_WIDTH 3 +#define SDMA_DESC1_HEADER_MODE_MASK \ + ((1ULL << SDMA_DESC1_HEADER_MODE_WIDTH) - 1) +#define SDMA_DESC1_HEADER_MODE_SMASK \ + (SDMA_DESC1_HEADER_MODE_MASK << SDMA_DESC1_HEADER_MODE_SHIFT) +#define SDMA_DESC1_HEADER_INDEX_SHIFT 8 +#define SDMA_DESC1_HEADER_INDEX_WIDTH 5 +#define SDMA_DESC1_HEADER_INDEX_MASK \ + ((1ULL << SDMA_DESC1_HEADER_INDEX_WIDTH) - 1) +#define SDMA_DESC1_HEADER_INDEX_SMASK \ + (SDMA_DESC1_HEADER_INDEX_MASK << SDMA_DESC1_HEADER_INDEX_SHIFT) +#define SDMA_DESC1_HEADER_DWS_SHIFT 4 +#define SDMA_DESC1_HEADER_DWS_WIDTH 4 +#define SDMA_DESC1_HEADER_DWS_MASK \ + ((1ULL << SDMA_DESC1_HEADER_DWS_WIDTH) - 1) +#define SDMA_DESC1_HEADER_DWS_SMASK \ + (SDMA_DESC1_HEADER_DWS_MASK << SDMA_DESC1_HEADER_DWS_SHIFT) +#define SDMA_DESC1_GENERATION_SHIFT 2 +#define SDMA_DESC1_GENERATION_WIDTH 2 +#define SDMA_DESC1_GENERATION_MASK \ + ((1ULL << SDMA_DESC1_GENERATION_WIDTH) - 1) +#define SDMA_DESC1_GENERATION_SMASK \ + (SDMA_DESC1_GENERATION_MASK << SDMA_DESC1_GENERATION_SHIFT) +#define SDMA_DESC1_INT_REQ_FLAG BIT_ULL(1) +#define SDMA_DESC1_HEAD_TO_HOST_FLAG BIT_ULL(0) + +enum sdma_states { + sdma_state_s00_hw_down, + sdma_state_s10_hw_start_up_halt_wait, + sdma_state_s15_hw_start_up_clean_wait, + sdma_state_s20_idle, + sdma_state_s30_sw_clean_up_wait, + sdma_state_s40_hw_clean_up_wait, + sdma_state_s50_hw_halt_wait, + sdma_state_s60_idle_halt_wait, + sdma_state_s80_hw_freeze, + sdma_state_s82_freeze_sw_clean, + sdma_state_s99_running, +}; + +enum sdma_events { + sdma_event_e00_go_hw_down, + sdma_event_e10_go_hw_start, + sdma_event_e15_hw_halt_done, + sdma_event_e25_hw_clean_up_done, + sdma_event_e30_go_running, + sdma_event_e40_sw_cleaned, + sdma_event_e50_hw_cleaned, + sdma_event_e60_hw_halted, + sdma_event_e70_go_idle, + sdma_event_e80_hw_freeze, + sdma_event_e81_hw_frozen, + sdma_event_e82_hw_unfreeze, + sdma_event_e85_link_down, + sdma_event_e90_sw_halted, +}; + +struct sdma_set_state_action { + unsigned op_enable:1; + unsigned op_intenable:1; + unsigned op_halt:1; + unsigned op_cleanup:1; + unsigned go_s99_running_tofalse:1; + unsigned go_s99_running_totrue:1; +}; + +struct sdma_state { + struct kref kref; + struct completion comp; + enum sdma_states current_state; + unsigned current_op; + unsigned go_s99_running; + /* debugging/development */ + enum sdma_states previous_state; + unsigned previous_op; + enum sdma_events last_event; +}; + +/** + * DOC: sdma exported routines + * + * These sdma routines fit into three categories: + * - The SDMA API for building and submitting packets + * to the ring + * + * - Initialization and tear down routines to buildup + * and tear down SDMA + * + * - ISR entrances to handle interrupts, state changes + * and errors + */ + +/** + * DOC: sdma PSM/verbs API + * + * The sdma API is designed to be used by both PSM + * and verbs to supply packets to the SDMA ring. + * + * The usage of the API is as follows: + * + * Embed a struct iowait in the QP or + * PQ. The iowait should be initialized with a + * call to iowait_init(). + * + * The user of the API should create an allocation method + * for their version of the txreq. slabs, pre-allocated lists, + * and dma pools can be used. Once the user's overload of + * the sdma_txreq has been allocated, the sdma_txreq member + * must be initialized with sdma_txinit() or sdma_txinit_ahg(). + * + * The txreq must be declared with the sdma_txreq first. + * + * The tx request, once initialized, is manipulated with calls to + * sdma_txadd_daddr(), sdma_txadd_page(), or sdma_txadd_kvaddr() + * for each disjoint memory location. It is the user's responsibility + * to understand the packet boundaries and page boundaries to do the + * appropriate number of sdma_txadd_* calls.. The user + * must be prepared to deal with failures from these routines due to + * either memory allocation or dma_mapping failures. + * + * The mapping specifics for each memory location are recorded + * in the tx. Memory locations added with sdma_txadd_page() + * and sdma_txadd_kvaddr() are automatically mapped when added + * to the tx and nmapped as part of the progress processing in the + * SDMA interrupt handling. + * + * sdma_txadd_daddr() is used to add an dma_addr_t memory to the + * tx. An example of a use case would be a pre-allocated + * set of headers allocated via dma_pool_alloc() or + * dma_alloc_coherent(). For these memory locations, it + * is the responsibility of the user to handle that unmapping. + * (This would usually be at an unload or job termination.) + * + * The routine sdma_send_txreq() is used to submit + * a tx to the ring after the appropriate number of + * sdma_txadd_* have been done. + * + * If it is desired to send a burst of sdma_txreqs, sdma_send_txlist() + * can be used to submit a list of packets. + * + * The user is free to use the link overhead in the struct sdma_txreq as + * long as the tx isn't in flight. + * + * The extreme degenerate case of the number of descriptors + * exceeding the ring size is automatically handled as + * memory locations are added. An overflow of the descriptor + * array that is part of the sdma_txreq is also automatically + * handled. + * + */ + +/** + * DOC: Infrastructure calls + * + * sdma_init() is used to initialize data structures and + * CSRs for the desired number of SDMA engines. + * + * sdma_start() is used to kick the SDMA engines initialized + * with sdma_init(). Interrupts must be enabled at this + * point since aspects of the state machine are interrupt + * driven. + * + * sdma_engine_error() and sdma_engine_interrupt() are + * entrances for interrupts. + * + * sdma_map_init() is for the management of the mapping + * table when the number of vls is changed. + * + */ + +/* + * struct hw_sdma_desc - raw 128 bit SDMA descriptor + * + * This is the raw descriptor in the SDMA ring + */ +struct hw_sdma_desc { + /* private: don't use directly */ + __le64 qw[2]; +}; + +/** + * struct sdma_engine - Data pertaining to each SDMA engine. + * @dd: a back-pointer to the device data + * @ppd: per port back-pointer + * @imask: mask for irq manipulation + * @idle_mask: mask for determining if an interrupt is due to sdma_idle + * + * This structure has the state for each sdma_engine. + * + * Accessing to non public fields are not supported + * since the private members are subject to change. + */ +struct sdma_engine { + /* read mostly */ + struct hfi1_devdata *dd; + struct hfi1_pportdata *ppd; + /* private: */ + void __iomem *tail_csr; + u64 imask; /* clear interrupt mask */ + u64 idle_mask; + u64 progress_mask; + u64 int_mask; + /* private: */ + volatile __le64 *head_dma; /* DMA'ed by chip */ + /* private: */ + dma_addr_t head_phys; + /* private: */ + struct hw_sdma_desc *descq; + /* private: */ + unsigned descq_full_count; + struct sdma_txreq **tx_ring; + /* private: */ + dma_addr_t descq_phys; + /* private */ + u32 sdma_mask; + /* private */ + struct sdma_state state; + /* private */ + int cpu; + /* private: */ + u8 sdma_shift; + /* private: */ + u8 this_idx; /* zero relative engine */ + /* protect changes to senddmactrl shadow */ + spinlock_t senddmactrl_lock; + /* private: */ + u64 p_senddmactrl; /* shadow per-engine SendDmaCtrl */ + + /* read/write using tail_lock */ + spinlock_t tail_lock ____cacheline_aligned_in_smp; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + /* private: */ + u64 tail_sn; +#endif + /* private: */ + u32 descq_tail; + /* private: */ + unsigned long ahg_bits; + /* private: */ + u16 desc_avail; + /* private: */ + u16 tx_tail; + /* private: */ + u16 descq_cnt; + + /* read/write using head_lock */ + /* private: */ + seqlock_t head_lock ____cacheline_aligned_in_smp; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + /* private: */ + u64 head_sn; +#endif + /* private: */ + u32 descq_head; + /* private: */ + u16 tx_head; + /* private: */ + u64 last_status; + /* private */ + u64 err_cnt; + /* private */ + u64 sdma_int_cnt; + u64 idle_int_cnt; + u64 progress_int_cnt; + + /* private: */ + struct list_head dmawait; + + /* CONFIG SDMA for now, just blindly duplicate */ + /* private: */ + struct tasklet_struct sdma_hw_clean_up_task + ____cacheline_aligned_in_smp; + + /* private: */ + struct tasklet_struct sdma_sw_clean_up_task + ____cacheline_aligned_in_smp; + /* private: */ + struct work_struct err_halt_worker; + /* private */ + struct timer_list err_progress_check_timer; + u32 progress_check_head; + /* private: */ + struct work_struct flush_worker; + /* protect flush list */ + spinlock_t flushlist_lock; + /* private: */ + struct list_head flushlist; +}; + +int sdma_init(struct hfi1_devdata *dd, u8 port); +void sdma_start(struct hfi1_devdata *dd); +void sdma_exit(struct hfi1_devdata *dd); +void sdma_all_running(struct hfi1_devdata *dd); +void sdma_all_idle(struct hfi1_devdata *dd); +void sdma_freeze_notify(struct hfi1_devdata *dd, int go_idle); +void sdma_freeze(struct hfi1_devdata *dd); +void sdma_unfreeze(struct hfi1_devdata *dd); +void sdma_wait(struct hfi1_devdata *dd); + +/** + * sdma_empty() - idle engine test + * @engine: sdma engine + * + * Currently used by verbs as a latency optimization. + * + * Return: + * 1 - empty, 0 - non-empty + */ +static inline int sdma_empty(struct sdma_engine *sde) +{ + return sde->descq_tail == sde->descq_head; +} + +static inline u16 sdma_descq_freecnt(struct sdma_engine *sde) +{ + return sde->descq_cnt - + (sde->descq_tail - + ACCESS_ONCE(sde->descq_head)) - 1; +} + +static inline u16 sdma_descq_inprocess(struct sdma_engine *sde) +{ + return sde->descq_cnt - sdma_descq_freecnt(sde); +} + +/* + * Either head_lock or tail lock required to see + * a steady state. + */ +static inline int __sdma_running(struct sdma_engine *engine) +{ + return engine->state.current_state == sdma_state_s99_running; +} + +/** + * sdma_running() - state suitability test + * @engine: sdma engine + * + * sdma_running probes the internal state to determine if it is suitable + * for submitting packets. + * + * Return: + * 1 - ok to submit, 0 - not ok to submit + * + */ +static inline int sdma_running(struct sdma_engine *engine) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&engine->tail_lock, flags); + ret = __sdma_running(engine); + spin_unlock_irqrestore(&engine->tail_lock, flags); + return ret; +} + +void _sdma_txreq_ahgadd( + struct sdma_txreq *tx, + u8 num_ahg, + u8 ahg_entry, + u32 *ahg, + u8 ahg_hlen); + +/** + * sdma_txinit_ahg() - initialize an sdma_txreq struct with AHG + * @tx: tx request to initialize + * @flags: flags to key last descriptor additions + * @tlen: total packet length (pbc + headers + data) + * @ahg_entry: ahg entry to use (0 - 31) + * @num_ahg: ahg descriptor for first descriptor (0 - 9) + * @ahg: array of AHG descriptors (up to 9 entries) + * @ahg_hlen: number of bytes from ASIC entry to use + * @cb: callback + * + * The allocation of the sdma_txreq and it enclosing structure is user + * dependent. This routine must be called to initialize the user independent + * fields. + * + * The currently supported flags are SDMA_TXREQ_F_URGENT, + * SDMA_TXREQ_F_AHG_COPY, and SDMA_TXREQ_F_USE_AHG. + * + * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the + * completion is desired as soon as possible. + * + * SDMA_TXREQ_F_AHG_COPY causes the header in the first descriptor to be + * copied to chip entry. SDMA_TXREQ_F_USE_AHG causes the code to add in + * the AHG descriptors into the first 1 to 3 descriptors. + * + * Completions of submitted requests can be gotten on selected + * txreqs by giving a completion routine callback to sdma_txinit() or + * sdma_txinit_ahg(). The environment in which the callback runs + * can be from an ISR, a tasklet, or a thread, so no sleeping + * kernel routines can be used. Aspects of the sdma ring may + * be locked so care should be taken with locking. + * + * The callback pointer can be NULL to avoid any callback for the packet + * being submitted. The callback will be provided this tx, a status, and a flag. + * + * The status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR, + * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN. + * + * The flag, if the is the iowait had been used, indicates the iowait + * sdma_busy count has reached zero. + * + * user data portion of tlen should be precise. The sdma_txadd_* entrances + * will pad with a descriptor references 1 - 3 bytes when the number of bytes + * specified in tlen have been supplied to the sdma_txreq. + * + * ahg_hlen is used to determine the number of on-chip entry bytes to + * use as the header. This is for cases where the stored header is + * larger than the header to be used in a packet. This is typical + * for verbs where an RDMA_WRITE_FIRST is larger than the packet in + * and RDMA_WRITE_MIDDLE. + * + */ +static inline int sdma_txinit_ahg( + struct sdma_txreq *tx, + u16 flags, + u16 tlen, + u8 ahg_entry, + u8 num_ahg, + u32 *ahg, + u8 ahg_hlen, + void (*cb)(struct sdma_txreq *, int)) +{ + if (tlen == 0) + return -ENODATA; + if (tlen > MAX_SDMA_PKT_SIZE) + return -EMSGSIZE; + tx->desc_limit = ARRAY_SIZE(tx->descs); + tx->descp = &tx->descs[0]; + INIT_LIST_HEAD(&tx->list); + tx->num_desc = 0; + tx->flags = flags; + tx->complete = cb; + tx->coalesce_buf = NULL; + tx->wait = NULL; + tx->packet_len = tlen; + tx->tlen = tx->packet_len; + tx->descs[0].qw[0] = SDMA_DESC0_FIRST_DESC_FLAG; + tx->descs[0].qw[1] = 0; + if (flags & SDMA_TXREQ_F_AHG_COPY) + tx->descs[0].qw[1] |= + (((u64)ahg_entry & SDMA_DESC1_HEADER_INDEX_MASK) + << SDMA_DESC1_HEADER_INDEX_SHIFT) | + (((u64)SDMA_AHG_COPY & SDMA_DESC1_HEADER_MODE_MASK) + << SDMA_DESC1_HEADER_MODE_SHIFT); + else if (flags & SDMA_TXREQ_F_USE_AHG && num_ahg) + _sdma_txreq_ahgadd(tx, num_ahg, ahg_entry, ahg, ahg_hlen); + return 0; +} + +/** + * sdma_txinit() - initialize an sdma_txreq struct (no AHG) + * @tx: tx request to initialize + * @flags: flags to key last descriptor additions + * @tlen: total packet length (pbc + headers + data) + * @cb: callback pointer + * + * The allocation of the sdma_txreq and it enclosing structure is user + * dependent. This routine must be called to initialize the user + * independent fields. + * + * The currently supported flags is SDMA_TXREQ_F_URGENT. + * + * SDMA_TXREQ_F_URGENT is used for latency sensitive situations where the + * completion is desired as soon as possible. + * + * Completions of submitted requests can be gotten on selected + * txreqs by giving a completion routine callback to sdma_txinit() or + * sdma_txinit_ahg(). The environment in which the callback runs + * can be from an ISR, a tasklet, or a thread, so no sleeping + * kernel routines can be used. The head size of the sdma ring may + * be locked so care should be taken with locking. + * + * The callback pointer can be NULL to avoid any callback for the packet + * being submitted. + * + * The callback, if non-NULL, will be provided this tx and a status. The + * status will be one of SDMA_TXREQ_S_OK, SDMA_TXREQ_S_SENDERROR, + * SDMA_TXREQ_S_ABORTED, or SDMA_TXREQ_S_SHUTDOWN. + * + */ +static inline int sdma_txinit( + struct sdma_txreq *tx, + u16 flags, + u16 tlen, + void (*cb)(struct sdma_txreq *, int)) +{ + return sdma_txinit_ahg(tx, flags, tlen, 0, 0, NULL, 0, cb); +} + +/* helpers - don't use */ +static inline int sdma_mapping_type(struct sdma_desc *d) +{ + return (d->qw[1] & SDMA_DESC1_GENERATION_SMASK) + >> SDMA_DESC1_GENERATION_SHIFT; +} + +static inline size_t sdma_mapping_len(struct sdma_desc *d) +{ + return (d->qw[0] & SDMA_DESC0_BYTE_COUNT_SMASK) + >> SDMA_DESC0_BYTE_COUNT_SHIFT; +} + +static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d) +{ + return (d->qw[0] & SDMA_DESC0_PHY_ADDR_SMASK) + >> SDMA_DESC0_PHY_ADDR_SHIFT; +} + +static inline void make_tx_sdma_desc( + struct sdma_txreq *tx, + int type, + dma_addr_t addr, + size_t len) +{ + struct sdma_desc *desc = &tx->descp[tx->num_desc]; + + if (!tx->num_desc) { + /* qw[0] zero; qw[1] first, ahg mode already in from init */ + desc->qw[1] |= ((u64)type & SDMA_DESC1_GENERATION_MASK) + << SDMA_DESC1_GENERATION_SHIFT; + } else { + desc->qw[0] = 0; + desc->qw[1] = ((u64)type & SDMA_DESC1_GENERATION_MASK) + << SDMA_DESC1_GENERATION_SHIFT; + } + desc->qw[0] |= (((u64)addr & SDMA_DESC0_PHY_ADDR_MASK) + << SDMA_DESC0_PHY_ADDR_SHIFT) | + (((u64)len & SDMA_DESC0_BYTE_COUNT_MASK) + << SDMA_DESC0_BYTE_COUNT_SHIFT); +} + +/* helper to extend txreq */ +int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx, + int type, void *kvaddr, struct page *page, + unsigned long offset, u16 len); +int _pad_sdma_tx_descs(struct hfi1_devdata *, struct sdma_txreq *); +void sdma_txclean(struct hfi1_devdata *, struct sdma_txreq *); + +/* helpers used by public routines */ +static inline void _sdma_close_tx(struct hfi1_devdata *dd, + struct sdma_txreq *tx) +{ + tx->descp[tx->num_desc].qw[0] |= + SDMA_DESC0_LAST_DESC_FLAG; + tx->descp[tx->num_desc].qw[1] |= + dd->default_desc1; + if (tx->flags & SDMA_TXREQ_F_URGENT) + tx->descp[tx->num_desc].qw[1] |= + (SDMA_DESC1_HEAD_TO_HOST_FLAG | + SDMA_DESC1_INT_REQ_FLAG); +} + +static inline int _sdma_txadd_daddr( + struct hfi1_devdata *dd, + int type, + struct sdma_txreq *tx, + dma_addr_t addr, + u16 len) +{ + int rval = 0; + + make_tx_sdma_desc( + tx, + type, + addr, len); + WARN_ON(len > tx->tlen); + tx->tlen -= len; + /* special cases for last */ + if (!tx->tlen) { + if (tx->packet_len & (sizeof(u32) - 1)) { + rval = _pad_sdma_tx_descs(dd, tx); + if (rval) + return rval; + } else { + _sdma_close_tx(dd, tx); + } + } + tx->num_desc++; + return rval; +} + +/** + * sdma_txadd_page() - add a page to the sdma_txreq + * @dd: the device to use for mapping + * @tx: tx request to which the page is added + * @page: page to map + * @offset: offset within the page + * @len: length in bytes + * + * This is used to add a page/offset/length descriptor. + * + * The mapping/unmapping of the page/offset/len is automatically handled. + * + * Return: + * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't + * extend/coalesce descriptor array + */ +static inline int sdma_txadd_page( + struct hfi1_devdata *dd, + struct sdma_txreq *tx, + struct page *page, + unsigned long offset, + u16 len) +{ + dma_addr_t addr; + int rval; + + if ((unlikely(tx->num_desc == tx->desc_limit))) { + rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_PAGE, + NULL, page, offset, len); + if (rval <= 0) + return rval; + } + + addr = dma_map_page( + &dd->pcidev->dev, + page, + offset, + len, + DMA_TO_DEVICE); + + if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) { + sdma_txclean(dd, tx); + return -ENOSPC; + } + + return _sdma_txadd_daddr( + dd, SDMA_MAP_PAGE, tx, addr, len); +} + +/** + * sdma_txadd_daddr() - add a dma address to the sdma_txreq + * @dd: the device to use for mapping + * @tx: sdma_txreq to which the page is added + * @addr: dma address mapped by caller + * @len: length in bytes + * + * This is used to add a descriptor for memory that is already dma mapped. + * + * In this case, there is no unmapping as part of the progress processing for + * this memory location. + * + * Return: + * 0 - success, -ENOMEM - couldn't extend descriptor array + */ + +static inline int sdma_txadd_daddr( + struct hfi1_devdata *dd, + struct sdma_txreq *tx, + dma_addr_t addr, + u16 len) +{ + int rval; + + if ((unlikely(tx->num_desc == tx->desc_limit))) { + rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_NONE, + NULL, NULL, 0, 0); + if (rval <= 0) + return rval; + } + + return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len); +} + +/** + * sdma_txadd_kvaddr() - add a kernel virtual address to sdma_txreq + * @dd: the device to use for mapping + * @tx: sdma_txreq to which the page is added + * @kvaddr: the kernel virtual address + * @len: length in bytes + * + * This is used to add a descriptor referenced by the indicated kvaddr and + * len. + * + * The mapping/unmapping of the kvaddr and len is automatically handled. + * + * Return: + * 0 - success, -ENOSPC - mapping fail, -ENOMEM - couldn't extend/coalesce + * descriptor array + */ +static inline int sdma_txadd_kvaddr( + struct hfi1_devdata *dd, + struct sdma_txreq *tx, + void *kvaddr, + u16 len) +{ + dma_addr_t addr; + int rval; + + if ((unlikely(tx->num_desc == tx->desc_limit))) { + rval = ext_coal_sdma_tx_descs(dd, tx, SDMA_MAP_SINGLE, + kvaddr, NULL, 0, len); + if (rval <= 0) + return rval; + } + + addr = dma_map_single( + &dd->pcidev->dev, + kvaddr, + len, + DMA_TO_DEVICE); + + if (unlikely(dma_mapping_error(&dd->pcidev->dev, addr))) { + sdma_txclean(dd, tx); + return -ENOSPC; + } + + return _sdma_txadd_daddr( + dd, SDMA_MAP_SINGLE, tx, addr, len); +} + +struct iowait; + +int sdma_send_txreq(struct sdma_engine *sde, + struct iowait *wait, + struct sdma_txreq *tx); +int sdma_send_txlist(struct sdma_engine *sde, + struct iowait *wait, + struct list_head *tx_list); + +int sdma_ahg_alloc(struct sdma_engine *sde); +void sdma_ahg_free(struct sdma_engine *sde, int ahg_index); + +/** + * sdma_build_ahg - build ahg descriptor + * @data + * @dwindex + * @startbit + * @bits + * + * Build and return a 32 bit descriptor. + */ +static inline u32 sdma_build_ahg_descriptor( + u16 data, + u8 dwindex, + u8 startbit, + u8 bits) +{ + return (u32)(1UL << SDMA_AHG_UPDATE_ENABLE_SHIFT | + ((startbit & SDMA_AHG_FIELD_START_MASK) << + SDMA_AHG_FIELD_START_SHIFT) | + ((bits & SDMA_AHG_FIELD_LEN_MASK) << + SDMA_AHG_FIELD_LEN_SHIFT) | + ((dwindex & SDMA_AHG_INDEX_MASK) << + SDMA_AHG_INDEX_SHIFT) | + ((data & SDMA_AHG_VALUE_MASK) << + SDMA_AHG_VALUE_SHIFT)); +} + +/** + * sdma_progress - use seq number of detect head progress + * @sde: sdma_engine to check + * @seq: base seq count + * @tx: txreq for which we need to check descriptor availability + * + * This is used in the appropriate spot in the sleep routine + * to check for potential ring progress. This routine gets the + * seqcount before queuing the iowait structure for progress. + * + * If the seqcount indicates that progress needs to be checked, + * re-submission is detected by checking whether the descriptor + * queue has enough descriptor for the txreq. + */ +static inline unsigned sdma_progress(struct sdma_engine *sde, unsigned seq, + struct sdma_txreq *tx) +{ + if (read_seqretry(&sde->head_lock, seq)) { + sde->desc_avail = sdma_descq_freecnt(sde); + if (tx->num_desc > sde->desc_avail) + return 0; + return 1; + } + return 0; +} + +/** + * sdma_iowait_schedule() - initialize wait structure + * @sde: sdma_engine to schedule + * @wait: wait struct to schedule + * + * This function initializes the iowait + * structure embedded in the QP or PQ. + * + */ +static inline void sdma_iowait_schedule( + struct sdma_engine *sde, + struct iowait *wait) +{ + struct hfi1_pportdata *ppd = sde->dd->pport; + + iowait_schedule(wait, ppd->hfi1_wq, sde->cpu); +} + +/* for use by interrupt handling */ +void sdma_engine_error(struct sdma_engine *sde, u64 status); +void sdma_engine_interrupt(struct sdma_engine *sde, u64 status); + +/* + * + * The diagram below details the relationship of the mapping structures + * + * Since the mapping now allows for non-uniform engines per vl, the + * number of engines for a vl is either the vl_engines[vl] or + * a computation based on num_sdma/num_vls: + * + * For example: + * nactual = vl_engines ? vl_engines[vl] : num_sdma/num_vls + * + * n = roundup to next highest power of 2 using nactual + * + * In the case where there are num_sdma/num_vls doesn't divide + * evenly, the extras are added from the last vl downward. + * + * For the case where n > nactual, the engines are assigned + * in a round robin fashion wrapping back to the first engine + * for a particular vl. + * + * dd->sdma_map + * | sdma_map_elem[0] + * | +--------------------+ + * v | mask | + * sdma_vl_map |--------------------| + * +--------------------------+ | sde[0] -> eng 1 | + * | list (RCU) | |--------------------| + * |--------------------------| ->| sde[1] -> eng 2 | + * | mask | --/ |--------------------| + * |--------------------------| -/ | * | + * | actual_vls (max 8) | -/ |--------------------| + * |--------------------------| --/ | sde[n] -> eng n | + * | vls (max 8) | -/ +--------------------+ + * |--------------------------| --/ + * | map[0] |-/ + * |--------------------------| +--------------------+ + * | map[1] |--- | mask | + * |--------------------------| \---- |--------------------| + * | * | \-- | sde[0] -> eng 1+n | + * | * | \---- |--------------------| + * | * | \->| sde[1] -> eng 2+n | + * |--------------------------| |--------------------| + * | map[vls - 1] |- | * | + * +--------------------------+ \- |--------------------| + * \- | sde[m] -> eng m+n | + * \ +--------------------+ + * \- + * \ + * \- +--------------------+ + * \- | mask | + * \ |--------------------| + * \- | sde[0] -> eng 1+m+n| + * \- |--------------------| + * >| sde[1] -> eng 2+m+n| + * |--------------------| + * | * | + * |--------------------| + * | sde[o] -> eng o+m+n| + * +--------------------+ + * + */ + +/** + * struct sdma_map_elem - mapping for a vl + * @mask - selector mask + * @sde - array of engines for this vl + * + * The mask is used to "mod" the selector + * to produce index into the trailing + * array of sdes. + */ +struct sdma_map_elem { + u32 mask; + struct sdma_engine *sde[0]; +}; + +/** + * struct sdma_map_el - mapping for a vl + * @engine_to_vl - map of an engine to a vl + * @list - rcu head for free callback + * @mask - vl mask to "mod" the vl to produce an index to map array + * @actual_vls - number of vls + * @vls - number of vls rounded to next power of 2 + * @map - array of sdma_map_elem entries + * + * This is the parent mapping structure. The trailing + * members of the struct point to sdma_map_elem entries, which + * in turn point to an array of sde's for that vl. + */ +struct sdma_vl_map { + s8 engine_to_vl[TXE_NUM_SDMA_ENGINES]; + struct rcu_head list; + u32 mask; + u8 actual_vls; + u8 vls; + struct sdma_map_elem *map[0]; +}; + +int sdma_map_init( + struct hfi1_devdata *dd, + u8 port, + u8 num_vls, + u8 *vl_engines); + +/* slow path */ +void _sdma_engine_progress_schedule(struct sdma_engine *sde); + +/** + * sdma_engine_progress_schedule() - schedule progress on engine + * @sde: sdma_engine to schedule progress + * + * This is the fast path. + * + */ +static inline void sdma_engine_progress_schedule( + struct sdma_engine *sde) +{ + if (!sde || sdma_descq_inprocess(sde) < (sde->descq_cnt / 8)) + return; + _sdma_engine_progress_schedule(sde); +} + +struct sdma_engine *sdma_select_engine_sc( + struct hfi1_devdata *dd, + u32 selector, + u8 sc5); + +struct sdma_engine *sdma_select_engine_vl( + struct hfi1_devdata *dd, + u32 selector, + u8 vl); + +void sdma_seqfile_dump_sde(struct seq_file *s, struct sdma_engine *); + +#ifdef CONFIG_SDMA_VERBOSITY +void sdma_dumpstate(struct sdma_engine *); +#endif +static inline char *slashstrip(char *s) +{ + char *r = s; + + while (*s) + if (*s++ == '/') + r = s; + return r; +} + +u16 sdma_get_descq_cnt(void); + +extern uint mod_num_sdma; + +void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid); + +#endif diff --git a/drivers/infiniband/hw/hfi1/sdma_txreq.h b/drivers/infiniband/hw/hfi1/sdma_txreq.h new file mode 100644 index 000000000..bf7d777d7 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/sdma_txreq.h @@ -0,0 +1,135 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef HFI1_SDMA_TXREQ_H +#define HFI1_SDMA_TXREQ_H + +/* increased for AHG */ +#define NUM_DESC 6 + +/* + * struct sdma_desc - canonical fragment descriptor + * + * This is the descriptor carried in the tx request + * corresponding to each fragment. + * + */ +struct sdma_desc { + /* private: don't use directly */ + u64 qw[2]; +}; + +/** + * struct sdma_txreq - the sdma_txreq structure (one per packet) + * @list: for use by user and by queuing for wait + * + * This is the representation of a packet which consists of some + * number of fragments. Storage is provided to within the structure. + * for all fragments. + * + * The storage for the descriptors are automatically extended as needed + * when the currently allocation is exceeded. + * + * The user (Verbs or PSM) may overload this structure with fields + * specific to their use by putting this struct first in their struct. + * The method of allocation of the overloaded structure is user dependent + * + * The list is the only public field in the structure. + * + */ + +#define SDMA_TXREQ_S_OK 0 +#define SDMA_TXREQ_S_SENDERROR 1 +#define SDMA_TXREQ_S_ABORTED 2 +#define SDMA_TXREQ_S_SHUTDOWN 3 + +/* flags bits */ +#define SDMA_TXREQ_F_URGENT 0x0001 +#define SDMA_TXREQ_F_AHG_COPY 0x0002 +#define SDMA_TXREQ_F_USE_AHG 0x0004 + +struct sdma_txreq; +typedef void (*callback_t)(struct sdma_txreq *, int); + +struct iowait; +struct sdma_txreq { + struct list_head list; + /* private: */ + struct sdma_desc *descp; + /* private: */ + void *coalesce_buf; + /* private: */ + struct iowait *wait; + /* private: */ + callback_t complete; +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER + u64 sn; +#endif + /* private: - used in coalesce/pad processing */ + u16 packet_len; + /* private: - down-counted to trigger last */ + u16 tlen; + /* private: */ + u16 num_desc; + /* private: */ + u16 desc_limit; + /* private: */ + u16 next_descq_idx; + /* private: */ + u16 coalesce_idx; + /* private: flags */ + u16 flags; + /* private: */ + struct sdma_desc descs[NUM_DESC]; +}; + +static inline int sdma_txreq_built(struct sdma_txreq *tx) +{ + return tx->num_desc; +} + +#endif /* HFI1_SDMA_TXREQ_H */ diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c new file mode 100644 index 000000000..91fc2aed6 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/sysfs.c @@ -0,0 +1,785 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include <linux/ctype.h> + +#include "hfi.h" +#include "mad.h" +#include "trace.h" + +/* + * Start of per-port congestion control structures and support code + */ + +/* + * Congestion control table size followed by table entries + */ +static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) +{ + int ret; + struct hfi1_pportdata *ppd = + container_of(kobj, struct hfi1_pportdata, pport_cc_kobj); + struct cc_state *cc_state; + + ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow) + + sizeof(__be16); + + if (pos > ret) + return -EINVAL; + + if (count > ret - pos) + count = ret - pos; + + if (!count) + return count; + + rcu_read_lock(); + cc_state = get_cc_state(ppd); + if (!cc_state) { + rcu_read_unlock(); + return -EINVAL; + } + memcpy(buf, (void *)&cc_state->cct + pos, count); + rcu_read_unlock(); + + return count; +} + +static void port_release(struct kobject *kobj) +{ + /* nothing to do since memory is freed by hfi1_free_devdata() */ +} + +static struct bin_attribute cc_table_bin_attr = { + .attr = {.name = "cc_table_bin", .mode = 0444}, + .read = read_cc_table_bin, + .size = PAGE_SIZE, +}; + +/* + * Congestion settings: port control, control map and an array of 16 + * entries for the congestion entries - increase, timer, event log + * trigger threshold and the minimum injection rate delay. + */ +static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) +{ + int ret; + struct hfi1_pportdata *ppd = + container_of(kobj, struct hfi1_pportdata, pport_cc_kobj); + struct cc_state *cc_state; + + ret = sizeof(struct opa_congestion_setting_attr_shadow); + + if (pos > ret) + return -EINVAL; + if (count > ret - pos) + count = ret - pos; + + if (!count) + return count; + + rcu_read_lock(); + cc_state = get_cc_state(ppd); + if (!cc_state) { + rcu_read_unlock(); + return -EINVAL; + } + memcpy(buf, (void *)&cc_state->cong_setting + pos, count); + rcu_read_unlock(); + + return count; +} + +static struct bin_attribute cc_setting_bin_attr = { + .attr = {.name = "cc_settings_bin", .mode = 0444}, + .read = read_cc_setting_bin, + .size = PAGE_SIZE, +}; + +struct hfi1_port_attr { + struct attribute attr; + ssize_t (*show)(struct hfi1_pportdata *, char *); + ssize_t (*store)(struct hfi1_pportdata *, const char *, size_t); +}; + +static ssize_t cc_prescan_show(struct hfi1_pportdata *ppd, char *buf) +{ + return sprintf(buf, "%s\n", ppd->cc_prescan ? "on" : "off"); +} + +static ssize_t cc_prescan_store(struct hfi1_pportdata *ppd, const char *buf, + size_t count) +{ + if (!memcmp(buf, "on", 2)) + ppd->cc_prescan = true; + else if (!memcmp(buf, "off", 3)) + ppd->cc_prescan = false; + + return count; +} + +static struct hfi1_port_attr cc_prescan_attr = + __ATTR(cc_prescan, 0600, cc_prescan_show, cc_prescan_store); + +static ssize_t cc_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct hfi1_port_attr *port_attr = + container_of(attr, struct hfi1_port_attr, attr); + struct hfi1_pportdata *ppd = + container_of(kobj, struct hfi1_pportdata, pport_cc_kobj); + + return port_attr->show(ppd, buf); +} + +static ssize_t cc_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct hfi1_port_attr *port_attr = + container_of(attr, struct hfi1_port_attr, attr); + struct hfi1_pportdata *ppd = + container_of(kobj, struct hfi1_pportdata, pport_cc_kobj); + + return port_attr->store(ppd, buf, count); +} + +static const struct sysfs_ops port_cc_sysfs_ops = { + .show = cc_attr_show, + .store = cc_attr_store +}; + +static struct attribute *port_cc_default_attributes[] = { + &cc_prescan_attr.attr +}; + +static struct kobj_type port_cc_ktype = { + .release = port_release, + .sysfs_ops = &port_cc_sysfs_ops, + .default_attrs = port_cc_default_attributes +}; + +/* Start sc2vl */ +#define HFI1_SC2VL_ATTR(N) \ + static struct hfi1_sc2vl_attr hfi1_sc2vl_attr_##N = { \ + .attr = { .name = __stringify(N), .mode = 0444 }, \ + .sc = N \ + } + +struct hfi1_sc2vl_attr { + struct attribute attr; + int sc; +}; + +HFI1_SC2VL_ATTR(0); +HFI1_SC2VL_ATTR(1); +HFI1_SC2VL_ATTR(2); +HFI1_SC2VL_ATTR(3); +HFI1_SC2VL_ATTR(4); +HFI1_SC2VL_ATTR(5); +HFI1_SC2VL_ATTR(6); +HFI1_SC2VL_ATTR(7); +HFI1_SC2VL_ATTR(8); +HFI1_SC2VL_ATTR(9); +HFI1_SC2VL_ATTR(10); +HFI1_SC2VL_ATTR(11); +HFI1_SC2VL_ATTR(12); +HFI1_SC2VL_ATTR(13); +HFI1_SC2VL_ATTR(14); +HFI1_SC2VL_ATTR(15); +HFI1_SC2VL_ATTR(16); +HFI1_SC2VL_ATTR(17); +HFI1_SC2VL_ATTR(18); +HFI1_SC2VL_ATTR(19); +HFI1_SC2VL_ATTR(20); +HFI1_SC2VL_ATTR(21); +HFI1_SC2VL_ATTR(22); +HFI1_SC2VL_ATTR(23); +HFI1_SC2VL_ATTR(24); +HFI1_SC2VL_ATTR(25); +HFI1_SC2VL_ATTR(26); +HFI1_SC2VL_ATTR(27); +HFI1_SC2VL_ATTR(28); +HFI1_SC2VL_ATTR(29); +HFI1_SC2VL_ATTR(30); +HFI1_SC2VL_ATTR(31); + +static struct attribute *sc2vl_default_attributes[] = { + &hfi1_sc2vl_attr_0.attr, + &hfi1_sc2vl_attr_1.attr, + &hfi1_sc2vl_attr_2.attr, + &hfi1_sc2vl_attr_3.attr, + &hfi1_sc2vl_attr_4.attr, + &hfi1_sc2vl_attr_5.attr, + &hfi1_sc2vl_attr_6.attr, + &hfi1_sc2vl_attr_7.attr, + &hfi1_sc2vl_attr_8.attr, + &hfi1_sc2vl_attr_9.attr, + &hfi1_sc2vl_attr_10.attr, + &hfi1_sc2vl_attr_11.attr, + &hfi1_sc2vl_attr_12.attr, + &hfi1_sc2vl_attr_13.attr, + &hfi1_sc2vl_attr_14.attr, + &hfi1_sc2vl_attr_15.attr, + &hfi1_sc2vl_attr_16.attr, + &hfi1_sc2vl_attr_17.attr, + &hfi1_sc2vl_attr_18.attr, + &hfi1_sc2vl_attr_19.attr, + &hfi1_sc2vl_attr_20.attr, + &hfi1_sc2vl_attr_21.attr, + &hfi1_sc2vl_attr_22.attr, + &hfi1_sc2vl_attr_23.attr, + &hfi1_sc2vl_attr_24.attr, + &hfi1_sc2vl_attr_25.attr, + &hfi1_sc2vl_attr_26.attr, + &hfi1_sc2vl_attr_27.attr, + &hfi1_sc2vl_attr_28.attr, + &hfi1_sc2vl_attr_29.attr, + &hfi1_sc2vl_attr_30.attr, + &hfi1_sc2vl_attr_31.attr, + NULL +}; + +static ssize_t sc2vl_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct hfi1_sc2vl_attr *sattr = + container_of(attr, struct hfi1_sc2vl_attr, attr); + struct hfi1_pportdata *ppd = + container_of(kobj, struct hfi1_pportdata, sc2vl_kobj); + struct hfi1_devdata *dd = ppd->dd; + + return sprintf(buf, "%u\n", *((u8 *)dd->sc2vl + sattr->sc)); +} + +static const struct sysfs_ops hfi1_sc2vl_ops = { + .show = sc2vl_attr_show, +}; + +static struct kobj_type hfi1_sc2vl_ktype = { + .release = port_release, + .sysfs_ops = &hfi1_sc2vl_ops, + .default_attrs = sc2vl_default_attributes +}; + +/* End sc2vl */ + +/* Start sl2sc */ +#define HFI1_SL2SC_ATTR(N) \ + static struct hfi1_sl2sc_attr hfi1_sl2sc_attr_##N = { \ + .attr = { .name = __stringify(N), .mode = 0444 }, \ + .sl = N \ + } + +struct hfi1_sl2sc_attr { + struct attribute attr; + int sl; +}; + +HFI1_SL2SC_ATTR(0); +HFI1_SL2SC_ATTR(1); +HFI1_SL2SC_ATTR(2); +HFI1_SL2SC_ATTR(3); +HFI1_SL2SC_ATTR(4); +HFI1_SL2SC_ATTR(5); +HFI1_SL2SC_ATTR(6); +HFI1_SL2SC_ATTR(7); +HFI1_SL2SC_ATTR(8); +HFI1_SL2SC_ATTR(9); +HFI1_SL2SC_ATTR(10); +HFI1_SL2SC_ATTR(11); +HFI1_SL2SC_ATTR(12); +HFI1_SL2SC_ATTR(13); +HFI1_SL2SC_ATTR(14); +HFI1_SL2SC_ATTR(15); +HFI1_SL2SC_ATTR(16); +HFI1_SL2SC_ATTR(17); +HFI1_SL2SC_ATTR(18); +HFI1_SL2SC_ATTR(19); +HFI1_SL2SC_ATTR(20); +HFI1_SL2SC_ATTR(21); +HFI1_SL2SC_ATTR(22); +HFI1_SL2SC_ATTR(23); +HFI1_SL2SC_ATTR(24); +HFI1_SL2SC_ATTR(25); +HFI1_SL2SC_ATTR(26); +HFI1_SL2SC_ATTR(27); +HFI1_SL2SC_ATTR(28); +HFI1_SL2SC_ATTR(29); +HFI1_SL2SC_ATTR(30); +HFI1_SL2SC_ATTR(31); + +static struct attribute *sl2sc_default_attributes[] = { + &hfi1_sl2sc_attr_0.attr, + &hfi1_sl2sc_attr_1.attr, + &hfi1_sl2sc_attr_2.attr, + &hfi1_sl2sc_attr_3.attr, + &hfi1_sl2sc_attr_4.attr, + &hfi1_sl2sc_attr_5.attr, + &hfi1_sl2sc_attr_6.attr, + &hfi1_sl2sc_attr_7.attr, + &hfi1_sl2sc_attr_8.attr, + &hfi1_sl2sc_attr_9.attr, + &hfi1_sl2sc_attr_10.attr, + &hfi1_sl2sc_attr_11.attr, + &hfi1_sl2sc_attr_12.attr, + &hfi1_sl2sc_attr_13.attr, + &hfi1_sl2sc_attr_14.attr, + &hfi1_sl2sc_attr_15.attr, + &hfi1_sl2sc_attr_16.attr, + &hfi1_sl2sc_attr_17.attr, + &hfi1_sl2sc_attr_18.attr, + &hfi1_sl2sc_attr_19.attr, + &hfi1_sl2sc_attr_20.attr, + &hfi1_sl2sc_attr_21.attr, + &hfi1_sl2sc_attr_22.attr, + &hfi1_sl2sc_attr_23.attr, + &hfi1_sl2sc_attr_24.attr, + &hfi1_sl2sc_attr_25.attr, + &hfi1_sl2sc_attr_26.attr, + &hfi1_sl2sc_attr_27.attr, + &hfi1_sl2sc_attr_28.attr, + &hfi1_sl2sc_attr_29.attr, + &hfi1_sl2sc_attr_30.attr, + &hfi1_sl2sc_attr_31.attr, + NULL +}; + +static ssize_t sl2sc_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct hfi1_sl2sc_attr *sattr = + container_of(attr, struct hfi1_sl2sc_attr, attr); + struct hfi1_pportdata *ppd = + container_of(kobj, struct hfi1_pportdata, sl2sc_kobj); + struct hfi1_ibport *ibp = &ppd->ibport_data; + + return sprintf(buf, "%u\n", ibp->sl_to_sc[sattr->sl]); +} + +static const struct sysfs_ops hfi1_sl2sc_ops = { + .show = sl2sc_attr_show, +}; + +static struct kobj_type hfi1_sl2sc_ktype = { + .release = port_release, + .sysfs_ops = &hfi1_sl2sc_ops, + .default_attrs = sl2sc_default_attributes +}; + +/* End sl2sc */ + +/* Start vl2mtu */ + +#define HFI1_VL2MTU_ATTR(N) \ + static struct hfi1_vl2mtu_attr hfi1_vl2mtu_attr_##N = { \ + .attr = { .name = __stringify(N), .mode = 0444 }, \ + .vl = N \ + } + +struct hfi1_vl2mtu_attr { + struct attribute attr; + int vl; +}; + +HFI1_VL2MTU_ATTR(0); +HFI1_VL2MTU_ATTR(1); +HFI1_VL2MTU_ATTR(2); +HFI1_VL2MTU_ATTR(3); +HFI1_VL2MTU_ATTR(4); +HFI1_VL2MTU_ATTR(5); +HFI1_VL2MTU_ATTR(6); +HFI1_VL2MTU_ATTR(7); +HFI1_VL2MTU_ATTR(8); +HFI1_VL2MTU_ATTR(9); +HFI1_VL2MTU_ATTR(10); +HFI1_VL2MTU_ATTR(11); +HFI1_VL2MTU_ATTR(12); +HFI1_VL2MTU_ATTR(13); +HFI1_VL2MTU_ATTR(14); +HFI1_VL2MTU_ATTR(15); + +static struct attribute *vl2mtu_default_attributes[] = { + &hfi1_vl2mtu_attr_0.attr, + &hfi1_vl2mtu_attr_1.attr, + &hfi1_vl2mtu_attr_2.attr, + &hfi1_vl2mtu_attr_3.attr, + &hfi1_vl2mtu_attr_4.attr, + &hfi1_vl2mtu_attr_5.attr, + &hfi1_vl2mtu_attr_6.attr, + &hfi1_vl2mtu_attr_7.attr, + &hfi1_vl2mtu_attr_8.attr, + &hfi1_vl2mtu_attr_9.attr, + &hfi1_vl2mtu_attr_10.attr, + &hfi1_vl2mtu_attr_11.attr, + &hfi1_vl2mtu_attr_12.attr, + &hfi1_vl2mtu_attr_13.attr, + &hfi1_vl2mtu_attr_14.attr, + &hfi1_vl2mtu_attr_15.attr, + NULL +}; + +static ssize_t vl2mtu_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct hfi1_vl2mtu_attr *vlattr = + container_of(attr, struct hfi1_vl2mtu_attr, attr); + struct hfi1_pportdata *ppd = + container_of(kobj, struct hfi1_pportdata, vl2mtu_kobj); + struct hfi1_devdata *dd = ppd->dd; + + return sprintf(buf, "%u\n", dd->vld[vlattr->vl].mtu); +} + +static const struct sysfs_ops hfi1_vl2mtu_ops = { + .show = vl2mtu_attr_show, +}; + +static struct kobj_type hfi1_vl2mtu_ktype = { + .release = port_release, + .sysfs_ops = &hfi1_vl2mtu_ops, + .default_attrs = vl2mtu_default_attributes +}; + +/* end of per-port file structures and support code */ + +/* + * Start of per-unit (or driver, in some cases, but replicated + * per unit) functions (these get a device *) + */ +static ssize_t show_rev(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + + return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev); +} + +static ssize_t show_hfi(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + int ret; + + if (!dd->boardname) + ret = -EINVAL; + else + ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname); + return ret; +} + +static ssize_t show_boardversion(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + + /* The string printed here is already newline-terminated. */ + return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion); +} + +static ssize_t show_nctxts(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + + /* + * Return the smaller of send and receive contexts. + * Normally, user level applications would require both a send + * and a receive context, so returning the smaller of the two counts + * give a more accurate picture of total contexts available. + */ + return scnprintf(buf, PAGE_SIZE, "%u\n", + min(dd->num_rcv_contexts - dd->first_user_ctxt, + (u32)dd->sc_sizes[SC_USER].count)); +} + +static ssize_t show_nfreectxts(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + + /* Return the number of free user ports (contexts) available. */ + return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts); +} + +static ssize_t show_serial(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + + return scnprintf(buf, PAGE_SIZE, "%s", dd->serial); +} + +static ssize_t store_chip_reset(struct device *device, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + int ret; + + if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) { + ret = -EINVAL; + goto bail; + } + + ret = hfi1_reset_device(dd->unit); +bail: + return ret < 0 ? ret : count; +} + +/* + * Convert the reported temperature from an integer (reported in + * units of 0.25C) to a floating point number. + */ +#define temp2str(temp, buf, size, idx) \ + scnprintf((buf) + (idx), (size) - (idx), "%u.%02u ", \ + ((temp) >> 2), ((temp) & 0x3) * 25) + +/* + * Dump tempsense values, in decimal, to ease shell-scripts. + */ +static ssize_t show_tempsense(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct hfi1_ibdev *dev = + container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); + struct hfi1_devdata *dd = dd_from_dev(dev); + struct hfi1_temp temp; + int ret; + + ret = hfi1_tempsense_rd(dd, &temp); + if (!ret) { + int idx = 0; + + idx += temp2str(temp.curr, buf, PAGE_SIZE, idx); + idx += temp2str(temp.lo_lim, buf, PAGE_SIZE, idx); + idx += temp2str(temp.hi_lim, buf, PAGE_SIZE, idx); + idx += temp2str(temp.crit_lim, buf, PAGE_SIZE, idx); + idx += scnprintf(buf + idx, PAGE_SIZE - idx, + "%u %u %u\n", temp.triggers & 0x1, + temp.triggers & 0x2, temp.triggers & 0x4); + ret = idx; + } + return ret; +} + +/* + * end of per-unit (or driver, in some cases, but replicated + * per unit) functions + */ + +/* start of per-unit file structures and support code */ +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); +static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL); +static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL); +static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL); +static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); +static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL); +static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL); +static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset); + +static struct device_attribute *hfi1_attributes[] = { + &dev_attr_hw_rev, + &dev_attr_board_id, + &dev_attr_nctxts, + &dev_attr_nfreectxts, + &dev_attr_serial, + &dev_attr_boardversion, + &dev_attr_tempsense, + &dev_attr_chip_reset, +}; + +int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num, + struct kobject *kobj) +{ + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + int ret; + + if (!port_num || port_num > dd->num_pports) { + dd_dev_err(dd, + "Skipping infiniband class with invalid port %u\n", + port_num); + return -ENODEV; + } + ppd = &dd->pport[port_num - 1]; + + ret = kobject_init_and_add(&ppd->sc2vl_kobj, &hfi1_sc2vl_ktype, kobj, + "sc2vl"); + if (ret) { + dd_dev_err(dd, + "Skipping sc2vl sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail; + } + kobject_uevent(&ppd->sc2vl_kobj, KOBJ_ADD); + + ret = kobject_init_and_add(&ppd->sl2sc_kobj, &hfi1_sl2sc_ktype, kobj, + "sl2sc"); + if (ret) { + dd_dev_err(dd, + "Skipping sl2sc sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_sc2vl; + } + kobject_uevent(&ppd->sl2sc_kobj, KOBJ_ADD); + + ret = kobject_init_and_add(&ppd->vl2mtu_kobj, &hfi1_vl2mtu_ktype, kobj, + "vl2mtu"); + if (ret) { + dd_dev_err(dd, + "Skipping vl2mtu sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_sl2sc; + } + kobject_uevent(&ppd->vl2mtu_kobj, KOBJ_ADD); + + ret = kobject_init_and_add(&ppd->pport_cc_kobj, &port_cc_ktype, + kobj, "CCMgtA"); + if (ret) { + dd_dev_err(dd, + "Skipping Congestion Control sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_vl2mtu; + } + + kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD); + + ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, &cc_setting_bin_attr); + if (ret) { + dd_dev_err(dd, + "Skipping Congestion Control setting sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_cc; + } + + ret = sysfs_create_bin_file(&ppd->pport_cc_kobj, &cc_table_bin_attr); + if (ret) { + dd_dev_err(dd, + "Skipping Congestion Control table sysfs info, (err %d) port %u\n", + ret, port_num); + goto bail_cc_entry_bin; + } + + dd_dev_info(dd, + "Congestion Control Agent enabled for port %d\n", + port_num); + + return 0; + +bail_cc_entry_bin: + sysfs_remove_bin_file(&ppd->pport_cc_kobj, + &cc_setting_bin_attr); +bail_cc: + kobject_put(&ppd->pport_cc_kobj); +bail_vl2mtu: + kobject_put(&ppd->vl2mtu_kobj); +bail_sl2sc: + kobject_put(&ppd->sl2sc_kobj); +bail_sc2vl: + kobject_put(&ppd->sc2vl_kobj); +bail: + return ret; +} + +/* + * Register and create our files in /sys/class/infiniband. + */ +int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd) +{ + struct ib_device *dev = &dd->verbs_dev.rdi.ibdev; + int i, ret; + + for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) { + ret = device_create_file(&dev->dev, hfi1_attributes[i]); + if (ret) + goto bail; + } + + return 0; +bail: + for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) + device_remove_file(&dev->dev, hfi1_attributes[i]); + return ret; +} + +/* + * Unregister and remove our files in /sys/class/infiniband. + */ +void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd) +{ + struct hfi1_pportdata *ppd; + int i; + + for (i = 0; i < dd->num_pports; i++) { + ppd = &dd->pport[i]; + + sysfs_remove_bin_file(&ppd->pport_cc_kobj, + &cc_setting_bin_attr); + sysfs_remove_bin_file(&ppd->pport_cc_kobj, + &cc_table_bin_attr); + kobject_put(&ppd->pport_cc_kobj); + kobject_put(&ppd->vl2mtu_kobj); + kobject_put(&ppd->sl2sc_kobj); + kobject_put(&ppd->sc2vl_kobj); + } +} diff --git a/drivers/infiniband/hw/hfi1/trace.c b/drivers/infiniband/hw/hfi1/trace.c new file mode 100644 index 000000000..4cfb13771 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace.c @@ -0,0 +1,230 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#define CREATE_TRACE_POINTS +#include "trace.h" + +u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr) +{ + struct hfi1_other_headers *ohdr; + u8 opcode; + u8 lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3); + + if (lnh == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + return hdr_len_by_opcode[opcode] == 0 ? + 0 : hdr_len_by_opcode[opcode] - (12 + 8); +} + +#define IMM_PRN "imm %d" +#define RETH_PRN "reth vaddr 0x%.16llx rkey 0x%.8x dlen 0x%.8x" +#define AETH_PRN "aeth syn 0x%.2x %s msn 0x%.8x" +#define DETH_PRN "deth qkey 0x%.8x sqpn 0x%.6x" +#define IETH_PRN "ieth rkey 0x%.8x" +#define ATOMICACKETH_PRN "origdata %lld" +#define ATOMICETH_PRN "vaddr 0x%llx rkey 0x%.8x sdata %lld cdata %lld" + +#define OP(transport, op) IB_OPCODE_## transport ## _ ## op + +static u64 ib_u64_get(__be32 *p) +{ + return ((u64)be32_to_cpu(p[0]) << 32) | be32_to_cpu(p[1]); +} + +static const char *parse_syndrome(u8 syndrome) +{ + switch (syndrome >> 5) { + case 0: + return "ACK"; + case 1: + return "RNRNAK"; + case 3: + return "NAK"; + } + return ""; +} + +const char *parse_everbs_hdrs( + struct trace_seq *p, + u8 opcode, + void *ehdrs) +{ + union ib_ehdrs *eh = ehdrs; + const char *ret = trace_seq_buffer_ptr(p); + + switch (opcode) { + /* imm */ + case OP(RC, SEND_LAST_WITH_IMMEDIATE): + case OP(UC, SEND_LAST_WITH_IMMEDIATE): + case OP(RC, SEND_ONLY_WITH_IMMEDIATE): + case OP(UC, SEND_ONLY_WITH_IMMEDIATE): + case OP(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE): + case OP(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE): + trace_seq_printf(p, IMM_PRN, + be32_to_cpu(eh->imm_data)); + break; + /* reth + imm */ + case OP(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE): + case OP(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE): + trace_seq_printf(p, RETH_PRN " " IMM_PRN, + (unsigned long long)ib_u64_get( + (__be32 *)&eh->rc.reth.vaddr), + be32_to_cpu(eh->rc.reth.rkey), + be32_to_cpu(eh->rc.reth.length), + be32_to_cpu(eh->rc.imm_data)); + break; + /* reth */ + case OP(RC, RDMA_READ_REQUEST): + case OP(RC, RDMA_WRITE_FIRST): + case OP(UC, RDMA_WRITE_FIRST): + case OP(RC, RDMA_WRITE_ONLY): + case OP(UC, RDMA_WRITE_ONLY): + trace_seq_printf(p, RETH_PRN, + (unsigned long long)ib_u64_get( + (__be32 *)&eh->rc.reth.vaddr), + be32_to_cpu(eh->rc.reth.rkey), + be32_to_cpu(eh->rc.reth.length)); + break; + case OP(RC, RDMA_READ_RESPONSE_FIRST): + case OP(RC, RDMA_READ_RESPONSE_LAST): + case OP(RC, RDMA_READ_RESPONSE_ONLY): + case OP(RC, ACKNOWLEDGE): + trace_seq_printf(p, AETH_PRN, be32_to_cpu(eh->aeth) >> 24, + parse_syndrome(be32_to_cpu(eh->aeth) >> 24), + be32_to_cpu(eh->aeth) & HFI1_MSN_MASK); + break; + /* aeth + atomicacketh */ + case OP(RC, ATOMIC_ACKNOWLEDGE): + trace_seq_printf(p, AETH_PRN " " ATOMICACKETH_PRN, + be32_to_cpu(eh->at.aeth) >> 24, + parse_syndrome(be32_to_cpu(eh->at.aeth) >> 24), + be32_to_cpu(eh->at.aeth) & HFI1_MSN_MASK, + (unsigned long long) + ib_u64_get(eh->at.atomic_ack_eth)); + break; + /* atomiceth */ + case OP(RC, COMPARE_SWAP): + case OP(RC, FETCH_ADD): + trace_seq_printf(p, ATOMICETH_PRN, + (unsigned long long)ib_u64_get( + eh->atomic_eth.vaddr), + eh->atomic_eth.rkey, + (unsigned long long)ib_u64_get( + (__be32 *)&eh->atomic_eth.swap_data), + (unsigned long long)ib_u64_get( + (__be32 *)&eh->atomic_eth.compare_data)); + break; + /* deth */ + case OP(UD, SEND_ONLY): + case OP(UD, SEND_ONLY_WITH_IMMEDIATE): + trace_seq_printf(p, DETH_PRN, + be32_to_cpu(eh->ud.deth[0]), + be32_to_cpu(eh->ud.deth[1]) & RVT_QPN_MASK); + break; + /* ieth */ + case OP(RC, SEND_LAST_WITH_INVALIDATE): + case OP(RC, SEND_ONLY_WITH_INVALIDATE): + trace_seq_printf(p, IETH_PRN, + be32_to_cpu(eh->ieth)); + break; + } + trace_seq_putc(p, 0); + return ret; +} + +const char *parse_sdma_flags( + struct trace_seq *p, + u64 desc0, u64 desc1) +{ + const char *ret = trace_seq_buffer_ptr(p); + char flags[5] = { 'x', 'x', 'x', 'x', 0 }; + + flags[0] = (desc1 & SDMA_DESC1_INT_REQ_FLAG) ? 'I' : '-'; + flags[1] = (desc1 & SDMA_DESC1_HEAD_TO_HOST_FLAG) ? 'H' : '-'; + flags[2] = (desc0 & SDMA_DESC0_FIRST_DESC_FLAG) ? 'F' : '-'; + flags[3] = (desc0 & SDMA_DESC0_LAST_DESC_FLAG) ? 'L' : '-'; + trace_seq_printf(p, "%s", flags); + if (desc0 & SDMA_DESC0_FIRST_DESC_FLAG) + trace_seq_printf(p, " amode:%u aidx:%u alen:%u", + (u8)((desc1 >> SDMA_DESC1_HEADER_MODE_SHIFT) & + SDMA_DESC1_HEADER_MODE_MASK), + (u8)((desc1 >> SDMA_DESC1_HEADER_INDEX_SHIFT) & + SDMA_DESC1_HEADER_INDEX_MASK), + (u8)((desc1 >> SDMA_DESC1_HEADER_DWS_SHIFT) & + SDMA_DESC1_HEADER_DWS_MASK)); + return ret; +} + +const char *print_u32_array( + struct trace_seq *p, + u32 *arr, int len) +{ + int i; + const char *ret = trace_seq_buffer_ptr(p); + + for (i = 0; i < len ; i++) + trace_seq_printf(p, "%s%#x", i == 0 ? "" : " ", arr[i]); + trace_seq_putc(p, 0); + return ret; +} + +__hfi1_trace_fn(PKT); +__hfi1_trace_fn(PROC); +__hfi1_trace_fn(SDMA); +__hfi1_trace_fn(LINKVERB); +__hfi1_trace_fn(DEBUG); +__hfi1_trace_fn(SNOOP); +__hfi1_trace_fn(CNTR); +__hfi1_trace_fn(PIO); +__hfi1_trace_fn(DC8051); +__hfi1_trace_fn(FIRMWARE); +__hfi1_trace_fn(RCVCTRL); +__hfi1_trace_fn(TID); +__hfi1_trace_fn(MMU); +__hfi1_trace_fn(IOCTL); diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h new file mode 100644 index 000000000..28c1d0832 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace.h @@ -0,0 +1,1372 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR hfi1 + +#if !defined(__HFI1_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> + +#include "hfi.h" +#include "mad.h" +#include "sdma.h" + +#define DD_DEV_ENTRY(dd) __string(dev, dev_name(&(dd)->pcidev->dev)) +#define DD_DEV_ASSIGN(dd) __assign_str(dev, dev_name(&(dd)->pcidev->dev)) + +#define packettype_name(etype) { RHF_RCV_TYPE_##etype, #etype } +#define show_packettype(etype) \ +__print_symbolic(etype, \ + packettype_name(EXPECTED), \ + packettype_name(EAGER), \ + packettype_name(IB), \ + packettype_name(ERROR), \ + packettype_name(BYPASS)) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_rx + +TRACE_EVENT(hfi1_rcvhdr, + TP_PROTO(struct hfi1_devdata *dd, + u32 ctxt, + u64 eflags, + u32 etype, + u32 hlen, + u32 tlen, + u32 updegr, + u32 etail + ), + TP_ARGS(dd, ctxt, eflags, etype, hlen, tlen, updegr, etail), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u64, eflags) + __field(u32, ctxt) + __field(u32, etype) + __field(u32, hlen) + __field(u32, tlen) + __field(u32, updegr) + __field(u32, etail) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->eflags = eflags; + __entry->ctxt = ctxt; + __entry->etype = etype; + __entry->hlen = hlen; + __entry->tlen = tlen; + __entry->updegr = updegr; + __entry->etail = etail; + ), + TP_printk( + "[%s] ctxt %d eflags 0x%llx etype %d,%s hlen %d tlen %d updegr %d etail %d", + __get_str(dev), + __entry->ctxt, + __entry->eflags, + __entry->etype, show_packettype(__entry->etype), + __entry->hlen, + __entry->tlen, + __entry->updegr, + __entry->etail + ) +); + +TRACE_EVENT(hfi1_receive_interrupt, + TP_PROTO(struct hfi1_devdata *dd, u32 ctxt), + TP_ARGS(dd, ctxt), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u32, ctxt) + __field(u8, slow_path) + __field(u8, dma_rtail) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + if (dd->rcd[ctxt]->do_interrupt == + &handle_receive_interrupt) { + __entry->slow_path = 1; + __entry->dma_rtail = 0xFF; + } else if (dd->rcd[ctxt]->do_interrupt == + &handle_receive_interrupt_dma_rtail){ + __entry->dma_rtail = 1; + __entry->slow_path = 0; + } else if (dd->rcd[ctxt]->do_interrupt == + &handle_receive_interrupt_nodma_rtail) { + __entry->dma_rtail = 0; + __entry->slow_path = 0; + } + ), + TP_printk("[%s] ctxt %d SlowPath: %d DmaRtail: %d", + __get_str(dev), + __entry->ctxt, + __entry->slow_path, + __entry->dma_rtail + ) +); + +TRACE_EVENT(hfi1_exp_tid_reg, + TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr, + u32 npages, unsigned long va, unsigned long pa, + dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), + TP_STRUCT__entry( + __field(unsigned, ctxt) + __field(u16, subctxt) + __field(u32, rarr) + __field(u32, npages) + __field(unsigned long, va) + __field(unsigned long, pa) + __field(dma_addr_t, dma) + ), + TP_fast_assign( + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->rarr = rarr; + __entry->npages = npages; + __entry->va = va; + __entry->pa = pa; + __entry->dma = dma; + ), + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", + __entry->ctxt, + __entry->subctxt, + __entry->rarr, + __entry->npages, + __entry->pa, + __entry->va, + __entry->dma + ) + ); + +TRACE_EVENT(hfi1_exp_tid_unreg, + TP_PROTO(unsigned ctxt, u16 subctxt, u32 rarr, u32 npages, + unsigned long va, unsigned long pa, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, rarr, npages, va, pa, dma), + TP_STRUCT__entry( + __field(unsigned, ctxt) + __field(u16, subctxt) + __field(u32, rarr) + __field(u32, npages) + __field(unsigned long, va) + __field(unsigned long, pa) + __field(dma_addr_t, dma) + ), + TP_fast_assign( + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->rarr = rarr; + __entry->npages = npages; + __entry->va = va; + __entry->pa = pa; + __entry->dma = dma; + ), + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx, va:0x%lx dma:0x%llx", + __entry->ctxt, + __entry->subctxt, + __entry->rarr, + __entry->npages, + __entry->pa, + __entry->va, + __entry->dma + ) + ); + +TRACE_EVENT(hfi1_exp_tid_inval, + TP_PROTO(unsigned ctxt, u16 subctxt, unsigned long va, u32 rarr, + u32 npages, dma_addr_t dma), + TP_ARGS(ctxt, subctxt, va, rarr, npages, dma), + TP_STRUCT__entry( + __field(unsigned, ctxt) + __field(u16, subctxt) + __field(unsigned long, va) + __field(u32, rarr) + __field(u32, npages) + __field(dma_addr_t, dma) + ), + TP_fast_assign( + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->va = va; + __entry->rarr = rarr; + __entry->npages = npages; + __entry->dma = dma; + ), + TP_printk("[%u:%u] entry:%u, %u pages @ 0x%lx dma: 0x%llx", + __entry->ctxt, + __entry->subctxt, + __entry->rarr, + __entry->npages, + __entry->va, + __entry->dma + ) + ); + +TRACE_EVENT(hfi1_mmu_invalidate, + TP_PROTO(unsigned ctxt, u16 subctxt, const char *type, + unsigned long start, unsigned long end), + TP_ARGS(ctxt, subctxt, type, start, end), + TP_STRUCT__entry( + __field(unsigned, ctxt) + __field(u16, subctxt) + __string(type, type) + __field(unsigned long, start) + __field(unsigned long, end) + ), + TP_fast_assign( + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __assign_str(type, type); + __entry->start = start; + __entry->end = end; + ), + TP_printk("[%3u:%02u] MMU Invalidate (%s) 0x%lx - 0x%lx", + __entry->ctxt, + __entry->subctxt, + __get_str(type), + __entry->start, + __entry->end + ) + ); + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_tx + +TRACE_EVENT(hfi1_piofree, + TP_PROTO(struct send_context *sc, int extra), + TP_ARGS(sc, extra), + TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd) + __field(u32, sw_index) + __field(u32, hw_context) + __field(int, extra) + ), + TP_fast_assign(DD_DEV_ASSIGN(sc->dd); + __entry->sw_index = sc->sw_index; + __entry->hw_context = sc->hw_context; + __entry->extra = extra; + ), + TP_printk("[%s] ctxt %u(%u) extra %d", + __get_str(dev), + __entry->sw_index, + __entry->hw_context, + __entry->extra + ) +); + +TRACE_EVENT(hfi1_wantpiointr, + TP_PROTO(struct send_context *sc, u32 needint, u64 credit_ctrl), + TP_ARGS(sc, needint, credit_ctrl), + TP_STRUCT__entry(DD_DEV_ENTRY(sc->dd) + __field(u32, sw_index) + __field(u32, hw_context) + __field(u32, needint) + __field(u64, credit_ctrl) + ), + TP_fast_assign(DD_DEV_ASSIGN(sc->dd); + __entry->sw_index = sc->sw_index; + __entry->hw_context = sc->hw_context; + __entry->needint = needint; + __entry->credit_ctrl = credit_ctrl; + ), + TP_printk("[%s] ctxt %u(%u) on %d credit_ctrl 0x%llx", + __get_str(dev), + __entry->sw_index, + __entry->hw_context, + __entry->needint, + (unsigned long long)__entry->credit_ctrl + ) +); + +DECLARE_EVENT_CLASS(hfi1_qpsleepwakeup_template, + TP_PROTO(struct rvt_qp *qp, u32 flags), + TP_ARGS(qp, flags), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, flags) + __field(u32, s_flags) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->flags = flags; + __entry->qpn = qp->ibqp.qp_num; + __entry->s_flags = qp->s_flags; + ), + TP_printk( + "[%s] qpn 0x%x flags 0x%x s_flags 0x%x", + __get_str(dev), + __entry->qpn, + __entry->flags, + __entry->s_flags + ) +); + +DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpwakeup, + TP_PROTO(struct rvt_qp *qp, u32 flags), + TP_ARGS(qp, flags)); + +DEFINE_EVENT(hfi1_qpsleepwakeup_template, hfi1_qpsleep, + TP_PROTO(struct rvt_qp *qp, u32 flags), + TP_ARGS(qp, flags)); + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_ibhdrs + +u8 ibhdr_exhdr_len(struct hfi1_ib_header *hdr); +const char *parse_everbs_hdrs(struct trace_seq *p, u8 opcode, void *ehdrs); + +#define __parse_ib_ehdrs(op, ehdrs) parse_everbs_hdrs(p, op, ehdrs) + +const char *parse_sdma_flags(struct trace_seq *p, u64 desc0, u64 desc1); + +#define __parse_sdma_flags(desc0, desc1) parse_sdma_flags(p, desc0, desc1) + +#define lrh_name(lrh) { HFI1_##lrh, #lrh } +#define show_lnh(lrh) \ +__print_symbolic(lrh, \ + lrh_name(LRH_BTH), \ + lrh_name(LRH_GRH)) + +#define ib_opcode_name(opcode) { IB_OPCODE_##opcode, #opcode } +#define show_ib_opcode(opcode) \ +__print_symbolic(opcode, \ + ib_opcode_name(RC_SEND_FIRST), \ + ib_opcode_name(RC_SEND_MIDDLE), \ + ib_opcode_name(RC_SEND_LAST), \ + ib_opcode_name(RC_SEND_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(RC_SEND_ONLY), \ + ib_opcode_name(RC_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_WRITE_FIRST), \ + ib_opcode_name(RC_RDMA_WRITE_MIDDLE), \ + ib_opcode_name(RC_RDMA_WRITE_LAST), \ + ib_opcode_name(RC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_WRITE_ONLY), \ + ib_opcode_name(RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(RC_RDMA_READ_REQUEST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_FIRST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_MIDDLE), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_LAST), \ + ib_opcode_name(RC_RDMA_READ_RESPONSE_ONLY), \ + ib_opcode_name(RC_ACKNOWLEDGE), \ + ib_opcode_name(RC_ATOMIC_ACKNOWLEDGE), \ + ib_opcode_name(RC_COMPARE_SWAP), \ + ib_opcode_name(RC_FETCH_ADD), \ + ib_opcode_name(RC_SEND_LAST_WITH_INVALIDATE), \ + ib_opcode_name(RC_SEND_ONLY_WITH_INVALIDATE), \ + ib_opcode_name(UC_SEND_FIRST), \ + ib_opcode_name(UC_SEND_MIDDLE), \ + ib_opcode_name(UC_SEND_LAST), \ + ib_opcode_name(UC_SEND_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(UC_SEND_ONLY), \ + ib_opcode_name(UC_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(UC_RDMA_WRITE_FIRST), \ + ib_opcode_name(UC_RDMA_WRITE_MIDDLE), \ + ib_opcode_name(UC_RDMA_WRITE_LAST), \ + ib_opcode_name(UC_RDMA_WRITE_LAST_WITH_IMMEDIATE), \ + ib_opcode_name(UC_RDMA_WRITE_ONLY), \ + ib_opcode_name(UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(UD_SEND_ONLY), \ + ib_opcode_name(UD_SEND_ONLY_WITH_IMMEDIATE), \ + ib_opcode_name(CNP)) + +#define LRH_PRN "vl %d lver %d sl %d lnh %d,%s dlid %.4x len %d slid %.4x" +#define BTH_PRN \ + "op 0x%.2x,%s se %d m %d pad %d tver %d pkey 0x%.4x " \ + "f %d b %d qpn 0x%.6x a %d psn 0x%.8x" +#define EHDR_PRN "%s" + +DECLARE_EVENT_CLASS(hfi1_ibhdr_template, + TP_PROTO(struct hfi1_devdata *dd, + struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + /* LRH */ + __field(u8, vl) + __field(u8, lver) + __field(u8, sl) + __field(u8, lnh) + __field(u16, dlid) + __field(u16, len) + __field(u16, slid) + /* BTH */ + __field(u8, opcode) + __field(u8, se) + __field(u8, m) + __field(u8, pad) + __field(u8, tver) + __field(u16, pkey) + __field(u8, f) + __field(u8, b) + __field(u32, qpn) + __field(u8, a) + __field(u32, psn) + /* extended headers */ + __dynamic_array(u8, ehdrs, ibhdr_exhdr_len(hdr)) + ), + TP_fast_assign( + struct hfi1_other_headers *ohdr; + + DD_DEV_ASSIGN(dd); + /* LRH */ + __entry->vl = + (u8)(be16_to_cpu(hdr->lrh[0]) >> 12); + __entry->lver = + (u8)(be16_to_cpu(hdr->lrh[0]) >> 8) & 0xf; + __entry->sl = + (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; + __entry->lnh = + (u8)(be16_to_cpu(hdr->lrh[0]) & 3); + __entry->dlid = + be16_to_cpu(hdr->lrh[1]); + /* allow for larger len */ + __entry->len = + be16_to_cpu(hdr->lrh[2]); + __entry->slid = + be16_to_cpu(hdr->lrh[3]); + /* BTH */ + if (__entry->lnh == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + __entry->opcode = + (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + __entry->se = + (be32_to_cpu(ohdr->bth[0]) >> 23) & 1; + __entry->m = + (be32_to_cpu(ohdr->bth[0]) >> 22) & 1; + __entry->pad = + (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + __entry->tver = + (be32_to_cpu(ohdr->bth[0]) >> 16) & 0xf; + __entry->pkey = + be32_to_cpu(ohdr->bth[0]) & 0xffff; + __entry->f = + (be32_to_cpu(ohdr->bth[1]) >> HFI1_FECN_SHIFT) & + HFI1_FECN_MASK; + __entry->b = + (be32_to_cpu(ohdr->bth[1]) >> HFI1_BECN_SHIFT) & + HFI1_BECN_MASK; + __entry->qpn = + be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; + __entry->a = + (be32_to_cpu(ohdr->bth[2]) >> 31) & 1; + /* allow for larger PSN */ + __entry->psn = + be32_to_cpu(ohdr->bth[2]) & 0x7fffffff; + /* extended headers */ + memcpy(__get_dynamic_array(ehdrs), &ohdr->u, + ibhdr_exhdr_len(hdr)); + ), + TP_printk("[%s] " LRH_PRN " " BTH_PRN " " EHDR_PRN, + __get_str(dev), + /* LRH */ + __entry->vl, + __entry->lver, + __entry->sl, + __entry->lnh, show_lnh(__entry->lnh), + __entry->dlid, + __entry->len, + __entry->slid, + /* BTH */ + __entry->opcode, show_ib_opcode(__entry->opcode), + __entry->se, + __entry->m, + __entry->pad, + __entry->tver, + __entry->pkey, + __entry->f, + __entry->b, + __entry->qpn, + __entry->a, + __entry->psn, + /* extended headers */ + __parse_ib_ehdrs( + __entry->opcode, + (void *)__get_dynamic_array(ehdrs)) + ) +); + +DEFINE_EVENT(hfi1_ibhdr_template, input_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr)); + +DEFINE_EVENT(hfi1_ibhdr_template, pio_output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr)); + +DEFINE_EVENT(hfi1_ibhdr_template, ack_output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr)); + +DEFINE_EVENT(hfi1_ibhdr_template, sdma_output_ibhdr, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ib_header *hdr), + TP_ARGS(dd, hdr)); + +#define SNOOP_PRN \ + "slid %.4x dlid %.4x qpn 0x%.6x opcode 0x%.2x,%s " \ + "svc lvl %d pkey 0x%.4x [header = %d bytes] [data = %d bytes]" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_snoop + +TRACE_EVENT(snoop_capture, + TP_PROTO(struct hfi1_devdata *dd, + int hdr_len, + struct hfi1_ib_header *hdr, + int data_len, + void *data), + TP_ARGS(dd, hdr_len, hdr, data_len, data), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, slid) + __field(u16, dlid) + __field(u32, qpn) + __field(u8, opcode) + __field(u8, sl) + __field(u16, pkey) + __field(u32, hdr_len) + __field(u32, data_len) + __field(u8, lnh) + __dynamic_array(u8, raw_hdr, hdr_len) + __dynamic_array(u8, raw_pkt, data_len) + ), + TP_fast_assign( + struct hfi1_other_headers *ohdr; + + __entry->lnh = (u8)(be16_to_cpu(hdr->lrh[0]) & 3); + if (__entry->lnh == HFI1_LRH_BTH) + ohdr = &hdr->u.oth; + else + ohdr = &hdr->u.l.oth; + DD_DEV_ASSIGN(dd); + __entry->slid = be16_to_cpu(hdr->lrh[3]); + __entry->dlid = be16_to_cpu(hdr->lrh[1]); + __entry->qpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; + __entry->opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff; + __entry->sl = (u8)(be16_to_cpu(hdr->lrh[0]) >> 4) & 0xf; + __entry->pkey = be32_to_cpu(ohdr->bth[0]) & 0xffff; + __entry->hdr_len = hdr_len; + __entry->data_len = data_len; + memcpy(__get_dynamic_array(raw_hdr), hdr, hdr_len); + memcpy(__get_dynamic_array(raw_pkt), data, data_len); + ), + TP_printk( + "[%s] " SNOOP_PRN, + __get_str(dev), + __entry->slid, + __entry->dlid, + __entry->qpn, + __entry->opcode, + show_ib_opcode(__entry->opcode), + __entry->sl, + __entry->pkey, + __entry->hdr_len, + __entry->data_len + ) +); + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_ctxts + +#define UCTXT_FMT \ + "cred:%u, credaddr:0x%llx, piobase:0x%llx, rcvhdr_cnt:%u, " \ + "rcvbase:0x%llx, rcvegrc:%u, rcvegrb:0x%llx" +TRACE_EVENT(hfi1_uctxtdata, + TP_PROTO(struct hfi1_devdata *dd, struct hfi1_ctxtdata *uctxt), + TP_ARGS(dd, uctxt), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(unsigned, ctxt) + __field(u32, credits) + __field(u64, hw_free) + __field(u64, piobase) + __field(u16, rcvhdrq_cnt) + __field(u64, rcvhdrq_phys) + __field(u32, eager_cnt) + __field(u64, rcvegr_phys) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = uctxt->ctxt; + __entry->credits = uctxt->sc->credits; + __entry->hw_free = (u64)uctxt->sc->hw_free; + __entry->piobase = (u64)uctxt->sc->base_addr; + __entry->rcvhdrq_cnt = uctxt->rcvhdrq_cnt; + __entry->rcvhdrq_phys = uctxt->rcvhdrq_phys; + __entry->eager_cnt = uctxt->egrbufs.alloced; + __entry->rcvegr_phys = + uctxt->egrbufs.rcvtids[0].phys; + ), + TP_printk("[%s] ctxt %u " UCTXT_FMT, + __get_str(dev), + __entry->ctxt, + __entry->credits, + __entry->hw_free, + __entry->piobase, + __entry->rcvhdrq_cnt, + __entry->rcvhdrq_phys, + __entry->eager_cnt, + __entry->rcvegr_phys + ) +); + +#define CINFO_FMT \ + "egrtids:%u, egr_size:%u, hdrq_cnt:%u, hdrq_size:%u, sdma_ring_size:%u" +TRACE_EVENT(hfi1_ctxt_info, + TP_PROTO(struct hfi1_devdata *dd, unsigned ctxt, unsigned subctxt, + struct hfi1_ctxt_info cinfo), + TP_ARGS(dd, ctxt, subctxt, cinfo), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(unsigned, ctxt) + __field(unsigned, subctxt) + __field(u16, egrtids) + __field(u16, rcvhdrq_cnt) + __field(u16, rcvhdrq_size) + __field(u16, sdma_ring_size) + __field(u32, rcvegr_size) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->egrtids = cinfo.egrtids; + __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt; + __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize; + __entry->sdma_ring_size = cinfo.sdma_ring_size; + __entry->rcvegr_size = cinfo.rcvegr_size; + ), + TP_printk("[%s] ctxt %u:%u " CINFO_FMT, + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->egrtids, + __entry->rcvegr_size, + __entry->rcvhdrq_cnt, + __entry->rcvhdrq_size, + __entry->sdma_ring_size + ) +); + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_sma + +#define BCT_FORMAT \ + "shared_limit %x vls 0-7 [%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x][%x,%x] 15 [%x,%x]" + +#define BCT(field) \ + be16_to_cpu( \ + ((struct buffer_control *)__get_dynamic_array(bct))->field \ + ) + +DECLARE_EVENT_CLASS(hfi1_bct_template, + TP_PROTO(struct hfi1_devdata *dd, + struct buffer_control *bc), + TP_ARGS(dd, bc), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __dynamic_array(u8, bct, sizeof(*bc)) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + memcpy(__get_dynamic_array(bct), bc, + sizeof(*bc)); + ), + TP_printk(BCT_FORMAT, + BCT(overall_shared_limit), + + BCT(vl[0].dedicated), + BCT(vl[0].shared), + + BCT(vl[1].dedicated), + BCT(vl[1].shared), + + BCT(vl[2].dedicated), + BCT(vl[2].shared), + + BCT(vl[3].dedicated), + BCT(vl[3].shared), + + BCT(vl[4].dedicated), + BCT(vl[4].shared), + + BCT(vl[5].dedicated), + BCT(vl[5].shared), + + BCT(vl[6].dedicated), + BCT(vl[6].shared), + + BCT(vl[7].dedicated), + BCT(vl[7].shared), + + BCT(vl[15].dedicated), + BCT(vl[15].shared) + ) +); + +DEFINE_EVENT(hfi1_bct_template, bct_set, + TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc), + TP_ARGS(dd, bc)); + +DEFINE_EVENT(hfi1_bct_template, bct_get, + TP_PROTO(struct hfi1_devdata *dd, struct buffer_control *bc), + TP_ARGS(dd, bc)); + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_sdma + +TRACE_EVENT(hfi1_sdma_descriptor, + TP_PROTO(struct sdma_engine *sde, + u64 desc0, + u64 desc1, + u16 e, + void *descp), + TP_ARGS(sde, desc0, desc1, e, descp), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(void *, descp) + __field(u64, desc0) + __field(u64, desc1) + __field(u16, e) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->desc0 = desc0; + __entry->desc1 = desc1; + __entry->idx = sde->this_idx; + __entry->descp = descp; + __entry->e = e; + ), + TP_printk( + "[%s] SDE(%u) flags:%s addr:0x%016llx gen:%u len:%u d0:%016llx d1:%016llx to %p,%u", + __get_str(dev), + __entry->idx, + __parse_sdma_flags(__entry->desc0, __entry->desc1), + (__entry->desc0 >> SDMA_DESC0_PHY_ADDR_SHIFT) & + SDMA_DESC0_PHY_ADDR_MASK, + (u8)((__entry->desc1 >> SDMA_DESC1_GENERATION_SHIFT) & + SDMA_DESC1_GENERATION_MASK), + (u16)((__entry->desc0 >> SDMA_DESC0_BYTE_COUNT_SHIFT) & + SDMA_DESC0_BYTE_COUNT_MASK), + __entry->desc0, + __entry->desc1, + __entry->descp, + __entry->e + ) +); + +TRACE_EVENT(hfi1_sdma_engine_select, + TP_PROTO(struct hfi1_devdata *dd, u32 sel, u8 vl, u8 idx), + TP_ARGS(dd, sel, vl, idx), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __field(u32, sel) + __field(u8, vl) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd); + __entry->sel = sel; + __entry->vl = vl; + __entry->idx = idx; + ), + TP_printk("[%s] selecting SDE %u sel 0x%x vl %u", + __get_str(dev), + __entry->idx, + __entry->sel, + __entry->vl + ) +); + +DECLARE_EVENT_CLASS(hfi1_sdma_engine_class, + TP_PROTO(struct sdma_engine *sde, u64 status), + TP_ARGS(sde, status), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u64, status) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->status = status; + __entry->idx = sde->this_idx; + ), + TP_printk("[%s] SDE(%u) status %llx", + __get_str(dev), + __entry->idx, + (unsigned long long)__entry->status + ) +); + +DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_interrupt, + TP_PROTO(struct sdma_engine *sde, u64 status), + TP_ARGS(sde, status) +); + +DEFINE_EVENT(hfi1_sdma_engine_class, hfi1_sdma_engine_progress, + TP_PROTO(struct sdma_engine *sde, u64 status), + TP_ARGS(sde, status) +); + +DECLARE_EVENT_CLASS(hfi1_sdma_ahg_ad, + TP_PROTO(struct sdma_engine *sde, int aidx), + TP_ARGS(sde, aidx), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(int, aidx) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->idx = sde->this_idx; + __entry->aidx = aidx; + ), + TP_printk("[%s] SDE(%u) aidx %d", + __get_str(dev), + __entry->idx, + __entry->aidx + ) +); + +DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_allocate, + TP_PROTO(struct sdma_engine *sde, int aidx), + TP_ARGS(sde, aidx)); + +DEFINE_EVENT(hfi1_sdma_ahg_ad, hfi1_ahg_deallocate, + TP_PROTO(struct sdma_engine *sde, int aidx), + TP_ARGS(sde, aidx)); + +#ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER +TRACE_EVENT(hfi1_sdma_progress, + TP_PROTO(struct sdma_engine *sde, + u16 hwhead, + u16 swhead, + struct sdma_txreq *txp + ), + TP_ARGS(sde, hwhead, swhead, txp), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u64, sn) + __field(u16, hwhead) + __field(u16, swhead) + __field(u16, txnext) + __field(u16, tx_tail) + __field(u16, tx_head) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->hwhead = hwhead; + __entry->swhead = swhead; + __entry->tx_tail = sde->tx_tail; + __entry->tx_head = sde->tx_head; + __entry->txnext = txp ? txp->next_descq_idx : ~0; + __entry->idx = sde->this_idx; + __entry->sn = txp ? txp->sn : ~0; + ), + TP_printk( + "[%s] SDE(%u) sn %llu hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", + __get_str(dev), + __entry->idx, + __entry->sn, + __entry->hwhead, + __entry->swhead, + __entry->txnext, + __entry->tx_head, + __entry->tx_tail + ) +); +#else +TRACE_EVENT(hfi1_sdma_progress, + TP_PROTO(struct sdma_engine *sde, + u16 hwhead, u16 swhead, + struct sdma_txreq *txp + ), + TP_ARGS(sde, hwhead, swhead, txp), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u16, hwhead) + __field(u16, swhead) + __field(u16, txnext) + __field(u16, tx_tail) + __field(u16, tx_head) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->hwhead = hwhead; + __entry->swhead = swhead; + __entry->tx_tail = sde->tx_tail; + __entry->tx_head = sde->tx_head; + __entry->txnext = txp ? txp->next_descq_idx : ~0; + __entry->idx = sde->this_idx; + ), + TP_printk( + "[%s] SDE(%u) hwhead %u swhead %u next_descq_idx %u tx_head %u tx_tail %u", + __get_str(dev), + __entry->idx, + __entry->hwhead, + __entry->swhead, + __entry->txnext, + __entry->tx_head, + __entry->tx_tail + ) +); +#endif + +DECLARE_EVENT_CLASS(hfi1_sdma_sn, + TP_PROTO(struct sdma_engine *sde, u64 sn), + TP_ARGS(sde, sn), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __field(u64, sn) + __field(u8, idx) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __entry->sn = sn; + __entry->idx = sde->this_idx; + ), + TP_printk("[%s] SDE(%u) sn %llu", + __get_str(dev), + __entry->idx, + __entry->sn + ) +); + +DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_out_sn, + TP_PROTO( + struct sdma_engine *sde, + u64 sn + ), + TP_ARGS(sde, sn) +); + +DEFINE_EVENT(hfi1_sdma_sn, hfi1_sdma_in_sn, + TP_PROTO(struct sdma_engine *sde, u64 sn), + TP_ARGS(sde, sn) +); + +#define USDMA_HDR_FORMAT \ + "[%s:%u:%u:%u] PBC=(0x%x 0x%x) LRH=(0x%x 0x%x) BTH=(0x%x 0x%x 0x%x) KDETH=(0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x) TIDVal=0x%x" + +TRACE_EVENT(hfi1_sdma_user_header, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req, + struct hfi1_pkt_header *hdr, u32 tidval), + TP_ARGS(dd, ctxt, subctxt, req, hdr, tidval), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u8, subctxt) + __field(u16, req) + __field(__le32, pbc0) + __field(__le32, pbc1) + __field(__be32, lrh0) + __field(__be32, lrh1) + __field(__be32, bth0) + __field(__be32, bth1) + __field(__be32, bth2) + __field(__le32, kdeth0) + __field(__le32, kdeth1) + __field(__le32, kdeth2) + __field(__le32, kdeth3) + __field(__le32, kdeth4) + __field(__le32, kdeth5) + __field(__le32, kdeth6) + __field(__le32, kdeth7) + __field(__le32, kdeth8) + __field(u32, tidval) + ), + TP_fast_assign( + __le32 *pbc = (__le32 *)hdr->pbc; + __be32 *lrh = (__be32 *)hdr->lrh; + __be32 *bth = (__be32 *)hdr->bth; + __le32 *kdeth = (__le32 *)&hdr->kdeth; + + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->req = req; + __entry->pbc0 = pbc[0]; + __entry->pbc1 = pbc[1]; + __entry->lrh0 = be32_to_cpu(lrh[0]); + __entry->lrh1 = be32_to_cpu(lrh[1]); + __entry->bth0 = be32_to_cpu(bth[0]); + __entry->bth1 = be32_to_cpu(bth[1]); + __entry->bth2 = be32_to_cpu(bth[2]); + __entry->kdeth0 = kdeth[0]; + __entry->kdeth1 = kdeth[1]; + __entry->kdeth2 = kdeth[2]; + __entry->kdeth3 = kdeth[3]; + __entry->kdeth4 = kdeth[4]; + __entry->kdeth5 = kdeth[5]; + __entry->kdeth6 = kdeth[6]; + __entry->kdeth7 = kdeth[7]; + __entry->kdeth8 = kdeth[8]; + __entry->tidval = tidval; + ), + TP_printk(USDMA_HDR_FORMAT, + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->req, + __entry->pbc1, + __entry->pbc0, + __entry->lrh0, + __entry->lrh1, + __entry->bth0, + __entry->bth1, + __entry->bth2, + __entry->kdeth0, + __entry->kdeth1, + __entry->kdeth2, + __entry->kdeth3, + __entry->kdeth4, + __entry->kdeth5, + __entry->kdeth6, + __entry->kdeth7, + __entry->kdeth8, + __entry->tidval + ) + ); + +#define SDMA_UREQ_FMT \ + "[%s:%u:%u] ver/op=0x%x, iovcnt=%u, npkts=%u, frag=%u, idx=%u" +TRACE_EVENT(hfi1_sdma_user_reqinfo, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 *i), + TP_ARGS(dd, ctxt, subctxt, i), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd); + __field(u16, ctxt) + __field(u8, subctxt) + __field(u8, ver_opcode) + __field(u8, iovcnt) + __field(u16, npkts) + __field(u16, fragsize) + __field(u16, comp_idx) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->ver_opcode = i[0] & 0xff; + __entry->iovcnt = (i[0] >> 8) & 0xff; + __entry->npkts = i[1]; + __entry->fragsize = i[2]; + __entry->comp_idx = i[3]; + ), + TP_printk(SDMA_UREQ_FMT, + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->ver_opcode, + __entry->iovcnt, + __entry->npkts, + __entry->fragsize, + __entry->comp_idx + ) + ); + +#define usdma_complete_name(st) { st, #st } +#define show_usdma_complete_state(st) \ + __print_symbolic(st, \ + usdma_complete_name(FREE), \ + usdma_complete_name(QUEUED), \ + usdma_complete_name(COMPLETE), \ + usdma_complete_name(ERROR)) + +TRACE_EVENT(hfi1_sdma_user_completion, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 idx, + u8 state, int code), + TP_ARGS(dd, ctxt, subctxt, idx, state, code), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u8, subctxt) + __field(u16, idx) + __field(u8, state) + __field(int, code) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->idx = idx; + __entry->state = state; + __entry->code = code; + ), + TP_printk("[%s:%u:%u:%u] SDMA completion state %s (%d)", + __get_str(dev), __entry->ctxt, __entry->subctxt, + __entry->idx, show_usdma_complete_state(__entry->state), + __entry->code) + ); + +const char *print_u32_array(struct trace_seq *, u32 *, int); +#define __print_u32_hex(arr, len) print_u32_array(p, arr, len) + +TRACE_EVENT(hfi1_sdma_user_header_ahg, + TP_PROTO(struct hfi1_devdata *dd, u16 ctxt, u8 subctxt, u16 req, + u8 sde, u8 ahgidx, u32 *ahg, int len, u32 tidval), + TP_ARGS(dd, ctxt, subctxt, req, sde, ahgidx, ahg, len, tidval), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd) + __field(u16, ctxt) + __field(u8, subctxt) + __field(u16, req) + __field(u8, sde) + __field(u8, idx) + __field(int, len) + __field(u32, tidval) + __array(u32, ahg, 10) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd); + __entry->ctxt = ctxt; + __entry->subctxt = subctxt; + __entry->req = req; + __entry->sde = sde; + __entry->idx = ahgidx; + __entry->len = len; + __entry->tidval = tidval; + memcpy(__entry->ahg, ahg, len * sizeof(u32)); + ), + TP_printk("[%s:%u:%u:%u] (SDE%u/AHG%u) ahg[0-%d]=(%s) TIDVal=0x%x", + __get_str(dev), + __entry->ctxt, + __entry->subctxt, + __entry->req, + __entry->sde, + __entry->idx, + __entry->len - 1, + __print_u32_hex(__entry->ahg, __entry->len), + __entry->tidval + ) + ); + +TRACE_EVENT(hfi1_sdma_state, + TP_PROTO(struct sdma_engine *sde, + const char *cstate, + const char *nstate + ), + TP_ARGS(sde, cstate, nstate), + TP_STRUCT__entry(DD_DEV_ENTRY(sde->dd) + __string(curstate, cstate) + __string(newstate, nstate) + ), + TP_fast_assign(DD_DEV_ASSIGN(sde->dd); + __assign_str(curstate, cstate); + __assign_str(newstate, nstate); + ), + TP_printk("[%s] current state %s new state %s", + __get_str(dev), + __get_str(curstate), + __get_str(newstate) + ) +); + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_rc + +DECLARE_EVENT_CLASS(hfi1_rc_template, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn), + TP_STRUCT__entry( + DD_DEV_ENTRY(dd_from_ibdev(qp->ibqp.device)) + __field(u32, qpn) + __field(u32, s_flags) + __field(u32, psn) + __field(u32, s_psn) + __field(u32, s_next_psn) + __field(u32, s_sending_psn) + __field(u32, s_sending_hpsn) + __field(u32, r_psn) + ), + TP_fast_assign( + DD_DEV_ASSIGN(dd_from_ibdev(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->s_flags = qp->s_flags; + __entry->psn = psn; + __entry->s_psn = qp->s_psn; + __entry->s_next_psn = qp->s_next_psn; + __entry->s_sending_psn = qp->s_sending_psn; + __entry->s_sending_hpsn = qp->s_sending_hpsn; + __entry->r_psn = qp->r_psn; + ), + TP_printk( + "[%s] qpn 0x%x s_flags 0x%x psn 0x%x s_psn 0x%x s_next_psn 0x%x s_sending_psn 0x%x sending_hpsn 0x%x r_psn 0x%x", + __get_str(dev), + __entry->qpn, + __entry->s_flags, + __entry->psn, + __entry->s_psn, + __entry->s_next_psn, + __entry->s_sending_psn, + __entry->s_sending_hpsn, + __entry->r_psn + ) +); + +DEFINE_EVENT(hfi1_rc_template, hfi1_rc_sendcomplete, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(hfi1_rc_template, hfi1_rc_ack, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(hfi1_rc_template, hfi1_rc_timeout, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +DEFINE_EVENT(hfi1_rc_template, hfi1_rc_rcv_error, + TP_PROTO(struct rvt_qp *qp, u32 psn), + TP_ARGS(qp, psn) +); + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_misc + +TRACE_EVENT(hfi1_interrupt, + TP_PROTO(struct hfi1_devdata *dd, const struct is_table *is_entry, + int src), + TP_ARGS(dd, is_entry, src), + TP_STRUCT__entry(DD_DEV_ENTRY(dd) + __array(char, buf, 64) + __field(int, src) + ), + TP_fast_assign(DD_DEV_ASSIGN(dd) + is_entry->is_name(__entry->buf, 64, + src - is_entry->start); + __entry->src = src; + ), + TP_printk("[%s] source: %s [%d]", __get_str(dev), __entry->buf, + __entry->src) +); + +/* + * Note: + * This produces a REALLY ugly trace in the console output when the string is + * too long. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_trace + +#define MAX_MSG_LEN 512 + +DECLARE_EVENT_CLASS(hfi1_trace_template, + TP_PROTO(const char *function, struct va_format *vaf), + TP_ARGS(function, vaf), + TP_STRUCT__entry(__string(function, function) + __dynamic_array(char, msg, MAX_MSG_LEN) + ), + TP_fast_assign(__assign_str(function, function); + WARN_ON_ONCE(vsnprintf + (__get_dynamic_array(msg), + MAX_MSG_LEN, vaf->fmt, + *vaf->va) >= + MAX_MSG_LEN); + ), + TP_printk("(%s) %s", + __get_str(function), + __get_str(msg)) +); + +/* + * It may be nice to macroize the __hfi1_trace but the va_* stuff requires an + * actual function to work and can not be in a macro. + */ +#define __hfi1_trace_def(lvl) \ +void __hfi1_trace_##lvl(const char *funct, char *fmt, ...); \ + \ +DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl, \ + TP_PROTO(const char *function, struct va_format *vaf), \ + TP_ARGS(function, vaf)) + +#define __hfi1_trace_fn(lvl) \ +void __hfi1_trace_##lvl(const char *func, char *fmt, ...) \ +{ \ + struct va_format vaf = { \ + .fmt = fmt, \ + }; \ + va_list args; \ + \ + va_start(args, fmt); \ + vaf.va = &args; \ + trace_hfi1_ ##lvl(func, &vaf); \ + va_end(args); \ + return; \ +} + +/* + * To create a new trace level simply define it below and as a __hfi1_trace_fn + * in trace.c. This will create all the hooks for calling + * hfi1_cdbg(LVL, fmt, ...); as well as take care of all + * the debugfs stuff. + */ +__hfi1_trace_def(PKT); +__hfi1_trace_def(PROC); +__hfi1_trace_def(SDMA); +__hfi1_trace_def(LINKVERB); +__hfi1_trace_def(DEBUG); +__hfi1_trace_def(SNOOP); +__hfi1_trace_def(CNTR); +__hfi1_trace_def(PIO); +__hfi1_trace_def(DC8051); +__hfi1_trace_def(FIRMWARE); +__hfi1_trace_def(RCVCTRL); +__hfi1_trace_def(TID); +__hfi1_trace_def(MMU); +__hfi1_trace_def(IOCTL); + +#define hfi1_cdbg(which, fmt, ...) \ + __hfi1_trace_##which(__func__, fmt, ##__VA_ARGS__) + +#define hfi1_dbg(fmt, ...) \ + hfi1_cdbg(DEBUG, fmt, ##__VA_ARGS__) + +/* + * Define HFI1_EARLY_DBG at compile time or here to enable early trace + * messages. Do not check in an enablement for this. + */ + +#ifdef HFI1_EARLY_DBG +#define hfi1_dbg_early(fmt, ...) \ + trace_printk(fmt, ##__VA_ARGS__) +#else +#define hfi1_dbg_early(fmt, ...) +#endif + +#endif /* __HFI1_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/twsi.c b/drivers/infiniband/hw/hfi1/twsi.c new file mode 100644 index 000000000..e82e52a63 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/twsi.c @@ -0,0 +1,489 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/delay.h> +#include <linux/pci.h> +#include <linux/vmalloc.h> + +#include "hfi.h" +#include "twsi.h" + +/* + * "Two Wire Serial Interface" support. + * + * Originally written for a not-quite-i2c serial eeprom, which is + * still used on some supported boards. Later boards have added a + * variety of other uses, most board-specific, so the bit-boffing + * part has been split off to this file, while the other parts + * have been moved to chip-specific files. + * + * We have also dropped all pretense of fully generic (e.g. pretend + * we don't know whether '1' is the higher voltage) interface, as + * the restrictions of the generic i2c interface (e.g. no access from + * driver itself) make it unsuitable for this use. + */ + +#define READ_CMD 1 +#define WRITE_CMD 0 + +/** + * i2c_wait_for_writes - wait for a write + * @dd: the hfi1_ib device + * + * We use this instead of udelay directly, so we can make sure + * that previous register writes have been flushed all the way + * to the chip. Since we are delaying anyway, the cost doesn't + * hurt, and makes the bit twiddling more regular + */ +static void i2c_wait_for_writes(struct hfi1_devdata *dd, u32 target) +{ + /* + * implicit read of EXTStatus is as good as explicit + * read of scratch, if all we want to do is flush + * writes. + */ + hfi1_gpio_mod(dd, target, 0, 0, 0); + rmb(); /* inlined, so prevent compiler reordering */ +} + +/* + * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that + * for "almost compliant" modules + */ +#define SCL_WAIT_USEC 1000 + +/* BUF_WAIT is time bus must be free between STOP or ACK and to next START. + * Should be 20, but some chips need more. + */ +#define TWSI_BUF_WAIT_USEC 60 + +static void scl_out(struct hfi1_devdata *dd, u32 target, u8 bit) +{ + u32 mask; + + udelay(1); + + mask = QSFP_HFI0_I2CCLK; + + /* SCL is meant to be bare-drain, so never set "OUT", just DIR */ + hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask); + + /* + * Allow for slow slaves by simple + * delay for falling edge, sampling on rise. + */ + if (!bit) { + udelay(2); + } else { + int rise_usec; + + for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) { + if (mask & hfi1_gpio_mod(dd, target, 0, 0, 0)) + break; + udelay(2); + } + if (rise_usec <= 0) + dd_dev_err(dd, "SCL interface stuck low > %d uSec\n", + SCL_WAIT_USEC); + } + i2c_wait_for_writes(dd, target); +} + +static u8 scl_in(struct hfi1_devdata *dd, u32 target, int wait) +{ + u32 read_val, mask; + + mask = QSFP_HFI0_I2CCLK; + /* SCL is meant to be bare-drain, so never set "OUT", just DIR */ + hfi1_gpio_mod(dd, target, 0, 0, mask); + read_val = hfi1_gpio_mod(dd, target, 0, 0, 0); + if (wait) + i2c_wait_for_writes(dd, target); + return (read_val & mask) >> GPIO_SCL_NUM; +} + +static void sda_out(struct hfi1_devdata *dd, u32 target, u8 bit) +{ + u32 mask; + + mask = QSFP_HFI0_I2CDAT; + + /* SDA is meant to be bare-drain, so never set "OUT", just DIR */ + hfi1_gpio_mod(dd, target, 0, bit ? 0 : mask, mask); + + i2c_wait_for_writes(dd, target); + udelay(2); +} + +static u8 sda_in(struct hfi1_devdata *dd, u32 target, int wait) +{ + u32 read_val, mask; + + mask = QSFP_HFI0_I2CDAT; + /* SDA is meant to be bare-drain, so never set "OUT", just DIR */ + hfi1_gpio_mod(dd, target, 0, 0, mask); + read_val = hfi1_gpio_mod(dd, target, 0, 0, 0); + if (wait) + i2c_wait_for_writes(dd, target); + return (read_val & mask) >> GPIO_SDA_NUM; +} + +/** + * i2c_ackrcv - see if ack following write is true + * @dd: the hfi1_ib device + */ +static int i2c_ackrcv(struct hfi1_devdata *dd, u32 target) +{ + u8 ack_received; + + /* AT ENTRY SCL = LOW */ + /* change direction, ignore data */ + ack_received = sda_in(dd, target, 1); + scl_out(dd, target, 1); + ack_received = sda_in(dd, target, 1) == 0; + scl_out(dd, target, 0); + return ack_received; +} + +static void stop_cmd(struct hfi1_devdata *dd, u32 target); + +/** + * rd_byte - read a byte, sending STOP on last, else ACK + * @dd: the hfi1_ib device + * + * Returns byte shifted out of device + */ +static int rd_byte(struct hfi1_devdata *dd, u32 target, int last) +{ + int bit_cntr, data; + + data = 0; + + for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) { + data <<= 1; + scl_out(dd, target, 1); + data |= sda_in(dd, target, 0); + scl_out(dd, target, 0); + } + if (last) { + scl_out(dd, target, 1); + stop_cmd(dd, target); + } else { + sda_out(dd, target, 0); + scl_out(dd, target, 1); + scl_out(dd, target, 0); + sda_out(dd, target, 1); + } + return data; +} + +/** + * wr_byte - write a byte, one bit at a time + * @dd: the hfi1_ib device + * @data: the byte to write + * + * Returns 0 if we got the following ack, otherwise 1 + */ +static int wr_byte(struct hfi1_devdata *dd, u32 target, u8 data) +{ + int bit_cntr; + u8 bit; + + for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) { + bit = (data >> bit_cntr) & 1; + sda_out(dd, target, bit); + scl_out(dd, target, 1); + scl_out(dd, target, 0); + } + return (!i2c_ackrcv(dd, target)) ? 1 : 0; +} + +/* + * issue TWSI start sequence: + * (both clock/data high, clock high, data low while clock is high) + */ +static void start_seq(struct hfi1_devdata *dd, u32 target) +{ + sda_out(dd, target, 1); + scl_out(dd, target, 1); + sda_out(dd, target, 0); + udelay(1); + scl_out(dd, target, 0); +} + +/** + * stop_seq - transmit the stop sequence + * @dd: the hfi1_ib device + * + * (both clock/data low, clock high, data high while clock is high) + */ +static void stop_seq(struct hfi1_devdata *dd, u32 target) +{ + scl_out(dd, target, 0); + sda_out(dd, target, 0); + scl_out(dd, target, 1); + sda_out(dd, target, 1); +} + +/** + * stop_cmd - transmit the stop condition + * @dd: the hfi1_ib device + * + * (both clock/data low, clock high, data high while clock is high) + */ +static void stop_cmd(struct hfi1_devdata *dd, u32 target) +{ + stop_seq(dd, target); + udelay(TWSI_BUF_WAIT_USEC); +} + +/** + * hfi1_twsi_reset - reset I2C communication + * @dd: the hfi1_ib device + * returns 0 if ok, -EIO on error + */ +int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target) +{ + int clock_cycles_left = 9; + u32 mask; + + /* Both SCL and SDA should be high. If not, there + * is something wrong. + */ + mask = QSFP_HFI0_I2CCLK | QSFP_HFI0_I2CDAT; + + /* + * Force pins to desired innocuous state. + * This is the default power-on state with out=0 and dir=0, + * So tri-stated and should be floating high (barring HW problems) + */ + hfi1_gpio_mod(dd, target, 0, 0, mask); + + /* Check if SCL is low, if it is low then we have a slave device + * misbehaving and there is not much we can do. + */ + if (!scl_in(dd, target, 0)) + return -EIO; + + /* Check if SDA is low, if it is low then we have to clock SDA + * up to 9 times for the device to release the bus + */ + while (clock_cycles_left--) { + if (sda_in(dd, target, 0)) + return 0; + scl_out(dd, target, 0); + scl_out(dd, target, 1); + } + + return -EIO; +} + +#define HFI1_TWSI_START 0x100 +#define HFI1_TWSI_STOP 0x200 + +/* Write byte to TWSI, optionally prefixed with START or suffixed with + * STOP. + * returns 0 if OK (ACK received), else != 0 + */ +static int twsi_wr(struct hfi1_devdata *dd, u32 target, int data, int flags) +{ + int ret = 1; + + if (flags & HFI1_TWSI_START) + start_seq(dd, target); + + /* Leaves SCL low (from i2c_ackrcv()) */ + ret = wr_byte(dd, target, data); + + if (flags & HFI1_TWSI_STOP) + stop_cmd(dd, target); + return ret; +} + +/* Added functionality for IBA7220-based cards */ +#define HFI1_TEMP_DEV 0x98 + +/* + * hfi1_twsi_blk_rd + * General interface for data transfer from twsi devices. + * One vestige of its former role is that it recognizes a device + * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part, + * which responded to all TWSI device codes, interpreting them as + * address within device. On all other devices found on board handled by + * this driver, the device is followed by a N-byte "address" which selects + * the "register" or "offset" within the device from which data should + * be read. + */ +int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr, + void *buffer, int len) +{ + u8 *bp = buffer; + int ret = 1; + int i; + int offset_size; + + /* obtain the offset size, strip it from the device address */ + offset_size = (dev >> 8) & 0xff; + dev &= 0xff; + + /* allow at most a 2 byte offset */ + if (offset_size > 2) + goto bail; + + if (dev == HFI1_TWSI_NO_DEV) { + /* legacy not-really-I2C */ + addr = (addr << 1) | READ_CMD; + ret = twsi_wr(dd, target, addr, HFI1_TWSI_START); + } else { + /* Actual I2C */ + if (offset_size) { + ret = twsi_wr(dd, target, + dev | WRITE_CMD, HFI1_TWSI_START); + if (ret) { + stop_cmd(dd, target); + goto bail; + } + + for (i = 0; i < offset_size; i++) { + ret = twsi_wr(dd, target, + (addr >> (i * 8)) & 0xff, 0); + udelay(TWSI_BUF_WAIT_USEC); + if (ret) { + dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n", + i, addr); + goto bail; + } + } + } + ret = twsi_wr(dd, target, dev | READ_CMD, HFI1_TWSI_START); + } + if (ret) { + stop_cmd(dd, target); + goto bail; + } + + /* + * block devices keeps clocking data out as long as we ack, + * automatically incrementing the address. Some have "pages" + * whose boundaries will not be crossed, but the handling + * of these is left to the caller, who is in a better + * position to know. + */ + while (len-- > 0) { + /* + * Get and store data, sending ACK if length remaining, + * else STOP + */ + *bp++ = rd_byte(dd, target, !len); + } + + ret = 0; + +bail: + return ret; +} + +/* + * hfi1_twsi_blk_wr + * General interface for data transfer to twsi devices. + * One vestige of its former role is that it recognizes a device + * HFI1_TWSI_NO_DEV and does the correct operation for the legacy part, + * which responded to all TWSI device codes, interpreting them as + * address within device. On all other devices found on board handled by + * this driver, the device is followed by a N-byte "address" which selects + * the "register" or "offset" within the device to which data should + * be written. + */ +int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr, + const void *buffer, int len) +{ + const u8 *bp = buffer; + int ret = 1; + int i; + int offset_size; + + /* obtain the offset size, strip it from the device address */ + offset_size = (dev >> 8) & 0xff; + dev &= 0xff; + + /* allow at most a 2 byte offset */ + if (offset_size > 2) + goto bail; + + if (dev == HFI1_TWSI_NO_DEV) { + if (twsi_wr(dd, target, (addr << 1) | WRITE_CMD, + HFI1_TWSI_START)) { + goto failed_write; + } + } else { + /* Real I2C */ + if (twsi_wr(dd, target, dev | WRITE_CMD, HFI1_TWSI_START)) + goto failed_write; + } + + for (i = 0; i < offset_size; i++) { + ret = twsi_wr(dd, target, (addr >> (i * 8)) & 0xff, 0); + udelay(TWSI_BUF_WAIT_USEC); + if (ret) { + dd_dev_err(dd, "Failed to write byte %d of offset 0x%04X\n", + i, addr); + goto bail; + } + } + + for (i = 0; i < len; i++) + if (twsi_wr(dd, target, *bp++, 0)) + goto failed_write; + + ret = 0; + +failed_write: + stop_cmd(dd, target); + +bail: + return ret; +} diff --git a/drivers/infiniband/hw/hfi1/twsi.h b/drivers/infiniband/hw/hfi1/twsi.h new file mode 100644 index 000000000..5b8a5b5e7 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/twsi.h @@ -0,0 +1,65 @@ +#ifndef _TWSI_H +#define _TWSI_H +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#define HFI1_TWSI_NO_DEV 0xFF + +struct hfi1_devdata; + +/* Bit position of SDA/SCL pins in ASIC_QSFP* registers */ +#define GPIO_SDA_NUM 1 +#define GPIO_SCL_NUM 0 + +/* these functions must be called with qsfp_lock held */ +int hfi1_twsi_reset(struct hfi1_devdata *dd, u32 target); +int hfi1_twsi_blk_rd(struct hfi1_devdata *dd, u32 target, int dev, int addr, + void *buffer, int len); +int hfi1_twsi_blk_wr(struct hfi1_devdata *dd, u32 target, int dev, int addr, + const void *buffer, int len); + +#endif /* _TWSI_H */ diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c new file mode 100644 index 000000000..df773d433 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -0,0 +1,604 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "hfi.h" +#include "verbs_txreq.h" +#include "qp.h" + +/* cut down ridiculously long IB macro names */ +#define OP(x) IB_OPCODE_UC_##x + +/* only opcode mask for adaptive pio */ +const u32 uc_only_opcode = + BIT(OP(SEND_ONLY) & 0x1f) | + BIT(OP(SEND_ONLY_WITH_IMMEDIATE & 0x1f)) | + BIT(OP(RDMA_WRITE_ONLY & 0x1f)) | + BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE & 0x1f)); + +/** + * hfi1_make_uc_req - construct a request packet (SEND, RDMA write) + * @qp: a pointer to the QP + * + * Assume s_lock is held. + * + * Return 1 if constructed; otherwise, return 0. + */ +int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_other_headers *ohdr; + struct rvt_swqe *wqe; + u32 hwords = 5; + u32 bth0 = 0; + u32 len; + u32 pmtu = qp->pmtu; + int middle = 0; + + ps->s_txreq = get_txreq(ps->dev, qp); + if (IS_ERR(ps->s_txreq)) + goto bail_no_tx; + + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_SEND_OK)) { + if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + smp_read_barrier_depends(); /* see post_one_send() */ + if (qp->s_last == ACCESS_ONCE(qp->s_head)) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (iowait_sdma_pending(&priv->s_iowait)) { + qp->s_flags |= RVT_S_WAIT_DMA; + goto bail; + } + clear_ahg(qp); + wqe = rvt_get_swqe_ptr(qp, qp->s_last); + hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done_free_tx; + } + + ohdr = &ps->s_txreq->phdr.hdr.u.oth; + if (qp->remote_ah_attr.ah_flags & IB_AH_GRH) + ohdr = &ps->s_txreq->phdr.hdr.u.l.oth; + + /* Get the next send request. */ + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); + qp->s_wqe = NULL; + switch (qp->s_state) { + default: + if (!(ib_rvt_state_ops[qp->state] & + RVT_PROCESS_NEXT_SEND_OK)) + goto bail; + /* Check if send work queue is empty. */ + smp_read_barrier_depends(); /* see post_one_send() */ + if (qp->s_cur == ACCESS_ONCE(qp->s_head)) { + clear_ahg(qp); + goto bail; + } + /* + * Start a new request. + */ + qp->s_psn = wqe->psn; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + len = wqe->length; + qp->s_len = len; + switch (wqe->wr.opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + if (len > pmtu) { + qp->s_state = OP(SEND_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_SEND) { + qp->s_state = OP(SEND_ONLY); + } else { + qp->s_state = + OP(SEND_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + ohdr->u.rc.reth.vaddr = + cpu_to_be64(wqe->rdma_wr.remote_addr); + ohdr->u.rc.reth.rkey = + cpu_to_be32(wqe->rdma_wr.rkey); + ohdr->u.rc.reth.length = cpu_to_be32(len); + hwords += sizeof(struct ib_reth) / 4; + if (len > pmtu) { + qp->s_state = OP(RDMA_WRITE_FIRST); + len = pmtu; + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { + qp->s_state = OP(RDMA_WRITE_ONLY); + } else { + qp->s_state = + OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE); + /* Immediate data comes after the RETH */ + ohdr->u.rc.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + default: + goto bail; + } + break; + + case OP(SEND_FIRST): + qp->s_state = OP(SEND_MIDDLE); + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + middle = HFI1_CAP_IS_KSET(SDMA_AHG); + break; + } + if (wqe->wr.opcode == IB_WR_SEND) { + qp->s_state = OP(SEND_LAST); + } else { + qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + + case OP(RDMA_WRITE_FIRST): + qp->s_state = OP(RDMA_WRITE_MIDDLE); + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + len = qp->s_len; + if (len > pmtu) { + len = pmtu; + middle = HFI1_CAP_IS_KSET(SDMA_AHG); + break; + } + if (wqe->wr.opcode == IB_WR_RDMA_WRITE) { + qp->s_state = OP(RDMA_WRITE_LAST); + } else { + qp->s_state = + OP(RDMA_WRITE_LAST_WITH_IMMEDIATE); + /* Immediate data comes after the BTH */ + ohdr->u.imm_data = wqe->wr.ex.imm_data; + hwords += 1; + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + } + qp->s_wqe = wqe; + if (++qp->s_cur >= qp->s_size) + qp->s_cur = 0; + break; + } + qp->s_len -= len; + qp->s_hdrwords = hwords; + ps->s_txreq->sde = priv->s_sde; + qp->s_cur_sge = &qp->s_sge; + qp->s_cur_size = len; + hfi1_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), + mask_psn(qp->s_psn++), middle, ps); + /* pbc */ + ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2; + return 1; + +done_free_tx: + hfi1_put_txreq(ps->s_txreq); + ps->s_txreq = NULL; + return 1; + +bail: + hfi1_put_txreq(ps->s_txreq); + +bail_no_tx: + ps->s_txreq = NULL; + qp->s_flags &= ~RVT_S_BUSY; + qp->s_hdrwords = 0; + return 0; +} + +/** + * hfi1_uc_rcv - handle an incoming UC packet + * @ibp: the port the packet came in on + * @hdr: the header of the packet + * @rcv_flags: flags relevant to rcv processing + * @data: the packet data + * @tlen: the length of the packet + * @qp: the QP for this packet. + * + * This is called from qp_rcv() to process an incoming UC packet + * for the given QP. + * Called at interrupt level. + */ +void hfi1_uc_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data; + struct hfi1_ib_header *hdr = packet->hdr; + u32 rcv_flags = packet->rcv_flags; + void *data = packet->ebuf; + u32 tlen = packet->tlen; + struct rvt_qp *qp = packet->qp; + struct hfi1_other_headers *ohdr = packet->ohdr; + u32 bth0, opcode; + u32 hdrsize = packet->hlen; + u32 psn; + u32 pad; + struct ib_wc wc; + u32 pmtu = qp->pmtu; + struct ib_reth *reth; + int has_grh = rcv_flags & HFI1_HAS_GRH; + int ret; + u32 bth1; + + bth0 = be32_to_cpu(ohdr->bth[0]); + if (hfi1_ruc_check_hdr(ibp, hdr, has_grh, qp, bth0)) + return; + + bth1 = be32_to_cpu(ohdr->bth[1]); + if (unlikely(bth1 & (HFI1_BECN_SMASK | HFI1_FECN_SMASK))) { + if (bth1 & HFI1_BECN_SMASK) { + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 rqpn, lqpn; + u16 rlid = be16_to_cpu(hdr->lrh[3]); + u8 sl, sc5; + + lqpn = bth1 & RVT_QPN_MASK; + rqpn = qp->remote_qpn; + + sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl]; + sl = ibp->sc_to_sl[sc5]; + + process_becn(ppd, sl, rlid, lqpn, rqpn, + IB_CC_SVCTYPE_UC); + } + + if (bth1 & HFI1_FECN_SMASK) { + struct ib_grh *grh = NULL; + u16 pkey = (u16)be32_to_cpu(ohdr->bth[0]); + u16 slid = be16_to_cpu(hdr->lrh[3]); + u16 dlid = be16_to_cpu(hdr->lrh[1]); + u32 src_qp = qp->remote_qpn; + u8 sc5; + + sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl]; + if (has_grh) + grh = &hdr->u.l.grh; + + return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, + grh); + } + } + + psn = be32_to_cpu(ohdr->bth[2]); + opcode = (bth0 >> 24) & 0xff; + + /* Compare the PSN verses the expected PSN. */ + if (unlikely(cmp_psn(psn, qp->r_psn) != 0)) { + /* + * Handle a sequence error. + * Silently drop any current message. + */ + qp->r_psn = psn; +inv: + if (qp->r_state == OP(SEND_FIRST) || + qp->r_state == OP(SEND_MIDDLE)) { + set_bit(RVT_R_REWIND_SGE, &qp->r_aflags); + qp->r_sge.num_sge = 0; + } else { + rvt_put_ss(&qp->r_sge); + } + qp->r_state = OP(SEND_LAST); + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): + goto send_first; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): + goto rdma_first; + + default: + goto drop; + } + } + + /* Check for opcode sequence errors. */ + switch (qp->r_state) { + case OP(SEND_FIRST): + case OP(SEND_MIDDLE): + if (opcode == OP(SEND_MIDDLE) || + opcode == OP(SEND_LAST) || + opcode == OP(SEND_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_MIDDLE): + if (opcode == OP(RDMA_WRITE_MIDDLE) || + opcode == OP(RDMA_WRITE_LAST) || + opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE)) + break; + goto inv; + + default: + if (opcode == OP(SEND_FIRST) || + opcode == OP(SEND_ONLY) || + opcode == OP(SEND_ONLY_WITH_IMMEDIATE) || + opcode == OP(RDMA_WRITE_FIRST) || + opcode == OP(RDMA_WRITE_ONLY) || + opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) + break; + goto inv; + } + + if (qp->state == IB_QPS_RTR && !(qp->r_flags & RVT_R_COMM_EST)) + qp_comm_est(qp); + + /* OK, process the packet. */ + switch (opcode) { + case OP(SEND_FIRST): + case OP(SEND_ONLY): + case OP(SEND_ONLY_WITH_IMMEDIATE): +send_first: + if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) { + qp->r_sge = qp->s_rdma_read_sge; + } else { + ret = hfi1_rvt_get_rwqe(qp, 0); + if (ret < 0) + goto op_err; + if (!ret) + goto drop; + /* + * qp->s_rdma_read_sge will be the owner + * of the mr references. + */ + qp->s_rdma_read_sge = qp->r_sge; + } + qp->r_rcv_len = 0; + if (opcode == OP(SEND_ONLY)) + goto no_immediate_data; + else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE)) + goto send_last_imm; + /* FALLTHROUGH */ + case OP(SEND_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto rewind; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto rewind; + hfi1_copy_sge(&qp->r_sge, data, pmtu, 0, 0); + break; + + case OP(SEND_LAST_WITH_IMMEDIATE): +send_last_imm: + wc.ex.imm_data = ohdr->u.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + goto send_last; + case OP(SEND_LAST): +no_immediate_data: + wc.ex.imm_data = 0; + wc.wc_flags = 0; +send_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto rewind; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + wc.byte_len = tlen + qp->r_rcv_len; + if (unlikely(wc.byte_len > qp->r_len)) + goto rewind; + wc.opcode = IB_WC_RECV; + hfi1_copy_sge(&qp->r_sge, data, tlen, 0, 0); + rvt_put_ss(&qp->s_rdma_read_sge); +last_imm: + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = qp->remote_ah_attr.dlid; + /* + * It seems that IB mandates the presence of an SL in a + * work completion only for the UD transport (see section + * 11.4.2 of IBTA Vol. 1). + * + * However, the way the SL is chosen below is consistent + * with the way that IB/qib works and is trying avoid + * introducing incompatibilities. + * + * See also OPA Vol. 1, section 9.7.6, and table 9-17. + */ + wc.sl = qp->remote_ah_attr.sl; + /* zero fields that are N/A */ + wc.vendor_err = 0; + wc.pkey_index = 0; + wc.dlid_path_bits = 0; + wc.port_num = 0; + /* Signal completion event if the solicited bit is set. */ + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(IB_BTH_SOLICITED)) != 0); + break; + + case OP(RDMA_WRITE_FIRST): + case OP(RDMA_WRITE_ONLY): + case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */ +rdma_first: + if (unlikely(!(qp->qp_access_flags & + IB_ACCESS_REMOTE_WRITE))) { + goto drop; + } + reth = &ohdr->u.rc.reth; + qp->r_len = be32_to_cpu(reth->length); + qp->r_rcv_len = 0; + qp->r_sge.sg_list = NULL; + if (qp->r_len != 0) { + u32 rkey = be32_to_cpu(reth->rkey); + u64 vaddr = be64_to_cpu(reth->vaddr); + int ok; + + /* Check rkey */ + ok = rvt_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, + vaddr, rkey, IB_ACCESS_REMOTE_WRITE); + if (unlikely(!ok)) + goto drop; + qp->r_sge.num_sge = 1; + } else { + qp->r_sge.num_sge = 0; + qp->r_sge.sge.mr = NULL; + qp->r_sge.sge.vaddr = NULL; + qp->r_sge.sge.length = 0; + qp->r_sge.sge.sge_length = 0; + } + if (opcode == OP(RDMA_WRITE_ONLY)) { + goto rdma_last; + } else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) { + wc.ex.imm_data = ohdr->u.rc.imm_data; + goto rdma_last_imm; + } + /* FALLTHROUGH */ + case OP(RDMA_WRITE_MIDDLE): + /* Check for invalid length PMTU or posted rwqe len. */ + if (unlikely(tlen != (hdrsize + pmtu + 4))) + goto drop; + qp->r_rcv_len += pmtu; + if (unlikely(qp->r_rcv_len > qp->r_len)) + goto drop; + hfi1_copy_sge(&qp->r_sge, data, pmtu, 1, 0); + break; + + case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): + wc.ex.imm_data = ohdr->u.imm_data; +rdma_last_imm: + wc.wc_flags = IB_WC_WITH_IMM; + + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto drop; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) + goto drop; + if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) { + rvt_put_ss(&qp->s_rdma_read_sge); + } else { + ret = hfi1_rvt_get_rwqe(qp, 1); + if (ret < 0) + goto op_err; + if (!ret) + goto drop; + } + wc.byte_len = qp->r_len; + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + hfi1_copy_sge(&qp->r_sge, data, tlen, 1, 0); + rvt_put_ss(&qp->r_sge); + goto last_imm; + + case OP(RDMA_WRITE_LAST): +rdma_last: + /* Get the number of bytes the message was padded by. */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + /* Check for invalid length. */ + /* LAST len should be >= 1 */ + if (unlikely(tlen < (hdrsize + pad + 4))) + goto drop; + /* Don't count the CRC. */ + tlen -= (hdrsize + pad + 4); + if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) + goto drop; + hfi1_copy_sge(&qp->r_sge, data, tlen, 1, 0); + rvt_put_ss(&qp->r_sge); + break; + + default: + /* Drop packet for unknown opcodes. */ + goto drop; + } + qp->r_psn++; + qp->r_state = opcode; + return; + +rewind: + set_bit(RVT_R_REWIND_SGE, &qp->r_aflags); + qp->r_sge.num_sge = 0; +drop: + ibp->rvp.n_pkt_drops++; + return; + +op_err: + hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR); +} diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c new file mode 100644 index 000000000..be91f6fa1 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -0,0 +1,894 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/net.h> +#include <rdma/ib_smi.h> + +#include "hfi.h" +#include "mad.h" +#include "verbs_txreq.h" +#include "qp.h" + +/** + * ud_loopback - handle send on loopback QPs + * @sqp: the sending QP + * @swqe: the send work request + * + * This is called from hfi1_make_ud_req() to forward a WQE addressed + * to the same HFI. + * Note that the receive interrupt handler may be calling hfi1_ud_rcv() + * while this is being called. + */ +static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) +{ + struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); + struct hfi1_pportdata *ppd; + struct rvt_qp *qp; + struct ib_ah_attr *ah_attr; + unsigned long flags; + struct rvt_sge_state ssge; + struct rvt_sge *sge; + struct ib_wc wc; + u32 length; + enum ib_qp_type sqptype, dqptype; + + rcu_read_lock(); + + qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp, + swqe->ud_wr.remote_qpn); + if (!qp) { + ibp->rvp.n_pkt_drops++; + rcu_read_unlock(); + return; + } + + sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ? + IB_QPT_UD : sqp->ibqp.qp_type; + dqptype = qp->ibqp.qp_type == IB_QPT_GSI ? + IB_QPT_UD : qp->ibqp.qp_type; + + if (dqptype != sqptype || + !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) { + ibp->rvp.n_pkt_drops++; + goto drop; + } + + ah_attr = &ibah_to_rvtah(swqe->ud_wr.ah)->attr; + ppd = ppd_from_ibp(ibp); + + if (qp->ibqp.qp_num > 1) { + u16 pkey; + u16 slid; + u8 sc5 = ibp->sl_to_sc[ah_attr->sl]; + + pkey = hfi1_get_pkey(ibp, sqp->s_pkey_index); + slid = ppd->lid | (ah_attr->src_path_bits & + ((1 << ppd->lmc) - 1)); + if (unlikely(ingress_pkey_check(ppd, pkey, sc5, + qp->s_pkey_index, slid))) { + hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY, pkey, + ah_attr->sl, + sqp->ibqp.qp_num, qp->ibqp.qp_num, + slid, ah_attr->dlid); + goto drop; + } + } + + /* + * Check that the qkey matches (except for QP0, see 9.6.1.4.1). + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + if (qp->ibqp.qp_num) { + u32 qkey; + + qkey = (int)swqe->ud_wr.remote_qkey < 0 ? + sqp->qkey : swqe->ud_wr.remote_qkey; + if (unlikely(qkey != qp->qkey)) { + u16 lid; + + lid = ppd->lid | (ah_attr->src_path_bits & + ((1 << ppd->lmc) - 1)); + hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey, + ah_attr->sl, + sqp->ibqp.qp_num, qp->ibqp.qp_num, + lid, + ah_attr->dlid); + goto drop; + } + } + + /* + * A GRH is expected to precede the data even if not + * present on the wire. + */ + length = swqe->length; + memset(&wc, 0, sizeof(wc)); + wc.byte_len = length + sizeof(struct ib_grh); + + if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = swqe->wr.ex.imm_data; + } + + spin_lock_irqsave(&qp->r_lock, flags); + + /* + * Get the next work request entry to find where to put the data. + */ + if (qp->r_flags & RVT_R_REUSE_SGE) { + qp->r_flags &= ~RVT_R_REUSE_SGE; + } else { + int ret; + + ret = hfi1_rvt_get_rwqe(qp, 0); + if (ret < 0) { + hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + goto bail_unlock; + } + if (!ret) { + if (qp->ibqp.qp_num == 0) + ibp->rvp.n_vl15_dropped++; + goto bail_unlock; + } + } + /* Silently drop packets which are too big. */ + if (unlikely(wc.byte_len > qp->r_len)) { + qp->r_flags |= RVT_R_REUSE_SGE; + ibp->rvp.n_pkt_drops++; + goto bail_unlock; + } + + if (ah_attr->ah_flags & IB_AH_GRH) { + hfi1_copy_sge(&qp->r_sge, &ah_attr->grh, + sizeof(struct ib_grh), 1, 0); + wc.wc_flags |= IB_WC_GRH; + } else { + hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1); + } + ssge.sg_list = swqe->sg_list + 1; + ssge.sge = *swqe->sg_list; + ssge.num_sge = swqe->wr.num_sge; + sge = &ssge.sge; + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + WARN_ON_ONCE(len == 0); + hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, 1, 0); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (--ssge.num_sge) + *sge = *ssge.sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= RVT_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } + rvt_put_ss(&qp->r_sge); + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) + goto bail_unlock; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.qp = &qp->ibqp; + wc.src_qp = sqp->ibqp.qp_num; + if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) { + if (sqp->ibqp.qp_type == IB_QPT_GSI || + sqp->ibqp.qp_type == IB_QPT_SMI) + wc.pkey_index = swqe->ud_wr.pkey_index; + else + wc.pkey_index = sqp->s_pkey_index; + } else { + wc.pkey_index = 0; + } + wc.slid = ppd->lid | (ah_attr->src_path_bits & ((1 << ppd->lmc) - 1)); + /* Check for loopback when the port lid is not set */ + if (wc.slid == 0 && sqp->ibqp.qp_type == IB_QPT_GSI) + wc.slid = be16_to_cpu(IB_LID_PERMISSIVE); + wc.sl = ah_attr->sl; + wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1); + wc.port_num = qp->port_num; + /* Signal completion event if the solicited bit is set. */ + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, + swqe->wr.send_flags & IB_SEND_SOLICITED); + ibp->rvp.n_loop_pkts++; +bail_unlock: + spin_unlock_irqrestore(&qp->r_lock, flags); +drop: + rcu_read_unlock(); +} + +/** + * hfi1_make_ud_req - construct a UD request packet + * @qp: the QP + * + * Assume s_lock is held. + * + * Return 1 if constructed; otherwise, return 0. + */ +int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_other_headers *ohdr; + struct ib_ah_attr *ah_attr; + struct hfi1_pportdata *ppd; + struct hfi1_ibport *ibp; + struct rvt_swqe *wqe; + u32 nwords; + u32 extra_bytes; + u32 bth0; + u16 lrh0; + u16 lid; + int next_cur; + u8 sc5; + + ps->s_txreq = get_txreq(ps->dev, qp); + if (IS_ERR(ps->s_txreq)) + goto bail_no_tx; + + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_NEXT_SEND_OK)) { + if (!(ib_rvt_state_ops[qp->state] & RVT_FLUSH_SEND)) + goto bail; + /* We are in the error state, flush the work request. */ + smp_read_barrier_depends(); /* see post_one_send */ + if (qp->s_last == ACCESS_ONCE(qp->s_head)) + goto bail; + /* If DMAs are in progress, we can't flush immediately. */ + if (iowait_sdma_pending(&priv->s_iowait)) { + qp->s_flags |= RVT_S_WAIT_DMA; + goto bail; + } + wqe = rvt_get_swqe_ptr(qp, qp->s_last); + hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + goto done_free_tx; + } + + /* see post_one_send() */ + smp_read_barrier_depends(); + if (qp->s_cur == ACCESS_ONCE(qp->s_head)) + goto bail; + + wqe = rvt_get_swqe_ptr(qp, qp->s_cur); + next_cur = qp->s_cur + 1; + if (next_cur >= qp->s_size) + next_cur = 0; + + /* Construct the header. */ + ibp = to_iport(qp->ibqp.device, qp->port_num); + ppd = ppd_from_ibp(ibp); + ah_attr = &ibah_to_rvtah(wqe->ud_wr.ah)->attr; + if (ah_attr->dlid < be16_to_cpu(IB_MULTICAST_LID_BASE) || + ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE)) { + lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1); + if (unlikely(!loopback && + (lid == ppd->lid || + (lid == be16_to_cpu(IB_LID_PERMISSIVE) && + qp->ibqp.qp_type == IB_QPT_GSI)))) { + unsigned long tflags = ps->flags; + /* + * If DMAs are in progress, we can't generate + * a completion for the loopback packet since + * it would be out of order. + * Instead of waiting, we could queue a + * zero length descriptor so we get a callback. + */ + if (iowait_sdma_pending(&priv->s_iowait)) { + qp->s_flags |= RVT_S_WAIT_DMA; + goto bail; + } + qp->s_cur = next_cur; + spin_unlock_irqrestore(&qp->s_lock, tflags); + ud_loopback(qp, wqe); + spin_lock_irqsave(&qp->s_lock, tflags); + ps->flags = tflags; + hfi1_send_complete(qp, wqe, IB_WC_SUCCESS); + goto done_free_tx; + } + } + + qp->s_cur = next_cur; + extra_bytes = -wqe->length & 3; + nwords = (wqe->length + extra_bytes) >> 2; + + /* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */ + qp->s_hdrwords = 7; + qp->s_cur_size = wqe->length; + qp->s_cur_sge = &qp->s_sge; + qp->s_srate = ah_attr->static_rate; + qp->srate_mbps = ib_rate_to_mbps(qp->s_srate); + qp->s_wqe = wqe; + qp->s_sge.sge = wqe->sg_list[0]; + qp->s_sge.sg_list = wqe->sg_list + 1; + qp->s_sge.num_sge = wqe->wr.num_sge; + qp->s_sge.total_len = wqe->length; + + if (ah_attr->ah_flags & IB_AH_GRH) { + /* Header size in 32-bit words. */ + qp->s_hdrwords += hfi1_make_grh(ibp, + &ps->s_txreq->phdr.hdr.u.l.grh, + &ah_attr->grh, + qp->s_hdrwords, nwords); + lrh0 = HFI1_LRH_GRH; + ohdr = &ps->s_txreq->phdr.hdr.u.l.oth; + /* + * Don't worry about sending to locally attached multicast + * QPs. It is unspecified by the spec. what happens. + */ + } else { + /* Header size in 32-bit words. */ + lrh0 = HFI1_LRH_BTH; + ohdr = &ps->s_txreq->phdr.hdr.u.oth; + } + if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) { + qp->s_hdrwords++; + ohdr->u.ud.imm_data = wqe->wr.ex.imm_data; + bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24; + } else { + bth0 = IB_OPCODE_UD_SEND_ONLY << 24; + } + sc5 = ibp->sl_to_sc[ah_attr->sl]; + lrh0 |= (ah_attr->sl & 0xf) << 4; + if (qp->ibqp.qp_type == IB_QPT_SMI) { + lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */ + priv->s_sc = 0xf; + } else { + lrh0 |= (sc5 & 0xf) << 12; + priv->s_sc = sc5; + } + priv->s_sde = qp_to_sdma_engine(qp, priv->s_sc); + ps->s_txreq->sde = priv->s_sde; + priv->s_sendcontext = qp_to_send_context(qp, priv->s_sc); + ps->s_txreq->psc = priv->s_sendcontext; + ps->s_txreq->phdr.hdr.lrh[0] = cpu_to_be16(lrh0); + ps->s_txreq->phdr.hdr.lrh[1] = cpu_to_be16(ah_attr->dlid); + ps->s_txreq->phdr.hdr.lrh[2] = + cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC); + if (ah_attr->dlid == be16_to_cpu(IB_LID_PERMISSIVE)) { + ps->s_txreq->phdr.hdr.lrh[3] = IB_LID_PERMISSIVE; + } else { + lid = ppd->lid; + if (lid) { + lid |= ah_attr->src_path_bits & ((1 << ppd->lmc) - 1); + ps->s_txreq->phdr.hdr.lrh[3] = cpu_to_be16(lid); + } else { + ps->s_txreq->phdr.hdr.lrh[3] = IB_LID_PERMISSIVE; + } + } + if (wqe->wr.send_flags & IB_SEND_SOLICITED) + bth0 |= IB_BTH_SOLICITED; + bth0 |= extra_bytes << 20; + if (qp->ibqp.qp_type == IB_QPT_GSI || qp->ibqp.qp_type == IB_QPT_SMI) + bth0 |= hfi1_get_pkey(ibp, wqe->ud_wr.pkey_index); + else + bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index); + ohdr->bth[0] = cpu_to_be32(bth0); + ohdr->bth[1] = cpu_to_be32(wqe->ud_wr.remote_qpn); + ohdr->bth[2] = cpu_to_be32(mask_psn(wqe->psn)); + /* + * Qkeys with the high order bit set mean use the + * qkey from the QP context instead of the WR (see 10.2.5). + */ + ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ? + qp->qkey : wqe->ud_wr.remote_qkey); + ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num); + /* disarm any ahg */ + priv->s_hdr->ahgcount = 0; + priv->s_hdr->ahgidx = 0; + priv->s_hdr->tx_flags = 0; + priv->s_hdr->sde = NULL; + /* pbc */ + ps->s_txreq->hdr_dwords = qp->s_hdrwords + 2; + + return 1; + +done_free_tx: + hfi1_put_txreq(ps->s_txreq); + ps->s_txreq = NULL; + return 1; + +bail: + hfi1_put_txreq(ps->s_txreq); + +bail_no_tx: + ps->s_txreq = NULL; + qp->s_flags &= ~RVT_S_BUSY; + qp->s_hdrwords = 0; + return 0; +} + +/* + * Hardware can't check this so we do it here. + * + * This is a slightly different algorithm than the standard pkey check. It + * special cases the management keys and allows for 0x7fff and 0xffff to be in + * the table at the same time. + * + * @returns the index found or -1 if not found + */ +int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + unsigned i; + + if (pkey == FULL_MGMT_P_KEY || pkey == LIM_MGMT_P_KEY) { + unsigned lim_idx = -1; + + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i) { + /* here we look for an exact match */ + if (ppd->pkeys[i] == pkey) + return i; + if (ppd->pkeys[i] == LIM_MGMT_P_KEY) + lim_idx = i; + } + + /* did not find 0xffff return 0x7fff idx if found */ + if (pkey == FULL_MGMT_P_KEY) + return lim_idx; + + /* no match... */ + return -1; + } + + pkey &= 0x7fff; /* remove limited/full membership bit */ + + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); ++i) + if ((ppd->pkeys[i] & 0x7fff) == pkey) + return i; + + /* + * Should not get here, this means hardware failed to validate pkeys. + */ + return -1; +} + +void return_cnp(struct hfi1_ibport *ibp, struct rvt_qp *qp, u32 remote_qpn, + u32 pkey, u32 slid, u32 dlid, u8 sc5, + const struct ib_grh *old_grh) +{ + u64 pbc, pbc_flags = 0; + u32 bth0, plen, vl, hwords = 5; + u16 lrh0; + u8 sl = ibp->sc_to_sl[sc5]; + struct hfi1_ib_header hdr; + struct hfi1_other_headers *ohdr; + struct pio_buf *pbuf; + struct send_context *ctxt = qp_to_send_context(qp, sc5); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + if (old_grh) { + struct ib_grh *grh = &hdr.u.l.grh; + + grh->version_tclass_flow = old_grh->version_tclass_flow; + grh->paylen = cpu_to_be16((hwords - 2 + SIZE_OF_CRC) << 2); + grh->hop_limit = 0xff; + grh->sgid = old_grh->dgid; + grh->dgid = old_grh->sgid; + ohdr = &hdr.u.l.oth; + lrh0 = HFI1_LRH_GRH; + hwords += sizeof(struct ib_grh) / sizeof(u32); + } else { + ohdr = &hdr.u.oth; + lrh0 = HFI1_LRH_BTH; + } + + lrh0 |= (sc5 & 0xf) << 12 | sl << 4; + + bth0 = pkey | (IB_OPCODE_CNP << 24); + ohdr->bth[0] = cpu_to_be32(bth0); + + ohdr->bth[1] = cpu_to_be32(remote_qpn | (1 << HFI1_BECN_SHIFT)); + ohdr->bth[2] = 0; /* PSN 0 */ + + hdr.lrh[0] = cpu_to_be16(lrh0); + hdr.lrh[1] = cpu_to_be16(dlid); + hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC); + hdr.lrh[3] = cpu_to_be16(slid); + + plen = 2 /* PBC */ + hwords; + pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT; + vl = sc_to_vlt(ppd->dd, sc5); + pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); + if (ctxt) { + pbuf = sc_buffer_alloc(ctxt, plen, NULL, NULL); + if (pbuf) + ppd->dd->pio_inline_send(ppd->dd, pbuf, pbc, + &hdr, hwords); + } +} + +/* + * opa_smp_check() - Do the regular pkey checking, and the additional + * checks for SMPs specified in OPAv1 rev 0.90, section 9.10.26 + * ("SMA Packet Checks"). + * + * Note that: + * - Checks are done using the pkey directly from the packet's BTH, + * and specifically _not_ the pkey that we attach to the completion, + * which may be different. + * - These checks are specifically for "non-local" SMPs (i.e., SMPs + * which originated on another node). SMPs which are sent from, and + * destined to this node are checked in opa_local_smp_check(). + * + * At the point where opa_smp_check() is called, we know: + * - destination QP is QP0 + * + * opa_smp_check() returns 0 if all checks succeed, 1 otherwise. + */ +static int opa_smp_check(struct hfi1_ibport *ibp, u16 pkey, u8 sc5, + struct rvt_qp *qp, u16 slid, struct opa_smp *smp) +{ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + /* + * I don't think it's possible for us to get here with sc != 0xf, + * but check it to be certain. + */ + if (sc5 != 0xf) + return 1; + + if (rcv_pkey_check(ppd, pkey, sc5, slid)) + return 1; + + /* + * At this point we know (and so don't need to check again) that + * the pkey is either LIM_MGMT_P_KEY, or FULL_MGMT_P_KEY + * (see ingress_pkey_check). + */ + if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE && + smp->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) { + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; + } + + /* + * SMPs fall into one of four (disjoint) categories: + * SMA request, SMA response, trap, or trap repress. + * Our response depends, in part, on which type of + * SMP we're processing. + * + * If this is not an SMA request, or trap repress: + * - accept MAD if the port is running an SM + * - pkey == FULL_MGMT_P_KEY => + * reply with unsupported method (i.e., just mark + * the smp's status field here, and let it be + * processed normally) + * - pkey != LIM_MGMT_P_KEY => + * increment port recv constraint errors, drop MAD + * If this is an SMA request or trap repress: + * - pkey != FULL_MGMT_P_KEY => + * increment port recv constraint errors, drop MAD + */ + switch (smp->method) { + case IB_MGMT_METHOD_GET: + case IB_MGMT_METHOD_SET: + case IB_MGMT_METHOD_REPORT: + case IB_MGMT_METHOD_TRAP_REPRESS: + if (pkey != FULL_MGMT_P_KEY) { + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; + } + break; + case IB_MGMT_METHOD_SEND: + case IB_MGMT_METHOD_TRAP: + case IB_MGMT_METHOD_GET_RESP: + case IB_MGMT_METHOD_REPORT_RESP: + if (ibp->rvp.port_cap_flags & IB_PORT_SM) + return 0; + if (pkey == FULL_MGMT_P_KEY) { + smp->status |= IB_SMP_UNSUP_METHOD; + return 0; + } + if (pkey != LIM_MGMT_P_KEY) { + ingress_pkey_table_fail(ppd, pkey, slid); + return 1; + } + break; + default: + break; + } + return 0; +} + +/** + * hfi1_ud_rcv - receive an incoming UD packet + * @ibp: the port the packet came in on + * @hdr: the packet header + * @rcv_flags: flags relevant to rcv processing + * @data: the packet data + * @tlen: the packet length + * @qp: the QP the packet came on + * + * This is called from qp_rcv() to process an incoming UD packet + * for the given QP. + * Called at interrupt level. + */ +void hfi1_ud_rcv(struct hfi1_packet *packet) +{ + struct hfi1_other_headers *ohdr = packet->ohdr; + int opcode; + u32 hdrsize = packet->hlen; + u32 pad; + struct ib_wc wc; + u32 qkey; + u32 src_qp; + u16 dlid, pkey; + int mgmt_pkey_idx = -1; + struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data; + struct hfi1_ib_header *hdr = packet->hdr; + u32 rcv_flags = packet->rcv_flags; + void *data = packet->ebuf; + u32 tlen = packet->tlen; + struct rvt_qp *qp = packet->qp; + bool has_grh = rcv_flags & HFI1_HAS_GRH; + u8 sc5 = hdr2sc((struct hfi1_message_header *)hdr, packet->rhf); + u32 bth1; + int is_mcast; + struct ib_grh *grh = NULL; + + qkey = be32_to_cpu(ohdr->u.ud.deth[0]); + src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & RVT_QPN_MASK; + dlid = be16_to_cpu(hdr->lrh[1]); + is_mcast = (dlid > be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (dlid != be16_to_cpu(IB_LID_PERMISSIVE)); + bth1 = be32_to_cpu(ohdr->bth[1]); + if (unlikely(bth1 & HFI1_BECN_SMASK)) { + /* + * In pre-B0 h/w the CNP_OPCODE is handled via an + * error path. + */ + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u32 lqpn = be32_to_cpu(ohdr->bth[1]) & RVT_QPN_MASK; + u8 sl; + + sl = ibp->sc_to_sl[sc5]; + + process_becn(ppd, sl, 0, lqpn, 0, IB_CC_SVCTYPE_UD); + } + + /* + * The opcode is in the low byte when its in network order + * (top byte when in host order). + */ + opcode = be32_to_cpu(ohdr->bth[0]) >> 24; + opcode &= 0xff; + + pkey = (u16)be32_to_cpu(ohdr->bth[0]); + + if (!is_mcast && (opcode != IB_OPCODE_CNP) && bth1 & HFI1_FECN_SMASK) { + u16 slid = be16_to_cpu(hdr->lrh[3]); + + return_cnp(ibp, qp, src_qp, pkey, dlid, slid, sc5, grh); + } + /* + * Get the number of bytes the message was padded by + * and drop incomplete packets. + */ + pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3; + if (unlikely(tlen < (hdrsize + pad + 4))) + goto drop; + + tlen -= hdrsize + pad + 4; + + /* + * Check that the permissive LID is only used on QP0 + * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1). + */ + if (qp->ibqp.qp_num) { + if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE || + hdr->lrh[3] == IB_LID_PERMISSIVE)) + goto drop; + if (qp->ibqp.qp_num > 1) { + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + u16 slid; + + slid = be16_to_cpu(hdr->lrh[3]); + if (unlikely(rcv_pkey_check(ppd, pkey, sc5, slid))) { + /* + * Traps will not be sent for packets dropped + * by the HW. This is fine, as sending trap + * for invalid pkeys is optional according to + * IB spec (release 1.3, section 10.9.4) + */ + hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY, + pkey, + (be16_to_cpu(hdr->lrh[0]) >> 4) & + 0xF, + src_qp, qp->ibqp.qp_num, + be16_to_cpu(hdr->lrh[3]), + be16_to_cpu(hdr->lrh[1])); + return; + } + } else { + /* GSI packet */ + mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey); + if (mgmt_pkey_idx < 0) + goto drop; + } + if (unlikely(qkey != qp->qkey)) { + hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_Q_KEY, qkey, + (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF, + src_qp, qp->ibqp.qp_num, + be16_to_cpu(hdr->lrh[3]), + be16_to_cpu(hdr->lrh[1])); + return; + } + /* Drop invalid MAD packets (see 13.5.3.1). */ + if (unlikely(qp->ibqp.qp_num == 1 && + (tlen > 2048 || + (be16_to_cpu(hdr->lrh[0]) >> 12) == 15))) + goto drop; + } else { + /* Received on QP0, and so by definition, this is an SMP */ + struct opa_smp *smp = (struct opa_smp *)data; + u16 slid = be16_to_cpu(hdr->lrh[3]); + + if (opa_smp_check(ibp, pkey, sc5, qp, slid, smp)) + goto drop; + + if (tlen > 2048) + goto drop; + if ((hdr->lrh[1] == IB_LID_PERMISSIVE || + hdr->lrh[3] == IB_LID_PERMISSIVE) && + smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + goto drop; + + /* look up SMI pkey */ + mgmt_pkey_idx = hfi1_lookup_pkey_idx(ibp, pkey); + if (mgmt_pkey_idx < 0) + goto drop; + } + + if (qp->ibqp.qp_num > 1 && + opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) { + wc.ex.imm_data = ohdr->u.ud.imm_data; + wc.wc_flags = IB_WC_WITH_IMM; + tlen -= sizeof(u32); + } else if (opcode == IB_OPCODE_UD_SEND_ONLY) { + wc.ex.imm_data = 0; + wc.wc_flags = 0; + } else { + goto drop; + } + + /* + * A GRH is expected to precede the data even if not + * present on the wire. + */ + wc.byte_len = tlen + sizeof(struct ib_grh); + + /* + * Get the next work request entry to find where to put the data. + */ + if (qp->r_flags & RVT_R_REUSE_SGE) { + qp->r_flags &= ~RVT_R_REUSE_SGE; + } else { + int ret; + + ret = hfi1_rvt_get_rwqe(qp, 0); + if (ret < 0) { + hfi1_rc_error(qp, IB_WC_LOC_QP_OP_ERR); + return; + } + if (!ret) { + if (qp->ibqp.qp_num == 0) + ibp->rvp.n_vl15_dropped++; + return; + } + } + /* Silently drop packets which are too big. */ + if (unlikely(wc.byte_len > qp->r_len)) { + qp->r_flags |= RVT_R_REUSE_SGE; + goto drop; + } + if (has_grh) { + hfi1_copy_sge(&qp->r_sge, &hdr->u.l.grh, + sizeof(struct ib_grh), 1, 0); + wc.wc_flags |= IB_WC_GRH; + } else { + hfi1_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1); + } + hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), + 1, 0); + rvt_put_ss(&qp->r_sge); + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) + return; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.opcode = IB_WC_RECV; + wc.vendor_err = 0; + wc.qp = &qp->ibqp; + wc.src_qp = src_qp; + + if (qp->ibqp.qp_type == IB_QPT_GSI || + qp->ibqp.qp_type == IB_QPT_SMI) { + if (mgmt_pkey_idx < 0) { + if (net_ratelimit()) { + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_devdata *dd = ppd->dd; + + dd_dev_err(dd, "QP type %d mgmt_pkey_idx < 0 and packet not dropped???\n", + qp->ibqp.qp_type); + mgmt_pkey_idx = 0; + } + } + wc.pkey_index = (unsigned)mgmt_pkey_idx; + } else { + wc.pkey_index = 0; + } + + wc.slid = be16_to_cpu(hdr->lrh[3]); + wc.sl = ibp->sc_to_sl[sc5]; + + /* + * Save the LMC lower bits if the destination LID is a unicast LID. + */ + wc.dlid_path_bits = dlid >= be16_to_cpu(IB_MULTICAST_LID_BASE) ? 0 : + dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1); + wc.port_num = qp->port_num; + /* Signal completion event if the solicited bit is set. */ + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, + (ohdr->bth[0] & + cpu_to_be32(IB_BTH_SOLICITED)) != 0); + return; + +drop: + ibp->rvp.n_pkt_drops++; +} diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c new file mode 100644 index 000000000..1b640a35b --- /dev/null +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -0,0 +1,1050 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include <asm/page.h> + +#include "user_exp_rcv.h" +#include "trace.h" +#include "mmu_rb.h" + +struct tid_group { + struct list_head list; + unsigned base; + u8 size; + u8 used; + u8 map; +}; + +struct tid_rb_node { + struct mmu_rb_node mmu; + unsigned long phys; + struct tid_group *grp; + u32 rcventry; + dma_addr_t dma_addr; + bool freed; + unsigned npages; + struct page *pages[0]; +}; + +struct tid_pageset { + u16 idx; + u16 count; +}; + +#define EXP_TID_SET_EMPTY(set) (set.count == 0 && list_empty(&set.list)) + +#define num_user_pages(vaddr, len) \ + (1 + (((((unsigned long)(vaddr) + \ + (unsigned long)(len) - 1) & PAGE_MASK) - \ + ((unsigned long)vaddr & PAGE_MASK)) >> PAGE_SHIFT)) + +static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *, + struct rb_root *); +static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *); +static int set_rcvarray_entry(struct file *, unsigned long, u32, + struct tid_group *, struct page **, unsigned); +static int mmu_rb_insert(struct rb_root *, struct mmu_rb_node *); +static void mmu_rb_remove(struct rb_root *, struct mmu_rb_node *, + struct mm_struct *); +static int mmu_rb_invalidate(struct rb_root *, struct mmu_rb_node *); +static int program_rcvarray(struct file *, unsigned long, struct tid_group *, + struct tid_pageset *, unsigned, u16, struct page **, + u32 *, unsigned *, unsigned *); +static int unprogram_rcvarray(struct file *, u32, struct tid_group **); +static void clear_tid_node(struct hfi1_filedata *, u16, struct tid_rb_node *); + +static struct mmu_rb_ops tid_rb_ops = { + .insert = mmu_rb_insert, + .remove = mmu_rb_remove, + .invalidate = mmu_rb_invalidate +}; + +static inline u32 rcventry2tidinfo(u32 rcventry) +{ + u32 pair = rcventry & ~0x1; + + return EXP_TID_SET(IDX, pair >> 1) | + EXP_TID_SET(CTRL, 1 << (rcventry - pair)); +} + +static inline void exp_tid_group_init(struct exp_tid_set *set) +{ + INIT_LIST_HEAD(&set->list); + set->count = 0; +} + +static inline void tid_group_remove(struct tid_group *grp, + struct exp_tid_set *set) +{ + list_del_init(&grp->list); + set->count--; +} + +static inline void tid_group_add_tail(struct tid_group *grp, + struct exp_tid_set *set) +{ + list_add_tail(&grp->list, &set->list); + set->count++; +} + +static inline struct tid_group *tid_group_pop(struct exp_tid_set *set) +{ + struct tid_group *grp = + list_first_entry(&set->list, struct tid_group, list); + list_del_init(&grp->list); + set->count--; + return grp; +} + +static inline void tid_group_move(struct tid_group *group, + struct exp_tid_set *s1, + struct exp_tid_set *s2) +{ + tid_group_remove(group, s1); + tid_group_add_tail(group, s2); +} + +/* + * Initialize context and file private data needed for Expected + * receive caching. This needs to be done after the context has + * been configured with the eager/expected RcvEntry counts. + */ +int hfi1_user_exp_rcv_init(struct file *fp) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + unsigned tidbase; + int i, ret = 0; + + spin_lock_init(&fd->tid_lock); + spin_lock_init(&fd->invalid_lock); + fd->tid_rb_root = RB_ROOT; + + if (!uctxt->subctxt_cnt || !fd->subctxt) { + exp_tid_group_init(&uctxt->tid_group_list); + exp_tid_group_init(&uctxt->tid_used_list); + exp_tid_group_init(&uctxt->tid_full_list); + + tidbase = uctxt->expected_base; + for (i = 0; i < uctxt->expected_count / + dd->rcv_entries.group_size; i++) { + struct tid_group *grp; + + grp = kzalloc(sizeof(*grp), GFP_KERNEL); + if (!grp) { + /* + * If we fail here, the groups already + * allocated will be freed by the close + * call. + */ + ret = -ENOMEM; + goto done; + } + grp->size = dd->rcv_entries.group_size; + grp->base = tidbase; + tid_group_add_tail(grp, &uctxt->tid_group_list); + tidbase += dd->rcv_entries.group_size; + } + } + + fd->entry_to_rb = kcalloc(uctxt->expected_count, + sizeof(struct rb_node *), + GFP_KERNEL); + if (!fd->entry_to_rb) + return -ENOMEM; + + if (!HFI1_CAP_IS_USET(TID_UNMAP)) { + fd->invalid_tid_idx = 0; + fd->invalid_tids = kzalloc(uctxt->expected_count * + sizeof(u32), GFP_KERNEL); + if (!fd->invalid_tids) { + ret = -ENOMEM; + goto done; + } + + /* + * Register MMU notifier callbacks. If the registration + * fails, continue but turn off the TID caching for + * all user contexts. + */ + ret = hfi1_mmu_rb_register(&fd->tid_rb_root, &tid_rb_ops); + if (ret) { + dd_dev_info(dd, + "Failed MMU notifier registration %d\n", + ret); + HFI1_CAP_USET(TID_UNMAP); + ret = 0; + } + } + + /* + * PSM does not have a good way to separate, count, and + * effectively enforce a limit on RcvArray entries used by + * subctxts (when context sharing is used) when TID caching + * is enabled. To help with that, we calculate a per-process + * RcvArray entry share and enforce that. + * If TID caching is not in use, PSM deals with usage on its + * own. In that case, we allow any subctxt to take all of the + * entries. + * + * Make sure that we set the tid counts only after successful + * init. + */ + spin_lock(&fd->tid_lock); + if (uctxt->subctxt_cnt && !HFI1_CAP_IS_USET(TID_UNMAP)) { + u16 remainder; + + fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt; + remainder = uctxt->expected_count % uctxt->subctxt_cnt; + if (remainder && fd->subctxt < remainder) + fd->tid_limit++; + } else { + fd->tid_limit = uctxt->expected_count; + } + spin_unlock(&fd->tid_lock); +done: + return ret; +} + +int hfi1_user_exp_rcv_free(struct hfi1_filedata *fd) +{ + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct tid_group *grp, *gptr; + + if (!test_bit(HFI1_CTXT_SETUP_DONE, &uctxt->event_flags)) + return 0; + /* + * The notifier would have been removed when the process'es mm + * was freed. + */ + if (!HFI1_CAP_IS_USET(TID_UNMAP)) + hfi1_mmu_rb_unregister(&fd->tid_rb_root); + + kfree(fd->invalid_tids); + + if (!uctxt->cnt) { + if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list)) + unlock_exp_tids(uctxt, &uctxt->tid_full_list, + &fd->tid_rb_root); + if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list)) + unlock_exp_tids(uctxt, &uctxt->tid_used_list, + &fd->tid_rb_root); + list_for_each_entry_safe(grp, gptr, &uctxt->tid_group_list.list, + list) { + list_del_init(&grp->list); + kfree(grp); + } + hfi1_clear_tids(uctxt); + } + + kfree(fd->entry_to_rb); + return 0; +} + +/* + * Write an "empty" RcvArray entry. + * This function exists so the TID registaration code can use it + * to write to unused/unneeded entries and still take advantage + * of the WC performance improvements. The HFI will ignore this + * write to the RcvArray entry. + */ +static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index) +{ + /* + * Doing the WC fill writes only makes sense if the device is + * present and the RcvArray has been mapped as WC memory. + */ + if ((dd->flags & HFI1_PRESENT) && dd->rcvarray_wc) + writeq(0, dd->rcvarray_wc + (index * 8)); +} + +/* + * RcvArray entry allocation for Expected Receives is done by the + * following algorithm: + * + * The context keeps 3 lists of groups of RcvArray entries: + * 1. List of empty groups - tid_group_list + * This list is created during user context creation and + * contains elements which describe sets (of 8) of empty + * RcvArray entries. + * 2. List of partially used groups - tid_used_list + * This list contains sets of RcvArray entries which are + * not completely used up. Another mapping request could + * use some of all of the remaining entries. + * 3. List of full groups - tid_full_list + * This is the list where sets that are completely used + * up go. + * + * An attempt to optimize the usage of RcvArray entries is + * made by finding all sets of physically contiguous pages in a + * user's buffer. + * These physically contiguous sets are further split into + * sizes supported by the receive engine of the HFI. The + * resulting sets of pages are stored in struct tid_pageset, + * which describes the sets as: + * * .count - number of pages in this set + * * .idx - starting index into struct page ** array + * of this set + * + * From this point on, the algorithm deals with the page sets + * described above. The number of pagesets is divided by the + * RcvArray group size to produce the number of full groups + * needed. + * + * Groups from the 3 lists are manipulated using the following + * rules: + * 1. For each set of 8 pagesets, a complete group from + * tid_group_list is taken, programmed, and moved to + * the tid_full_list list. + * 2. For all remaining pagesets: + * 2.1 If the tid_used_list is empty and the tid_group_list + * is empty, stop processing pageset and return only + * what has been programmed up to this point. + * 2.2 If the tid_used_list is empty and the tid_group_list + * is not empty, move a group from tid_group_list to + * tid_used_list. + * 2.3 For each group is tid_used_group, program as much as + * can fit into the group. If the group becomes fully + * used, move it to tid_full_list. + */ +int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo) +{ + int ret = 0, need_group = 0, pinned; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets, + tididx = 0, mapped, mapped_pages = 0; + unsigned long vaddr = tinfo->vaddr; + struct page **pages = NULL; + u32 *tidlist = NULL; + struct tid_pageset *pagesets = NULL; + + /* Get the number of pages the user buffer spans */ + npages = num_user_pages(vaddr, tinfo->length); + if (!npages) + return -EINVAL; + + if (npages > uctxt->expected_count) { + dd_dev_err(dd, "Expected buffer too big\n"); + return -EINVAL; + } + + /* Verify that access is OK for the user buffer */ + if (!access_ok(VERIFY_WRITE, (void __user *)vaddr, + npages * PAGE_SIZE)) { + dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n", + (void *)vaddr, npages); + return -EFAULT; + } + + pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets), + GFP_KERNEL); + if (!pagesets) + return -ENOMEM; + + /* Allocate the array of struct page pointers needed for pinning */ + pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) { + ret = -ENOMEM; + goto bail; + } + + /* + * Pin all the pages of the user buffer. If we can't pin all the + * pages, accept the amount pinned so far and program only that. + * User space knows how to deal with partially programmed buffers. + */ + if (!hfi1_can_pin_pages(dd, fd->tid_n_pinned, npages)) { + ret = -ENOMEM; + goto bail; + } + + pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages); + if (pinned <= 0) { + ret = pinned; + goto bail; + } + fd->tid_n_pinned += npages; + + /* Find sets of physically contiguous pages */ + npagesets = find_phys_blocks(pages, pinned, pagesets); + + /* + * We don't need to access this under a lock since tid_used is per + * process and the same process cannot be in hfi1_user_exp_rcv_clear() + * and hfi1_user_exp_rcv_setup() at the same time. + */ + spin_lock(&fd->tid_lock); + if (fd->tid_used + npagesets > fd->tid_limit) + pageset_count = fd->tid_limit - fd->tid_used; + else + pageset_count = npagesets; + spin_unlock(&fd->tid_lock); + + if (!pageset_count) + goto bail; + + ngroups = pageset_count / dd->rcv_entries.group_size; + tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL); + if (!tidlist) { + ret = -ENOMEM; + goto nomem; + } + + tididx = 0; + + /* + * From this point on, we are going to be using shared (between master + * and subcontexts) context resources. We need to take the lock. + */ + mutex_lock(&uctxt->exp_lock); + /* + * The first step is to program the RcvArray entries which are complete + * groups. + */ + while (ngroups && uctxt->tid_group_list.count) { + struct tid_group *grp = + tid_group_pop(&uctxt->tid_group_list); + + ret = program_rcvarray(fp, vaddr, grp, pagesets, + pageidx, dd->rcv_entries.group_size, + pages, tidlist, &tididx, &mapped); + /* + * If there was a failure to program the RcvArray + * entries for the entire group, reset the grp fields + * and add the grp back to the free group list. + */ + if (ret <= 0) { + tid_group_add_tail(grp, &uctxt->tid_group_list); + hfi1_cdbg(TID, + "Failed to program RcvArray group %d", ret); + goto unlock; + } + + tid_group_add_tail(grp, &uctxt->tid_full_list); + ngroups--; + pageidx += ret; + mapped_pages += mapped; + } + + while (pageidx < pageset_count) { + struct tid_group *grp, *ptr; + /* + * If we don't have any partially used tid groups, check + * if we have empty groups. If so, take one from there and + * put in the partially used list. + */ + if (!uctxt->tid_used_list.count || need_group) { + if (!uctxt->tid_group_list.count) + goto unlock; + + grp = tid_group_pop(&uctxt->tid_group_list); + tid_group_add_tail(grp, &uctxt->tid_used_list); + need_group = 0; + } + /* + * There is an optimization opportunity here - instead of + * fitting as many page sets as we can, check for a group + * later on in the list that could fit all of them. + */ + list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list, + list) { + unsigned use = min_t(unsigned, pageset_count - pageidx, + grp->size - grp->used); + + ret = program_rcvarray(fp, vaddr, grp, pagesets, + pageidx, use, pages, tidlist, + &tididx, &mapped); + if (ret < 0) { + hfi1_cdbg(TID, + "Failed to program RcvArray entries %d", + ret); + ret = -EFAULT; + goto unlock; + } else if (ret > 0) { + if (grp->used == grp->size) + tid_group_move(grp, + &uctxt->tid_used_list, + &uctxt->tid_full_list); + pageidx += ret; + mapped_pages += mapped; + need_group = 0; + /* Check if we are done so we break out early */ + if (pageidx >= pageset_count) + break; + } else if (WARN_ON(ret == 0)) { + /* + * If ret is 0, we did not program any entries + * into this group, which can only happen if + * we've screwed up the accounting somewhere. + * Warn and try to continue. + */ + need_group = 1; + } + } + } +unlock: + mutex_unlock(&uctxt->exp_lock); +nomem: + hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, + mapped_pages, ret); + if (tididx) { + spin_lock(&fd->tid_lock); + fd->tid_used += tididx; + spin_unlock(&fd->tid_lock); + tinfo->tidcnt = tididx; + tinfo->length = mapped_pages * PAGE_SIZE; + + if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist, + tidlist, sizeof(tidlist[0]) * tididx)) { + /* + * On failure to copy to the user level, we need to undo + * everything done so far so we don't leak resources. + */ + tinfo->tidlist = (unsigned long)&tidlist; + hfi1_user_exp_rcv_clear(fp, tinfo); + tinfo->tidlist = 0; + ret = -EFAULT; + goto bail; + } + } + + /* + * If not everything was mapped (due to insufficient RcvArray entries, + * for example), unpin all unmapped pages so we can pin them nex time. + */ + if (mapped_pages != pinned) { + hfi1_release_user_pages(current->mm, &pages[mapped_pages], + pinned - mapped_pages, + false); + fd->tid_n_pinned -= pinned - mapped_pages; + } +bail: + kfree(pagesets); + kfree(pages); + kfree(tidlist); + return ret > 0 ? 0 : ret; +} + +int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo) +{ + int ret = 0; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + u32 *tidinfo; + unsigned tididx; + + tidinfo = kcalloc(tinfo->tidcnt, sizeof(*tidinfo), GFP_KERNEL); + if (!tidinfo) + return -ENOMEM; + + if (copy_from_user(tidinfo, (void __user *)(unsigned long) + tinfo->tidlist, sizeof(tidinfo[0]) * + tinfo->tidcnt)) { + ret = -EFAULT; + goto done; + } + + mutex_lock(&uctxt->exp_lock); + for (tididx = 0; tididx < tinfo->tidcnt; tididx++) { + ret = unprogram_rcvarray(fp, tidinfo[tididx], NULL); + if (ret) { + hfi1_cdbg(TID, "Failed to unprogram rcv array %d", + ret); + break; + } + } + spin_lock(&fd->tid_lock); + fd->tid_used -= tididx; + spin_unlock(&fd->tid_lock); + tinfo->tidcnt = tididx; + mutex_unlock(&uctxt->exp_lock); +done: + kfree(tidinfo); + return ret; +} + +int hfi1_user_exp_rcv_invalid(struct file *fp, struct hfi1_tid_info *tinfo) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + unsigned long *ev = uctxt->dd->events + + (((uctxt->ctxt - uctxt->dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS) + fd->subctxt); + u32 *array; + int ret = 0; + + if (!fd->invalid_tids) + return -EINVAL; + + /* + * copy_to_user() can sleep, which will leave the invalid_lock + * locked and cause the MMU notifier to be blocked on the lock + * for a long time. + * Copy the data to a local buffer so we can release the lock. + */ + array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL); + if (!array) + return -EFAULT; + + spin_lock(&fd->invalid_lock); + if (fd->invalid_tid_idx) { + memcpy(array, fd->invalid_tids, sizeof(*array) * + fd->invalid_tid_idx); + memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) * + fd->invalid_tid_idx); + tinfo->tidcnt = fd->invalid_tid_idx; + fd->invalid_tid_idx = 0; + /* + * Reset the user flag while still holding the lock. + * Otherwise, PSM can miss events. + */ + clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); + } else { + tinfo->tidcnt = 0; + } + spin_unlock(&fd->invalid_lock); + + if (tinfo->tidcnt) { + if (copy_to_user((void __user *)tinfo->tidlist, + array, sizeof(*array) * tinfo->tidcnt)) + ret = -EFAULT; + } + kfree(array); + + return ret; +} + +static u32 find_phys_blocks(struct page **pages, unsigned npages, + struct tid_pageset *list) +{ + unsigned pagecount, pageidx, setcount = 0, i; + unsigned long pfn, this_pfn; + + if (!npages) + return 0; + + /* + * Look for sets of physically contiguous pages in the user buffer. + * This will allow us to optimize Expected RcvArray entry usage by + * using the bigger supported sizes. + */ + pfn = page_to_pfn(pages[0]); + for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) { + this_pfn = i < npages ? page_to_pfn(pages[i]) : 0; + + /* + * If the pfn's are not sequential, pages are not physically + * contiguous. + */ + if (this_pfn != ++pfn) { + /* + * At this point we have to loop over the set of + * physically contiguous pages and break them down it + * sizes supported by the HW. + * There are two main constraints: + * 1. The max buffer size is MAX_EXPECTED_BUFFER. + * If the total set size is bigger than that + * program only a MAX_EXPECTED_BUFFER chunk. + * 2. The buffer size has to be a power of two. If + * it is not, round down to the closes power of + * 2 and program that size. + */ + while (pagecount) { + int maxpages = pagecount; + u32 bufsize = pagecount * PAGE_SIZE; + + if (bufsize > MAX_EXPECTED_BUFFER) + maxpages = + MAX_EXPECTED_BUFFER >> + PAGE_SHIFT; + else if (!is_power_of_2(bufsize)) + maxpages = + rounddown_pow_of_two(bufsize) >> + PAGE_SHIFT; + + list[setcount].idx = pageidx; + list[setcount].count = maxpages; + pagecount -= maxpages; + pageidx += maxpages; + setcount++; + } + pageidx = i; + pagecount = 1; + pfn = this_pfn; + } else { + pagecount++; + } + } + return setcount; +} + +/** + * program_rcvarray() - program an RcvArray group with receive buffers + * @fp: file pointer + * @vaddr: starting user virtual address + * @grp: RcvArray group + * @sets: array of struct tid_pageset holding information on physically + * contiguous chunks from the user buffer + * @start: starting index into sets array + * @count: number of struct tid_pageset's to program + * @pages: an array of struct page * for the user buffer + * @tidlist: the array of u32 elements when the information about the + * programmed RcvArray entries is to be encoded. + * @tididx: starting offset into tidlist + * @pmapped: (output parameter) number of pages programmed into the RcvArray + * entries. + * + * This function will program up to 'count' number of RcvArray entries from the + * group 'grp'. To make best use of write-combining writes, the function will + * perform writes to the unused RcvArray entries which will be ignored by the + * HW. Each RcvArray entry will be programmed with a physically contiguous + * buffer chunk from the user's virtual buffer. + * + * Return: + * -EINVAL if the requested count is larger than the size of the group, + * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or + * number of RcvArray entries programmed. + */ +static int program_rcvarray(struct file *fp, unsigned long vaddr, + struct tid_group *grp, + struct tid_pageset *sets, + unsigned start, u16 count, struct page **pages, + u32 *tidlist, unsigned *tididx, unsigned *pmapped) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + u16 idx; + u32 tidinfo = 0, rcventry, useidx = 0; + int mapped = 0; + + /* Count should never be larger than the group size */ + if (count > grp->size) + return -EINVAL; + + /* Find the first unused entry in the group */ + for (idx = 0; idx < grp->size; idx++) { + if (!(grp->map & (1 << idx))) { + useidx = idx; + break; + } + rcv_array_wc_fill(dd, grp->base + idx); + } + + idx = 0; + while (idx < count) { + u16 npages, pageidx, setidx = start + idx; + int ret = 0; + + /* + * If this entry in the group is used, move to the next one. + * If we go past the end of the group, exit the loop. + */ + if (useidx >= grp->size) { + break; + } else if (grp->map & (1 << useidx)) { + rcv_array_wc_fill(dd, grp->base + useidx); + useidx++; + continue; + } + + rcventry = grp->base + useidx; + npages = sets[setidx].count; + pageidx = sets[setidx].idx; + + ret = set_rcvarray_entry(fp, vaddr + (pageidx * PAGE_SIZE), + rcventry, grp, pages + pageidx, + npages); + if (ret) + return ret; + mapped += npages; + + tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) | + EXP_TID_SET(LEN, npages); + tidlist[(*tididx)++] = tidinfo; + grp->used++; + grp->map |= 1 << useidx++; + idx++; + } + + /* Fill the rest of the group with "blank" writes */ + for (; useidx < grp->size; useidx++) + rcv_array_wc_fill(dd, grp->base + useidx); + *pmapped = mapped; + return idx; +} + +static int set_rcvarray_entry(struct file *fp, unsigned long vaddr, + u32 rcventry, struct tid_group *grp, + struct page **pages, unsigned npages) +{ + int ret; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct tid_rb_node *node; + struct hfi1_devdata *dd = uctxt->dd; + struct rb_root *root = &fd->tid_rb_root; + dma_addr_t phys; + + /* + * Allocate the node first so we can handle a potential + * failure before we've programmed anything. + */ + node = kzalloc(sizeof(*node) + (sizeof(struct page *) * npages), + GFP_KERNEL); + if (!node) + return -ENOMEM; + + phys = pci_map_single(dd->pcidev, + __va(page_to_phys(pages[0])), + npages * PAGE_SIZE, PCI_DMA_FROMDEVICE); + if (dma_mapping_error(&dd->pcidev->dev, phys)) { + dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n", + phys); + kfree(node); + return -EFAULT; + } + + node->mmu.addr = vaddr; + node->mmu.len = npages * PAGE_SIZE; + node->phys = page_to_phys(pages[0]); + node->npages = npages; + node->rcventry = rcventry; + node->dma_addr = phys; + node->grp = grp; + node->freed = false; + memcpy(node->pages, pages, sizeof(struct page *) * npages); + + if (HFI1_CAP_IS_USET(TID_UNMAP)) + ret = mmu_rb_insert(root, &node->mmu); + else + ret = hfi1_mmu_rb_insert(root, &node->mmu); + + if (ret) { + hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d", + node->rcventry, node->mmu.addr, node->phys, ret); + pci_unmap_single(dd->pcidev, phys, npages * PAGE_SIZE, + PCI_DMA_FROMDEVICE); + kfree(node); + return -EFAULT; + } + hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1); + trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages, + node->mmu.addr, node->phys, phys); + return 0; +} + +static int unprogram_rcvarray(struct file *fp, u32 tidinfo, + struct tid_group **grp) +{ + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + struct tid_rb_node *node; + u8 tidctrl = EXP_TID_GET(tidinfo, CTRL); + u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry; + + if (tididx >= uctxt->expected_count) { + dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n", + tididx, uctxt->ctxt); + return -EINVAL; + } + + if (tidctrl == 0x3) + return -EINVAL; + + rcventry = tididx + (tidctrl - 1); + + node = fd->entry_to_rb[rcventry]; + if (!node || node->rcventry != (uctxt->expected_base + rcventry)) + return -EBADF; + if (HFI1_CAP_IS_USET(TID_UNMAP)) + mmu_rb_remove(&fd->tid_rb_root, &node->mmu, NULL); + else + hfi1_mmu_rb_remove(&fd->tid_rb_root, &node->mmu); + + if (grp) + *grp = node->grp; + clear_tid_node(fd, fd->subctxt, node); + return 0; +} + +static void clear_tid_node(struct hfi1_filedata *fd, u16 subctxt, + struct tid_rb_node *node) +{ + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_devdata *dd = uctxt->dd; + + trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry, + node->npages, node->mmu.addr, node->phys, + node->dma_addr); + + hfi1_put_tid(dd, node->rcventry, PT_INVALID, 0, 0); + /* + * Make sure device has seen the write before we unpin the + * pages. + */ + flush_wc(); + + pci_unmap_single(dd->pcidev, node->dma_addr, node->mmu.len, + PCI_DMA_FROMDEVICE); + hfi1_release_user_pages(current->mm, node->pages, node->npages, true); + fd->tid_n_pinned -= node->npages; + + node->grp->used--; + node->grp->map &= ~(1 << (node->rcventry - node->grp->base)); + + if (node->grp->used == node->grp->size - 1) + tid_group_move(node->grp, &uctxt->tid_full_list, + &uctxt->tid_used_list); + else if (!node->grp->used) + tid_group_move(node->grp, &uctxt->tid_used_list, + &uctxt->tid_group_list); + kfree(node); +} + +static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt, + struct exp_tid_set *set, struct rb_root *root) +{ + struct tid_group *grp, *ptr; + struct hfi1_filedata *fd = container_of(root, struct hfi1_filedata, + tid_rb_root); + int i; + + list_for_each_entry_safe(grp, ptr, &set->list, list) { + list_del_init(&grp->list); + + for (i = 0; i < grp->size; i++) { + if (grp->map & (1 << i)) { + u16 rcventry = grp->base + i; + struct tid_rb_node *node; + + node = fd->entry_to_rb[rcventry - + uctxt->expected_base]; + if (!node || node->rcventry != rcventry) + continue; + if (HFI1_CAP_IS_USET(TID_UNMAP)) + mmu_rb_remove(&fd->tid_rb_root, + &node->mmu, NULL); + else + hfi1_mmu_rb_remove(&fd->tid_rb_root, + &node->mmu); + clear_tid_node(fd, -1, node); + } + } + } +} + +static int mmu_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode) +{ + struct hfi1_filedata *fdata = + container_of(root, struct hfi1_filedata, tid_rb_root); + struct hfi1_ctxtdata *uctxt = fdata->uctxt; + struct tid_rb_node *node = + container_of(mnode, struct tid_rb_node, mmu); + + if (node->freed) + return 0; + + trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, node->mmu.addr, + node->rcventry, node->npages, node->dma_addr); + node->freed = true; + + spin_lock(&fdata->invalid_lock); + if (fdata->invalid_tid_idx < uctxt->expected_count) { + fdata->invalid_tids[fdata->invalid_tid_idx] = + rcventry2tidinfo(node->rcventry - uctxt->expected_base); + fdata->invalid_tids[fdata->invalid_tid_idx] |= + EXP_TID_SET(LEN, node->npages); + if (!fdata->invalid_tid_idx) { + unsigned long *ev; + + /* + * hfi1_set_uevent_bits() sets a user event flag + * for all processes. Because calling into the + * driver to process TID cache invalidations is + * expensive and TID cache invalidations are + * handled on a per-process basis, we can + * optimize this to set the flag only for the + * process in question. + */ + ev = uctxt->dd->events + + (((uctxt->ctxt - uctxt->dd->first_user_ctxt) * + HFI1_MAX_SHARED_CTXTS) + fdata->subctxt); + set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev); + } + fdata->invalid_tid_idx++; + } + spin_unlock(&fdata->invalid_lock); + return 0; +} + +static int mmu_rb_insert(struct rb_root *root, struct mmu_rb_node *node) +{ + struct hfi1_filedata *fdata = + container_of(root, struct hfi1_filedata, tid_rb_root); + struct tid_rb_node *tnode = + container_of(node, struct tid_rb_node, mmu); + u32 base = fdata->uctxt->expected_base; + + fdata->entry_to_rb[tnode->rcventry - base] = tnode; + return 0; +} + +static void mmu_rb_remove(struct rb_root *root, struct mmu_rb_node *node, + struct mm_struct *mm) +{ + struct hfi1_filedata *fdata = + container_of(root, struct hfi1_filedata, tid_rb_root); + struct tid_rb_node *tnode = + container_of(node, struct tid_rb_node, mmu); + u32 base = fdata->uctxt->expected_base; + + fdata->entry_to_rb[tnode->rcventry - base] = NULL; +} diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.h b/drivers/infiniband/hw/hfi1/user_exp_rcv.h new file mode 100644 index 000000000..9bc8d9fba --- /dev/null +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.h @@ -0,0 +1,79 @@ +#ifndef _HFI1_USER_EXP_RCV_H +#define _HFI1_USER_EXP_RCV_H +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "hfi.h" + +#define EXP_TID_TIDLEN_MASK 0x7FFULL +#define EXP_TID_TIDLEN_SHIFT 0 +#define EXP_TID_TIDCTRL_MASK 0x3ULL +#define EXP_TID_TIDCTRL_SHIFT 20 +#define EXP_TID_TIDIDX_MASK 0x3FFULL +#define EXP_TID_TIDIDX_SHIFT 22 +#define EXP_TID_GET(tid, field) \ + (((tid) >> EXP_TID_TID##field##_SHIFT) & EXP_TID_TID##field##_MASK) + +#define EXP_TID_SET(field, value) \ + (((value) & EXP_TID_TID##field##_MASK) << \ + EXP_TID_TID##field##_SHIFT) +#define EXP_TID_CLEAR(tid, field) ({ \ + (tid) &= ~(EXP_TID_TID##field##_MASK << \ + EXP_TID_TID##field##_SHIFT); \ + }) +#define EXP_TID_RESET(tid, field, value) do { \ + EXP_TID_CLEAR(tid, field); \ + (tid) |= EXP_TID_SET(field, (value)); \ + } while (0) + +int hfi1_user_exp_rcv_init(struct file *); +int hfi1_user_exp_rcv_free(struct hfi1_filedata *); +int hfi1_user_exp_rcv_setup(struct file *, struct hfi1_tid_info *); +int hfi1_user_exp_rcv_clear(struct file *, struct hfi1_tid_info *); +int hfi1_user_exp_rcv_invalid(struct file *, struct hfi1_tid_info *); + +#endif /* _HFI1_USER_EXP_RCV_H */ diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c new file mode 100644 index 000000000..88e10b5f5 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -0,0 +1,135 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/device.h> +#include <linux/module.h> + +#include "hfi.h" + +static unsigned long cache_size = 256; +module_param(cache_size, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(cache_size, "Send and receive side cache size limit (in MB)"); + +/* + * Determine whether the caller can pin pages. + * + * This function should be used in the implementation of buffer caches. + * The cache implementation should call this function prior to attempting + * to pin buffer pages in order to determine whether they should do so. + * The function computes cache limits based on the configured ulimit and + * cache size. Use of this function is especially important for caches + * which are not limited in any other way (e.g. by HW resources) and, thus, + * could keeping caching buffers. + * + */ +bool hfi1_can_pin_pages(struct hfi1_devdata *dd, u32 nlocked, u32 npages) +{ + unsigned long ulimit = rlimit(RLIMIT_MEMLOCK), pinned, cache_limit, + size = (cache_size * (1UL << 20)); /* convert to bytes */ + unsigned usr_ctxts = dd->num_rcv_contexts - dd->first_user_ctxt; + bool can_lock = capable(CAP_IPC_LOCK); + + /* + * Calculate per-cache size. The calculation below uses only a quarter + * of the available per-context limit. This leaves space for other + * pinning. Should we worry about shared ctxts? + */ + cache_limit = (ulimit / usr_ctxts) / 4; + + /* If ulimit isn't set to "unlimited" and is smaller than cache_size. */ + if (ulimit != (-1UL) && size > cache_limit) + size = cache_limit; + + /* Convert to number of pages */ + size = DIV_ROUND_UP(size, PAGE_SIZE); + + down_read(¤t->mm->mmap_sem); + pinned = current->mm->pinned_vm; + up_read(¤t->mm->mmap_sem); + + /* First, check the absolute limit against all pinned pages. */ + if (pinned + npages >= ulimit && !can_lock) + return false; + + return ((nlocked + npages) <= size) || can_lock; +} + +int hfi1_acquire_user_pages(unsigned long vaddr, size_t npages, bool writable, + struct page **pages) +{ + int ret; + + ret = get_user_pages_fast(vaddr, npages, writable, pages); + if (ret < 0) + return ret; + + down_write(¤t->mm->mmap_sem); + current->mm->pinned_vm += ret; + up_write(¤t->mm->mmap_sem); + + return ret; +} + +void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, + size_t npages, bool dirty) +{ + size_t i; + + for (i = 0; i < npages; i++) { + if (dirty) + set_page_dirty_lock(p[i]); + put_page(p[i]); + } + + if (mm) { /* during close after signal, mm can be NULL */ + down_write(&mm->mmap_sem); + mm->pinned_vm -= npages; + up_write(&mm->mmap_sem); + } +} diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c new file mode 100644 index 000000000..47ffd273e --- /dev/null +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -0,0 +1,1625 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include <linux/mm.h> +#include <linux/types.h> +#include <linux/device.h> +#include <linux/dmapool.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <linux/highmem.h> +#include <linux/io.h> +#include <linux/uio.h> +#include <linux/rbtree.h> +#include <linux/spinlock.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/mmu_context.h> +#include <linux/module.h> +#include <linux/vmalloc.h> + +#include "hfi.h" +#include "sdma.h" +#include "user_sdma.h" +#include "verbs.h" /* for the headers */ +#include "common.h" /* for struct hfi1_tid_info */ +#include "trace.h" +#include "mmu_rb.h" + +static uint hfi1_sdma_comp_ring_size = 128; +module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO); +MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128"); + +/* The maximum number of Data io vectors per message/request */ +#define MAX_VECTORS_PER_REQ 8 +/* + * Maximum number of packet to send from each message/request + * before moving to the next one. + */ +#define MAX_PKTS_PER_QUEUE 16 + +#define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT)) + +#define req_opcode(x) \ + (((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) +#define req_version(x) \ + (((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK) +#define req_iovcnt(x) \ + (((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK) + +/* Number of BTH.PSN bits used for sequence number in expected rcvs */ +#define BTH_SEQ_MASK 0x7ffull + +/* + * Define fields in the KDETH header so we can update the header + * template. + */ +#define KDETH_OFFSET_SHIFT 0 +#define KDETH_OFFSET_MASK 0x7fff +#define KDETH_OM_SHIFT 15 +#define KDETH_OM_MASK 0x1 +#define KDETH_TID_SHIFT 16 +#define KDETH_TID_MASK 0x3ff +#define KDETH_TIDCTRL_SHIFT 26 +#define KDETH_TIDCTRL_MASK 0x3 +#define KDETH_INTR_SHIFT 28 +#define KDETH_INTR_MASK 0x1 +#define KDETH_SH_SHIFT 29 +#define KDETH_SH_MASK 0x1 +#define KDETH_HCRC_UPPER_SHIFT 16 +#define KDETH_HCRC_UPPER_MASK 0xff +#define KDETH_HCRC_LOWER_SHIFT 24 +#define KDETH_HCRC_LOWER_MASK 0xff + +#define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4) +#define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff) + +#define KDETH_GET(val, field) \ + (((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK) +#define KDETH_SET(dw, field, val) do { \ + u32 dwval = le32_to_cpu(dw); \ + dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \ + dwval |= (((val) & KDETH_##field##_MASK) << \ + KDETH_##field##_SHIFT); \ + dw = cpu_to_le32(dwval); \ + } while (0) + +#define AHG_HEADER_SET(arr, idx, dw, bit, width, value) \ + do { \ + if ((idx) < ARRAY_SIZE((arr))) \ + (arr)[(idx++)] = sdma_build_ahg_descriptor( \ + (__force u16)(value), (dw), (bit), \ + (width)); \ + else \ + return -ERANGE; \ + } while (0) + +/* KDETH OM multipliers and switch over point */ +#define KDETH_OM_SMALL 4 +#define KDETH_OM_LARGE 64 +#define KDETH_OM_MAX_SIZE (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1)) + +/* Last packet in the request */ +#define TXREQ_FLAGS_REQ_LAST_PKT BIT(0) + +#define SDMA_REQ_IN_USE 0 +#define SDMA_REQ_FOR_THREAD 1 +#define SDMA_REQ_SEND_DONE 2 +#define SDMA_REQ_HAVE_AHG 3 +#define SDMA_REQ_HAS_ERROR 4 +#define SDMA_REQ_DONE_ERROR 5 + +#define SDMA_PKT_Q_INACTIVE BIT(0) +#define SDMA_PKT_Q_ACTIVE BIT(1) +#define SDMA_PKT_Q_DEFERRED BIT(2) + +/* + * Maximum retry attempts to submit a TX request + * before putting the process to sleep. + */ +#define MAX_DEFER_RETRY_COUNT 1 + +static unsigned initial_pkt_count = 8; + +#define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */ + +struct sdma_mmu_node; + +struct user_sdma_iovec { + struct list_head list; + struct iovec iov; + /* number of pages in this vector */ + unsigned npages; + /* array of pinned pages for this vector */ + struct page **pages; + /* + * offset into the virtual address space of the vector at + * which we last left off. + */ + u64 offset; + struct sdma_mmu_node *node; +}; + +#define SDMA_CACHE_NODE_EVICT 0 + +struct sdma_mmu_node { + struct mmu_rb_node rb; + struct list_head list; + struct hfi1_user_sdma_pkt_q *pq; + atomic_t refcount; + struct page **pages; + unsigned npages; + unsigned long flags; +}; + +struct user_sdma_request { + struct sdma_req_info info; + struct hfi1_user_sdma_pkt_q *pq; + struct hfi1_user_sdma_comp_q *cq; + /* This is the original header from user space */ + struct hfi1_pkt_header hdr; + /* + * Pointer to the SDMA engine for this request. + * Since different request could be on different VLs, + * each request will need it's own engine pointer. + */ + struct sdma_engine *sde; + u8 ahg_idx; + u32 ahg[9]; + /* + * KDETH.Offset (Eager) field + * We need to remember the initial value so the headers + * can be updated properly. + */ + u32 koffset; + /* + * KDETH.OFFSET (TID) field + * The offset can cover multiple packets, depending on the + * size of the TID entry. + */ + u32 tidoffset; + /* + * KDETH.OM + * Remember this because the header template always sets it + * to 0. + */ + u8 omfactor; + /* + * We copy the iovs for this request (based on + * info.iovcnt). These are only the data vectors + */ + unsigned data_iovs; + /* total length of the data in the request */ + u32 data_len; + /* progress index moving along the iovs array */ + unsigned iov_idx; + struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ]; + /* number of elements copied to the tids array */ + u16 n_tids; + /* TID array values copied from the tid_iov vector */ + u32 *tids; + u16 tididx; + u32 sent; + u64 seqnum; + u64 seqcomp; + u64 seqsubmitted; + struct list_head txps; + unsigned long flags; + /* status of the last txreq completed */ + int status; +}; + +/* + * A single txreq could span up to 3 physical pages when the MTU + * is sufficiently large (> 4K). Each of the IOV pointers also + * needs it's own set of flags so the vector has been handled + * independently of each other. + */ +struct user_sdma_txreq { + /* Packet header for the txreq */ + struct hfi1_pkt_header hdr; + struct sdma_txreq txreq; + struct list_head list; + struct user_sdma_request *req; + u16 flags; + unsigned busycount; + u64 seqnum; +}; + +#define SDMA_DBG(req, fmt, ...) \ + hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \ + (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \ + ##__VA_ARGS__) +#define SDMA_Q_DBG(pq, fmt, ...) \ + hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \ + (pq)->subctxt, ##__VA_ARGS__) + +static int user_sdma_send_pkts(struct user_sdma_request *, unsigned); +static int num_user_pages(const struct iovec *); +static void user_sdma_txreq_cb(struct sdma_txreq *, int); +static inline void pq_update(struct hfi1_user_sdma_pkt_q *); +static void user_sdma_free_request(struct user_sdma_request *, bool); +static int pin_vector_pages(struct user_sdma_request *, + struct user_sdma_iovec *); +static void unpin_vector_pages(struct mm_struct *, struct page **, unsigned, + unsigned); +static int check_header_template(struct user_sdma_request *, + struct hfi1_pkt_header *, u32, u32); +static int set_txreq_header(struct user_sdma_request *, + struct user_sdma_txreq *, u32); +static int set_txreq_header_ahg(struct user_sdma_request *, + struct user_sdma_txreq *, u32); +static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *, + struct hfi1_user_sdma_comp_q *, + u16, enum hfi1_sdma_comp_state, int); +static inline u32 set_pkt_bth_psn(__be32, u8, u32); +static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); + +static int defer_packet_queue( + struct sdma_engine *, + struct iowait *, + struct sdma_txreq *, + unsigned seq); +static void activate_packet_queue(struct iowait *, int); +static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long); +static int sdma_rb_insert(struct rb_root *, struct mmu_rb_node *); +static void sdma_rb_remove(struct rb_root *, struct mmu_rb_node *, + struct mm_struct *); +static int sdma_rb_invalidate(struct rb_root *, struct mmu_rb_node *); + +static struct mmu_rb_ops sdma_rb_ops = { + .filter = sdma_rb_filter, + .insert = sdma_rb_insert, + .remove = sdma_rb_remove, + .invalidate = sdma_rb_invalidate +}; + +static int defer_packet_queue( + struct sdma_engine *sde, + struct iowait *wait, + struct sdma_txreq *txreq, + unsigned seq) +{ + struct hfi1_user_sdma_pkt_q *pq = + container_of(wait, struct hfi1_user_sdma_pkt_q, busy); + struct hfi1_ibdev *dev = &pq->dd->verbs_dev; + struct user_sdma_txreq *tx = + container_of(txreq, struct user_sdma_txreq, txreq); + + if (sdma_progress(sde, seq, txreq)) { + if (tx->busycount++ < MAX_DEFER_RETRY_COUNT) + goto eagain; + } + /* + * We are assuming that if the list is enqueued somewhere, it + * is to the dmawait list since that is the only place where + * it is supposed to be enqueued. + */ + xchg(&pq->state, SDMA_PKT_Q_DEFERRED); + write_seqlock(&dev->iowait_lock); + if (list_empty(&pq->busy.list)) + list_add_tail(&pq->busy.list, &sde->dmawait); + write_sequnlock(&dev->iowait_lock); + return -EBUSY; +eagain: + return -EAGAIN; +} + +static void activate_packet_queue(struct iowait *wait, int reason) +{ + struct hfi1_user_sdma_pkt_q *pq = + container_of(wait, struct hfi1_user_sdma_pkt_q, busy); + xchg(&pq->state, SDMA_PKT_Q_ACTIVE); + wake_up(&wait->wait_dma); +}; + +static void sdma_kmem_cache_ctor(void *obj) +{ + struct user_sdma_txreq *tx = obj; + + memset(tx, 0, sizeof(*tx)); +} + +int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp) +{ + struct hfi1_filedata *fd; + int ret = 0; + unsigned memsize; + char buf[64]; + struct hfi1_devdata *dd; + struct hfi1_user_sdma_comp_q *cq; + struct hfi1_user_sdma_pkt_q *pq; + unsigned long flags; + + if (!uctxt || !fp) { + ret = -EBADF; + goto done; + } + + fd = fp->private_data; + + if (!hfi1_sdma_comp_ring_size) { + ret = -EINVAL; + goto done; + } + + dd = uctxt->dd; + + pq = kzalloc(sizeof(*pq), GFP_KERNEL); + if (!pq) + goto pq_nomem; + + memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size; + pq->reqs = kzalloc(memsize, GFP_KERNEL); + if (!pq->reqs) + goto pq_reqs_nomem; + + INIT_LIST_HEAD(&pq->list); + pq->dd = dd; + pq->ctxt = uctxt->ctxt; + pq->subctxt = fd->subctxt; + pq->n_max_reqs = hfi1_sdma_comp_ring_size; + pq->state = SDMA_PKT_Q_INACTIVE; + atomic_set(&pq->n_reqs, 0); + init_waitqueue_head(&pq->wait); + pq->sdma_rb_root = RB_ROOT; + INIT_LIST_HEAD(&pq->evict); + spin_lock_init(&pq->evict_lock); + + iowait_init(&pq->busy, 0, NULL, defer_packet_queue, + activate_packet_queue, NULL); + pq->reqidx = 0; + snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt, + fd->subctxt); + pq->txreq_cache = kmem_cache_create(buf, + sizeof(struct user_sdma_txreq), + L1_CACHE_BYTES, + SLAB_HWCACHE_ALIGN, + sdma_kmem_cache_ctor); + if (!pq->txreq_cache) { + dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n", + uctxt->ctxt); + goto pq_txreq_nomem; + } + fd->pq = pq; + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + goto cq_nomem; + + memsize = PAGE_ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size); + cq->comps = vmalloc_user(memsize); + if (!cq->comps) + goto cq_comps_nomem; + + cq->nentries = hfi1_sdma_comp_ring_size; + fd->cq = cq; + + ret = hfi1_mmu_rb_register(&pq->sdma_rb_root, &sdma_rb_ops); + if (ret) { + dd_dev_err(dd, "Failed to register with MMU %d", ret); + goto done; + } + + spin_lock_irqsave(&uctxt->sdma_qlock, flags); + list_add(&pq->list, &uctxt->sdma_queues); + spin_unlock_irqrestore(&uctxt->sdma_qlock, flags); + goto done; + +cq_comps_nomem: + kfree(cq); +cq_nomem: + kmem_cache_destroy(pq->txreq_cache); +pq_txreq_nomem: + kfree(pq->reqs); +pq_reqs_nomem: + kfree(pq); + fd->pq = NULL; +pq_nomem: + ret = -ENOMEM; +done: + return ret; +} + +int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd) +{ + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_user_sdma_pkt_q *pq; + unsigned long flags; + + hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit, + uctxt->ctxt, fd->subctxt); + pq = fd->pq; + hfi1_mmu_rb_unregister(&pq->sdma_rb_root); + if (pq) { + spin_lock_irqsave(&uctxt->sdma_qlock, flags); + if (!list_empty(&pq->list)) + list_del_init(&pq->list); + spin_unlock_irqrestore(&uctxt->sdma_qlock, flags); + iowait_sdma_drain(&pq->busy); + /* Wait until all requests have been freed. */ + wait_event_interruptible( + pq->wait, + (ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE)); + kfree(pq->reqs); + kmem_cache_destroy(pq->txreq_cache); + kfree(pq); + fd->pq = NULL; + } + if (fd->cq) { + vfree(fd->cq->comps); + kfree(fd->cq); + fd->cq = NULL; + } + return 0; +} + +int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec, + unsigned long dim, unsigned long *count) +{ + int ret = 0, i = 0; + struct hfi1_filedata *fd = fp->private_data; + struct hfi1_ctxtdata *uctxt = fd->uctxt; + struct hfi1_user_sdma_pkt_q *pq = fd->pq; + struct hfi1_user_sdma_comp_q *cq = fd->cq; + struct hfi1_devdata *dd = pq->dd; + unsigned long idx = 0; + u8 pcount = initial_pkt_count; + struct sdma_req_info info; + struct user_sdma_request *req; + u8 opcode, sc, vl; + int req_queued = 0; + + if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) { + hfi1_cdbg( + SDMA, + "[%u:%u:%u] First vector not big enough for header %lu/%lu", + dd->unit, uctxt->ctxt, fd->subctxt, + iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr)); + return -EINVAL; + } + ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info)); + if (ret) { + hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)", + dd->unit, uctxt->ctxt, fd->subctxt, ret); + return -EFAULT; + } + + trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt, + (u16 *)&info); + if (cq->comps[info.comp_idx].status == QUEUED || + test_bit(SDMA_REQ_IN_USE, &pq->reqs[info.comp_idx].flags)) { + hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state", + dd->unit, uctxt->ctxt, fd->subctxt, + info.comp_idx); + return -EBADSLT; + } + if (!info.fragsize) { + hfi1_cdbg(SDMA, + "[%u:%u:%u:%u] Request does not specify fragsize", + dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx); + return -EINVAL; + } + /* + * We've done all the safety checks that we can up to this point, + * "allocate" the request entry. + */ + hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit, + uctxt->ctxt, fd->subctxt, info.comp_idx); + req = pq->reqs + info.comp_idx; + memset(req, 0, sizeof(*req)); + /* Mark the request as IN_USE before we start filling it in. */ + set_bit(SDMA_REQ_IN_USE, &req->flags); + req->data_iovs = req_iovcnt(info.ctrl) - 1; + req->pq = pq; + req->cq = cq; + req->status = -1; + INIT_LIST_HEAD(&req->txps); + + memcpy(&req->info, &info, sizeof(info)); + + if (req_opcode(info.ctrl) == EXPECTED) + req->data_iovs--; + + if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) { + SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs, + MAX_VECTORS_PER_REQ); + return -EINVAL; + } + /* Copy the header from the user buffer */ + ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info), + sizeof(req->hdr)); + if (ret) { + SDMA_DBG(req, "Failed to copy header template (%d)", ret); + ret = -EFAULT; + goto free_req; + } + + /* If Static rate control is not enabled, sanitize the header. */ + if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL)) + req->hdr.pbc[2] = 0; + + /* Validate the opcode. Do not trust packets from user space blindly. */ + opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff; + if ((opcode & USER_OPCODE_CHECK_MASK) != + USER_OPCODE_CHECK_VAL) { + SDMA_DBG(req, "Invalid opcode (%d)", opcode); + ret = -EINVAL; + goto free_req; + } + /* + * Validate the vl. Do not trust packets from user space blindly. + * VL comes from PBC, SC comes from LRH, and the VL needs to + * match the SC look up. + */ + vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF; + sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) | + (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4)); + if (vl >= dd->pport->vls_operational || + vl != sc_to_vlt(dd, sc)) { + SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl); + ret = -EINVAL; + goto free_req; + } + + /* Checking P_KEY for requests from user-space */ + if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc, + PKEY_CHECK_INVALID)) { + ret = -EINVAL; + goto free_req; + } + + /* + * Also should check the BTH.lnh. If it says the next header is GRH then + * the RXE parsing will be off and will land in the middle of the KDETH + * or miss it entirely. + */ + if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) { + SDMA_DBG(req, "User tried to pass in a GRH"); + ret = -EINVAL; + goto free_req; + } + + req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]); + /* + * Calculate the initial TID offset based on the values of + * KDETH.OFFSET and KDETH.OM that are passed in. + */ + req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) * + (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? + KDETH_OM_LARGE : KDETH_OM_SMALL); + SDMA_DBG(req, "Initial TID offset %u", req->tidoffset); + idx++; + + /* Save all the IO vector structures */ + while (i < req->data_iovs) { + INIT_LIST_HEAD(&req->iovs[i].list); + memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec)); + ret = pin_vector_pages(req, &req->iovs[i]); + if (ret) { + req->status = ret; + goto free_req; + } + req->data_len += req->iovs[i++].iov.iov_len; + } + SDMA_DBG(req, "total data length %u", req->data_len); + + if (pcount > req->info.npkts) + pcount = req->info.npkts; + /* + * Copy any TID info + * User space will provide the TID info only when the + * request type is EXPECTED. This is true even if there is + * only one packet in the request and the header is already + * setup. The reason for the singular TID case is that the + * driver needs to perform safety checks. + */ + if (req_opcode(req->info.ctrl) == EXPECTED) { + u16 ntids = iovec[idx].iov_len / sizeof(*req->tids); + + if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) { + ret = -EINVAL; + goto free_req; + } + req->tids = kcalloc(ntids, sizeof(*req->tids), GFP_KERNEL); + if (!req->tids) { + ret = -ENOMEM; + goto free_req; + } + /* + * We have to copy all of the tids because they may vary + * in size and, therefore, the TID count might not be + * equal to the pkt count. However, there is no way to + * tell at this point. + */ + ret = copy_from_user(req->tids, iovec[idx].iov_base, + ntids * sizeof(*req->tids)); + if (ret) { + SDMA_DBG(req, "Failed to copy %d TIDs (%d)", + ntids, ret); + ret = -EFAULT; + goto free_req; + } + req->n_tids = ntids; + idx++; + } + + /* Have to select the engine */ + req->sde = sdma_select_engine_vl(dd, + (u32)(uctxt->ctxt + fd->subctxt), + vl); + if (!req->sde || !sdma_running(req->sde)) { + ret = -ECOMM; + goto free_req; + } + + /* We don't need an AHG entry if the request contains only one packet */ + if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) { + int ahg = sdma_ahg_alloc(req->sde); + + if (likely(ahg >= 0)) { + req->ahg_idx = (u8)ahg; + set_bit(SDMA_REQ_HAVE_AHG, &req->flags); + } + } + + set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); + atomic_inc(&pq->n_reqs); + req_queued = 1; + /* Send the first N packets in the request to buy us some time */ + ret = user_sdma_send_pkts(req, pcount); + if (unlikely(ret < 0 && ret != -EBUSY)) { + req->status = ret; + goto free_req; + } + + /* + * It is possible that the SDMA engine would have processed all the + * submitted packets by the time we get here. Therefore, only set + * packet queue state to ACTIVE if there are still uncompleted + * requests. + */ + if (atomic_read(&pq->n_reqs)) + xchg(&pq->state, SDMA_PKT_Q_ACTIVE); + + /* + * This is a somewhat blocking send implementation. + * The driver will block the caller until all packets of the + * request have been submitted to the SDMA engine. However, it + * will not wait for send completions. + */ + while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) { + ret = user_sdma_send_pkts(req, pcount); + if (ret < 0) { + if (ret != -EBUSY) { + req->status = ret; + set_bit(SDMA_REQ_DONE_ERROR, &req->flags); + if (ACCESS_ONCE(req->seqcomp) == + req->seqsubmitted - 1) + goto free_req; + return ret; + } + wait_event_interruptible_timeout( + pq->busy.wait_dma, + (pq->state == SDMA_PKT_Q_ACTIVE), + msecs_to_jiffies( + SDMA_IOWAIT_TIMEOUT)); + } + } + *count += idx; + return 0; +free_req: + user_sdma_free_request(req, true); + if (req_queued) + pq_update(pq); + set_comp_state(pq, cq, info.comp_idx, ERROR, req->status); + return ret; +} + +static inline u32 compute_data_length(struct user_sdma_request *req, + struct user_sdma_txreq *tx) +{ + /* + * Determine the proper size of the packet data. + * The size of the data of the first packet is in the header + * template. However, it includes the header and ICRC, which need + * to be subtracted. + * The size of the remaining packets is the minimum of the frag + * size (MTU) or remaining data in the request. + */ + u32 len; + + if (!req->seqnum) { + len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) - + (sizeof(tx->hdr) - 4)); + } else if (req_opcode(req->info.ctrl) == EXPECTED) { + u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) * + PAGE_SIZE; + /* + * Get the data length based on the remaining space in the + * TID pair. + */ + len = min(tidlen - req->tidoffset, (u32)req->info.fragsize); + /* If we've filled up the TID pair, move to the next one. */ + if (unlikely(!len) && ++req->tididx < req->n_tids && + req->tids[req->tididx]) { + tidlen = EXP_TID_GET(req->tids[req->tididx], + LEN) * PAGE_SIZE; + req->tidoffset = 0; + len = min_t(u32, tidlen, req->info.fragsize); + } + /* + * Since the TID pairs map entire pages, make sure that we + * are not going to try to send more data that we have + * remaining. + */ + len = min(len, req->data_len - req->sent); + } else { + len = min(req->data_len - req->sent, (u32)req->info.fragsize); + } + SDMA_DBG(req, "Data Length = %u", len); + return len; +} + +static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len) +{ + /* (Size of complete header - size of PBC) + 4B ICRC + data length */ + return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len); +} + +static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) +{ + int ret = 0; + unsigned npkts = 0; + struct user_sdma_txreq *tx = NULL; + struct hfi1_user_sdma_pkt_q *pq = NULL; + struct user_sdma_iovec *iovec = NULL; + + if (!req->pq) + return -EINVAL; + + pq = req->pq; + + /* If tx completion has reported an error, we are done. */ + if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) { + set_bit(SDMA_REQ_DONE_ERROR, &req->flags); + return -EFAULT; + } + + /* + * Check if we might have sent the entire request already + */ + if (unlikely(req->seqnum == req->info.npkts)) { + if (!list_empty(&req->txps)) + goto dosend; + return ret; + } + + if (!maxpkts || maxpkts > req->info.npkts - req->seqnum) + maxpkts = req->info.npkts - req->seqnum; + + while (npkts < maxpkts) { + u32 datalen = 0, queued = 0, data_sent = 0; + u64 iov_offset = 0; + + /* + * Check whether any of the completions have come back + * with errors. If so, we are not going to process any + * more packets from this request. + */ + if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) { + set_bit(SDMA_REQ_DONE_ERROR, &req->flags); + return -EFAULT; + } + + tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL); + if (!tx) + return -ENOMEM; + + tx->flags = 0; + tx->req = req; + tx->busycount = 0; + INIT_LIST_HEAD(&tx->list); + + if (req->seqnum == req->info.npkts - 1) + tx->flags |= TXREQ_FLAGS_REQ_LAST_PKT; + + /* + * Calculate the payload size - this is min of the fragment + * (MTU) size or the remaining bytes in the request but only + * if we have payload data. + */ + if (req->data_len) { + iovec = &req->iovs[req->iov_idx]; + if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) { + if (++req->iov_idx == req->data_iovs) { + ret = -EFAULT; + goto free_txreq; + } + iovec = &req->iovs[req->iov_idx]; + WARN_ON(iovec->offset); + } + + datalen = compute_data_length(req, tx); + if (!datalen) { + SDMA_DBG(req, + "Request has data but pkt len is 0"); + ret = -EFAULT; + goto free_tx; + } + } + + if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) { + if (!req->seqnum) { + u16 pbclen = le16_to_cpu(req->hdr.pbc[0]); + u32 lrhlen = get_lrh_len(req->hdr, datalen); + /* + * Copy the request header into the tx header + * because the HW needs a cacheline-aligned + * address. + * This copy can be optimized out if the hdr + * member of user_sdma_request were also + * cacheline aligned. + */ + memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr)); + if (PBC2LRH(pbclen) != lrhlen) { + pbclen = (pbclen & 0xf000) | + LRH2PBC(lrhlen); + tx->hdr.pbc[0] = cpu_to_le16(pbclen); + } + ret = sdma_txinit_ahg(&tx->txreq, + SDMA_TXREQ_F_AHG_COPY, + sizeof(tx->hdr) + datalen, + req->ahg_idx, 0, NULL, 0, + user_sdma_txreq_cb); + if (ret) + goto free_tx; + ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq, + &tx->hdr, + sizeof(tx->hdr)); + if (ret) + goto free_txreq; + } else { + int changes; + + changes = set_txreq_header_ahg(req, tx, + datalen); + if (changes < 0) + goto free_tx; + sdma_txinit_ahg(&tx->txreq, + SDMA_TXREQ_F_USE_AHG, + datalen, req->ahg_idx, changes, + req->ahg, sizeof(req->hdr), + user_sdma_txreq_cb); + } + } else { + ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + + datalen, user_sdma_txreq_cb); + if (ret) + goto free_tx; + /* + * Modify the header for this packet. This only needs + * to be done if we are not going to use AHG. Otherwise, + * the HW will do it based on the changes we gave it + * during sdma_txinit_ahg(). + */ + ret = set_txreq_header(req, tx, datalen); + if (ret) + goto free_txreq; + } + + /* + * If the request contains any data vectors, add up to + * fragsize bytes to the descriptor. + */ + while (queued < datalen && + (req->sent + data_sent) < req->data_len) { + unsigned long base, offset; + unsigned pageidx, len; + + base = (unsigned long)iovec->iov.iov_base; + offset = offset_in_page(base + iovec->offset + + iov_offset); + pageidx = (((iovec->offset + iov_offset + + base) - (base & PAGE_MASK)) >> PAGE_SHIFT); + len = offset + req->info.fragsize > PAGE_SIZE ? + PAGE_SIZE - offset : req->info.fragsize; + len = min((datalen - queued), len); + ret = sdma_txadd_page(pq->dd, &tx->txreq, + iovec->pages[pageidx], + offset, len); + if (ret) { + SDMA_DBG(req, "SDMA txreq add page failed %d\n", + ret); + goto free_txreq; + } + iov_offset += len; + queued += len; + data_sent += len; + if (unlikely(queued < datalen && + pageidx == iovec->npages && + req->iov_idx < req->data_iovs - 1)) { + iovec->offset += iov_offset; + iovec = &req->iovs[++req->iov_idx]; + iov_offset = 0; + } + } + /* + * The txreq was submitted successfully so we can update + * the counters. + */ + req->koffset += datalen; + if (req_opcode(req->info.ctrl) == EXPECTED) + req->tidoffset += datalen; + req->sent += data_sent; + if (req->data_len) + iovec->offset += iov_offset; + list_add_tail(&tx->txreq.list, &req->txps); + /* + * It is important to increment this here as it is used to + * generate the BTH.PSN and, therefore, can't be bulk-updated + * outside of the loop. + */ + tx->seqnum = req->seqnum++; + npkts++; + } +dosend: + ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps); + if (list_empty(&req->txps)) { + req->seqsubmitted = req->seqnum; + if (req->seqnum == req->info.npkts) { + set_bit(SDMA_REQ_SEND_DONE, &req->flags); + /* + * The txreq has already been submitted to the HW queue + * so we can free the AHG entry now. Corruption will not + * happen due to the sequential manner in which + * descriptors are processed. + */ + if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) + sdma_ahg_free(req->sde, req->ahg_idx); + } + } else if (ret > 0) { + req->seqsubmitted += ret; + ret = 0; + } + return ret; + +free_txreq: + sdma_txclean(pq->dd, &tx->txreq); +free_tx: + kmem_cache_free(pq->txreq_cache, tx); + return ret; +} + +/* + * How many pages in this iovec element? + */ +static inline int num_user_pages(const struct iovec *iov) +{ + const unsigned long addr = (unsigned long)iov->iov_base; + const unsigned long len = iov->iov_len; + const unsigned long spage = addr & PAGE_MASK; + const unsigned long epage = (addr + len - 1) & PAGE_MASK; + + return 1 + ((epage - spage) >> PAGE_SHIFT); +} + +static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages) +{ + u32 cleared = 0; + struct sdma_mmu_node *node, *ptr; + struct list_head to_evict = LIST_HEAD_INIT(to_evict); + + spin_lock(&pq->evict_lock); + list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) { + /* Make sure that no one is still using the node. */ + if (!atomic_read(&node->refcount)) { + set_bit(SDMA_CACHE_NODE_EVICT, &node->flags); + list_del_init(&node->list); + list_add(&node->list, &to_evict); + cleared += node->npages; + if (cleared >= npages) + break; + } + } + spin_unlock(&pq->evict_lock); + + list_for_each_entry_safe(node, ptr, &to_evict, list) + hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb); + + return cleared; +} + +static int pin_vector_pages(struct user_sdma_request *req, + struct user_sdma_iovec *iovec) { + int ret = 0, pinned, npages, cleared; + struct page **pages; + struct hfi1_user_sdma_pkt_q *pq = req->pq; + struct sdma_mmu_node *node = NULL; + struct mmu_rb_node *rb_node; + + rb_node = hfi1_mmu_rb_extract(&pq->sdma_rb_root, + (unsigned long)iovec->iov.iov_base, + iovec->iov.iov_len); + if (rb_node && !IS_ERR(rb_node)) + node = container_of(rb_node, struct sdma_mmu_node, rb); + else + rb_node = NULL; + + if (!node) { + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + node->rb.addr = (unsigned long)iovec->iov.iov_base; + node->pq = pq; + atomic_set(&node->refcount, 0); + INIT_LIST_HEAD(&node->list); + } + + npages = num_user_pages(&iovec->iov); + if (node->npages < npages) { + pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) { + SDMA_DBG(req, "Failed page array alloc"); + ret = -ENOMEM; + goto bail; + } + memcpy(pages, node->pages, node->npages * sizeof(*pages)); + + npages -= node->npages; + + /* + * If rb_node is NULL, it means that this is brand new node + * and, therefore not on the eviction list. + * If, however, the rb_node is non-NULL, it means that the + * node is already in RB tree and, therefore on the eviction + * list (nodes are unconditionally inserted in the eviction + * list). In that case, we have to remove the node prior to + * calling the eviction function in order to prevent it from + * freeing this node. + */ + if (rb_node) { + spin_lock(&pq->evict_lock); + list_del_init(&node->list); + spin_unlock(&pq->evict_lock); + } +retry: + if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) { + cleared = sdma_cache_evict(pq, npages); + if (cleared >= npages) + goto retry; + } + pinned = hfi1_acquire_user_pages( + ((unsigned long)iovec->iov.iov_base + + (node->npages * PAGE_SIZE)), npages, 0, + pages + node->npages); + if (pinned < 0) { + kfree(pages); + ret = pinned; + goto bail; + } + if (pinned != npages) { + unpin_vector_pages(current->mm, pages, node->npages, + pinned); + ret = -EFAULT; + goto bail; + } + kfree(node->pages); + node->rb.len = iovec->iov.iov_len; + node->pages = pages; + node->npages += pinned; + npages = node->npages; + spin_lock(&pq->evict_lock); + list_add(&node->list, &pq->evict); + pq->n_locked += pinned; + spin_unlock(&pq->evict_lock); + } + iovec->pages = node->pages; + iovec->npages = npages; + iovec->node = node; + + ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb); + if (ret) { + spin_lock(&pq->evict_lock); + if (!list_empty(&node->list)) + list_del(&node->list); + pq->n_locked -= node->npages; + spin_unlock(&pq->evict_lock); + goto bail; + } + return 0; +bail: + if (rb_node) + unpin_vector_pages(current->mm, node->pages, 0, node->npages); + kfree(node); + return ret; +} + +static void unpin_vector_pages(struct mm_struct *mm, struct page **pages, + unsigned start, unsigned npages) +{ + hfi1_release_user_pages(mm, pages + start, npages, 0); + kfree(pages); +} + +static int check_header_template(struct user_sdma_request *req, + struct hfi1_pkt_header *hdr, u32 lrhlen, + u32 datalen) +{ + /* + * Perform safety checks for any type of packet: + * - transfer size is multiple of 64bytes + * - packet length is multiple of 4bytes + * - entire request length is multiple of 4bytes + * - packet length is not larger than MTU size + * + * These checks are only done for the first packet of the + * transfer since the header is "given" to us by user space. + * For the remainder of the packets we compute the values. + */ + if (req->info.fragsize % PIO_BLOCK_SIZE || + lrhlen & 0x3 || req->data_len & 0x3 || + lrhlen > get_lrh_len(*hdr, req->info.fragsize)) + return -EINVAL; + + if (req_opcode(req->info.ctrl) == EXPECTED) { + /* + * The header is checked only on the first packet. Furthermore, + * we ensure that at least one TID entry is copied when the + * request is submitted. Therefore, we don't have to verify that + * tididx points to something sane. + */ + u32 tidval = req->tids[req->tididx], + tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE, + tididx = EXP_TID_GET(tidval, IDX), + tidctrl = EXP_TID_GET(tidval, CTRL), + tidoff; + __le32 kval = hdr->kdeth.ver_tid_offset; + + tidoff = KDETH_GET(kval, OFFSET) * + (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ? + KDETH_OM_LARGE : KDETH_OM_SMALL); + /* + * Expected receive packets have the following + * additional checks: + * - offset is not larger than the TID size + * - TIDCtrl values match between header and TID array + * - TID indexes match between header and TID array + */ + if ((tidoff + datalen > tidlen) || + KDETH_GET(kval, TIDCTRL) != tidctrl || + KDETH_GET(kval, TID) != tididx) + return -EINVAL; + } + return 0; +} + +/* + * Correctly set the BTH.PSN field based on type of + * transfer - eager packets can just increment the PSN but + * expected packets encode generation and sequence in the + * BTH.PSN field so just incrementing will result in errors. + */ +static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags) +{ + u32 val = be32_to_cpu(bthpsn), + mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull : + 0xffffffull), + psn = val & mask; + if (expct) + psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK); + else + psn = psn + frags; + return psn & mask; +} + +static int set_txreq_header(struct user_sdma_request *req, + struct user_sdma_txreq *tx, u32 datalen) +{ + struct hfi1_user_sdma_pkt_q *pq = req->pq; + struct hfi1_pkt_header *hdr = &tx->hdr; + u16 pbclen; + int ret; + u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen); + + /* Copy the header template to the request before modification */ + memcpy(hdr, &req->hdr, sizeof(*hdr)); + + /* + * Check if the PBC and LRH length are mismatched. If so + * adjust both in the header. + */ + pbclen = le16_to_cpu(hdr->pbc[0]); + if (PBC2LRH(pbclen) != lrhlen) { + pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen); + hdr->pbc[0] = cpu_to_le16(pbclen); + hdr->lrh[2] = cpu_to_be16(lrhlen >> 2); + /* + * Third packet + * This is the first packet in the sequence that has + * a "static" size that can be used for the rest of + * the packets (besides the last one). + */ + if (unlikely(req->seqnum == 2)) { + /* + * From this point on the lengths in both the + * PBC and LRH are the same until the last + * packet. + * Adjust the template so we don't have to update + * every packet + */ + req->hdr.pbc[0] = hdr->pbc[0]; + req->hdr.lrh[2] = hdr->lrh[2]; + } + } + /* + * We only have to modify the header if this is not the + * first packet in the request. Otherwise, we use the + * header given to us. + */ + if (unlikely(!req->seqnum)) { + ret = check_header_template(req, hdr, lrhlen, datalen); + if (ret) + return ret; + goto done; + } + + hdr->bth[2] = cpu_to_be32( + set_pkt_bth_psn(hdr->bth[2], + (req_opcode(req->info.ctrl) == EXPECTED), + req->seqnum)); + + /* Set ACK request on last packet */ + if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) + hdr->bth[2] |= cpu_to_be32(1UL << 31); + + /* Set the new offset */ + hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset); + /* Expected packets have to fill in the new TID information */ + if (req_opcode(req->info.ctrl) == EXPECTED) { + tidval = req->tids[req->tididx]; + /* + * If the offset puts us at the end of the current TID, + * advance everything. + */ + if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * + PAGE_SIZE)) { + req->tidoffset = 0; + /* + * Since we don't copy all the TIDs, all at once, + * we have to check again. + */ + if (++req->tididx > req->n_tids - 1 || + !req->tids[req->tididx]) { + return -EINVAL; + } + tidval = req->tids[req->tididx]; + } + req->omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >= + KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE : KDETH_OM_SMALL; + /* Set KDETH.TIDCtrl based on value for this TID. */ + KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL, + EXP_TID_GET(tidval, CTRL)); + /* Set KDETH.TID based on value for this TID */ + KDETH_SET(hdr->kdeth.ver_tid_offset, TID, + EXP_TID_GET(tidval, IDX)); + /* Clear KDETH.SH only on the last packet */ + if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) + KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0); + /* + * Set the KDETH.OFFSET and KDETH.OM based on size of + * transfer. + */ + SDMA_DBG(req, "TID offset %ubytes %uunits om%u", + req->tidoffset, req->tidoffset / req->omfactor, + req->omfactor != KDETH_OM_SMALL); + KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET, + req->tidoffset / req->omfactor); + KDETH_SET(hdr->kdeth.ver_tid_offset, OM, + req->omfactor != KDETH_OM_SMALL); + } +done: + trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt, + req->info.comp_idx, hdr, tidval); + return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr)); +} + +static int set_txreq_header_ahg(struct user_sdma_request *req, + struct user_sdma_txreq *tx, u32 len) +{ + int diff = 0; + struct hfi1_user_sdma_pkt_q *pq = req->pq; + struct hfi1_pkt_header *hdr = &req->hdr; + u16 pbclen = le16_to_cpu(hdr->pbc[0]); + u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len); + + if (PBC2LRH(pbclen) != lrhlen) { + /* PBC.PbcLengthDWs */ + AHG_HEADER_SET(req->ahg, diff, 0, 0, 12, + cpu_to_le16(LRH2PBC(lrhlen))); + /* LRH.PktLen (we need the full 16 bits due to byte swap) */ + AHG_HEADER_SET(req->ahg, diff, 3, 0, 16, + cpu_to_be16(lrhlen >> 2)); + } + + /* + * Do the common updates + */ + /* BTH.PSN and BTH.A */ + val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) & + (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff); + if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) + val32 |= 1UL << 31; + AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16)); + AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff)); + /* KDETH.Offset */ + AHG_HEADER_SET(req->ahg, diff, 15, 0, 16, + cpu_to_le16(req->koffset & 0xffff)); + AHG_HEADER_SET(req->ahg, diff, 15, 16, 16, + cpu_to_le16(req->koffset >> 16)); + if (req_opcode(req->info.ctrl) == EXPECTED) { + __le16 val; + + tidval = req->tids[req->tididx]; + + /* + * If the offset puts us at the end of the current TID, + * advance everything. + */ + if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) * + PAGE_SIZE)) { + req->tidoffset = 0; + /* + * Since we don't copy all the TIDs, all at once, + * we have to check again. + */ + if (++req->tididx > req->n_tids - 1 || + !req->tids[req->tididx]) { + return -EINVAL; + } + tidval = req->tids[req->tididx]; + } + req->omfactor = ((EXP_TID_GET(tidval, LEN) * + PAGE_SIZE) >= + KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE : + KDETH_OM_SMALL; + /* KDETH.OM and KDETH.OFFSET (TID) */ + AHG_HEADER_SET(req->ahg, diff, 7, 0, 16, + ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 | + ((req->tidoffset / req->omfactor) & 0x7fff))); + /* KDETH.TIDCtrl, KDETH.TID */ + val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) | + (EXP_TID_GET(tidval, IDX) & 0x3ff)); + /* Clear KDETH.SH on last packet */ + if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) { + val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset, + INTR) >> 16); + val &= cpu_to_le16(~(1U << 13)); + AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val); + } else { + AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val); + } + } + + trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt, + req->info.comp_idx, req->sde->this_idx, + req->ahg_idx, req->ahg, diff, tidval); + return diff; +} + +/* + * SDMA tx request completion callback. Called when the SDMA progress + * state machine gets notification that the SDMA descriptors for this + * tx request have been processed by the DMA engine. Called in + * interrupt context. + */ +static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) +{ + struct user_sdma_txreq *tx = + container_of(txreq, struct user_sdma_txreq, txreq); + struct user_sdma_request *req; + struct hfi1_user_sdma_pkt_q *pq; + struct hfi1_user_sdma_comp_q *cq; + u16 idx; + + if (!tx->req) + return; + + req = tx->req; + pq = req->pq; + cq = req->cq; + + if (status != SDMA_TXREQ_S_OK) { + SDMA_DBG(req, "SDMA completion with error %d", + status); + set_bit(SDMA_REQ_HAS_ERROR, &req->flags); + } + + req->seqcomp = tx->seqnum; + kmem_cache_free(pq->txreq_cache, tx); + tx = NULL; + + idx = req->info.comp_idx; + if (req->status == -1 && status == SDMA_TXREQ_S_OK) { + if (req->seqcomp == req->info.npkts - 1) { + req->status = 0; + user_sdma_free_request(req, false); + pq_update(pq); + set_comp_state(pq, cq, idx, COMPLETE, 0); + } + } else { + if (status != SDMA_TXREQ_S_OK) + req->status = status; + if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) && + (test_bit(SDMA_REQ_SEND_DONE, &req->flags) || + test_bit(SDMA_REQ_DONE_ERROR, &req->flags))) { + user_sdma_free_request(req, false); + pq_update(pq); + set_comp_state(pq, cq, idx, ERROR, req->status); + } + } +} + +static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) +{ + if (atomic_dec_and_test(&pq->n_reqs)) { + xchg(&pq->state, SDMA_PKT_Q_INACTIVE); + wake_up(&pq->wait); + } +} + +static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) +{ + if (!list_empty(&req->txps)) { + struct sdma_txreq *t, *p; + + list_for_each_entry_safe(t, p, &req->txps, list) { + struct user_sdma_txreq *tx = + container_of(t, struct user_sdma_txreq, txreq); + list_del_init(&t->list); + sdma_txclean(req->pq->dd, t); + kmem_cache_free(req->pq->txreq_cache, tx); + } + } + if (req->data_iovs) { + struct sdma_mmu_node *node; + int i; + + for (i = 0; i < req->data_iovs; i++) { + node = req->iovs[i].node; + if (!node) + continue; + + if (unpin) + hfi1_mmu_rb_remove(&req->pq->sdma_rb_root, + &node->rb); + else + atomic_dec(&node->refcount); + } + } + kfree(req->tids); + clear_bit(SDMA_REQ_IN_USE, &req->flags); +} + +static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq, + struct hfi1_user_sdma_comp_q *cq, + u16 idx, enum hfi1_sdma_comp_state state, + int ret) +{ + hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Setting completion status %u %d", + pq->dd->unit, pq->ctxt, pq->subctxt, idx, state, ret); + cq->comps[idx].status = state; + if (state == ERROR) + cq->comps[idx].errcode = -ret; + trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt, + idx, state, ret); +} + +static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, + unsigned long len) +{ + return (bool)(node->addr == addr); +} + +static int sdma_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode) +{ + struct sdma_mmu_node *node = + container_of(mnode, struct sdma_mmu_node, rb); + + atomic_inc(&node->refcount); + return 0; +} + +static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode, + struct mm_struct *mm) +{ + struct sdma_mmu_node *node = + container_of(mnode, struct sdma_mmu_node, rb); + + spin_lock(&node->pq->evict_lock); + /* + * We've been called by the MMU notifier but this node has been + * scheduled for eviction. The eviction function will take care + * of freeing this node. + * We have to take the above lock first because we are racing + * against the setting of the bit in the eviction function. + */ + if (mm && test_bit(SDMA_CACHE_NODE_EVICT, &node->flags)) { + spin_unlock(&node->pq->evict_lock); + return; + } + + if (!list_empty(&node->list)) + list_del(&node->list); + node->pq->n_locked -= node->npages; + spin_unlock(&node->pq->evict_lock); + + /* + * If mm is set, we are being called by the MMU notifier and we + * should not pass a mm_struct to unpin_vector_page(). This is to + * prevent a deadlock when hfi1_release_user_pages() attempts to + * take the mmap_sem, which the MMU notifier has already taken. + */ + unpin_vector_pages(mm ? NULL : current->mm, node->pages, 0, + node->npages); + /* + * If called by the MMU notifier, we have to adjust the pinned + * page count ourselves. + */ + if (mm) + mm->pinned_vm -= node->npages; + kfree(node); +} + +static int sdma_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode) +{ + struct sdma_mmu_node *node = + container_of(mnode, struct sdma_mmu_node, rb); + + if (!atomic_read(&node->refcount)) + return 1; + return 0; +} diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h new file mode 100644 index 000000000..b9240e351 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -0,0 +1,84 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#include <linux/device.h> +#include <linux/wait.h> + +#include "common.h" +#include "iowait.h" +#include "user_exp_rcv.h" + +extern uint extended_psn; + +struct hfi1_user_sdma_pkt_q { + struct list_head list; + unsigned ctxt; + unsigned subctxt; + u16 n_max_reqs; + atomic_t n_reqs; + u16 reqidx; + struct hfi1_devdata *dd; + struct kmem_cache *txreq_cache; + struct user_sdma_request *reqs; + struct iowait busy; + unsigned state; + wait_queue_head_t wait; + unsigned long unpinned; + struct rb_root sdma_rb_root; + u32 n_locked; + struct list_head evict; + spinlock_t evict_lock; /* protect evict and n_locked */ +}; + +struct hfi1_user_sdma_comp_q { + u16 nentries; + struct hfi1_sdma_comp_entry *comps; +}; + +int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *, struct file *); +int hfi1_user_sdma_free_queues(struct hfi1_filedata *); +int hfi1_user_sdma_process_request(struct file *, struct iovec *, unsigned long, + unsigned long *); diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c new file mode 100644 index 000000000..849c4b939 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -0,0 +1,1764 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <rdma/ib_mad.h> +#include <rdma/ib_user_verbs.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/utsname.h> +#include <linux/rculist.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> + +#include "hfi.h" +#include "common.h" +#include "device.h" +#include "trace.h" +#include "qp.h" +#include "verbs_txreq.h" + +static unsigned int hfi1_lkey_table_size = 16; +module_param_named(lkey_table_size, hfi1_lkey_table_size, uint, + S_IRUGO); +MODULE_PARM_DESC(lkey_table_size, + "LKEY table size in bits (2^n, 1 <= n <= 23)"); + +static unsigned int hfi1_max_pds = 0xFFFF; +module_param_named(max_pds, hfi1_max_pds, uint, S_IRUGO); +MODULE_PARM_DESC(max_pds, + "Maximum number of protection domains to support"); + +static unsigned int hfi1_max_ahs = 0xFFFF; +module_param_named(max_ahs, hfi1_max_ahs, uint, S_IRUGO); +MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support"); + +unsigned int hfi1_max_cqes = 0x2FFFF; +module_param_named(max_cqes, hfi1_max_cqes, uint, S_IRUGO); +MODULE_PARM_DESC(max_cqes, + "Maximum number of completion queue entries to support"); + +unsigned int hfi1_max_cqs = 0x1FFFF; +module_param_named(max_cqs, hfi1_max_cqs, uint, S_IRUGO); +MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support"); + +unsigned int hfi1_max_qp_wrs = 0x3FFF; +module_param_named(max_qp_wrs, hfi1_max_qp_wrs, uint, S_IRUGO); +MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support"); + +unsigned int hfi1_max_qps = 16384; +module_param_named(max_qps, hfi1_max_qps, uint, S_IRUGO); +MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support"); + +unsigned int hfi1_max_sges = 0x60; +module_param_named(max_sges, hfi1_max_sges, uint, S_IRUGO); +MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support"); + +unsigned int hfi1_max_mcast_grps = 16384; +module_param_named(max_mcast_grps, hfi1_max_mcast_grps, uint, S_IRUGO); +MODULE_PARM_DESC(max_mcast_grps, + "Maximum number of multicast groups to support"); + +unsigned int hfi1_max_mcast_qp_attached = 16; +module_param_named(max_mcast_qp_attached, hfi1_max_mcast_qp_attached, + uint, S_IRUGO); +MODULE_PARM_DESC(max_mcast_qp_attached, + "Maximum number of attached QPs to support"); + +unsigned int hfi1_max_srqs = 1024; +module_param_named(max_srqs, hfi1_max_srqs, uint, S_IRUGO); +MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support"); + +unsigned int hfi1_max_srq_sges = 128; +module_param_named(max_srq_sges, hfi1_max_srq_sges, uint, S_IRUGO); +MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support"); + +unsigned int hfi1_max_srq_wrs = 0x1FFFF; +module_param_named(max_srq_wrs, hfi1_max_srq_wrs, uint, S_IRUGO); +MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support"); + +unsigned short piothreshold = 256; +module_param(piothreshold, ushort, S_IRUGO); +MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio"); + +#define COPY_CACHELESS 1 +#define COPY_ADAPTIVE 2 +static unsigned int sge_copy_mode; +module_param(sge_copy_mode, uint, S_IRUGO); +MODULE_PARM_DESC(sge_copy_mode, + "Verbs copy mode: 0 use memcpy, 1 use cacheless copy, 2 adapt based on WSS"); + +static void verbs_sdma_complete( + struct sdma_txreq *cookie, + int status); + +static int pio_wait(struct rvt_qp *qp, + struct send_context *sc, + struct hfi1_pkt_state *ps, + u32 flag); + +/* Length of buffer to create verbs txreq cache name */ +#define TXREQ_NAME_LEN 24 + +static uint wss_threshold; +module_param(wss_threshold, uint, S_IRUGO); +MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy"); +static uint wss_clean_period = 256; +module_param(wss_clean_period, uint, S_IRUGO); +MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned"); + +/* memory working set size */ +struct hfi1_wss { + unsigned long *entries; + atomic_t total_count; + atomic_t clean_counter; + atomic_t clean_entry; + + int threshold; + int num_entries; + long pages_mask; +}; + +static struct hfi1_wss wss; + +int hfi1_wss_init(void) +{ + long llc_size; + long llc_bits; + long table_size; + long table_bits; + + /* check for a valid percent range - default to 80 if none or invalid */ + if (wss_threshold < 1 || wss_threshold > 100) + wss_threshold = 80; + /* reject a wildly large period */ + if (wss_clean_period > 1000000) + wss_clean_period = 256; + /* reject a zero period */ + if (wss_clean_period == 0) + wss_clean_period = 1; + + /* + * Calculate the table size - the next power of 2 larger than the + * LLC size. LLC size is in KiB. + */ + llc_size = wss_llc_size() * 1024; + table_size = roundup_pow_of_two(llc_size); + + /* one bit per page in rounded up table */ + llc_bits = llc_size / PAGE_SIZE; + table_bits = table_size / PAGE_SIZE; + wss.pages_mask = table_bits - 1; + wss.num_entries = table_bits / BITS_PER_LONG; + + wss.threshold = (llc_bits * wss_threshold) / 100; + if (wss.threshold == 0) + wss.threshold = 1; + + atomic_set(&wss.clean_counter, wss_clean_period); + + wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries), + GFP_KERNEL); + if (!wss.entries) { + hfi1_wss_exit(); + return -ENOMEM; + } + + return 0; +} + +void hfi1_wss_exit(void) +{ + /* coded to handle partially initialized and repeat callers */ + kfree(wss.entries); + wss.entries = NULL; +} + +/* + * Advance the clean counter. When the clean period has expired, + * clean an entry. + * + * This is implemented in atomics to avoid locking. Because multiple + * variables are involved, it can be racy which can lead to slightly + * inaccurate information. Since this is only a heuristic, this is + * OK. Any innaccuracies will clean themselves out as the counter + * advances. That said, it is unlikely the entry clean operation will + * race - the next possible racer will not start until the next clean + * period. + * + * The clean counter is implemented as a decrement to zero. When zero + * is reached an entry is cleaned. + */ +static void wss_advance_clean_counter(void) +{ + int entry; + int weight; + unsigned long bits; + + /* become the cleaner if we decrement the counter to zero */ + if (atomic_dec_and_test(&wss.clean_counter)) { + /* + * Set, not add, the clean period. This avoids an issue + * where the counter could decrement below the clean period. + * Doing a set can result in lost decrements, slowing the + * clean advance. Since this a heuristic, this possible + * slowdown is OK. + * + * An alternative is to loop, advancing the counter by a + * clean period until the result is > 0. However, this could + * lead to several threads keeping another in the clean loop. + * This could be mitigated by limiting the number of times + * we stay in the loop. + */ + atomic_set(&wss.clean_counter, wss_clean_period); + + /* + * Uniquely grab the entry to clean and move to next. + * The current entry is always the lower bits of + * wss.clean_entry. The table size, wss.num_entries, + * is always a power-of-2. + */ + entry = (atomic_inc_return(&wss.clean_entry) - 1) + & (wss.num_entries - 1); + + /* clear the entry and count the bits */ + bits = xchg(&wss.entries[entry], 0); + weight = hweight64((u64)bits); + /* only adjust the contended total count if needed */ + if (weight) + atomic_sub(weight, &wss.total_count); + } +} + +/* + * Insert the given address into the working set array. + */ +static void wss_insert(void *address) +{ + u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask; + u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */ + u32 nr = page & (BITS_PER_LONG - 1); + + if (!test_and_set_bit(nr, &wss.entries[entry])) + atomic_inc(&wss.total_count); + + wss_advance_clean_counter(); +} + +/* + * Is the working set larger than the threshold? + */ +static inline int wss_exceeds_threshold(void) +{ + return atomic_read(&wss.total_count) >= wss.threshold; +} + +/* + * Translate ib_wr_opcode into ib_wc_opcode. + */ +const enum ib_wc_opcode ib_hfi1_wc_opcode[] = { + [IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE, + [IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, + [IB_WR_SEND] = IB_WC_SEND, + [IB_WR_SEND_WITH_IMM] = IB_WC_SEND, + [IB_WR_RDMA_READ] = IB_WC_RDMA_READ, + [IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP, + [IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD +}; + +/* + * Length of header by opcode, 0 --> not supported + */ +const u8 hdr_len_by_opcode[256] = { + /* RC */ + [IB_OPCODE_RC_SEND_FIRST] = 12 + 8, + [IB_OPCODE_RC_SEND_MIDDLE] = 12 + 8, + [IB_OPCODE_RC_SEND_LAST] = 12 + 8, + [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_RC_SEND_ONLY] = 12 + 8, + [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_RC_RDMA_WRITE_FIRST] = 12 + 8 + 16, + [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = 12 + 8, + [IB_OPCODE_RC_RDMA_WRITE_LAST] = 12 + 8, + [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_RC_RDMA_WRITE_ONLY] = 12 + 8 + 16, + [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20, + [IB_OPCODE_RC_RDMA_READ_REQUEST] = 12 + 8 + 16, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = 12 + 8 + 4, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = 12 + 8, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = 12 + 8 + 4, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = 12 + 8 + 4, + [IB_OPCODE_RC_ACKNOWLEDGE] = 12 + 8 + 4, + [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = 12 + 8 + 4, + [IB_OPCODE_RC_COMPARE_SWAP] = 12 + 8 + 28, + [IB_OPCODE_RC_FETCH_ADD] = 12 + 8 + 28, + [IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE] = 12 + 8 + 4, + [IB_OPCODE_RC_SEND_ONLY_WITH_INVALIDATE] = 12 + 8 + 4, + /* UC */ + [IB_OPCODE_UC_SEND_FIRST] = 12 + 8, + [IB_OPCODE_UC_SEND_MIDDLE] = 12 + 8, + [IB_OPCODE_UC_SEND_LAST] = 12 + 8, + [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_UC_SEND_ONLY] = 12 + 8, + [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_UC_RDMA_WRITE_FIRST] = 12 + 8 + 16, + [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = 12 + 8, + [IB_OPCODE_UC_RDMA_WRITE_LAST] = 12 + 8, + [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = 12 + 8 + 4, + [IB_OPCODE_UC_RDMA_WRITE_ONLY] = 12 + 8 + 16, + [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = 12 + 8 + 20, + /* UD */ + [IB_OPCODE_UD_SEND_ONLY] = 12 + 8 + 8, + [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = 12 + 8 + 12 +}; + +static const opcode_handler opcode_handler_tbl[256] = { + /* RC */ + [IB_OPCODE_RC_SEND_FIRST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_MIDDLE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_LAST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_ONLY] = &hfi1_rc_rcv, + [IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_FIRST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_MIDDLE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_LAST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_ONLY] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_REQUEST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST] = &hfi1_rc_rcv, + [IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY] = &hfi1_rc_rcv, + [IB_OPCODE_RC_ACKNOWLEDGE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE] = &hfi1_rc_rcv, + [IB_OPCODE_RC_COMPARE_SWAP] = &hfi1_rc_rcv, + [IB_OPCODE_RC_FETCH_ADD] = &hfi1_rc_rcv, + /* UC */ + [IB_OPCODE_UC_SEND_FIRST] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_MIDDLE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_LAST] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_ONLY] = &hfi1_uc_rcv, + [IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_FIRST] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_MIDDLE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_LAST] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_ONLY] = &hfi1_uc_rcv, + [IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE] = &hfi1_uc_rcv, + /* UD */ + [IB_OPCODE_UD_SEND_ONLY] = &hfi1_ud_rcv, + [IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE] = &hfi1_ud_rcv, + /* CNP */ + [IB_OPCODE_CNP] = &hfi1_cnp_rcv +}; + +/* + * System image GUID. + */ +__be64 ib_hfi1_sys_image_guid; + +/** + * hfi1_copy_sge - copy data to SGE memory + * @ss: the SGE state + * @data: the data to copy + * @length: the length of the data + * @copy_last: do a separate copy of the last 8 bytes + */ +void hfi1_copy_sge( + struct rvt_sge_state *ss, + void *data, u32 length, + int release, + int copy_last) +{ + struct rvt_sge *sge = &ss->sge; + int in_last = 0; + int i; + int cacheless_copy = 0; + + if (sge_copy_mode == COPY_CACHELESS) { + cacheless_copy = length >= PAGE_SIZE; + } else if (sge_copy_mode == COPY_ADAPTIVE) { + if (length >= PAGE_SIZE) { + /* + * NOTE: this *assumes*: + * o The first vaddr is the dest. + * o If multiple pages, then vaddr is sequential. + */ + wss_insert(sge->vaddr); + if (length >= (2 * PAGE_SIZE)) + wss_insert(sge->vaddr + PAGE_SIZE); + + cacheless_copy = wss_exceeds_threshold(); + } else { + wss_advance_clean_counter(); + } + } + if (copy_last) { + if (length > 8) { + length -= 8; + } else { + copy_last = 0; + in_last = 1; + } + } + +again: + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + WARN_ON_ONCE(len == 0); + if (unlikely(in_last)) { + /* enforce byte transfer ordering */ + for (i = 0; i < len; i++) + ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i]; + } else if (cacheless_copy) { + cacheless_memcpy(sge->vaddr, data, len); + } else { + memcpy(sge->vaddr, data, len); + } + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (release) + rvt_put_mr(sge->mr); + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= RVT_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + data += len; + length -= len; + } + + if (copy_last) { + copy_last = 0; + in_last = 1; + length = 8; + goto again; + } +} + +/** + * hfi1_skip_sge - skip over SGE memory + * @ss: the SGE state + * @length: the number of bytes to skip + */ +void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release) +{ + struct rvt_sge *sge = &ss->sge; + + while (length) { + u32 len = sge->length; + + if (len > length) + len = length; + if (len > sge->sge_length) + len = sge->sge_length; + WARN_ON_ONCE(len == 0); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (release) + rvt_put_mr(sge->mr); + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= RVT_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + length -= len; + } +} + +/* + * Make sure the QP is ready and able to accept the given opcode. + */ +static inline int qp_ok(int opcode, struct hfi1_packet *packet) +{ + struct hfi1_ibport *ibp; + + if (!(ib_rvt_state_ops[packet->qp->state] & RVT_PROCESS_RECV_OK)) + goto dropit; + if (((opcode & RVT_OPCODE_QP_MASK) == packet->qp->allowed_ops) || + (opcode == IB_OPCODE_CNP)) + return 1; +dropit: + ibp = &packet->rcd->ppd->ibport_data; + ibp->rvp.n_pkt_drops++; + return 0; +} + +/** + * hfi1_ib_rcv - process an incoming packet + * @packet: data packet information + * + * This is called to process an incoming packet at interrupt level. + * + * Tlen is the length of the header + data + CRC in bytes. + */ +void hfi1_ib_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ctxtdata *rcd = packet->rcd; + struct hfi1_ib_header *hdr = packet->hdr; + u32 tlen = packet->tlen; + struct hfi1_pportdata *ppd = rcd->ppd; + struct hfi1_ibport *ibp = &ppd->ibport_data; + struct rvt_dev_info *rdi = &ppd->dd->verbs_dev.rdi; + unsigned long flags; + u32 qp_num; + int lnh; + u8 opcode; + u16 lid; + + /* Check for GRH */ + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh == HFI1_LRH_BTH) { + packet->ohdr = &hdr->u.oth; + } else if (lnh == HFI1_LRH_GRH) { + u32 vtf; + + packet->ohdr = &hdr->u.l.oth; + if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR) + goto drop; + vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow); + if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION) + goto drop; + packet->rcv_flags |= HFI1_HAS_GRH; + } else { + goto drop; + } + + trace_input_ibhdr(rcd->dd, hdr); + + opcode = (be32_to_cpu(packet->ohdr->bth[0]) >> 24); + inc_opstats(tlen, &rcd->opstats->stats[opcode]); + + /* Get the destination QP number. */ + qp_num = be32_to_cpu(packet->ohdr->bth[1]) & RVT_QPN_MASK; + lid = be16_to_cpu(hdr->lrh[1]); + if (unlikely((lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) && + (lid != be16_to_cpu(IB_LID_PERMISSIVE)))) { + struct rvt_mcast *mcast; + struct rvt_mcast_qp *p; + + if (lnh != HFI1_LRH_GRH) + goto drop; + mcast = rvt_mcast_find(&ibp->rvp, &hdr->u.l.grh.dgid); + if (!mcast) + goto drop; + list_for_each_entry_rcu(p, &mcast->qp_list, list) { + packet->qp = p->qp; + spin_lock_irqsave(&packet->qp->r_lock, flags); + if (likely((qp_ok(opcode, packet)))) + opcode_handler_tbl[opcode](packet); + spin_unlock_irqrestore(&packet->qp->r_lock, flags); + } + /* + * Notify rvt_multicast_detach() if it is waiting for us + * to finish. + */ + if (atomic_dec_return(&mcast->refcount) <= 1) + wake_up(&mcast->wait); + } else { + rcu_read_lock(); + packet->qp = rvt_lookup_qpn(rdi, &ibp->rvp, qp_num); + if (!packet->qp) { + rcu_read_unlock(); + goto drop; + } + spin_lock_irqsave(&packet->qp->r_lock, flags); + if (likely((qp_ok(opcode, packet)))) + opcode_handler_tbl[opcode](packet); + spin_unlock_irqrestore(&packet->qp->r_lock, flags); + rcu_read_unlock(); + } + return; + +drop: + ibp->rvp.n_pkt_drops++; +} + +/* + * This is called from a timer to check for QPs + * which need kernel memory in order to send a packet. + */ +static void mem_timer(unsigned long data) +{ + struct hfi1_ibdev *dev = (struct hfi1_ibdev *)data; + struct list_head *list = &dev->memwait; + struct rvt_qp *qp = NULL; + struct iowait *wait; + unsigned long flags; + struct hfi1_qp_priv *priv; + + write_seqlock_irqsave(&dev->iowait_lock, flags); + if (!list_empty(list)) { + wait = list_first_entry(list, struct iowait, list); + qp = iowait_to_qp(wait); + priv = qp->priv; + list_del_init(&priv->s_iowait.list); + /* refcount held until actual wake up */ + if (!list_empty(list)) + mod_timer(&dev->mem_timer, jiffies + 1); + } + write_sequnlock_irqrestore(&dev->iowait_lock, flags); + + if (qp) + hfi1_qp_wakeup(qp, RVT_S_WAIT_KMEM); +} + +void update_sge(struct rvt_sge_state *ss, u32 length) +{ + struct rvt_sge *sge = &ss->sge; + + sge->vaddr += length; + sge->length -= length; + sge->sge_length -= length; + if (sge->sge_length == 0) { + if (--ss->num_sge) + *sge = *ss->sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= RVT_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + return; + sge->n = 0; + } + sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = sge->mr->map[sge->m]->segs[sge->n].length; + } +} + +/* + * This is called with progress side lock held. + */ +/* New API */ +static void verbs_sdma_complete( + struct sdma_txreq *cookie, + int status) +{ + struct verbs_txreq *tx = + container_of(cookie, struct verbs_txreq, txreq); + struct rvt_qp *qp = tx->qp; + + spin_lock(&qp->s_lock); + if (tx->wqe) { + hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS); + } else if (qp->ibqp.qp_type == IB_QPT_RC) { + struct hfi1_ib_header *hdr; + + hdr = &tx->phdr.hdr; + hfi1_rc_send_complete(qp, hdr); + } + spin_unlock(&qp->s_lock); + + hfi1_put_txreq(tx); +} + +static int wait_kmem(struct hfi1_ibdev *dev, + struct rvt_qp *qp, + struct hfi1_pkt_state *ps) +{ + struct hfi1_qp_priv *priv = qp->priv; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { + write_seqlock(&dev->iowait_lock); + list_add_tail(&ps->s_txreq->txreq.list, + &priv->s_iowait.tx_head); + if (list_empty(&priv->s_iowait.list)) { + if (list_empty(&dev->memwait)) + mod_timer(&dev->mem_timer, jiffies + 1); + qp->s_flags |= RVT_S_WAIT_KMEM; + list_add_tail(&priv->s_iowait.list, &dev->memwait); + trace_hfi1_qpsleep(qp, RVT_S_WAIT_KMEM); + atomic_inc(&qp->refcount); + } + write_sequnlock(&dev->iowait_lock); + qp->s_flags &= ~RVT_S_BUSY; + ret = -EBUSY; + } + spin_unlock_irqrestore(&qp->s_lock, flags); + + return ret; +} + +/* + * This routine calls txadds for each sg entry. + * + * Add failures will revert the sge cursor + */ +static noinline int build_verbs_ulp_payload( + struct sdma_engine *sde, + struct rvt_sge_state *ss, + u32 length, + struct verbs_txreq *tx) +{ + struct rvt_sge *sg_list = ss->sg_list; + struct rvt_sge sge = ss->sge; + u8 num_sge = ss->num_sge; + u32 len; + int ret = 0; + + while (length) { + len = ss->sge.length; + if (len > length) + len = length; + if (len > ss->sge.sge_length) + len = ss->sge.sge_length; + WARN_ON_ONCE(len == 0); + ret = sdma_txadd_kvaddr( + sde->dd, + &tx->txreq, + ss->sge.vaddr, + len); + if (ret) + goto bail_txadd; + update_sge(ss, len); + length -= len; + } + return ret; +bail_txadd: + /* unwind cursor */ + ss->sge = sge; + ss->num_sge = num_sge; + ss->sg_list = sg_list; + return ret; +} + +/* + * Build the number of DMA descriptors needed to send length bytes of data. + * + * NOTE: DMA mapping is held in the tx until completed in the ring or + * the tx desc is freed without having been submitted to the ring + * + * This routine ensures all the helper routine calls succeed. + */ +/* New API */ +static int build_verbs_tx_desc( + struct sdma_engine *sde, + struct rvt_sge_state *ss, + u32 length, + struct verbs_txreq *tx, + struct ahg_ib_header *ahdr, + u64 pbc) +{ + int ret = 0; + struct hfi1_pio_header *phdr = &tx->phdr; + u16 hdrbytes = tx->hdr_dwords << 2; + + if (!ahdr->ahgcount) { + ret = sdma_txinit_ahg( + &tx->txreq, + ahdr->tx_flags, + hdrbytes + length, + ahdr->ahgidx, + 0, + NULL, + 0, + verbs_sdma_complete); + if (ret) + goto bail_txadd; + phdr->pbc = cpu_to_le64(pbc); + ret = sdma_txadd_kvaddr( + sde->dd, + &tx->txreq, + phdr, + hdrbytes); + if (ret) + goto bail_txadd; + } else { + ret = sdma_txinit_ahg( + &tx->txreq, + ahdr->tx_flags, + length, + ahdr->ahgidx, + ahdr->ahgcount, + ahdr->ahgdesc, + hdrbytes, + verbs_sdma_complete); + if (ret) + goto bail_txadd; + } + + /* add the ulp payload - if any. ss can be NULL for acks */ + if (ss) + ret = build_verbs_ulp_payload(sde, ss, length, tx); +bail_txadd: + return ret; +} + +int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + u64 pbc) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct ahg_ib_header *ahdr = priv->s_hdr; + u32 hdrwords = qp->s_hdrwords; + struct rvt_sge_state *ss = qp->s_cur_sge; + u32 len = qp->s_cur_size; + u32 plen = hdrwords + ((len + 3) >> 2) + 2; /* includes pbc */ + struct hfi1_ibdev *dev = ps->dev; + struct hfi1_pportdata *ppd = ps->ppd; + struct verbs_txreq *tx; + u64 pbc_flags = 0; + u8 sc5 = priv->s_sc; + + int ret; + + tx = ps->s_txreq; + if (!sdma_txreq_built(&tx->txreq)) { + if (likely(pbc == 0)) { + u32 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5); + /* No vl15 here */ + /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ + pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT; + + pbc = create_pbc(ppd, + pbc_flags, + qp->srate_mbps, + vl, + plen); + } + tx->wqe = qp->s_wqe; + ret = build_verbs_tx_desc(tx->sde, ss, len, tx, ahdr, pbc); + if (unlikely(ret)) + goto bail_build; + } + ret = sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq); + if (unlikely(ret < 0)) { + if (ret == -ECOMM) + goto bail_ecomm; + return ret; + } + trace_sdma_output_ibhdr(dd_from_ibdev(qp->ibqp.device), + &ps->s_txreq->phdr.hdr); + return ret; + +bail_ecomm: + /* The current one got "sent" */ + return 0; +bail_build: + ret = wait_kmem(dev, qp, ps); + if (!ret) { + /* free txreq - bad state */ + hfi1_put_txreq(ps->s_txreq); + ps->s_txreq = NULL; + } + return ret; +} + +/* + * If we are now in the error state, return zero to flush the + * send work request. + */ +static int pio_wait(struct rvt_qp *qp, + struct send_context *sc, + struct hfi1_pkt_state *ps, + u32 flag) +{ + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_devdata *dd = sc->dd; + struct hfi1_ibdev *dev = &dd->verbs_dev; + unsigned long flags; + int ret = 0; + + /* + * Note that as soon as want_buffer() is called and + * possibly before it returns, sc_piobufavail() + * could be called. Therefore, put QP on the I/O wait list before + * enabling the PIO avail interrupt. + */ + spin_lock_irqsave(&qp->s_lock, flags); + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { + write_seqlock(&dev->iowait_lock); + list_add_tail(&ps->s_txreq->txreq.list, + &priv->s_iowait.tx_head); + if (list_empty(&priv->s_iowait.list)) { + struct hfi1_ibdev *dev = &dd->verbs_dev; + int was_empty; + + dev->n_piowait += !!(flag & RVT_S_WAIT_PIO); + dev->n_piodrain += !!(flag & RVT_S_WAIT_PIO_DRAIN); + qp->s_flags |= flag; + was_empty = list_empty(&sc->piowait); + list_add_tail(&priv->s_iowait.list, &sc->piowait); + trace_hfi1_qpsleep(qp, RVT_S_WAIT_PIO); + atomic_inc(&qp->refcount); + /* counting: only call wantpiobuf_intr if first user */ + if (was_empty) + hfi1_sc_wantpiobuf_intr(sc, 1); + } + write_sequnlock(&dev->iowait_lock); + qp->s_flags &= ~RVT_S_BUSY; + ret = -EBUSY; + } + spin_unlock_irqrestore(&qp->s_lock, flags); + return ret; +} + +static void verbs_pio_complete(void *arg, int code) +{ + struct rvt_qp *qp = (struct rvt_qp *)arg; + struct hfi1_qp_priv *priv = qp->priv; + + if (iowait_pio_dec(&priv->s_iowait)) + iowait_drain_wakeup(&priv->s_iowait); +} + +int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + u64 pbc) +{ + struct hfi1_qp_priv *priv = qp->priv; + u32 hdrwords = qp->s_hdrwords; + struct rvt_sge_state *ss = qp->s_cur_sge; + u32 len = qp->s_cur_size; + u32 dwords = (len + 3) >> 2; + u32 plen = hdrwords + dwords + 2; /* includes pbc */ + struct hfi1_pportdata *ppd = ps->ppd; + u32 *hdr = (u32 *)&ps->s_txreq->phdr.hdr; + u64 pbc_flags = 0; + u8 sc5; + unsigned long flags = 0; + struct send_context *sc; + struct pio_buf *pbuf; + int wc_status = IB_WC_SUCCESS; + int ret = 0; + pio_release_cb cb = NULL; + + /* only RC/UC use complete */ + switch (qp->ibqp.qp_type) { + case IB_QPT_RC: + case IB_QPT_UC: + cb = verbs_pio_complete; + break; + default: + break; + } + + /* vl15 special case taken care of in ud.c */ + sc5 = priv->s_sc; + sc = ps->s_txreq->psc; + + if (likely(pbc == 0)) { + u8 vl = sc_to_vlt(dd_from_ibdev(qp->ibqp.device), sc5); + /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */ + pbc_flags |= (!!(sc5 & 0x10)) << PBC_DC_INFO_SHIFT; + pbc = create_pbc(ppd, pbc_flags, qp->srate_mbps, vl, plen); + } + if (cb) + iowait_pio_inc(&priv->s_iowait); + pbuf = sc_buffer_alloc(sc, plen, cb, qp); + if (unlikely(!pbuf)) { + if (cb) + verbs_pio_complete(qp, 0); + if (ppd->host_link_state != HLS_UP_ACTIVE) { + /* + * If we have filled the PIO buffers to capacity and are + * not in an active state this request is not going to + * go out to so just complete it with an error or else a + * ULP or the core may be stuck waiting. + */ + hfi1_cdbg( + PIO, + "alloc failed. state not active, completing"); + wc_status = IB_WC_GENERAL_ERR; + goto pio_bail; + } else { + /* + * This is a normal occurrence. The PIO buffs are full + * up but we are still happily sending, well we could be + * so lets continue to queue the request. + */ + hfi1_cdbg(PIO, "alloc failed. state active, queuing"); + ret = pio_wait(qp, sc, ps, RVT_S_WAIT_PIO); + if (!ret) + /* txreq not queued - free */ + goto bail; + /* tx consumed in wait */ + return ret; + } + } + + if (len == 0) { + pio_copy(ppd->dd, pbuf, pbc, hdr, hdrwords); + } else { + if (ss) { + seg_pio_copy_start(pbuf, pbc, hdr, hdrwords * 4); + while (len) { + void *addr = ss->sge.vaddr; + u32 slen = ss->sge.length; + + if (slen > len) + slen = len; + update_sge(ss, slen); + seg_pio_copy_mid(pbuf, addr, slen); + len -= slen; + } + seg_pio_copy_end(pbuf); + } + } + + trace_pio_output_ibhdr(dd_from_ibdev(qp->ibqp.device), + &ps->s_txreq->phdr.hdr); + +pio_bail: + if (qp->s_wqe) { + spin_lock_irqsave(&qp->s_lock, flags); + hfi1_send_complete(qp, qp->s_wqe, wc_status); + spin_unlock_irqrestore(&qp->s_lock, flags); + } else if (qp->ibqp.qp_type == IB_QPT_RC) { + spin_lock_irqsave(&qp->s_lock, flags); + hfi1_rc_send_complete(qp, &ps->s_txreq->phdr.hdr); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + + ret = 0; + +bail: + hfi1_put_txreq(ps->s_txreq); + return ret; +} + +/* + * egress_pkey_matches_entry - return 1 if the pkey matches ent (ent + * being an entry from the partition key table), return 0 + * otherwise. Use the matching criteria for egress partition keys + * specified in the OPAv1 spec., section 9.1l.7. + */ +static inline int egress_pkey_matches_entry(u16 pkey, u16 ent) +{ + u16 mkey = pkey & PKEY_LOW_15_MASK; + u16 mentry = ent & PKEY_LOW_15_MASK; + + if (mkey == mentry) { + /* + * If pkey[15] is set (full partition member), + * is bit 15 in the corresponding table element + * clear (limited member)? + */ + if (pkey & PKEY_MEMBER_MASK) + return !!(ent & PKEY_MEMBER_MASK); + return 1; + } + return 0; +} + +/** + * egress_pkey_check - check P_KEY of a packet + * @ppd: Physical IB port data + * @lrh: Local route header + * @bth: Base transport header + * @sc5: SC for packet + * @s_pkey_index: It will be used for look up optimization for kernel contexts + * only. If it is negative value, then it means user contexts is calling this + * function. + * + * It checks if hdr's pkey is valid. + * + * Return: 0 on success, otherwise, 1 + */ +int egress_pkey_check(struct hfi1_pportdata *ppd, __be16 *lrh, __be32 *bth, + u8 sc5, int8_t s_pkey_index) +{ + struct hfi1_devdata *dd; + int i; + u16 pkey; + int is_user_ctxt_mechanism = (s_pkey_index < 0); + + if (!(ppd->part_enforce & HFI1_PART_ENFORCE_OUT)) + return 0; + + pkey = (u16)be32_to_cpu(bth[0]); + + /* If SC15, pkey[0:14] must be 0x7fff */ + if ((sc5 == 0xf) && ((pkey & PKEY_LOW_15_MASK) != PKEY_LOW_15_MASK)) + goto bad; + + /* Is the pkey = 0x0, or 0x8000? */ + if ((pkey & PKEY_LOW_15_MASK) == 0) + goto bad; + + /* + * For the kernel contexts only, if a qp is passed into the function, + * the most likely matching pkey has index qp->s_pkey_index + */ + if (!is_user_ctxt_mechanism && + egress_pkey_matches_entry(pkey, ppd->pkeys[s_pkey_index])) { + return 0; + } + + for (i = 0; i < MAX_PKEY_VALUES; i++) { + if (egress_pkey_matches_entry(pkey, ppd->pkeys[i])) + return 0; + } +bad: + /* + * For the user-context mechanism, the P_KEY check would only happen + * once per SDMA request, not once per packet. Therefore, there's no + * need to increment the counter for the user-context mechanism. + */ + if (!is_user_ctxt_mechanism) { + incr_cntr64(&ppd->port_xmit_constraint_errors); + dd = ppd->dd; + if (!(dd->err_info_xmit_constraint.status & + OPA_EI_STATUS_SMASK)) { + u16 slid = be16_to_cpu(lrh[3]); + + dd->err_info_xmit_constraint.status |= + OPA_EI_STATUS_SMASK; + dd->err_info_xmit_constraint.slid = slid; + dd->err_info_xmit_constraint.pkey = pkey; + } + } + return 1; +} + +/** + * get_send_routine - choose an egress routine + * + * Choose an egress routine based on QP type + * and size + */ +static inline send_routine get_send_routine(struct rvt_qp *qp, + struct verbs_txreq *tx) +{ + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_ib_header *h = &tx->phdr.hdr; + + if (unlikely(!(dd->flags & HFI1_HAS_SEND_DMA))) + return dd->process_pio_send; + switch (qp->ibqp.qp_type) { + case IB_QPT_SMI: + return dd->process_pio_send; + case IB_QPT_GSI: + case IB_QPT_UD: + break; + case IB_QPT_RC: + if (piothreshold && + qp->s_cur_size <= min(piothreshold, qp->pmtu) && + (BIT(get_opcode(h) & 0x1f) & rc_only_opcode) && + iowait_sdma_pending(&priv->s_iowait) == 0 && + !sdma_txreq_built(&tx->txreq)) + return dd->process_pio_send; + break; + case IB_QPT_UC: + if (piothreshold && + qp->s_cur_size <= min(piothreshold, qp->pmtu) && + (BIT(get_opcode(h) & 0x1f) & uc_only_opcode) && + iowait_sdma_pending(&priv->s_iowait) == 0 && + !sdma_txreq_built(&tx->txreq)) + return dd->process_pio_send; + break; + default: + break; + } + return dd->process_dma_send; +} + +/** + * hfi1_verbs_send - send a packet + * @qp: the QP to send on + * @ps: the state of the packet to send + * + * Return zero if packet is sent or queued OK. + * Return non-zero and clear qp->s_flags RVT_S_BUSY otherwise. + */ +int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) +{ + struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); + struct hfi1_qp_priv *priv = qp->priv; + struct hfi1_other_headers *ohdr; + struct hfi1_ib_header *hdr; + send_routine sr; + int ret; + u8 lnh; + + hdr = &ps->s_txreq->phdr.hdr; + /* locate the pkey within the headers */ + lnh = be16_to_cpu(hdr->lrh[0]) & 3; + if (lnh == HFI1_LRH_GRH) + ohdr = &hdr->u.l.oth; + else + ohdr = &hdr->u.oth; + + sr = get_send_routine(qp, ps->s_txreq); + ret = egress_pkey_check(dd->pport, + hdr->lrh, + ohdr->bth, + priv->s_sc, + qp->s_pkey_index); + if (unlikely(ret)) { + /* + * The value we are returning here does not get propagated to + * the verbs caller. Thus we need to complete the request with + * error otherwise the caller could be sitting waiting on the + * completion event. Only do this for PIO. SDMA has its own + * mechanism for handling the errors. So for SDMA we can just + * return. + */ + if (sr == dd->process_pio_send) { + unsigned long flags; + + hfi1_cdbg(PIO, "%s() Failed. Completing with err", + __func__); + spin_lock_irqsave(&qp->s_lock, flags); + hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); + spin_unlock_irqrestore(&qp->s_lock, flags); + } + return -EINVAL; + } + if (sr == dd->process_dma_send && iowait_pio_pending(&priv->s_iowait)) + return pio_wait(qp, + ps->s_txreq->psc, + ps, + RVT_S_WAIT_PIO_DRAIN); + return sr(qp, ps, 0); +} + +/** + * hfi1_fill_device_attr - Fill in rvt dev info device attributes. + * @dd: the device data structure + */ +static void hfi1_fill_device_attr(struct hfi1_devdata *dd) +{ + struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; + + memset(&rdi->dparms.props, 0, sizeof(rdi->dparms.props)); + + rdi->dparms.props.device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR | + IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT | + IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | + IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE; + rdi->dparms.props.page_size_cap = PAGE_SIZE; + rdi->dparms.props.vendor_id = dd->oui1 << 16 | dd->oui2 << 8 | dd->oui3; + rdi->dparms.props.vendor_part_id = dd->pcidev->device; + rdi->dparms.props.hw_ver = dd->minrev; + rdi->dparms.props.sys_image_guid = ib_hfi1_sys_image_guid; + rdi->dparms.props.max_mr_size = ~0ULL; + rdi->dparms.props.max_qp = hfi1_max_qps; + rdi->dparms.props.max_qp_wr = hfi1_max_qp_wrs; + rdi->dparms.props.max_sge = hfi1_max_sges; + rdi->dparms.props.max_sge_rd = hfi1_max_sges; + rdi->dparms.props.max_cq = hfi1_max_cqs; + rdi->dparms.props.max_ah = hfi1_max_ahs; + rdi->dparms.props.max_cqe = hfi1_max_cqes; + rdi->dparms.props.max_mr = rdi->lkey_table.max; + rdi->dparms.props.max_fmr = rdi->lkey_table.max; + rdi->dparms.props.max_map_per_fmr = 32767; + rdi->dparms.props.max_pd = hfi1_max_pds; + rdi->dparms.props.max_qp_rd_atom = HFI1_MAX_RDMA_ATOMIC; + rdi->dparms.props.max_qp_init_rd_atom = 255; + rdi->dparms.props.max_srq = hfi1_max_srqs; + rdi->dparms.props.max_srq_wr = hfi1_max_srq_wrs; + rdi->dparms.props.max_srq_sge = hfi1_max_srq_sges; + rdi->dparms.props.atomic_cap = IB_ATOMIC_GLOB; + rdi->dparms.props.max_pkeys = hfi1_get_npkeys(dd); + rdi->dparms.props.max_mcast_grp = hfi1_max_mcast_grps; + rdi->dparms.props.max_mcast_qp_attach = hfi1_max_mcast_qp_attached; + rdi->dparms.props.max_total_mcast_qp_attach = + rdi->dparms.props.max_mcast_qp_attach * + rdi->dparms.props.max_mcast_grp; +} + +static inline u16 opa_speed_to_ib(u16 in) +{ + u16 out = 0; + + if (in & OPA_LINK_SPEED_25G) + out |= IB_SPEED_EDR; + if (in & OPA_LINK_SPEED_12_5G) + out |= IB_SPEED_FDR; + + return out; +} + +/* + * Convert a single OPA link width (no multiple flags) to an IB value. + * A zero OPA link width means link down, which means the IB width value + * is a don't care. + */ +static inline u16 opa_width_to_ib(u16 in) +{ + switch (in) { + case OPA_LINK_WIDTH_1X: + /* map 2x and 3x to 1x as they don't exist in IB */ + case OPA_LINK_WIDTH_2X: + case OPA_LINK_WIDTH_3X: + return IB_WIDTH_1X; + default: /* link down or unknown, return our largest width */ + case OPA_LINK_WIDTH_4X: + return IB_WIDTH_4X; + } +} + +static int query_port(struct rvt_dev_info *rdi, u8 port_num, + struct ib_port_attr *props) +{ + struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); + struct hfi1_devdata *dd = dd_from_dev(verbs_dev); + struct hfi1_pportdata *ppd = &dd->pport[port_num - 1]; + u16 lid = ppd->lid; + + props->lid = lid ? lid : 0; + props->lmc = ppd->lmc; + /* OPA logical states match IB logical states */ + props->state = driver_lstate(ppd); + props->phys_state = hfi1_ibphys_portstate(ppd); + props->gid_tbl_len = HFI1_GUIDS_PER_PORT; + props->active_width = (u8)opa_width_to_ib(ppd->link_width_active); + /* see rate_show() in ib core/sysfs.c */ + props->active_speed = (u8)opa_speed_to_ib(ppd->link_speed_active); + props->max_vl_num = ppd->vls_supported; + + /* Once we are a "first class" citizen and have added the OPA MTUs to + * the core we can advertise the larger MTU enum to the ULPs, for now + * advertise only 4K. + * + * Those applications which are either OPA aware or pass the MTU enum + * from the Path Records to us will get the new 8k MTU. Those that + * attempt to process the MTU enum may fail in various ways. + */ + props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ? + 4096 : hfi1_max_mtu), IB_MTU_4096); + props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu : + mtu_to_enum(ppd->ibmtu, IB_MTU_2048); + + return 0; +} + +static int modify_device(struct ib_device *device, + int device_modify_mask, + struct ib_device_modify *device_modify) +{ + struct hfi1_devdata *dd = dd_from_ibdev(device); + unsigned i; + int ret; + + if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | + IB_DEVICE_MODIFY_NODE_DESC)) { + ret = -EOPNOTSUPP; + goto bail; + } + + if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) { + memcpy(device->node_desc, device_modify->node_desc, 64); + for (i = 0; i < dd->num_pports; i++) { + struct hfi1_ibport *ibp = &dd->pport[i].ibport_data; + + hfi1_node_desc_chg(ibp); + } + } + + if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) { + ib_hfi1_sys_image_guid = + cpu_to_be64(device_modify->sys_image_guid); + for (i = 0; i < dd->num_pports; i++) { + struct hfi1_ibport *ibp = &dd->pport[i].ibport_data; + + hfi1_sys_guid_chg(ibp); + } + } + + ret = 0; + +bail: + return ret; +} + +static int shut_down_port(struct rvt_dev_info *rdi, u8 port_num) +{ + struct hfi1_ibdev *verbs_dev = dev_from_rdi(rdi); + struct hfi1_devdata *dd = dd_from_dev(verbs_dev); + struct hfi1_pportdata *ppd = &dd->pport[port_num - 1]; + int ret; + + set_link_down_reason(ppd, OPA_LINKDOWN_REASON_UNKNOWN, 0, + OPA_LINKDOWN_REASON_UNKNOWN); + ret = set_link_state(ppd, HLS_DN_DOWNDEF); + return ret; +} + +static int hfi1_get_guid_be(struct rvt_dev_info *rdi, struct rvt_ibport *rvp, + int guid_index, __be64 *guid) +{ + struct hfi1_ibport *ibp = container_of(rvp, struct hfi1_ibport, rvp); + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + + if (guid_index == 0) + *guid = cpu_to_be64(ppd->guid); + else if (guid_index < HFI1_GUIDS_PER_PORT) + *guid = ibp->guids[guid_index - 1]; + else + return -EINVAL; + + return 0; +} + +/* + * convert ah port,sl to sc + */ +u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah) +{ + struct hfi1_ibport *ibp = to_iport(ibdev, ah->port_num); + + return ibp->sl_to_sc[ah->sl]; +} + +static int hfi1_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr) +{ + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + u8 sc5; + + /* test the mapping for validity */ + ibp = to_iport(ibdev, ah_attr->port_num); + ppd = ppd_from_ibp(ibp); + sc5 = ibp->sl_to_sc[ah_attr->sl]; + dd = dd_from_ppd(ppd); + if (sc_to_vlt(dd, sc5) > num_vls && sc_to_vlt(dd, sc5) != 0xf) + return -EINVAL; + return 0; +} + +static void hfi1_notify_new_ah(struct ib_device *ibdev, + struct ib_ah_attr *ah_attr, + struct rvt_ah *ah) +{ + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; + u8 sc5; + + /* + * Do not trust reading anything from rvt_ah at this point as it is not + * done being setup. We can however modify things which we need to set. + */ + + ibp = to_iport(ibdev, ah_attr->port_num); + ppd = ppd_from_ibp(ibp); + sc5 = ibp->sl_to_sc[ah->attr.sl]; + dd = dd_from_ppd(ppd); + ah->vl = sc_to_vlt(dd, sc5); + if (ah->vl < num_vls || ah->vl == 15) + ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu); +} + +struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid) +{ + struct ib_ah_attr attr; + struct ib_ah *ah = ERR_PTR(-EINVAL); + struct rvt_qp *qp0; + + memset(&attr, 0, sizeof(attr)); + attr.dlid = dlid; + attr.port_num = ppd_from_ibp(ibp)->port; + rcu_read_lock(); + qp0 = rcu_dereference(ibp->rvp.qp[0]); + if (qp0) + ah = ib_create_ah(qp0->ibqp.pd, &attr); + rcu_read_unlock(); + return ah; +} + +/** + * hfi1_get_npkeys - return the size of the PKEY table for context 0 + * @dd: the hfi1_ib device + */ +unsigned hfi1_get_npkeys(struct hfi1_devdata *dd) +{ + return ARRAY_SIZE(dd->pport[0].pkeys); +} + +static void init_ibport(struct hfi1_pportdata *ppd) +{ + struct hfi1_ibport *ibp = &ppd->ibport_data; + size_t sz = ARRAY_SIZE(ibp->sl_to_sc); + int i; + + for (i = 0; i < sz; i++) { + ibp->sl_to_sc[i] = i; + ibp->sc_to_sl[i] = i; + } + + spin_lock_init(&ibp->rvp.lock); + /* Set the prefix to the default value (see ch. 4.1.1) */ + ibp->rvp.gid_prefix = IB_DEFAULT_GID_PREFIX; + ibp->rvp.sm_lid = 0; + /* Below should only set bits defined in OPA PortInfo.CapabilityMask */ + ibp->rvp.port_cap_flags = IB_PORT_AUTO_MIGR_SUP | + IB_PORT_CAP_MASK_NOTICE_SUP; + ibp->rvp.pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA; + ibp->rvp.pma_counter_select[1] = IB_PMA_PORT_RCV_DATA; + ibp->rvp.pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS; + ibp->rvp.pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS; + ibp->rvp.pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT; + + RCU_INIT_POINTER(ibp->rvp.qp[0], NULL); + RCU_INIT_POINTER(ibp->rvp.qp[1], NULL); +} + +/** + * hfi1_register_ib_device - register our device with the infiniband core + * @dd: the device data structure + * Return 0 if successful, errno if unsuccessful. + */ +int hfi1_register_ib_device(struct hfi1_devdata *dd) +{ + struct hfi1_ibdev *dev = &dd->verbs_dev; + struct ib_device *ibdev = &dev->rdi.ibdev; + struct hfi1_pportdata *ppd = dd->pport; + unsigned i; + int ret; + size_t lcpysz = IB_DEVICE_NAME_MAX; + + for (i = 0; i < dd->num_pports; i++) + init_ibport(ppd + i); + + /* Only need to initialize non-zero fields. */ + + setup_timer(&dev->mem_timer, mem_timer, (unsigned long)dev); + + seqlock_init(&dev->iowait_lock); + INIT_LIST_HEAD(&dev->txwait); + INIT_LIST_HEAD(&dev->memwait); + + ret = verbs_txreq_init(dev); + if (ret) + goto err_verbs_txreq; + + /* + * The system image GUID is supposed to be the same for all + * HFIs in a single system but since there can be other + * device types in the system, we can't be sure this is unique. + */ + if (!ib_hfi1_sys_image_guid) + ib_hfi1_sys_image_guid = cpu_to_be64(ppd->guid); + lcpysz = strlcpy(ibdev->name, class_name(), lcpysz); + strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz); + ibdev->owner = THIS_MODULE; + ibdev->node_guid = cpu_to_be64(ppd->guid); + ibdev->phys_port_cnt = dd->num_pports; + ibdev->dma_device = &dd->pcidev->dev; + ibdev->modify_device = modify_device; + + /* keep process mad in the driver */ + ibdev->process_mad = hfi1_process_mad; + + strncpy(ibdev->node_desc, init_utsname()->nodename, + sizeof(ibdev->node_desc)); + + /* + * Fill in rvt info object. + */ + dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files; + dd->verbs_dev.rdi.driver_f.get_card_name = get_card_name; + dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev; + dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah; + dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah; + dd->verbs_dev.rdi.driver_f.get_guid_be = hfi1_get_guid_be; + dd->verbs_dev.rdi.driver_f.query_port_state = query_port; + dd->verbs_dev.rdi.driver_f.shut_down_port = shut_down_port; + dd->verbs_dev.rdi.driver_f.cap_mask_chg = hfi1_cap_mask_chg; + /* + * Fill in rvt info device attributes. + */ + hfi1_fill_device_attr(dd); + + /* queue pair */ + dd->verbs_dev.rdi.dparms.qp_table_size = hfi1_qp_table_size; + dd->verbs_dev.rdi.dparms.qpn_start = 0; + dd->verbs_dev.rdi.dparms.qpn_inc = 1; + dd->verbs_dev.rdi.dparms.qos_shift = dd->qos_shift; + dd->verbs_dev.rdi.dparms.qpn_res_start = kdeth_qp << 16; + dd->verbs_dev.rdi.dparms.qpn_res_end = + dd->verbs_dev.rdi.dparms.qpn_res_start + 65535; + dd->verbs_dev.rdi.dparms.max_rdma_atomic = HFI1_MAX_RDMA_ATOMIC; + dd->verbs_dev.rdi.dparms.psn_mask = PSN_MASK; + dd->verbs_dev.rdi.dparms.psn_shift = PSN_SHIFT; + dd->verbs_dev.rdi.dparms.psn_modify_mask = PSN_MODIFY_MASK; + dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_INTEL_OPA; + dd->verbs_dev.rdi.dparms.max_mad_size = OPA_MGMT_MAD_SIZE; + + dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qp_priv_alloc; + dd->verbs_dev.rdi.driver_f.qp_priv_free = qp_priv_free; + dd->verbs_dev.rdi.driver_f.free_all_qps = free_all_qps; + dd->verbs_dev.rdi.driver_f.notify_qp_reset = notify_qp_reset; + dd->verbs_dev.rdi.driver_f.do_send = hfi1_do_send; + dd->verbs_dev.rdi.driver_f.schedule_send = hfi1_schedule_send; + dd->verbs_dev.rdi.driver_f.schedule_send_no_lock = _hfi1_schedule_send; + dd->verbs_dev.rdi.driver_f.get_pmtu_from_attr = get_pmtu_from_attr; + dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp; + dd->verbs_dev.rdi.driver_f.flush_qp_waiters = flush_qp_waiters; + dd->verbs_dev.rdi.driver_f.stop_send_queue = stop_send_queue; + dd->verbs_dev.rdi.driver_f.quiesce_qp = quiesce_qp; + dd->verbs_dev.rdi.driver_f.notify_error_qp = notify_error_qp; + dd->verbs_dev.rdi.driver_f.mtu_from_qp = mtu_from_qp; + dd->verbs_dev.rdi.driver_f.mtu_to_path_mtu = mtu_to_path_mtu; + dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp; + dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp; + dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe; + + /* completeion queue */ + snprintf(dd->verbs_dev.rdi.dparms.cq_name, + sizeof(dd->verbs_dev.rdi.dparms.cq_name), + "hfi1_cq%d", dd->unit); + dd->verbs_dev.rdi.dparms.node = dd->node; + + /* misc settings */ + dd->verbs_dev.rdi.flags = 0; /* Let rdmavt handle it all */ + dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size; + dd->verbs_dev.rdi.dparms.nports = dd->num_pports; + dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd); + + ppd = dd->pport; + for (i = 0; i < dd->num_pports; i++, ppd++) + rvt_init_port(&dd->verbs_dev.rdi, + &ppd->ibport_data.rvp, + i, + ppd->pkeys); + + ret = rvt_register_device(&dd->verbs_dev.rdi); + if (ret) + goto err_verbs_txreq; + + ret = hfi1_verbs_register_sysfs(dd); + if (ret) + goto err_class; + + return ret; + +err_class: + rvt_unregister_device(&dd->verbs_dev.rdi); +err_verbs_txreq: + verbs_txreq_exit(dev); + dd_dev_err(dd, "cannot register verbs: %d!\n", -ret); + return ret; +} + +void hfi1_unregister_ib_device(struct hfi1_devdata *dd) +{ + struct hfi1_ibdev *dev = &dd->verbs_dev; + + hfi1_verbs_unregister_sysfs(dd); + + rvt_unregister_device(&dd->verbs_dev.rdi); + + if (!list_empty(&dev->txwait)) + dd_dev_err(dd, "txwait list not empty!\n"); + if (!list_empty(&dev->memwait)) + dd_dev_err(dd, "memwait list not empty!\n"); + + del_timer_sync(&dev->mem_timer); + verbs_txreq_exit(dev); +} + +void hfi1_cnp_rcv(struct hfi1_packet *packet) +{ + struct hfi1_ibport *ibp = &packet->rcd->ppd->ibport_data; + struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); + struct hfi1_ib_header *hdr = packet->hdr; + struct rvt_qp *qp = packet->qp; + u32 lqpn, rqpn = 0; + u16 rlid = 0; + u8 sl, sc5, sc4_bit, svc_type; + bool sc4_set = has_sc4_bit(packet); + + switch (packet->qp->ibqp.qp_type) { + case IB_QPT_UC: + rlid = qp->remote_ah_attr.dlid; + rqpn = qp->remote_qpn; + svc_type = IB_CC_SVCTYPE_UC; + break; + case IB_QPT_RC: + rlid = qp->remote_ah_attr.dlid; + rqpn = qp->remote_qpn; + svc_type = IB_CC_SVCTYPE_RC; + break; + case IB_QPT_SMI: + case IB_QPT_GSI: + case IB_QPT_UD: + svc_type = IB_CC_SVCTYPE_UD; + break; + default: + ibp->rvp.n_pkt_drops++; + return; + } + + sc4_bit = sc4_set << 4; + sc5 = (be16_to_cpu(hdr->lrh[0]) >> 12) & 0xf; + sc5 |= sc4_bit; + sl = ibp->sc_to_sl[sc5]; + lqpn = qp->ibqp.qp_num; + + process_becn(ppd, sl, rlid, lqpn, rqpn, svc_type); +} diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h new file mode 100644 index 000000000..488356775 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -0,0 +1,531 @@ +/* + * Copyright(c) 2015, 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef HFI1_VERBS_H +#define HFI1_VERBS_H + +#include <linux/types.h> +#include <linux/seqlock.h> +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/kref.h> +#include <linux/workqueue.h> +#include <linux/kthread.h> +#include <linux/completion.h> +#include <linux/slab.h> +#include <rdma/ib_pack.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_mad.h> +#include <rdma/rdma_vt.h> +#include <rdma/rdmavt_qp.h> +#include <rdma/rdmavt_cq.h> + +struct hfi1_ctxtdata; +struct hfi1_pportdata; +struct hfi1_devdata; +struct hfi1_packet; + +#include "iowait.h" + +#define HFI1_MAX_RDMA_ATOMIC 16 +#define HFI1_GUIDS_PER_PORT 5 + +/* + * Increment this value if any changes that break userspace ABI + * compatibility are made. + */ +#define HFI1_UVERBS_ABI_VERSION 2 + +#define IB_SEQ_NAK (3 << 29) + +/* AETH NAK opcode values */ +#define IB_RNR_NAK 0x20 +#define IB_NAK_PSN_ERROR 0x60 +#define IB_NAK_INVALID_REQUEST 0x61 +#define IB_NAK_REMOTE_ACCESS_ERROR 0x62 +#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63 +#define IB_NAK_INVALID_RD_REQUEST 0x64 + +/* IB Performance Manager status values */ +#define IB_PMA_SAMPLE_STATUS_DONE 0x00 +#define IB_PMA_SAMPLE_STATUS_STARTED 0x01 +#define IB_PMA_SAMPLE_STATUS_RUNNING 0x02 + +/* Mandatory IB performance counter select values. */ +#define IB_PMA_PORT_XMIT_DATA cpu_to_be16(0x0001) +#define IB_PMA_PORT_RCV_DATA cpu_to_be16(0x0002) +#define IB_PMA_PORT_XMIT_PKTS cpu_to_be16(0x0003) +#define IB_PMA_PORT_RCV_PKTS cpu_to_be16(0x0004) +#define IB_PMA_PORT_XMIT_WAIT cpu_to_be16(0x0005) + +#define HFI1_VENDOR_IPG cpu_to_be16(0xFFA0) + +#define IB_BTH_REQ_ACK BIT(31) +#define IB_BTH_SOLICITED BIT(23) +#define IB_BTH_MIG_REQ BIT(22) + +#define IB_GRH_VERSION 6 +#define IB_GRH_VERSION_MASK 0xF +#define IB_GRH_VERSION_SHIFT 28 +#define IB_GRH_TCLASS_MASK 0xFF +#define IB_GRH_TCLASS_SHIFT 20 +#define IB_GRH_FLOW_MASK 0xFFFFF +#define IB_GRH_FLOW_SHIFT 0 +#define IB_GRH_NEXT_HDR 0x1B + +#define IB_DEFAULT_GID_PREFIX cpu_to_be64(0xfe80000000000000ULL) + +/* flags passed by hfi1_ib_rcv() */ +enum { + HFI1_HAS_GRH = (1 << 0), +}; + +struct ib_reth { + __be64 vaddr; + __be32 rkey; + __be32 length; +} __packed; + +struct ib_atomic_eth { + __be32 vaddr[2]; /* unaligned so access as 2 32-bit words */ + __be32 rkey; + __be64 swap_data; + __be64 compare_data; +} __packed; + +union ib_ehdrs { + struct { + __be32 deth[2]; + __be32 imm_data; + } ud; + struct { + struct ib_reth reth; + __be32 imm_data; + } rc; + struct { + __be32 aeth; + __be32 atomic_ack_eth[2]; + } at; + __be32 imm_data; + __be32 aeth; + __be32 ieth; + struct ib_atomic_eth atomic_eth; +} __packed; + +struct hfi1_other_headers { + __be32 bth[3]; + union ib_ehdrs u; +} __packed; + +/* + * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes + * long (72 w/ imm_data). Only the first 56 bytes of the IB header + * will be in the eager header buffer. The remaining 12 or 16 bytes + * are in the data buffer. + */ +struct hfi1_ib_header { + __be16 lrh[4]; + union { + struct { + struct ib_grh grh; + struct hfi1_other_headers oth; + } l; + struct hfi1_other_headers oth; + } u; +} __packed; + +struct ahg_ib_header { + struct sdma_engine *sde; + u32 ahgdesc[2]; + u16 tx_flags; + u8 ahgcount; + u8 ahgidx; + struct hfi1_ib_header ibh; +}; + +struct hfi1_pio_header { + __le64 pbc; + struct hfi1_ib_header hdr; +} __packed; + +/* + * hfi1 specific data structures that will be hidden from rvt after the queue + * pair is made common + */ +struct hfi1_qp_priv { + struct ahg_ib_header *s_hdr; /* next header to send */ + struct sdma_engine *s_sde; /* current sde */ + struct send_context *s_sendcontext; /* current sendcontext */ + u8 s_sc; /* SC[0..4] for next packet */ + u8 r_adefered; /* number of acks defered */ + struct iowait s_iowait; + struct timer_list s_rnr_timer; + struct rvt_qp *owner; +}; + +/* + * This structure is used to hold commonly lookedup and computed values during + * the send engine progress. + */ +struct hfi1_pkt_state { + struct hfi1_ibdev *dev; + struct hfi1_ibport *ibp; + struct hfi1_pportdata *ppd; + struct verbs_txreq *s_txreq; + unsigned long flags; +}; + +#define HFI1_PSN_CREDIT 16 + +struct hfi1_opcode_stats { + u64 n_packets; /* number of packets */ + u64 n_bytes; /* total number of bytes */ +}; + +struct hfi1_opcode_stats_perctx { + struct hfi1_opcode_stats stats[256]; +}; + +static inline void inc_opstats( + u32 tlen, + struct hfi1_opcode_stats *stats) +{ +#ifdef CONFIG_DEBUG_FS + stats->n_bytes += tlen; + stats->n_packets++; +#endif +} + +struct hfi1_ibport { + struct rvt_qp __rcu *qp[2]; + struct rvt_ibport rvp; + + __be64 guids[HFI1_GUIDS_PER_PORT - 1]; /* writable GUIDs */ + + /* the first 16 entries are sl_to_vl for !OPA */ + u8 sl_to_sc[32]; + u8 sc_to_sl[32]; +}; + +struct hfi1_ibdev { + struct rvt_dev_info rdi; /* Must be first */ + + /* QP numbers are shared by all IB ports */ + /* protect wait lists */ + seqlock_t iowait_lock; + struct list_head txwait; /* list for wait verbs_txreq */ + struct list_head memwait; /* list for wait kernel memory */ + struct list_head txreq_free; + struct kmem_cache *verbs_txreq_cache; + struct timer_list mem_timer; + + u64 n_piowait; + u64 n_piodrain; + u64 n_txwait; + u64 n_kmem_wait; + +#ifdef CONFIG_DEBUG_FS + /* per HFI debugfs */ + struct dentry *hfi1_ibdev_dbg; + /* per HFI symlinks to above */ + struct dentry *hfi1_ibdev_link; +#endif +}; + +static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev) +{ + struct rvt_dev_info *rdi; + + rdi = container_of(ibdev, struct rvt_dev_info, ibdev); + return container_of(rdi, struct hfi1_ibdev, rdi); +} + +static inline struct rvt_qp *iowait_to_qp(struct iowait *s_iowait) +{ + struct hfi1_qp_priv *priv; + + priv = container_of(s_iowait, struct hfi1_qp_priv, s_iowait); + return priv->owner; +} + +/* + * Send if not busy or waiting for I/O and either + * a RC response is pending or we can process send work requests. + */ +static inline int hfi1_send_ok(struct rvt_qp *qp) +{ + return !(qp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT_IO)) && + (qp->s_hdrwords || (qp->s_flags & RVT_S_RESP_PENDING) || + !(qp->s_flags & RVT_S_ANY_WAIT_SEND)); +} + +/* + * This must be called with s_lock held. + */ +void hfi1_bad_pqkey(struct hfi1_ibport *ibp, __be16 trap_num, u32 key, u32 sl, + u32 qp1, u32 qp2, u16 lid1, u16 lid2); +void hfi1_cap_mask_chg(struct rvt_dev_info *rdi, u8 port_num); +void hfi1_sys_guid_chg(struct hfi1_ibport *ibp); +void hfi1_node_desc_chg(struct hfi1_ibport *ibp); +int hfi1_process_mad(struct ib_device *ibdev, int mad_flags, u8 port, + const struct ib_wc *in_wc, const struct ib_grh *in_grh, + const struct ib_mad_hdr *in_mad, size_t in_mad_size, + struct ib_mad_hdr *out_mad, size_t *out_mad_size, + u16 *out_mad_pkey_index); + +/* + * The PSN_MASK and PSN_SHIFT allow for + * 1) comparing two PSNs + * 2) returning the PSN with any upper bits masked + * 3) returning the difference between to PSNs + * + * The number of significant bits in the PSN must + * necessarily be at least one bit less than + * the container holding the PSN. + */ +#ifndef CONFIG_HFI1_VERBS_31BIT_PSN +#define PSN_MASK 0xFFFFFF +#define PSN_SHIFT 8 +#else +#define PSN_MASK 0x7FFFFFFF +#define PSN_SHIFT 1 +#endif +#define PSN_MODIFY_MASK 0xFFFFFF + +/* + * Compare the lower 24 bits of the msn values. + * Returns an integer <, ==, or > than zero. + */ +static inline int cmp_msn(u32 a, u32 b) +{ + return (((int)a) - ((int)b)) << 8; +} + +/* + * Compare two PSNs + * Returns an integer <, ==, or > than zero. + */ +static inline int cmp_psn(u32 a, u32 b) +{ + return (((int)a) - ((int)b)) << PSN_SHIFT; +} + +/* + * Return masked PSN + */ +static inline u32 mask_psn(u32 a) +{ + return a & PSN_MASK; +} + +/* + * Return delta between two PSNs + */ +static inline u32 delta_psn(u32 a, u32 b) +{ + return (((int)a - (int)b) << PSN_SHIFT) >> PSN_SHIFT; +} + +struct verbs_txreq; +void hfi1_put_txreq(struct verbs_txreq *tx); + +int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps); + +void hfi1_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, + int release, int copy_last); + +void hfi1_skip_sge(struct rvt_sge_state *ss, u32 length, int release); + +void hfi1_cnp_rcv(struct hfi1_packet *packet); + +void hfi1_uc_rcv(struct hfi1_packet *packet); + +void hfi1_rc_rcv(struct hfi1_packet *packet); + +void hfi1_rc_hdrerr( + struct hfi1_ctxtdata *rcd, + struct hfi1_ib_header *hdr, + u32 rcv_flags, + struct rvt_qp *qp); + +u8 ah_to_sc(struct ib_device *ibdev, struct ib_ah_attr *ah_attr); + +struct ib_ah *hfi1_create_qp0_ah(struct hfi1_ibport *ibp, u16 dlid); + +void hfi1_rc_rnr_retry(unsigned long arg); +void hfi1_add_rnr_timer(struct rvt_qp *qp, u32 to); +void hfi1_rc_timeout(unsigned long arg); +void hfi1_del_timers_sync(struct rvt_qp *qp); +void hfi1_stop_rc_timers(struct rvt_qp *qp); + +void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_ib_header *hdr); + +void hfi1_rc_error(struct rvt_qp *qp, enum ib_wc_status err); + +void hfi1_ud_rcv(struct hfi1_packet *packet); + +int hfi1_lookup_pkey_idx(struct hfi1_ibport *ibp, u16 pkey); + +int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only); + +void hfi1_migrate_qp(struct rvt_qp *qp); + +int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + +void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata); + +int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe); + +extern const u32 rc_only_opcode; +extern const u32 uc_only_opcode; + +static inline u8 get_opcode(struct hfi1_ib_header *h) +{ + u16 lnh = be16_to_cpu(h->lrh[0]) & 3; + + if (lnh == IB_LNH_IBA_LOCAL) + return be32_to_cpu(h->u.oth.bth[0]) >> 24; + else + return be32_to_cpu(h->u.l.oth.bth[0]) >> 24; +} + +int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_ib_header *hdr, + int has_grh, struct rvt_qp *qp, u32 bth0); + +u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr, + struct ib_global_route *grh, u32 hwords, u32 nwords); + +void hfi1_make_ruc_header(struct rvt_qp *qp, struct hfi1_other_headers *ohdr, + u32 bth0, u32 bth2, int middle, + struct hfi1_pkt_state *ps); + +void _hfi1_do_send(struct work_struct *work); + +void hfi1_do_send(struct rvt_qp *qp); + +void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, + enum ib_wc_status status); + +void hfi1_send_rc_ack(struct hfi1_ctxtdata *, struct rvt_qp *qp, int is_fecn); + +int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps); + +int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps); + +int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps); + +int hfi1_register_ib_device(struct hfi1_devdata *); + +void hfi1_unregister_ib_device(struct hfi1_devdata *); + +void hfi1_ib_rcv(struct hfi1_packet *packet); + +unsigned hfi1_get_npkeys(struct hfi1_devdata *); + +int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + u64 pbc); + +int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, + u64 pbc); + +int hfi1_wss_init(void); +void hfi1_wss_exit(void); + +/* platform specific: return the lowest level cache (llc) size, in KiB */ +static inline int wss_llc_size(void) +{ + /* assume that the boot CPU value is universal for all CPUs */ + return boot_cpu_data.x86_cache_size; +} + +/* platform specific: cacheless copy */ +static inline void cacheless_memcpy(void *dst, void *src, size_t n) +{ + /* + * Use the only available X64 cacheless copy. Add a __user cast + * to quiet sparse. The src agument is already in the kernel so + * there are no security issues. The extra fault recovery machinery + * is not invoked. + */ + __copy_user_nocache(dst, (void __user *)src, n, 0); +} + +extern const enum ib_wc_opcode ib_hfi1_wc_opcode[]; + +extern const u8 hdr_len_by_opcode[]; + +extern const int ib_rvt_state_ops[]; + +extern __be64 ib_hfi1_sys_image_guid; /* in network order */ + +extern unsigned int hfi1_max_cqes; + +extern unsigned int hfi1_max_cqs; + +extern unsigned int hfi1_max_qp_wrs; + +extern unsigned int hfi1_max_qps; + +extern unsigned int hfi1_max_sges; + +extern unsigned int hfi1_max_mcast_grps; + +extern unsigned int hfi1_max_mcast_qp_attached; + +extern unsigned int hfi1_max_srqs; + +extern unsigned int hfi1_max_srq_sges; + +extern unsigned int hfi1_max_srq_wrs; + +extern unsigned short piothreshold; + +extern const u32 ib_hfi1_rnr_table[]; + +#endif /* HFI1_VERBS_H */ diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.c b/drivers/infiniband/hw/hfi1/verbs_txreq.c new file mode 100644 index 000000000..d8fb05652 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.c @@ -0,0 +1,147 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "hfi.h" +#include "verbs_txreq.h" +#include "qp.h" +#include "trace.h" + +#define TXREQ_LEN 24 + +void hfi1_put_txreq(struct verbs_txreq *tx) +{ + struct hfi1_ibdev *dev; + struct rvt_qp *qp; + unsigned long flags; + unsigned int seq; + struct hfi1_qp_priv *priv; + + qp = tx->qp; + dev = to_idev(qp->ibqp.device); + + if (tx->mr) + rvt_put_mr(tx->mr); + + sdma_txclean(dd_from_dev(dev), &tx->txreq); + + /* Free verbs_txreq and return to slab cache */ + kmem_cache_free(dev->verbs_txreq_cache, tx); + + do { + seq = read_seqbegin(&dev->iowait_lock); + if (!list_empty(&dev->txwait)) { + struct iowait *wait; + + write_seqlock_irqsave(&dev->iowait_lock, flags); + wait = list_first_entry(&dev->txwait, struct iowait, + list); + qp = iowait_to_qp(wait); + priv = qp->priv; + list_del_init(&priv->s_iowait.list); + /* refcount held until actual wake up */ + write_sequnlock_irqrestore(&dev->iowait_lock, flags); + hfi1_qp_wakeup(qp, RVT_S_WAIT_TX); + break; + } + } while (read_seqretry(&dev->iowait_lock, seq)); +} + +struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev, + struct rvt_qp *qp) + __must_hold(&qp->s_lock) +{ + struct verbs_txreq *tx = ERR_PTR(-EBUSY); + + write_seqlock(&dev->iowait_lock); + if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { + struct hfi1_qp_priv *priv; + + tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC); + if (tx) + goto out; + priv = qp->priv; + if (list_empty(&priv->s_iowait.list)) { + dev->n_txwait++; + qp->s_flags |= RVT_S_WAIT_TX; + list_add_tail(&priv->s_iowait.list, &dev->txwait); + trace_hfi1_qpsleep(qp, RVT_S_WAIT_TX); + atomic_inc(&qp->refcount); + } + qp->s_flags &= ~RVT_S_BUSY; + } +out: + write_sequnlock(&dev->iowait_lock); + return tx; +} + +static void verbs_txreq_kmem_cache_ctor(void *obj) +{ + struct verbs_txreq *tx = (struct verbs_txreq *)obj; + + memset(tx, 0, sizeof(*tx)); +} + +int verbs_txreq_init(struct hfi1_ibdev *dev) +{ + char buf[TXREQ_LEN]; + struct hfi1_devdata *dd = dd_from_dev(dev); + + snprintf(buf, sizeof(buf), "hfi1_%u_vtxreq_cache", dd->unit); + dev->verbs_txreq_cache = kmem_cache_create(buf, + sizeof(struct verbs_txreq), + 0, SLAB_HWCACHE_ALIGN, + verbs_txreq_kmem_cache_ctor); + if (!dev->verbs_txreq_cache) + return -ENOMEM; + return 0; +} + +void verbs_txreq_exit(struct hfi1_ibdev *dev) +{ + kmem_cache_destroy(dev->verbs_txreq_cache); + dev->verbs_txreq_cache = NULL; +} diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h new file mode 100644 index 000000000..a1d6e0807 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h @@ -0,0 +1,117 @@ +/* + * Copyright(c) 2016 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef HFI1_VERBS_TXREQ_H +#define HFI1_VERBS_TXREQ_H + +#include <linux/types.h> +#include <linux/slab.h> + +#include "verbs.h" +#include "sdma_txreq.h" +#include "iowait.h" + +struct verbs_txreq { + struct hfi1_pio_header phdr; + struct sdma_txreq txreq; + struct rvt_qp *qp; + struct rvt_swqe *wqe; + struct rvt_mregion *mr; + struct rvt_sge_state *ss; + struct sdma_engine *sde; + struct send_context *psc; + u16 hdr_dwords; +}; + +struct hfi1_ibdev; +struct verbs_txreq *__get_txreq(struct hfi1_ibdev *dev, + struct rvt_qp *qp); + +static inline struct verbs_txreq *get_txreq(struct hfi1_ibdev *dev, + struct rvt_qp *qp) + __must_hold(&qp->slock) +{ + struct verbs_txreq *tx; + struct hfi1_qp_priv *priv = qp->priv; + + tx = kmem_cache_alloc(dev->verbs_txreq_cache, GFP_ATOMIC); + if (unlikely(!tx)) { + /* call slow path to get the lock */ + tx = __get_txreq(dev, qp); + if (IS_ERR(tx)) + return tx; + } + tx->qp = qp; + tx->mr = NULL; + tx->sde = priv->s_sde; + tx->psc = priv->s_sendcontext; + /* so that we can test if the sdma decriptors are there */ + tx->txreq.num_desc = 0; + return tx; +} + +static inline struct sdma_txreq *get_sdma_txreq(struct verbs_txreq *tx) +{ + return &tx->txreq; +} + +static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp) +{ + struct sdma_txreq *stx; + struct hfi1_qp_priv *priv = qp->priv; + + stx = iowait_get_txhead(&priv->s_iowait); + if (stx) + return container_of(stx, struct verbs_txreq, txreq); + return NULL; +} + +void hfi1_put_txreq(struct verbs_txreq *tx); +int verbs_txreq_init(struct hfi1_ibdev *dev); +void verbs_txreq_exit(struct hfi1_ibdev *dev); + +#endif /* HFI1_VERBS_TXREQ_H */ diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h index 819767681..b738acdb9 100644 --- a/drivers/infiniband/hw/i40iw/i40iw.h +++ b/drivers/infiniband/hw/i40iw/i40iw.h @@ -50,8 +50,6 @@ #include <rdma/ib_pack.h> #include <rdma/rdma_cm.h> #include <rdma/iw_cm.h> -#include <rdma/iw_portmap.h> -#include <rdma/rdma_netlink.h> #include <crypto/hash.h> #include "i40iw_status.h" @@ -115,6 +113,8 @@ #define IW_HMC_OBJ_TYPE_NUM ARRAY_SIZE(iw_hmc_obj_types) #define IW_CFG_FPM_QP_COUNT 32768 +#define I40IW_MAX_PAGES_PER_FMR 512 +#define I40IW_MIN_PAGES_PER_FMR 1 #define I40IW_MTU_TO_MSS 40 #define I40IW_DEFAULT_MSS 1460 @@ -254,6 +254,7 @@ struct i40iw_device { u32 arp_table_size; u32 next_arp_index; spinlock_t resource_lock; /* hw resource access */ + spinlock_t qptable_lock; u32 vendor_id; u32 vendor_part_id; u32 of_device_registered; @@ -392,7 +393,7 @@ void i40iw_flush_wqes(struct i40iw_device *iwdev, void i40iw_manage_arp_cache(struct i40iw_device *iwdev, unsigned char *mac_addr, - __be32 *ip_addr, + u32 *ip_addr, bool ipv4, u32 action); @@ -550,7 +551,7 @@ enum i40iw_status_code i40iw_hw_flush_wqes(struct i40iw_device *iwdev, struct i40iw_qp_flush_info *info, bool wait); -void i40iw_copy_ip_ntohl(u32 *dst, u32 *src); +void i40iw_copy_ip_ntohl(u32 *dst, __be32 *src); struct ib_mr *i40iw_reg_phys_mr(struct ib_pd *ib_pd, u64 addr, u64 size, diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 38f917a6c..d2fa72516 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -771,6 +771,7 @@ static void i40iw_build_mpa_v2(struct i40iw_cm_node *cm_node, { struct ietf_mpa_v2 *mpa_frame = (struct ietf_mpa_v2 *)start_addr; struct ietf_rtr_msg *rtr_msg = &mpa_frame->rtr_msg; + u16 ctrl_ird, ctrl_ord; /* initialize the upper 5 bytes of the frame */ i40iw_build_mpa_v1(cm_node, start_addr, mpa_key); @@ -779,38 +780,38 @@ static void i40iw_build_mpa_v2(struct i40iw_cm_node *cm_node, /* initialize RTR msg */ if (cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) { - rtr_msg->ctrl_ird = IETF_NO_IRD_ORD; - rtr_msg->ctrl_ord = IETF_NO_IRD_ORD; + ctrl_ird = IETF_NO_IRD_ORD; + ctrl_ord = IETF_NO_IRD_ORD; } else { - rtr_msg->ctrl_ird = (cm_node->ird_size > IETF_NO_IRD_ORD) ? + ctrl_ird = (cm_node->ird_size > IETF_NO_IRD_ORD) ? IETF_NO_IRD_ORD : cm_node->ird_size; - rtr_msg->ctrl_ord = (cm_node->ord_size > IETF_NO_IRD_ORD) ? + ctrl_ord = (cm_node->ord_size > IETF_NO_IRD_ORD) ? IETF_NO_IRD_ORD : cm_node->ord_size; } - rtr_msg->ctrl_ird |= IETF_PEER_TO_PEER; - rtr_msg->ctrl_ird |= IETF_FLPDU_ZERO_LEN; + ctrl_ird |= IETF_PEER_TO_PEER; + ctrl_ird |= IETF_FLPDU_ZERO_LEN; switch (mpa_key) { case MPA_KEY_REQUEST: - rtr_msg->ctrl_ord |= IETF_RDMA0_WRITE; - rtr_msg->ctrl_ord |= IETF_RDMA0_READ; + ctrl_ord |= IETF_RDMA0_WRITE; + ctrl_ord |= IETF_RDMA0_READ; break; case MPA_KEY_REPLY: switch (cm_node->send_rdma0_op) { case SEND_RDMA_WRITE_ZERO: - rtr_msg->ctrl_ord |= IETF_RDMA0_WRITE; + ctrl_ord |= IETF_RDMA0_WRITE; break; case SEND_RDMA_READ_ZERO: - rtr_msg->ctrl_ord |= IETF_RDMA0_READ; + ctrl_ord |= IETF_RDMA0_READ; break; } break; default: break; } - rtr_msg->ctrl_ird = htons(rtr_msg->ctrl_ird); - rtr_msg->ctrl_ord = htons(rtr_msg->ctrl_ord); + rtr_msg->ctrl_ird = htons(ctrl_ird); + rtr_msg->ctrl_ord = htons(ctrl_ord); } /** @@ -2107,7 +2108,7 @@ static bool i40iw_ipv6_is_loopback(u32 *loc_addr, u32 *rem_addr) struct in6_addr raddr6; i40iw_copy_ip_htonl(raddr6.in6_u.u6_addr32, rem_addr); - return (!memcmp(loc_addr, rem_addr, 16) || ipv6_addr_loopback(&raddr6)); + return !memcmp(loc_addr, rem_addr, 16) || ipv6_addr_loopback(&raddr6); } /** @@ -2160,7 +2161,7 @@ static struct i40iw_cm_node *i40iw_make_cm_node( cm_node->tcp_cntxt.rcv_wnd = I40IW_CM_DEFAULT_RCV_WND_SCALED >> I40IW_CM_DEFAULT_RCV_WND_SCALE; ts = current_kernel_time(); - cm_node->tcp_cntxt.loc_seq_num = htonl(ts.tv_nsec); + cm_node->tcp_cntxt.loc_seq_num = ts.tv_nsec; cm_node->tcp_cntxt.mss = iwdev->mss; cm_node->iwdev = iwdev; @@ -2234,7 +2235,7 @@ static void i40iw_rem_ref_cm_node(struct i40iw_cm_node *cm_node) if (cm_node->listener) { i40iw_dec_refcnt_listen(cm_core, cm_node->listener, 0, true); } else { - if (!i40iw_listen_port_in_use(cm_core, htons(cm_node->loc_port)) && + if (!i40iw_listen_port_in_use(cm_core, cm_node->loc_port) && cm_node->apbvt_set && cm_node->iwdev) { i40iw_manage_apbvt(cm_node->iwdev, cm_node->loc_port, @@ -2852,7 +2853,6 @@ static struct i40iw_cm_node *i40iw_create_cm_node( void *private_data, struct i40iw_cm_info *cm_info) { - int ret; struct i40iw_cm_node *cm_node; struct i40iw_cm_listener *loopback_remotelistener; struct i40iw_cm_node *loopback_remotenode; @@ -2922,30 +2922,6 @@ static struct i40iw_cm_node *i40iw_create_cm_node( memcpy(cm_node->pdata_buf, private_data, private_data_len); cm_node->state = I40IW_CM_STATE_SYN_SENT; - ret = i40iw_send_syn(cm_node, 0); - - if (ret) { - if (cm_node->ipv4) - i40iw_debug(cm_node->dev, - I40IW_DEBUG_CM, - "Api - connect() FAILED: dest addr=%pI4", - cm_node->rem_addr); - else - i40iw_debug(cm_node->dev, I40IW_DEBUG_CM, - "Api - connect() FAILED: dest addr=%pI6", - cm_node->rem_addr); - i40iw_rem_ref_cm_node(cm_node); - cm_node = NULL; - } - - if (cm_node) - i40iw_debug(cm_node->dev, - I40IW_DEBUG_CM, - "Api - connect(): port=0x%04x, cm_node=%p, cm_id = %p.\n", - cm_node->rem_port, - cm_node, - cm_node->cm_id); - return cm_node; } @@ -3266,11 +3242,13 @@ static void i40iw_init_tcp_ctx(struct i40iw_cm_node *cm_node, tcp_info->dest_ip_addr3 = cpu_to_le32(cm_node->rem_addr[0]); tcp_info->local_ipaddr3 = cpu_to_le32(cm_node->loc_addr[0]); - tcp_info->arp_idx = cpu_to_le32(i40iw_arp_table(iwqp->iwdev, - &tcp_info->dest_ip_addr3, - true, - NULL, - I40IW_ARP_RESOLVE)); + tcp_info->arp_idx = + cpu_to_le16((u16)i40iw_arp_table( + iwqp->iwdev, + &tcp_info->dest_ip_addr3, + true, + NULL, + I40IW_ARP_RESOLVE)); } else { tcp_info->src_port = cpu_to_le16(cm_node->loc_port); tcp_info->dst_port = cpu_to_le16(cm_node->rem_port); @@ -3282,12 +3260,13 @@ static void i40iw_init_tcp_ctx(struct i40iw_cm_node *cm_node, tcp_info->local_ipaddr1 = cpu_to_le32(cm_node->loc_addr[1]); tcp_info->local_ipaddr2 = cpu_to_le32(cm_node->loc_addr[2]); tcp_info->local_ipaddr3 = cpu_to_le32(cm_node->loc_addr[3]); - tcp_info->arp_idx = cpu_to_le32(i40iw_arp_table( - iwqp->iwdev, - &tcp_info->dest_ip_addr0, - false, - NULL, - I40IW_ARP_RESOLVE)); + tcp_info->arp_idx = + cpu_to_le16((u16)i40iw_arp_table( + iwqp->iwdev, + &tcp_info->dest_ip_addr0, + false, + NULL, + I40IW_ARP_RESOLVE)); } } @@ -3564,7 +3543,6 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct i40iw_cm_node *cm_node; struct ib_qp_attr attr; int passive_state; - struct i40iw_ib_device *iwibdev; struct ib_mr *ibmr; struct i40iw_pd *iwpd; u16 buf_len = 0; @@ -3627,7 +3605,6 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) !i40iw_ipv4_is_loopback(cm_node->loc_addr[0], cm_node->rem_addr[0])) || (!cm_node->ipv4 && !i40iw_ipv6_is_loopback(cm_node->loc_addr, cm_node->rem_addr))) { - iwibdev = iwdev->iwibdev; iwpd = iwqp->iwpd; tagged_offset = (uintptr_t)iwqp->ietf_mem.va; ibmr = i40iw_reg_phys_mr(&iwpd->ibpd, @@ -3752,6 +3729,7 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) struct sockaddr_in *raddr; struct sockaddr_in6 *laddr6; struct sockaddr_in6 *raddr6; + bool qhash_set = false; int apbvt_set = 0; enum i40iw_status_code status; @@ -3810,6 +3788,7 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) true); if (status) return -EINVAL; + qhash_set = true; } status = i40iw_manage_apbvt(iwdev, cm_info.loc_port, I40IW_MANAGE_APBVT_ADD); if (status) { @@ -3828,23 +3807,8 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) conn_param->private_data_len, (void *)conn_param->private_data, &cm_info); - if (!cm_node) { - i40iw_manage_qhash(iwdev, - &cm_info, - I40IW_QHASH_TYPE_TCP_ESTABLISHED, - I40IW_QHASH_MANAGE_TYPE_DELETE, - NULL, - false); - - if (apbvt_set && !i40iw_listen_port_in_use(&iwdev->cm_core, - cm_info.loc_port)) - i40iw_manage_apbvt(iwdev, - cm_info.loc_port, - I40IW_MANAGE_APBVT_DEL); - cm_id->rem_ref(cm_id); - iwdev->cm_core.stats_connect_errs++; - return -ENOMEM; - } + if (!cm_node) + goto err; i40iw_record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO && @@ -3852,12 +3816,54 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_node->ord_size = 1; cm_node->apbvt_set = apbvt_set; - cm_node->qhash_set = true; + cm_node->qhash_set = qhash_set; iwqp->cm_node = cm_node; cm_node->iwqp = iwqp; iwqp->cm_id = cm_id; i40iw_add_ref(&iwqp->ibqp); + + if (cm_node->state == I40IW_CM_STATE_SYN_SENT) { + if (i40iw_send_syn(cm_node, 0)) { + i40iw_rem_ref_cm_node(cm_node); + goto err; + } + } + + i40iw_debug(cm_node->dev, + I40IW_DEBUG_CM, + "Api - connect(): port=0x%04x, cm_node=%p, cm_id = %p.\n", + cm_node->rem_port, + cm_node, + cm_node->cm_id); return 0; + +err: + if (cm_node) { + if (cm_node->ipv4) + i40iw_debug(cm_node->dev, + I40IW_DEBUG_CM, + "Api - connect() FAILED: dest addr=%pI4", + cm_node->rem_addr); + else + i40iw_debug(cm_node->dev, I40IW_DEBUG_CM, + "Api - connect() FAILED: dest addr=%pI6", + cm_node->rem_addr); + } + i40iw_manage_qhash(iwdev, + &cm_info, + I40IW_QHASH_TYPE_TCP_ESTABLISHED, + I40IW_QHASH_MANAGE_TYPE_DELETE, + NULL, + false); + + if (apbvt_set && !i40iw_listen_port_in_use(&iwdev->cm_core, + cm_info.loc_port)) + i40iw_manage_apbvt(iwdev, + cm_info.loc_port, + I40IW_MANAGE_APBVT_DEL); + cm_id->rem_ref(cm_id); + iwdev->cm_core.stats_connect_errs++; + return -ENOMEM; } /** diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h index 5f8ceb4a8..e9046d9f9 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.h +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h @@ -1,6 +1,6 @@ /******************************************************************************* * -* Copyright (c) 2015 Intel Corporation. All rights reserved. +* Copyright (c) 2015-2016 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -291,8 +291,6 @@ struct i40iw_cm_listener { u8 loc_mac[ETH_ALEN]; u32 loc_addr[4]; u16 loc_port; - u32 map_loc_addr[4]; - u16 map_loc_port; struct iw_cm_id *cm_id; atomic_t ref_count; struct i40iw_device *iwdev; @@ -317,8 +315,6 @@ struct i40iw_kmem_info { struct i40iw_cm_node { u32 loc_addr[4], rem_addr[4]; u16 loc_port, rem_port; - u32 map_loc_addr[4], map_rem_addr[4]; - u16 map_loc_port, map_rem_port; u16 vlan_id; enum i40iw_cm_node_state state; u8 loc_mac[ETH_ALEN]; @@ -370,10 +366,6 @@ struct i40iw_cm_info { u16 rem_port; u32 loc_addr[4]; u32 rem_addr[4]; - u16 map_loc_port; - u16 map_rem_port; - u32 map_loc_addr[4]; - u32 map_rem_addr[4]; u16 vlan_id; int backlog; u16 user_pri; diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c index f05802bf6..2c4b4d072 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c +++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c @@ -114,16 +114,21 @@ static enum i40iw_status_code i40iw_cqp_poll_registers( * i40iw_sc_parse_fpm_commit_buf - parse fpm commit buffer * @buf: ptr to fpm commit buffer * @info: ptr to i40iw_hmc_obj_info struct + * @sd: number of SDs for HMC objects * * parses fpm commit info and copy base value * of hmc objects in hmc_info */ static enum i40iw_status_code i40iw_sc_parse_fpm_commit_buf( u64 *buf, - struct i40iw_hmc_obj_info *info) + struct i40iw_hmc_obj_info *info, + u32 *sd) { u64 temp; + u64 size; + u64 base = 0; u32 i, j; + u32 k = 0; u32 low; /* copy base values in obj_info */ @@ -131,10 +136,20 @@ static enum i40iw_status_code i40iw_sc_parse_fpm_commit_buf( i <= I40IW_HMC_IW_PBLE; i++, j += 8) { get_64bit_val(buf, j, &temp); info[i].base = RS_64_1(temp, 32) * 512; + if (info[i].base > base) { + base = info[i].base; + k = i; + } low = (u32)(temp); if (low) info[i].cnt = low; } + size = info[k].cnt * info[k].size + info[k].base; + if (size & 0x1FFFFF) + *sd = (u32)((size >> 21) + 1); /* add 1 for remainder */ + else + *sd = (u32)(size >> 21); + return 0; } @@ -2909,6 +2924,65 @@ static enum i40iw_status_code i40iw_sc_mw_alloc( } /** + * i40iw_sc_mr_fast_register - Posts RDMA fast register mr WR to iwarp qp + * @qp: sc qp struct + * @info: fast mr info + * @post_sq: flag for cqp db to ring + */ +enum i40iw_status_code i40iw_sc_mr_fast_register( + struct i40iw_sc_qp *qp, + struct i40iw_fast_reg_stag_info *info, + bool post_sq) +{ + u64 temp, header; + u64 *wqe; + u32 wqe_idx; + + wqe = i40iw_qp_get_next_send_wqe(&qp->qp_uk, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, + 0, info->wr_id); + if (!wqe) + return I40IW_ERR_QP_TOOMANY_WRS_POSTED; + + i40iw_debug(qp->dev, I40IW_DEBUG_MR, "%s: wr_id[%llxh] wqe_idx[%04d] location[%p]\n", + __func__, info->wr_id, wqe_idx, + &qp->qp_uk.sq_wrtrk_array[wqe_idx].wrid); + temp = (info->addr_type == I40IW_ADDR_TYPE_VA_BASED) ? (uintptr_t)info->va : info->fbo; + set_64bit_val(wqe, 0, temp); + + temp = RS_64(info->first_pm_pbl_index >> 16, I40IWQPSQ_FIRSTPMPBLIDXHI); + set_64bit_val(wqe, + 8, + LS_64(temp, I40IWQPSQ_FIRSTPMPBLIDXHI) | + LS_64(info->reg_addr_pa >> I40IWQPSQ_PBLADDR_SHIFT, I40IWQPSQ_PBLADDR)); + + set_64bit_val(wqe, + 16, + info->total_len | + LS_64(info->first_pm_pbl_index, I40IWQPSQ_FIRSTPMPBLIDXLO)); + + header = LS_64(info->stag_key, I40IWQPSQ_STAGKEY) | + LS_64(info->stag_idx, I40IWQPSQ_STAGINDEX) | + LS_64(I40IWQP_OP_FAST_REGISTER, I40IWQPSQ_OPCODE) | + LS_64(info->chunk_size, I40IWQPSQ_LPBLSIZE) | + LS_64(info->page_size, I40IWQPSQ_HPAGESIZE) | + LS_64(info->access_rights, I40IWQPSQ_STAGRIGHTS) | + LS_64(info->addr_type, I40IWQPSQ_VABASEDTO) | + LS_64(info->read_fence, I40IWQPSQ_READFENCE) | + LS_64(info->local_fence, I40IWQPSQ_LOCALFENCE) | + LS_64(info->signaled, I40IWQPSQ_SIGCOMPL) | + LS_64(qp->qp_uk.swqe_polarity, I40IWQPSQ_VALID); + + i40iw_insert_wqe_hdr(wqe, header); + + i40iw_debug_buf(qp->dev, I40IW_DEBUG_WQE, "FAST_REG WQE", + wqe, I40IW_QP_WQE_MIN_SIZE); + + if (post_sq) + i40iw_qp_post_wr(&qp->qp_uk); + return 0; +} + +/** * i40iw_sc_send_lsmm - send last streaming mode message * @qp: sc qp struct * @lsmm_buf: buffer with lsmm message @@ -3147,7 +3221,7 @@ enum i40iw_status_code i40iw_sc_init_iw_hmc(struct i40iw_sc_dev *dev, u8 hmc_fn_ i40iw_cqp_commit_fpm_values_cmd(dev, &query_fpm_mem, hmc_fn_id); /* parse the fpm_commit_buf and fill hmc obj info */ - i40iw_sc_parse_fpm_commit_buf((u64 *)query_fpm_mem.va, hmc_info->hmc_obj); + i40iw_sc_parse_fpm_commit_buf((u64 *)query_fpm_mem.va, hmc_info->hmc_obj, &hmc_info->sd_table.sd_cnt); mem_size = sizeof(struct i40iw_hmc_sd_entry) * (hmc_info->sd_table.sd_cnt + hmc_info->first_sd_index); ret_code = i40iw_allocate_virt_mem(dev->hw, &virt_mem, mem_size); @@ -3221,7 +3295,9 @@ static enum i40iw_status_code i40iw_sc_configure_iw_fpm(struct i40iw_sc_dev *dev /* parse the fpm_commit_buf and fill hmc obj info */ if (!ret_code) - ret_code = i40iw_sc_parse_fpm_commit_buf(dev->fpm_commit_buf, hmc_info->hmc_obj); + ret_code = i40iw_sc_parse_fpm_commit_buf(dev->fpm_commit_buf, + hmc_info->hmc_obj, + &hmc_info->sd_table.sd_cnt); i40iw_debug_buf(dev, I40IW_DEBUG_HMC, "COMMIT FPM BUFFER", commit_fpm_mem.va, I40IW_COMMIT_FPM_BUF_SIZE); @@ -3469,6 +3545,40 @@ static bool i40iw_ring_full(struct i40iw_sc_cqp *cqp) } /** + * i40iw_est_sd - returns approximate number of SDs for HMC + * @dev: sc device struct + * @hmc_info: hmc structure, size and count for HMC objects + */ +static u64 i40iw_est_sd(struct i40iw_sc_dev *dev, struct i40iw_hmc_info *hmc_info) +{ + int i; + u64 size = 0; + u64 sd; + + for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_PBLE; i++) + size += hmc_info->hmc_obj[i].cnt * hmc_info->hmc_obj[i].size; + + if (dev->is_pf) + size += hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt * hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].size; + + if (size & 0x1FFFFF) + sd = (size >> 21) + 1; /* add 1 for remainder */ + else + sd = size >> 21; + + if (!dev->is_pf) { + /* 2MB alignment for VF PBLE HMC */ + size = hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt * hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].size; + if (size & 0x1FFFFF) + sd += (size >> 21) + 1; /* add 1 for remainder */ + else + sd += size >> 21; + } + + return sd; +} + +/** * i40iw_config_fpm_values - configure HMC objects * @dev: sc device struct * @qp_count: desired qp count @@ -3479,7 +3589,7 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ u32 i, mem_size; u32 qpwantedoriginal, qpwanted, mrwanted, pblewanted; u32 powerof2; - u64 sd_needed, bytes_needed; + u64 sd_needed; u32 loop_count = 0; struct i40iw_hmc_info *hmc_info; @@ -3497,23 +3607,15 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ return ret_code; } - bytes_needed = 0; - for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++) { + for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++) hmc_info->hmc_obj[i].cnt = hmc_info->hmc_obj[i].max_cnt; - bytes_needed += - (hmc_info->hmc_obj[i].max_cnt) * (hmc_info->hmc_obj[i].size); - i40iw_debug(dev, I40IW_DEBUG_HMC, - "%s i[%04d] max_cnt[0x%04X] size[0x%04llx]\n", - __func__, i, hmc_info->hmc_obj[i].max_cnt, - hmc_info->hmc_obj[i].size); - } - sd_needed = (bytes_needed / I40IW_HMC_DIRECT_BP_SIZE) + 1; /* round up */ + sd_needed = i40iw_est_sd(dev, hmc_info); i40iw_debug(dev, I40IW_DEBUG_HMC, "%s: FW initial max sd_count[%08lld] first_sd_index[%04d]\n", __func__, sd_needed, hmc_info->first_sd_index); i40iw_debug(dev, I40IW_DEBUG_HMC, - "%s: bytes_needed=0x%llx sd count %d where max sd is %d\n", - __func__, bytes_needed, hmc_info->sd_table.sd_cnt, + "%s: sd count %d where max sd is %d\n", + __func__, hmc_info->sd_table.sd_cnt, hmc_fpm_misc->max_sds); qpwanted = min(qp_count, hmc_info->hmc_obj[I40IW_HMC_IW_QP].max_cnt); @@ -3555,11 +3657,7 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt = pblewanted; /* How much memory is needed for all the objects. */ - bytes_needed = 0; - for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++) - bytes_needed += - (hmc_info->hmc_obj[i].cnt) * (hmc_info->hmc_obj[i].size); - sd_needed = (bytes_needed / I40IW_HMC_DIRECT_BP_SIZE) + 1; + sd_needed = i40iw_est_sd(dev, hmc_info); if ((loop_count > 1000) || ((!(loop_count % 10)) && (qpwanted > qpwantedoriginal * 2 / 3))) { @@ -3580,15 +3678,7 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ pblewanted -= FPM_MULTIPLIER * 1000; } while (sd_needed > hmc_fpm_misc->max_sds && loop_count < 2000); - bytes_needed = 0; - for (i = I40IW_HMC_IW_QP; i < I40IW_HMC_IW_MAX; i++) { - bytes_needed += (hmc_info->hmc_obj[i].cnt) * (hmc_info->hmc_obj[i].size); - i40iw_debug(dev, I40IW_DEBUG_HMC, - "%s i[%04d] cnt[0x%04x] size[0x%04llx]\n", - __func__, i, hmc_info->hmc_obj[i].cnt, - hmc_info->hmc_obj[i].size); - } - sd_needed = (bytes_needed / I40IW_HMC_DIRECT_BP_SIZE) + 1; /* round up not truncate. */ + sd_needed = i40iw_est_sd(dev, hmc_info); i40iw_debug(dev, I40IW_DEBUG_HMC, "loop_cnt=%d, sd_needed=%lld, qpcnt = %d, cqcnt=%d, mrcnt=%d, pblecnt=%d\n", @@ -3606,8 +3696,6 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ return ret_code; } - hmc_info->sd_table.sd_cnt = (u32)sd_needed; - mem_size = sizeof(struct i40iw_hmc_sd_entry) * (hmc_info->sd_table.sd_cnt + hmc_info->first_sd_index + 1); ret_code = i40iw_allocate_virt_mem(dev->hw, &virt_mem, mem_size); @@ -3911,11 +3999,11 @@ enum i40iw_status_code i40iw_process_bh(struct i40iw_sc_dev *dev) */ static u32 i40iw_iwarp_opcode(struct i40iw_aeqe_info *info, u8 *pkt) { - u16 *mpa; + __be16 *mpa; u32 opcode = 0xffffffff; if (info->q2_data_written) { - mpa = (u16 *)pkt; + mpa = (__be16 *)pkt; opcode = ntohs(mpa[1]) & 0xf; } return opcode; @@ -3977,7 +4065,7 @@ static int i40iw_bld_terminate_hdr(struct i40iw_sc_qp *qp, if (info->q2_data_written) { /* Use data from offending packet to fill in ddp & rdma hdrs */ pkt = i40iw_locate_mpa(pkt); - ddp_seg_len = ntohs(*(u16 *)pkt); + ddp_seg_len = ntohs(*(__be16 *)pkt); if (ddp_seg_len) { copy_len = 2; termhdr->hdrct = DDP_LEN_FLAG; @@ -4188,13 +4276,13 @@ void i40iw_terminate_connection(struct i40iw_sc_qp *qp, struct i40iw_aeqe_info * void i40iw_terminate_received(struct i40iw_sc_qp *qp, struct i40iw_aeqe_info *info) { u8 *pkt = qp->q2_buf + Q2_BAD_FRAME_OFFSET; - u32 *mpa; + __be32 *mpa; u8 ddp_ctl; u8 rdma_ctl; u16 aeq_id = 0; struct i40iw_terminate_hdr *termhdr; - mpa = (u32 *)i40iw_locate_mpa(pkt); + mpa = (__be32 *)i40iw_locate_mpa(pkt); if (info->q2_data_written) { /* did not validate the frame - do it now */ ddp_ctl = (ntohl(mpa[0]) >> 8) & 0xff; @@ -4559,17 +4647,18 @@ static struct i40iw_pd_ops iw_pd_ops = { }; static struct i40iw_priv_qp_ops iw_priv_qp_ops = { - i40iw_sc_qp_init, - i40iw_sc_qp_create, - i40iw_sc_qp_modify, - i40iw_sc_qp_destroy, - i40iw_sc_qp_flush_wqes, - i40iw_sc_qp_upload_context, - i40iw_sc_qp_setctx, - i40iw_sc_send_lsmm, - i40iw_sc_send_lsmm_nostag, - i40iw_sc_send_rtt, - i40iw_sc_post_wqe0, + .qp_init = i40iw_sc_qp_init, + .qp_create = i40iw_sc_qp_create, + .qp_modify = i40iw_sc_qp_modify, + .qp_destroy = i40iw_sc_qp_destroy, + .qp_flush_wqes = i40iw_sc_qp_flush_wqes, + .qp_upload_context = i40iw_sc_qp_upload_context, + .qp_setctx = i40iw_sc_qp_setctx, + .qp_send_lsmm = i40iw_sc_send_lsmm, + .qp_send_lsmm_nostag = i40iw_sc_send_lsmm_nostag, + .qp_send_rtt = i40iw_sc_send_rtt, + .qp_post_wqe0 = i40iw_sc_post_wqe0, + .iw_mr_fast_register = i40iw_sc_mr_fast_register }; static struct i40iw_priv_cq_ops iw_priv_cq_ops = { diff --git a/drivers/infiniband/hw/i40iw/i40iw_d.h b/drivers/infiniband/hw/i40iw/i40iw_d.h index aab88d65f..bd942da91 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_d.h +++ b/drivers/infiniband/hw/i40iw/i40iw_d.h @@ -1290,7 +1290,7 @@ /* wqe size considering 32 bytes per wqe*/ #define I40IWQP_SW_MIN_WQSIZE 4 /* 128 bytes */ -#define I40IWQP_SW_MAX_WQSIZE 16384 /* 524288 bytes */ +#define I40IWQP_SW_MAX_WQSIZE 2048 /* 2048 bytes */ #define I40IWQP_OP_RDMA_WRITE 0 #define I40IWQP_OP_RDMA_READ 1 @@ -1512,6 +1512,8 @@ enum i40iw_alignment { I40IW_SD_BUF_ALIGNMENT = 0x100 }; +#define I40IW_WQE_SIZE_64 64 + #define I40IW_QP_WQE_MIN_SIZE 32 #define I40IW_QP_WQE_MAX_SIZE 128 diff --git a/drivers/infiniband/hw/i40iw/i40iw_hw.c b/drivers/infiniband/hw/i40iw/i40iw_hw.c index 9fd302425..3ee0cad96 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_hw.c +++ b/drivers/infiniband/hw/i40iw/i40iw_hw.c @@ -106,7 +106,9 @@ u32 i40iw_initialize_hw_resources(struct i40iw_device *iwdev) set_bit(2, iwdev->allocated_pds); spin_lock_init(&iwdev->resource_lock); - mrdrvbits = 24 - get_count_order(iwdev->max_mr); + spin_lock_init(&iwdev->qptable_lock); + /* stag index mask has a minimum of 14 bits */ + mrdrvbits = 24 - max(get_count_order(iwdev->max_mr), 14); iwdev->mr_stagmask = ~(((1 << mrdrvbits) - 1) << (32 - mrdrvbits)); return 0; } @@ -301,11 +303,15 @@ void i40iw_process_aeq(struct i40iw_device *iwdev) "%s ae_id = 0x%x bool qp=%d qp_id = %d\n", __func__, info->ae_id, info->qp, info->qp_cq_id); if (info->qp) { + spin_lock_irqsave(&iwdev->qptable_lock, flags); iwqp = iwdev->qp_table[info->qp_cq_id]; if (!iwqp) { + spin_unlock_irqrestore(&iwdev->qptable_lock, flags); i40iw_pr_err("qp_id %d is already freed\n", info->qp_cq_id); continue; } + i40iw_add_ref(&iwqp->ibqp); + spin_unlock_irqrestore(&iwdev->qptable_lock, flags); qp = &iwqp->sc_qp; spin_lock_irqsave(&iwqp->lock, flags); iwqp->hw_tcp_state = info->tcp_state; @@ -411,6 +417,8 @@ void i40iw_process_aeq(struct i40iw_device *iwdev) i40iw_terminate_connection(qp, info); break; } + if (info->qp) + i40iw_rem_ref(&iwqp->ibqp); } while (1); if (aeqcnt) @@ -460,7 +468,7 @@ int i40iw_manage_apbvt(struct i40iw_device *iwdev, u16 accel_local_port, bool ad */ void i40iw_manage_arp_cache(struct i40iw_device *iwdev, unsigned char *mac_addr, - __be32 *ip_addr, + u32 *ip_addr, bool ipv4, u32 action) { @@ -481,7 +489,7 @@ void i40iw_manage_arp_cache(struct i40iw_device *iwdev, cqp_info->cqp_cmd = OP_ADD_ARP_CACHE_ENTRY; info = &cqp_info->in.u.add_arp_cache_entry.info; memset(info, 0, sizeof(*info)); - info->arp_index = cpu_to_le32(arp_index); + info->arp_index = cpu_to_le16((u16)arp_index); info->permanent = true; ether_addr_copy(info->mac_addr, mac_addr); cqp_info->in.u.add_arp_cache_entry.scratch = (uintptr_t)cqp_request; diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index 90e5af217..6e9081380 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -270,7 +270,6 @@ static void i40iw_disable_irq(struct i40iw_sc_dev *dev, i40iw_wr32(dev->hw, I40E_PFINT_DYN_CTLN(msix_vec->idx - 1), 0); else i40iw_wr32(dev->hw, I40E_VFINT_DYN_CTLN1(msix_vec->idx - 1), 0); - synchronize_irq(msix_vec->irq); free_irq(msix_vec->irq, dev_id); } @@ -601,8 +600,7 @@ static enum i40iw_status_code i40iw_create_cqp(struct i40iw_device *iwdev) cqp_init_info.scratch_array = cqp->scratch_array; status = dev->cqp_ops->cqp_init(dev->cqp, &cqp_init_info); if (status) { - i40iw_pr_err("cqp init status %d maj_err %d min_err %d\n", - status, maj_err, min_err); + i40iw_pr_err("cqp init status %d\n", status); goto exit; } status = dev->cqp_ops->cqp_create(dev->cqp, true, &maj_err, &min_err); @@ -1147,10 +1145,7 @@ static enum i40iw_status_code i40iw_alloc_set_mac_ipaddr(struct i40iw_device *iw if (!status) { status = i40iw_add_mac_ipaddr_entry(iwdev, macaddr, (u8)iwdev->mac_ip_table_idx); - if (!status) - status = i40iw_add_mac_ipaddr_entry(iwdev, macaddr, - (u8)iwdev->mac_ip_table_idx); - else + if (status) i40iw_del_macip_entry(iwdev, (u8)iwdev->mac_ip_table_idx); } return status; @@ -1165,7 +1160,7 @@ static void i40iw_add_ipv6_addr(struct i40iw_device *iwdev) struct net_device *ip_dev; struct inet6_dev *idev; struct inet6_ifaddr *ifp; - __be32 local_ipaddr6[4]; + u32 local_ipaddr6[4]; rcu_read_lock(); for_each_netdev_rcu(&init_net, ip_dev) { @@ -1512,6 +1507,7 @@ static enum i40iw_status_code i40iw_setup_init_state(struct i40iw_handler *hdl, I40IW_HMC_PROFILE_DEFAULT; iwdev->max_rdma_vfs = (iwdev->resource_profile != I40IW_HMC_PROFILE_DEFAULT) ? max_rdma_vfs : 0; + iwdev->max_enabled_vfs = iwdev->max_rdma_vfs; iwdev->netdev = ldev->netdev; hdl->client = client; iwdev->mss = (!ldev->params.mtu) ? I40IW_DEFAULT_MSS : ldev->params.mtu - I40IW_MTU_TO_MSS; @@ -1531,7 +1527,10 @@ static enum i40iw_status_code i40iw_setup_init_state(struct i40iw_handler *hdl, goto exit; iwdev->obj_next = iwdev->obj_mem; iwdev->push_mode = push_mode; + init_waitqueue_head(&iwdev->vchnl_waitq); + init_waitqueue_head(&dev->vf_reqs); + status = i40iw_initialize_dev(iwdev, ldev); exit: if (status) { @@ -1710,7 +1709,6 @@ static void i40iw_vf_reset(struct i40e_info *ldev, struct i40e_client *client, u for (i = 0; i < I40IW_MAX_PE_ENABLED_VF_COUNT; i++) { if (!dev->vf_dev[i] || (dev->vf_dev[i]->vf_id != vf_id)) continue; - /* free all resources allocated on behalf of vf */ tmp_vfdev = dev->vf_dev[i]; spin_lock_irqsave(&dev->dev_pestat.stats_lock, flags); @@ -1819,8 +1817,6 @@ static int i40iw_virtchnl_receive(struct i40e_info *ldev, dev = &hdl->device.sc_dev; iwdev = dev->back_dev; - i40iw_debug(dev, I40IW_DEBUG_VIRT, "msg %p, message length %u\n", msg, len); - if (dev->vchnl_if.vchnl_recv) { ret_code = dev->vchnl_if.vchnl_recv(dev, vf_id, msg, len); if (!dev->is_pf) { @@ -1832,6 +1828,39 @@ static int i40iw_virtchnl_receive(struct i40e_info *ldev, } /** + * i40iw_vf_clear_to_send - wait to send virtual channel message + * @dev: iwarp device * + * Wait for until virtual channel is clear + * before sending the next message + * + * Returns false if error + * Returns true if clear to send + */ +bool i40iw_vf_clear_to_send(struct i40iw_sc_dev *dev) +{ + struct i40iw_device *iwdev; + wait_queue_t wait; + + iwdev = dev->back_dev; + + if (!wq_has_sleeper(&dev->vf_reqs) && + (atomic_read(&iwdev->vchnl_msgs) == 0)) + return true; /* virtual channel is clear */ + + init_wait(&wait); + add_wait_queue_exclusive(&dev->vf_reqs, &wait); + + if (!wait_event_timeout(dev->vf_reqs, + (atomic_read(&iwdev->vchnl_msgs) == 0), + I40IW_VCHNL_EVENT_TIMEOUT)) + dev->vchnl_up = false; + + remove_wait_queue(&dev->vf_reqs, &wait); + + return dev->vchnl_up; +} + +/** * i40iw_virtchnl_send - send a message through the virtual channel * @dev: iwarp device * @vf_id: virtual function id associated with the message @@ -1848,22 +1877,20 @@ static enum i40iw_status_code i40iw_virtchnl_send(struct i40iw_sc_dev *dev, { struct i40iw_device *iwdev; struct i40e_info *ldev; - enum i40iw_status_code ret_code = I40IW_ERR_BAD_PTR; if (!dev || !dev->back_dev) - return ret_code; + return I40IW_ERR_BAD_PTR; iwdev = dev->back_dev; ldev = iwdev->ldev; if (ldev && ldev->ops && ldev->ops->virtchnl_send) - ret_code = ldev->ops->virtchnl_send(ldev, &i40iw_client, vf_id, msg, len); - - return ret_code; + return ldev->ops->virtchnl_send(ldev, &i40iw_client, vf_id, msg, len); + return I40IW_ERR_BAD_PTR; } /* client interface functions */ -static struct i40e_client_ops i40e_ops = { +static const struct i40e_client_ops i40e_ops = { .open = i40iw_open, .close = i40iw_close, .l2_param_change = i40iw_l2param_change, diff --git a/drivers/infiniband/hw/i40iw/i40iw_osdep.h b/drivers/infiniband/hw/i40iw/i40iw_osdep.h index 7e2049351..80f422bf3 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_osdep.h +++ b/drivers/infiniband/hw/i40iw/i40iw_osdep.h @@ -172,6 +172,7 @@ struct i40iw_hw; u8 __iomem *i40iw_get_hw_addr(void *dev); void i40iw_ieq_mpa_crc_ae(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp); enum i40iw_status_code i40iw_vf_wait_vchnl_resp(struct i40iw_sc_dev *dev); +bool i40iw_vf_clear_to_send(struct i40iw_sc_dev *dev); enum i40iw_status_code i40iw_ieq_check_mpacrc(struct shash_desc *desc, void *addr, u32 length, u32 value); struct i40iw_sc_qp *i40iw_ieq_get_qp(struct i40iw_sc_dev *dev, struct i40iw_puda_buf *buf); diff --git a/drivers/infiniband/hw/i40iw/i40iw_pble.c b/drivers/infiniband/hw/i40iw/i40iw_pble.c index ded853d2f..85993dc44 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_pble.c +++ b/drivers/infiniband/hw/i40iw/i40iw_pble.c @@ -404,13 +404,14 @@ static enum i40iw_status_code add_pble_pool(struct i40iw_sc_dev *dev, sd_entry->u.pd_table.pd_page_addr.pa : sd_entry->u.bp.addr.pa; if (sd_entry->valid) return 0; - if (dev->is_pf) + if (dev->is_pf) { ret_code = i40iw_hmc_sd_one(dev, hmc_info->hmc_fn_id, sd_reg_val, idx->sd_idx, sd_entry->entry_type, true); - if (ret_code) { - i40iw_pr_err("cqp cmd failed for sd (pbles)\n"); - goto error; + if (ret_code) { + i40iw_pr_err("cqp cmd failed for sd (pbles)\n"); + goto error; + } } sd_entry->valid = true; diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c index 8eb400d8a..e9c6e82af 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_puda.c +++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c @@ -1194,7 +1194,7 @@ static enum i40iw_status_code i40iw_ieq_process_buf(struct i40iw_puda_rsrc *ieq, ioffset = (u16)(buf->data - (u8 *)buf->mem.va); while (datalen) { - fpdu_len = i40iw_ieq_get_fpdu_length(ntohs(*(u16 *)datap)); + fpdu_len = i40iw_ieq_get_fpdu_length(ntohs(*(__be16 *)datap)); if (fpdu_len > pfpdu->max_fpdu_data) { i40iw_debug(ieq->dev, I40IW_DEBUG_IEQ, "%s: error bad fpdu_len\n", __func__); diff --git a/drivers/infiniband/hw/i40iw/i40iw_status.h b/drivers/infiniband/hw/i40iw/i40iw_status.h index b0110c15e..91c421762 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_status.h +++ b/drivers/infiniband/hw/i40iw/i40iw_status.h @@ -95,6 +95,7 @@ enum i40iw_status_code { I40IW_ERR_INVALID_MAC_ADDR = -65, I40IW_ERR_BAD_STAG = -66, I40IW_ERR_CQ_COMPL_ERROR = -67, + I40IW_ERR_QUEUE_DESTROYED = -68 }; #endif diff --git a/drivers/infiniband/hw/i40iw/i40iw_type.h b/drivers/infiniband/hw/i40iw/i40iw_type.h index edb3a8c82..16cc61720 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_type.h +++ b/drivers/infiniband/hw/i40iw/i40iw_type.h @@ -479,16 +479,17 @@ struct i40iw_sc_dev { struct i40iw_virt_mem ieq_mem; struct i40iw_puda_rsrc *ieq; - struct i40iw_vf_cqp_ops *iw_vf_cqp_ops; + const struct i40iw_vf_cqp_ops *iw_vf_cqp_ops; struct i40iw_hmc_fpm_misc hmc_fpm_misc; u16 qs_handle; - u32 debug_mask; + u32 debug_mask; u16 exception_lan_queue; u8 hmc_fn_id; bool is_pf; bool vchnl_up; u8 vf_id; + wait_queue_head_t vf_reqs; u64 cqp_cmd_stats[OP_SIZE_CQP_STAT_ARRAY]; struct i40iw_vchnl_vf_msg_buffer vchnl_vf_msg_buf; u8 hw_rev; @@ -889,8 +890,8 @@ struct i40iw_qhash_table_info { u32 qp_num; u32 dest_ip[4]; u32 src_ip[4]; - u32 dest_port; - u32 src_port; + u16 dest_port; + u16 src_port; }; struct i40iw_local_mac_ipaddr_entry_info { @@ -1040,6 +1041,9 @@ struct i40iw_priv_qp_ops { void (*qp_send_lsmm_nostag)(struct i40iw_sc_qp *, void *, u32); void (*qp_send_rtt)(struct i40iw_sc_qp *, bool); enum i40iw_status_code (*qp_post_wqe0)(struct i40iw_sc_qp *, u8); + enum i40iw_status_code (*iw_mr_fast_register)(struct i40iw_sc_qp *, + struct i40iw_fast_reg_stag_info *, + bool); }; struct i40iw_priv_cq_ops { @@ -1108,7 +1112,7 @@ struct i40iw_hmc_ops { enum i40iw_status_code (*parse_fpm_query_buf)(u64 *, struct i40iw_hmc_info *, struct i40iw_hmc_fpm_misc *); enum i40iw_status_code (*configure_iw_fpm)(struct i40iw_sc_dev *, u8); - enum i40iw_status_code (*parse_fpm_commit_buf)(u64 *, struct i40iw_hmc_obj_info *); + enum i40iw_status_code (*parse_fpm_commit_buf)(u64 *, struct i40iw_hmc_obj_info *, u32 *sd); enum i40iw_status_code (*create_hmc_object)(struct i40iw_sc_dev *dev, struct i40iw_hmc_create_obj_info *); enum i40iw_status_code (*del_hmc_object)(struct i40iw_sc_dev *dev, diff --git a/drivers/infiniband/hw/i40iw/i40iw_uk.c b/drivers/infiniband/hw/i40iw/i40iw_uk.c index f78c3dc8b..e35faea88 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_uk.c +++ b/drivers/infiniband/hw/i40iw/i40iw_uk.c @@ -56,6 +56,9 @@ static enum i40iw_status_code i40iw_nop_1(struct i40iw_qp_uk *qp) wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring); wqe = qp->sq_base[wqe_idx].elem; + + qp->sq_wrtrk_array[wqe_idx].wqe_size = I40IW_QP_WQE_MIN_SIZE; + peek_head = (qp->sq_ring.head + 1) % qp->sq_ring.size; wqe_0 = qp->sq_base[peek_head].elem; if (peek_head) @@ -130,7 +133,10 @@ static void i40iw_qp_ring_push_db(struct i40iw_qp_uk *qp, u32 wqe_idx) */ u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx, - u8 wqe_size) + u8 wqe_size, + u32 total_size, + u64 wr_id + ) { u64 *wqe = NULL; u64 wqe_ptr; @@ -159,6 +165,17 @@ u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp, if (!*wqe_idx) qp->swqe_polarity = !qp->swqe_polarity; } + + if (((*wqe_idx & 3) == 1) && (wqe_size == I40IW_WQE_SIZE_64)) { + i40iw_nop_1(qp); + I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code); + if (ret_code) + return NULL; + *wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring); + if (!*wqe_idx) + qp->swqe_polarity = !qp->swqe_polarity; + } + for (i = 0; i < wqe_size / I40IW_QP_WQE_MIN_SIZE; i++) { I40IW_RING_MOVE_HEAD(qp->sq_ring, ret_code); if (ret_code) @@ -169,8 +186,15 @@ u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp, peek_head = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring); wqe_0 = qp->sq_base[peek_head].elem; - if (peek_head & 0x3) - wqe_0[3] = LS_64(!qp->swqe_polarity, I40IWQPSQ_VALID); + + if (((peek_head & 3) == 1) || ((peek_head & 3) == 3)) { + if (RS_64(wqe_0[3], I40IWQPSQ_VALID) != !qp->swqe_polarity) + wqe_0[3] = LS_64(!qp->swqe_polarity, I40IWQPSQ_VALID); + } + + qp->sq_wrtrk_array[*wqe_idx].wrid = wr_id; + qp->sq_wrtrk_array[*wqe_idx].wr_len = total_size; + qp->sq_wrtrk_array[*wqe_idx].wqe_size = wqe_size; return wqe; } @@ -249,12 +273,9 @@ static enum i40iw_status_code i40iw_rdma_write(struct i40iw_qp_uk *qp, if (ret_code) return ret_code; - wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size); + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, total_size, info->wr_id); if (!wqe) return I40IW_ERR_QP_TOOMANY_WRS_POSTED; - - qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id; - qp->sq_wrtrk_array[wqe_idx].wr_len = total_size; set_64bit_val(wqe, 16, LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO)); if (!op_info->rem_addr.stag) @@ -309,12 +330,9 @@ static enum i40iw_status_code i40iw_rdma_read(struct i40iw_qp_uk *qp, ret_code = i40iw_fragcnt_to_wqesize_sq(1, &wqe_size); if (ret_code) return ret_code; - wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size); + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->lo_addr.len, info->wr_id); if (!wqe) return I40IW_ERR_QP_TOOMANY_WRS_POSTED; - - qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id; - qp->sq_wrtrk_array[wqe_idx].wr_len = op_info->lo_addr.len; local_fence |= info->local_fence; set_64bit_val(wqe, 16, LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO)); @@ -366,13 +384,11 @@ static enum i40iw_status_code i40iw_send(struct i40iw_qp_uk *qp, if (ret_code) return ret_code; - wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size); + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, total_size, info->wr_id); if (!wqe) return I40IW_ERR_QP_TOOMANY_WRS_POSTED; read_fence |= info->read_fence; - qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id; - qp->sq_wrtrk_array[wqe_idx].wr_len = total_size; set_64bit_val(wqe, 16, 0); header = LS_64(stag_to_inv, I40IWQPSQ_REMSTAG) | LS_64(info->op_type, I40IWQPSQ_OPCODE) | @@ -427,13 +443,11 @@ static enum i40iw_status_code i40iw_inline_rdma_write(struct i40iw_qp_uk *qp, if (ret_code) return ret_code; - wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size); + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->len, info->wr_id); if (!wqe) return I40IW_ERR_QP_TOOMANY_WRS_POSTED; read_fence |= info->read_fence; - qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id; - qp->sq_wrtrk_array[wqe_idx].wr_len = op_info->len; set_64bit_val(wqe, 16, LS_64(op_info->rem_addr.tag_off, I40IWQPSQ_FRAG_TO)); @@ -507,14 +521,11 @@ static enum i40iw_status_code i40iw_inline_send(struct i40iw_qp_uk *qp, if (ret_code) return ret_code; - wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size); + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, wqe_size, op_info->len, info->wr_id); if (!wqe) return I40IW_ERR_QP_TOOMANY_WRS_POSTED; read_fence |= info->read_fence; - - qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id; - qp->sq_wrtrk_array[wqe_idx].wr_len = op_info->len; header = LS_64(stag_to_inv, I40IWQPSQ_REMSTAG) | LS_64(info->op_type, I40IWQPSQ_OPCODE) | LS_64(op_info->len, I40IWQPSQ_INLINEDATALEN) | @@ -574,12 +585,9 @@ static enum i40iw_status_code i40iw_stag_local_invalidate(struct i40iw_qp_uk *qp op_info = &info->op.inv_local_stag; local_fence = info->local_fence; - wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE); + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, info->wr_id); if (!wqe) return I40IW_ERR_QP_TOOMANY_WRS_POSTED; - - qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id; - qp->sq_wrtrk_array[wqe_idx].wr_len = 0; set_64bit_val(wqe, 0, 0); set_64bit_val(wqe, 8, LS_64(op_info->target_stag, I40IWQPSQ_LOCSTAG)); @@ -619,12 +627,9 @@ static enum i40iw_status_code i40iw_mw_bind(struct i40iw_qp_uk *qp, op_info = &info->op.bind_window; local_fence |= info->local_fence; - wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE); + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, info->wr_id); if (!wqe) return I40IW_ERR_QP_TOOMANY_WRS_POSTED; - - qp->sq_wrtrk_array[wqe_idx].wrid = info->wr_id; - qp->sq_wrtrk_array[wqe_idx].wr_len = 0; set_64bit_val(wqe, 0, (uintptr_t)op_info->va); set_64bit_val(wqe, 8, LS_64(op_info->mr_stag, I40IWQPSQ_PARENTMRSTAG) | @@ -760,7 +765,7 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq, enum i40iw_status_code ret_code2 = 0; bool move_cq_head = true; u8 polarity; - u8 addl_frag_cnt, addl_wqes = 0; + u8 addl_wqes = 0; if (cq->avoid_mem_cflct) cqe = (u64 *)I40IW_GET_CURRENT_EXTENDED_CQ_ELEMENT(cq); @@ -797,6 +802,10 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq, info->is_srq = (bool)RS_64(qword3, I40IWCQ_SRQ); qp = (struct i40iw_qp_uk *)(unsigned long)comp_ctx; + if (!qp) { + ret_code = I40IW_ERR_QUEUE_DESTROYED; + goto exit; + } wqe_idx = (u32)RS_64(qword3, I40IW_CQ_WQEIDX); info->qp_handle = (i40iw_qp_handle)(unsigned long)qp; @@ -827,11 +836,8 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq, info->op_type = (u8)RS_64(qword3, I40IWCQ_OP); sw_wqe = qp->sq_base[wqe_idx].elem; get_64bit_val(sw_wqe, 24, &wqe_qword); - addl_frag_cnt = - (u8)RS_64(wqe_qword, I40IWQPSQ_ADDFRAGCNT); - i40iw_fragcnt_to_wqesize_sq(addl_frag_cnt + 1, &addl_wqes); - addl_wqes = (addl_wqes / I40IW_QP_WQE_MIN_SIZE); + addl_wqes = qp->sq_wrtrk_array[wqe_idx].wqe_size / I40IW_QP_WQE_MIN_SIZE; I40IW_RING_SET_TAIL(qp->sq_ring, (wqe_idx + addl_wqes)); } else { do { @@ -843,9 +849,7 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq, get_64bit_val(sw_wqe, 24, &wqe_qword); op_type = (u8)RS_64(wqe_qword, I40IWQPSQ_OPCODE); info->op_type = op_type; - addl_frag_cnt = (u8)RS_64(wqe_qword, I40IWQPSQ_ADDFRAGCNT); - i40iw_fragcnt_to_wqesize_sq(addl_frag_cnt + 1, &addl_wqes); - addl_wqes = (addl_wqes / I40IW_QP_WQE_MIN_SIZE); + addl_wqes = qp->sq_wrtrk_array[tail].wqe_size / I40IW_QP_WQE_MIN_SIZE; I40IW_RING_SET_TAIL(qp->sq_ring, (tail + addl_wqes)); if (op_type != I40IWQP_OP_NOP) { info->wr_id = qp->sq_wrtrk_array[tail].wrid; @@ -859,6 +863,7 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq, ret_code = 0; +exit: if (!ret_code && (info->comp_status == I40IW_COMPL_STATUS_FLUSHED)) if (pring && (I40IW_RING_MORE_WORK(*pring))) @@ -893,19 +898,21 @@ static enum i40iw_status_code i40iw_cq_poll_completion(struct i40iw_cq_uk *cq, * i40iw_get_wqe_shift - get shift count for maximum wqe size * @wqdepth: depth of wq required. * @sge: Maximum Scatter Gather Elements wqe + * @inline_data: Maximum inline data size * @shift: Returns the shift needed based on sge * - * Shift can be used to left shift the wqe size based on sge. - * If sge, == 1, shift =0 (wqe_size of 32 bytes), for sge=2 and 3, shift =1 - * (64 bytes wqes) and 2 otherwise (128 bytes wqe). + * Shift can be used to left shift the wqe size based on number of SGEs and inlind data size. + * For 1 SGE or inline data <= 16, shift = 0 (wqe size of 32 bytes). + * For 2 or 3 SGEs or inline data <= 48, shift = 1 (wqe size of 64 bytes). + * Shift of 2 otherwise (wqe size of 128 bytes). */ -enum i40iw_status_code i40iw_get_wqe_shift(u32 wqdepth, u8 sge, u8 *shift) +enum i40iw_status_code i40iw_get_wqe_shift(u32 wqdepth, u32 sge, u32 inline_data, u8 *shift) { u32 size; *shift = 0; - if (sge > 1) - *shift = (sge < 4) ? 1 : 2; + if (sge > 1 || inline_data > 16) + *shift = (sge < 4 && inline_data <= 48) ? 1 : 2; /* check if wqdepth is multiple of 2 or not */ @@ -968,11 +975,11 @@ enum i40iw_status_code i40iw_qp_uk_init(struct i40iw_qp_uk *qp, if (info->max_rq_frag_cnt > I40IW_MAX_WQ_FRAGMENT_COUNT) return I40IW_ERR_INVALID_FRAG_COUNT; - ret_code = i40iw_get_wqe_shift(info->sq_size, info->max_sq_frag_cnt, &sqshift); + ret_code = i40iw_get_wqe_shift(info->sq_size, info->max_sq_frag_cnt, info->max_inline_data, &sqshift); if (ret_code) return ret_code; - ret_code = i40iw_get_wqe_shift(info->rq_size, info->max_rq_frag_cnt, &rqshift); + ret_code = i40iw_get_wqe_shift(info->rq_size, info->max_rq_frag_cnt, 0, &rqshift); if (ret_code) return ret_code; @@ -1097,12 +1104,9 @@ enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp, u64 header, *wqe; u32 wqe_idx; - wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE); + wqe = i40iw_qp_get_next_send_wqe(qp, &wqe_idx, I40IW_QP_WQE_MIN_SIZE, 0, wr_id); if (!wqe) return I40IW_ERR_QP_TOOMANY_WRS_POSTED; - - qp->sq_wrtrk_array[wqe_idx].wrid = wr_id; - qp->sq_wrtrk_array[wqe_idx].wr_len = 0; set_64bit_val(wqe, 0, 0); set_64bit_val(wqe, 8, 0); set_64bit_val(wqe, 16, 0); @@ -1125,7 +1129,7 @@ enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp, * @frag_cnt: number of fragments * @wqe_size: size of sq wqe returned */ -enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u8 frag_cnt, u8 *wqe_size) +enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u32 frag_cnt, u8 *wqe_size) { switch (frag_cnt) { case 0: @@ -1156,7 +1160,7 @@ enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u8 frag_cnt, u8 *wqe_size) * @frag_cnt: number of fragments * @wqe_size: size of rq wqe returned */ -enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u8 frag_cnt, u8 *wqe_size) +enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u32 frag_cnt, u8 *wqe_size) { switch (frag_cnt) { case 0: diff --git a/drivers/infiniband/hw/i40iw/i40iw_user.h b/drivers/infiniband/hw/i40iw/i40iw_user.h index 5cd971bb8..4627646fe 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_user.h +++ b/drivers/infiniband/hw/i40iw/i40iw_user.h @@ -61,7 +61,7 @@ enum i40iw_device_capabilities_const { I40IW_MAX_CQ_SIZE = 1048575, I40IW_MAX_AEQ_ALLOCATE_COUNT = 255, I40IW_DB_ID_ZERO = 0, - I40IW_MAX_WQ_FRAGMENT_COUNT = 6, + I40IW_MAX_WQ_FRAGMENT_COUNT = 3, I40IW_MAX_SGE_RD = 1, I40IW_MAX_OUTBOUND_MESSAGE_SIZE = 2147483647, I40IW_MAX_INBOUND_MESSAGE_SIZE = 2147483647, @@ -70,8 +70,8 @@ enum i40iw_device_capabilities_const { I40IW_MAX_VF_FPM_ID = 47, I40IW_MAX_VF_PER_PF = 127, I40IW_MAX_SQ_PAYLOAD_SIZE = 2145386496, - I40IW_MAX_INLINE_DATA_SIZE = 112, - I40IW_MAX_PUSHMODE_INLINE_DATA_SIZE = 112, + I40IW_MAX_INLINE_DATA_SIZE = 48, + I40IW_MAX_PUSHMODE_INLINE_DATA_SIZE = 48, I40IW_MAX_IRD_SIZE = 32, I40IW_QPCTX_ENCD_MAXIRD = 3, I40IW_MAX_WQ_ENTRIES = 2048, @@ -102,6 +102,8 @@ enum i40iw_device_capabilities_const { #define I40IW_STAG_INDEX_FROM_STAG(stag) (((stag) && 0xFFFFFF00) >> 8) +#define I40IW_MAX_MR_SIZE 0x10000000000L + struct i40iw_qp_uk; struct i40iw_cq_uk; struct i40iw_srq_uk; @@ -198,7 +200,7 @@ enum i40iw_completion_notify { struct i40iw_post_send { i40iw_sgl sg_list; - u8 num_sges; + u32 num_sges; }; struct i40iw_post_inline_send { @@ -220,7 +222,7 @@ struct i40iw_post_inline_send_w_inv { struct i40iw_rdma_write { i40iw_sgl lo_sg_list; - u8 num_lo_sges; + u32 num_lo_sges; struct i40iw_sge rem_addr; }; @@ -345,7 +347,9 @@ struct i40iw_dev_uk { struct i40iw_sq_uk_wr_trk_info { u64 wrid; - u64 wr_len; + u32 wr_len; + u8 wqe_size; + u8 reserved[3]; }; struct i40iw_qp_quanta { @@ -367,6 +371,8 @@ struct i40iw_qp_uk { u32 qp_id; u32 sq_size; u32 rq_size; + u32 max_sq_frag_cnt; + u32 max_rq_frag_cnt; struct i40iw_qp_uk_ops ops; bool use_srq; u8 swqe_polarity; @@ -374,8 +380,6 @@ struct i40iw_qp_uk { u8 rwqe_polarity; u8 rq_wqe_size; u8 rq_wqe_size_multiplier; - u8 max_sq_frag_cnt; - u8 max_rq_frag_cnt; bool deferred_flag; }; @@ -404,8 +408,9 @@ struct i40iw_qp_uk_init_info { u32 qp_id; u32 sq_size; u32 rq_size; - u8 max_sq_frag_cnt; - u8 max_rq_frag_cnt; + u32 max_sq_frag_cnt; + u32 max_rq_frag_cnt; + u32 max_inline_data; }; @@ -422,7 +427,10 @@ void i40iw_device_init_uk(struct i40iw_dev_uk *dev); void i40iw_qp_post_wr(struct i40iw_qp_uk *qp); u64 *i40iw_qp_get_next_send_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx, - u8 wqe_size); + u8 wqe_size, + u32 total_size, + u64 wr_id + ); u64 *i40iw_qp_get_next_recv_wqe(struct i40iw_qp_uk *qp, u32 *wqe_idx); u64 *i40iw_qp_get_next_srq_wqe(struct i40iw_srq_uk *srq, u32 *wqe_idx); @@ -434,9 +442,9 @@ enum i40iw_status_code i40iw_qp_uk_init(struct i40iw_qp_uk *qp, void i40iw_clean_cq(void *queue, struct i40iw_cq_uk *cq); enum i40iw_status_code i40iw_nop(struct i40iw_qp_uk *qp, u64 wr_id, bool signaled, bool post_sq); -enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u8 frag_cnt, u8 *wqe_size); -enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u8 frag_cnt, u8 *wqe_size); +enum i40iw_status_code i40iw_fragcnt_to_wqesize_sq(u32 frag_cnt, u8 *wqe_size); +enum i40iw_status_code i40iw_fragcnt_to_wqesize_rq(u32 frag_cnt, u8 *wqe_size); enum i40iw_status_code i40iw_inline_data_size_to_wqesize(u32 data_size, u8 *wqe_size); -enum i40iw_status_code i40iw_get_wqe_shift(u32 wqdepth, u8 sge, u8 *shift); +enum i40iw_status_code i40iw_get_wqe_shift(u32 wqdepth, u32 sge, u32 inline_data, u8 *shift); #endif diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c index 1ceec81bd..0e8db0a35 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_utils.c +++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c @@ -59,7 +59,7 @@ * @action: modify, delete or add */ int i40iw_arp_table(struct i40iw_device *iwdev, - __be32 *ip_addr, + u32 *ip_addr, bool ipv4, u8 *mac_addr, u32 action) @@ -152,7 +152,7 @@ int i40iw_inetaddr_event(struct notifier_block *notifier, struct net_device *upper_dev; struct i40iw_device *iwdev; struct i40iw_handler *hdl; - __be32 local_ipaddr; + u32 local_ipaddr; hdl = i40iw_find_netdev(event_netdev); if (!hdl) @@ -167,11 +167,10 @@ int i40iw_inetaddr_event(struct notifier_block *notifier, switch (event) { case NETDEV_DOWN: if (upper_dev) - local_ipaddr = - ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address; + local_ipaddr = ntohl( + ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address); else - local_ipaddr = ifa->ifa_address; - local_ipaddr = ntohl(local_ipaddr); + local_ipaddr = ntohl(ifa->ifa_address); i40iw_manage_arp_cache(iwdev, netdev->dev_addr, &local_ipaddr, @@ -180,11 +179,10 @@ int i40iw_inetaddr_event(struct notifier_block *notifier, return NOTIFY_OK; case NETDEV_UP: if (upper_dev) - local_ipaddr = - ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address; + local_ipaddr = ntohl( + ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address); else - local_ipaddr = ifa->ifa_address; - local_ipaddr = ntohl(local_ipaddr); + local_ipaddr = ntohl(ifa->ifa_address); i40iw_manage_arp_cache(iwdev, netdev->dev_addr, &local_ipaddr, @@ -194,12 +192,11 @@ int i40iw_inetaddr_event(struct notifier_block *notifier, case NETDEV_CHANGEADDR: /* Add the address to the IP table */ if (upper_dev) - local_ipaddr = - ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address; + local_ipaddr = ntohl( + ((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address); else - local_ipaddr = ifa->ifa_address; + local_ipaddr = ntohl(ifa->ifa_address); - local_ipaddr = ntohl(local_ipaddr); i40iw_manage_arp_cache(iwdev, netdev->dev_addr, &local_ipaddr, @@ -227,7 +224,7 @@ int i40iw_inet6addr_event(struct notifier_block *notifier, struct net_device *netdev; struct i40iw_device *iwdev; struct i40iw_handler *hdl; - __be32 local_ipaddr6[4]; + u32 local_ipaddr6[4]; hdl = i40iw_find_netdev(event_netdev); if (!hdl) @@ -506,14 +503,19 @@ void i40iw_rem_ref(struct ib_qp *ibqp) struct cqp_commands_info *cqp_info; struct i40iw_device *iwdev; u32 qp_num; + unsigned long flags; iwqp = to_iwqp(ibqp); - if (!atomic_dec_and_test(&iwqp->refcount)) + iwdev = iwqp->iwdev; + spin_lock_irqsave(&iwdev->qptable_lock, flags); + if (!atomic_dec_and_test(&iwqp->refcount)) { + spin_unlock_irqrestore(&iwdev->qptable_lock, flags); return; + } - iwdev = iwqp->iwdev; qp_num = iwqp->ibqp.qp_num; iwdev->qp_table[qp_num] = NULL; + spin_unlock_irqrestore(&iwdev->qptable_lock, flags); cqp_request = i40iw_get_cqp_request(&iwdev->cqp, false); if (!cqp_request) return; @@ -985,21 +987,24 @@ enum i40iw_status_code i40iw_cqp_commit_fpm_values_cmd(struct i40iw_sc_dev *dev, enum i40iw_status_code i40iw_vf_wait_vchnl_resp(struct i40iw_sc_dev *dev) { struct i40iw_device *iwdev = dev->back_dev; - enum i40iw_status_code err_code = 0; int timeout_ret; i40iw_debug(dev, I40IW_DEBUG_VIRT, "%s[%u] dev %p, iwdev %p\n", __func__, __LINE__, dev, iwdev); - atomic_add(2, &iwdev->vchnl_msgs); + + atomic_set(&iwdev->vchnl_msgs, 2); timeout_ret = wait_event_timeout(iwdev->vchnl_waitq, (atomic_read(&iwdev->vchnl_msgs) == 1), I40IW_VCHNL_EVENT_TIMEOUT); atomic_dec(&iwdev->vchnl_msgs); if (!timeout_ret) { i40iw_pr_err("virt channel completion timeout = 0x%x\n", timeout_ret); - err_code = I40IW_ERR_TIMEOUT; + atomic_set(&iwdev->vchnl_msgs, 0); + dev->vchnl_up = false; + return I40IW_ERR_TIMEOUT; } - return err_code; + wake_up(&dev->vf_reqs); + return 0; } /** diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 1fe3b84a0..283b64c94 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -63,8 +63,8 @@ static int i40iw_query_device(struct ib_device *ibdev, ether_addr_copy((u8 *)&props->sys_image_guid, iwdev->netdev->dev_addr); props->fw_ver = I40IW_FW_VERSION; props->device_cap_flags = iwdev->device_cap_flags; - props->vendor_id = iwdev->vendor_id; - props->vendor_part_id = iwdev->vendor_part_id; + props->vendor_id = iwdev->ldev->pcidev->vendor; + props->vendor_part_id = iwdev->ldev->pcidev->device; props->hw_ver = (u32)iwdev->sc_dev.hw_rev; props->max_mr_size = I40IW_MAX_OUTBOUND_MESSAGE_SIZE; props->max_qp = iwdev->max_qp; @@ -74,11 +74,12 @@ static int i40iw_query_device(struct ib_device *ibdev, props->max_cqe = iwdev->max_cqe; props->max_mr = iwdev->max_mr; props->max_pd = iwdev->max_pd; - props->max_sge_rd = 1; + props->max_sge_rd = I40IW_MAX_SGE_RD; props->max_qp_rd_atom = I40IW_MAX_IRD_SIZE; props->max_qp_init_rd_atom = props->max_qp_rd_atom; props->atomic_cap = IB_ATOMIC_NONE; props->max_map_per_fmr = 1; + props->max_fast_reg_page_list_len = I40IW_MAX_PAGES_PER_FMR; return 0; } @@ -120,7 +121,7 @@ static int i40iw_query_port(struct ib_device *ibdev, props->pkey_tbl_len = 1; props->active_width = IB_WIDTH_4X; props->active_speed = 1; - props->max_msg_sz = 0x80000000; + props->max_msg_sz = I40IW_MAX_OUTBOUND_MESSAGE_SIZE; return 0; } @@ -437,7 +438,6 @@ void i40iw_free_qp_resources(struct i40iw_device *iwdev, kfree(iwqp->kqp.wrid_mem); iwqp->kqp.wrid_mem = NULL; kfree(iwqp->allocated_buffer); - iwqp->allocated_buffer = NULL; } /** @@ -521,14 +521,12 @@ static int i40iw_setup_kmode_qp(struct i40iw_device *iwdev, enum i40iw_status_code status; struct i40iw_qp_uk_init_info *ukinfo = &info->qp_uk_init_info; - ukinfo->max_sq_frag_cnt = I40IW_MAX_WQ_FRAGMENT_COUNT; - sq_size = i40iw_qp_roundup(ukinfo->sq_size + 1); rq_size = i40iw_qp_roundup(ukinfo->rq_size + 1); - status = i40iw_get_wqe_shift(sq_size, ukinfo->max_sq_frag_cnt, &sqshift); + status = i40iw_get_wqe_shift(sq_size, ukinfo->max_sq_frag_cnt, ukinfo->max_inline_data, &sqshift); if (!status) - status = i40iw_get_wqe_shift(rq_size, ukinfo->max_rq_frag_cnt, &rqshift); + status = i40iw_get_wqe_shift(rq_size, ukinfo->max_rq_frag_cnt, 0, &rqshift); if (status) return -ENOSYS; @@ -609,6 +607,9 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd, if (init_attr->cap.max_inline_data > I40IW_MAX_INLINE_DATA_SIZE) init_attr->cap.max_inline_data = I40IW_MAX_INLINE_DATA_SIZE; + if (init_attr->cap.max_send_sge > I40IW_MAX_WQ_FRAGMENT_COUNT) + init_attr->cap.max_send_sge = I40IW_MAX_WQ_FRAGMENT_COUNT; + memset(&init_info, 0, sizeof(init_info)); sq_size = init_attr->cap.max_send_wr; @@ -618,6 +619,7 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd, init_info.qp_uk_init_info.rq_size = rq_size; init_info.qp_uk_init_info.max_sq_frag_cnt = init_attr->cap.max_send_sge; init_info.qp_uk_init_info.max_rq_frag_cnt = init_attr->cap.max_recv_sge; + init_info.qp_uk_init_info.max_inline_data = init_attr->cap.max_inline_data; mem = kzalloc(sizeof(*iwqp), GFP_KERNEL); if (!mem) @@ -722,8 +724,10 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd, iwarp_info = &iwqp->iwarp_info; iwarp_info->rd_enable = true; iwarp_info->wr_rdresp_en = true; - if (!iwqp->user_mode) + if (!iwqp->user_mode) { + iwarp_info->fast_reg_en = true; iwarp_info->priv_mode_en = true; + } iwarp_info->ddp_ver = 1; iwarp_info->rdmap_ver = 1; @@ -784,6 +788,8 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd, return ERR_PTR(err_code); } } + init_completion(&iwqp->sq_drained); + init_completion(&iwqp->rq_drained); return &iwqp->ibqp; error: @@ -1444,6 +1450,167 @@ static int i40iw_handle_q_mem(struct i40iw_device *iwdev, } /** + * i40iw_hw_alloc_stag - cqp command to allocate stag + * @iwdev: iwarp device + * @iwmr: iwarp mr pointer + */ +static int i40iw_hw_alloc_stag(struct i40iw_device *iwdev, struct i40iw_mr *iwmr) +{ + struct i40iw_allocate_stag_info *info; + struct i40iw_pd *iwpd = to_iwpd(iwmr->ibmr.pd); + enum i40iw_status_code status; + int err = 0; + struct i40iw_cqp_request *cqp_request; + struct cqp_commands_info *cqp_info; + + cqp_request = i40iw_get_cqp_request(&iwdev->cqp, true); + if (!cqp_request) + return -ENOMEM; + + cqp_info = &cqp_request->info; + info = &cqp_info->in.u.alloc_stag.info; + memset(info, 0, sizeof(*info)); + info->page_size = PAGE_SIZE; + info->stag_idx = iwmr->stag >> I40IW_CQPSQ_STAG_IDX_SHIFT; + info->pd_id = iwpd->sc_pd.pd_id; + info->total_len = iwmr->length; + info->remote_access = true; + cqp_info->cqp_cmd = OP_ALLOC_STAG; + cqp_info->post_sq = 1; + cqp_info->in.u.alloc_stag.dev = &iwdev->sc_dev; + cqp_info->in.u.alloc_stag.scratch = (uintptr_t)cqp_request; + + status = i40iw_handle_cqp_op(iwdev, cqp_request); + if (status) { + err = -ENOMEM; + i40iw_pr_err("CQP-OP MR Reg fail"); + } + return err; +} + +/** + * i40iw_alloc_mr - register stag for fast memory registration + * @pd: ibpd pointer + * @mr_type: memory for stag registrion + * @max_num_sg: man number of pages + */ +static struct ib_mr *i40iw_alloc_mr(struct ib_pd *pd, + enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct i40iw_pd *iwpd = to_iwpd(pd); + struct i40iw_device *iwdev = to_iwdev(pd->device); + struct i40iw_pble_alloc *palloc; + struct i40iw_pbl *iwpbl; + struct i40iw_mr *iwmr; + enum i40iw_status_code status; + u32 stag; + int err_code = -ENOMEM; + + iwmr = kzalloc(sizeof(*iwmr), GFP_KERNEL); + if (!iwmr) + return ERR_PTR(-ENOMEM); + + stag = i40iw_create_stag(iwdev); + if (!stag) { + err_code = -EOVERFLOW; + goto err; + } + iwmr->stag = stag; + iwmr->ibmr.rkey = stag; + iwmr->ibmr.lkey = stag; + iwmr->ibmr.pd = pd; + iwmr->ibmr.device = pd->device; + iwpbl = &iwmr->iwpbl; + iwpbl->iwmr = iwmr; + iwmr->type = IW_MEMREG_TYPE_MEM; + palloc = &iwpbl->pble_alloc; + iwmr->page_cnt = max_num_sg; + mutex_lock(&iwdev->pbl_mutex); + status = i40iw_get_pble(&iwdev->sc_dev, iwdev->pble_rsrc, palloc, iwmr->page_cnt); + mutex_unlock(&iwdev->pbl_mutex); + if (status) + goto err1; + + if (palloc->level != I40IW_LEVEL_1) + goto err2; + err_code = i40iw_hw_alloc_stag(iwdev, iwmr); + if (err_code) + goto err2; + iwpbl->pbl_allocated = true; + i40iw_add_pdusecount(iwpd); + return &iwmr->ibmr; +err2: + i40iw_free_pble(iwdev->pble_rsrc, palloc); +err1: + i40iw_free_stag(iwdev, stag); +err: + kfree(iwmr); + return ERR_PTR(err_code); +} + +/** + * i40iw_set_page - populate pbl list for fmr + * @ibmr: ib mem to access iwarp mr pointer + * @addr: page dma address fro pbl list + */ +static int i40iw_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct i40iw_mr *iwmr = to_iwmr(ibmr); + struct i40iw_pbl *iwpbl = &iwmr->iwpbl; + struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc; + u64 *pbl; + + if (unlikely(iwmr->npages == iwmr->page_cnt)) + return -ENOMEM; + + pbl = (u64 *)palloc->level1.addr; + pbl[iwmr->npages++] = cpu_to_le64(addr); + return 0; +} + +/** + * i40iw_map_mr_sg - map of sg list for fmr + * @ibmr: ib mem to access iwarp mr pointer + * @sg: scatter gather list for fmr + * @sg_nents: number of sg pages + */ +static int i40iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset) +{ + struct i40iw_mr *iwmr = to_iwmr(ibmr); + + iwmr->npages = 0; + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, i40iw_set_page); +} + +/** + * i40iw_drain_sq - drain the send queue + * @ibqp: ib qp pointer + */ +static void i40iw_drain_sq(struct ib_qp *ibqp) +{ + struct i40iw_qp *iwqp = to_iwqp(ibqp); + struct i40iw_sc_qp *qp = &iwqp->sc_qp; + + if (I40IW_RING_MORE_WORK(qp->qp_uk.sq_ring)) + wait_for_completion(&iwqp->sq_drained); +} + +/** + * i40iw_drain_rq - drain the receive queue + * @ibqp: ib qp pointer + */ +static void i40iw_drain_rq(struct ib_qp *ibqp) +{ + struct i40iw_qp *iwqp = to_iwqp(ibqp); + struct i40iw_sc_qp *qp = &iwqp->sc_qp; + + if (I40IW_RING_MORE_WORK(qp->qp_uk.rq_ring)) + wait_for_completion(&iwqp->rq_drained); +} + +/** * i40iw_hwreg_mr - send cqp command for memory registration * @iwdev: iwarp device * @iwmr: iwarp mr pointer @@ -1526,14 +1693,16 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd, struct i40iw_mr *iwmr; struct ib_umem *region; struct i40iw_mem_reg_req req; - u32 pbl_depth = 0; + u64 pbl_depth = 0; u32 stag = 0; u16 access; - u32 region_length; + u64 region_length; bool use_pbles = false; unsigned long flags; int err = -ENOSYS; + if (length > I40IW_MAX_MR_SIZE) + return ERR_PTR(-EINVAL); region = ib_umem_get(pd->uobject->context, start, length, acc, 0); if (IS_ERR(region)) return (struct ib_mr *)region; @@ -1564,7 +1733,7 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd, palloc = &iwpbl->pble_alloc; iwmr->type = req.reg_type; - iwmr->page_cnt = pbl_depth; + iwmr->page_cnt = (u32)pbl_depth; switch (req.reg_type) { case IW_MEMREG_TYPE_QP: @@ -1881,12 +2050,14 @@ static int i40iw_post_send(struct ib_qp *ibqp, enum i40iw_status_code ret; int err = 0; unsigned long flags; + bool inv_stag; iwqp = (struct i40iw_qp *)ibqp; ukqp = &iwqp->sc_qp.qp_uk; spin_lock_irqsave(&iwqp->lock, flags); while (ib_wr) { + inv_stag = false; memset(&info, 0, sizeof(info)); info.wr_id = (u64)(ib_wr->wr_id); if ((ib_wr->send_flags & IB_SEND_SIGNALED) || iwqp->sig_all) @@ -1896,19 +2067,28 @@ static int i40iw_post_send(struct ib_qp *ibqp, switch (ib_wr->opcode) { case IB_WR_SEND: - if (ib_wr->send_flags & IB_SEND_SOLICITED) - info.op_type = I40IW_OP_TYPE_SEND_SOL; - else - info.op_type = I40IW_OP_TYPE_SEND; + /* fall-through */ + case IB_WR_SEND_WITH_INV: + if (ib_wr->opcode == IB_WR_SEND) { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + info.op_type = I40IW_OP_TYPE_SEND_SOL; + else + info.op_type = I40IW_OP_TYPE_SEND; + } else { + if (ib_wr->send_flags & IB_SEND_SOLICITED) + info.op_type = I40IW_OP_TYPE_SEND_SOL_INV; + else + info.op_type = I40IW_OP_TYPE_SEND_INV; + } if (ib_wr->send_flags & IB_SEND_INLINE) { info.op.inline_send.data = (void *)(unsigned long)ib_wr->sg_list[0].addr; info.op.inline_send.len = ib_wr->sg_list[0].length; - ret = ukqp->ops.iw_inline_send(ukqp, &info, rdma_wr(ib_wr)->rkey, false); + ret = ukqp->ops.iw_inline_send(ukqp, &info, ib_wr->ex.invalidate_rkey, false); } else { info.op.send.num_sges = ib_wr->num_sge; info.op.send.sg_list = (struct i40iw_sge *)ib_wr->sg_list; - ret = ukqp->ops.iw_send(ukqp, &info, rdma_wr(ib_wr)->rkey, false); + ret = ukqp->ops.iw_send(ukqp, &info, ib_wr->ex.invalidate_rkey, false); } if (ret) @@ -1936,7 +2116,14 @@ static int i40iw_post_send(struct ib_qp *ibqp, if (ret) err = -EIO; break; + case IB_WR_RDMA_READ_WITH_INV: + inv_stag = true; + /* fall-through*/ case IB_WR_RDMA_READ: + if (ib_wr->num_sge > I40IW_MAX_SGE_RD) { + err = -EINVAL; + break; + } info.op_type = I40IW_OP_TYPE_RDMA_READ; info.op.rdma_read.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr; info.op.rdma_read.rem_addr.stag = rdma_wr(ib_wr)->rkey; @@ -1944,10 +2131,52 @@ static int i40iw_post_send(struct ib_qp *ibqp, info.op.rdma_read.lo_addr.tag_off = ib_wr->sg_list->addr; info.op.rdma_read.lo_addr.stag = ib_wr->sg_list->lkey; info.op.rdma_read.lo_addr.len = ib_wr->sg_list->length; - ret = ukqp->ops.iw_rdma_read(ukqp, &info, false, false); + ret = ukqp->ops.iw_rdma_read(ukqp, &info, inv_stag, false); if (ret) err = -EIO; break; + case IB_WR_LOCAL_INV: + info.op_type = I40IW_OP_TYPE_INV_STAG; + info.op.inv_local_stag.target_stag = ib_wr->ex.invalidate_rkey; + ret = ukqp->ops.iw_stag_local_invalidate(ukqp, &info, true); + if (ret) + err = -EIO; + break; + case IB_WR_REG_MR: + { + struct i40iw_mr *iwmr = to_iwmr(reg_wr(ib_wr)->mr); + int page_shift = ilog2(reg_wr(ib_wr)->mr->page_size); + int flags = reg_wr(ib_wr)->access; + struct i40iw_pble_alloc *palloc = &iwmr->iwpbl.pble_alloc; + struct i40iw_sc_dev *dev = &iwqp->iwdev->sc_dev; + struct i40iw_fast_reg_stag_info info; + + memset(&info, 0, sizeof(info)); + info.access_rights = I40IW_ACCESS_FLAGS_LOCALREAD; + info.access_rights |= i40iw_get_user_access(flags); + info.stag_key = reg_wr(ib_wr)->key & 0xff; + info.stag_idx = reg_wr(ib_wr)->key >> 8; + info.wr_id = ib_wr->wr_id; + + info.addr_type = I40IW_ADDR_TYPE_VA_BASED; + info.va = (void *)(uintptr_t)iwmr->ibmr.iova; + info.total_len = iwmr->ibmr.length; + info.reg_addr_pa = *(u64 *)palloc->level1.addr; + info.first_pm_pbl_index = palloc->level1.idx; + info.local_fence = ib_wr->send_flags & IB_SEND_FENCE; + info.signaled = ib_wr->send_flags & IB_SEND_SIGNALED; + + if (iwmr->npages > I40IW_MIN_PAGES_PER_FMR) + info.chunk_size = 1; + + if (page_shift == 21) + info.page_size = 1; /* 2M page */ + + ret = dev->iw_priv_qp_ops->iw_mr_fast_register(&iwqp->sc_qp, &info, true); + if (ret) + err = -EIO; + break; + } default: err = -EINVAL; i40iw_pr_err(" upost_send bad opcode = 0x%x\n", @@ -2027,6 +2256,7 @@ static int i40iw_poll_cq(struct ib_cq *ibcq, enum i40iw_status_code ret; struct i40iw_cq_uk *ukcq; struct i40iw_sc_qp *qp; + struct i40iw_qp *iwqp; unsigned long flags; iwcq = (struct i40iw_cq *)ibcq; @@ -2037,6 +2267,8 @@ static int i40iw_poll_cq(struct ib_cq *ibcq, ret = ukcq->ops.iw_cq_poll_completion(ukcq, &cq_poll_info, true); if (ret == I40IW_ERR_QUEUE_EMPTY) { break; + } else if (ret == I40IW_ERR_QUEUE_DESTROYED) { + continue; } else if (ret) { if (!cqe_count) cqe_count = -1; @@ -2044,10 +2276,12 @@ static int i40iw_poll_cq(struct ib_cq *ibcq, } entry->wc_flags = 0; entry->wr_id = cq_poll_info.wr_id; - if (!cq_poll_info.error) - entry->status = IB_WC_SUCCESS; - else + if (cq_poll_info.error) { entry->status = IB_WC_WR_FLUSH_ERR; + entry->vendor_err = cq_poll_info.major_err << 16 | cq_poll_info.minor_err; + } else { + entry->status = IB_WC_SUCCESS; + } switch (cq_poll_info.op_type) { case I40IW_OP_TYPE_RDMA_WRITE: @@ -2071,12 +2305,17 @@ static int i40iw_poll_cq(struct ib_cq *ibcq, break; } - entry->vendor_err = - cq_poll_info.major_err << 16 | cq_poll_info.minor_err; entry->ex.imm_data = 0; qp = (struct i40iw_sc_qp *)cq_poll_info.qp_handle; entry->qp = (struct ib_qp *)qp->back_qp; entry->src_qp = cq_poll_info.qp_id; + iwqp = (struct i40iw_qp *)qp->back_qp; + if (iwqp->iwarp_state > I40IW_QP_STATE_RTS) { + if (!I40IW_RING_MORE_WORK(qp->qp_uk.sq_ring)) + complete(&iwqp->sq_drained); + if (!I40IW_RING_MORE_WORK(qp->qp_uk.rq_ring)) + complete(&iwqp->rq_drained); + } entry->byte_len = cq_poll_info.bytes_xfered; entry++; cqe_count++; @@ -2095,13 +2334,16 @@ static int i40iw_req_notify_cq(struct ib_cq *ibcq, { struct i40iw_cq *iwcq; struct i40iw_cq_uk *ukcq; - enum i40iw_completion_notify cq_notify = IW_CQ_COMPL_SOLICITED; + unsigned long flags; + enum i40iw_completion_notify cq_notify = IW_CQ_COMPL_EVENT; iwcq = (struct i40iw_cq *)ibcq; ukcq = &iwcq->sc_cq.cq_uk; - if (notify_flags == IB_CQ_NEXT_COMP) - cq_notify = IW_CQ_COMPL_EVENT; + if (notify_flags == IB_CQ_SOLICITED) + cq_notify = IW_CQ_COMPL_SOLICITED; + spin_lock_irqsave(&iwcq->lock, flags); ukcq->ops.iw_cq_request_notification(ukcq, cq_notify); + spin_unlock_irqrestore(&iwcq->lock, flags); return 0; } @@ -2129,62 +2371,130 @@ static int i40iw_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } +static const char * const i40iw_hw_stat_names[] = { + // 32bit names + [I40IW_HW_STAT_INDEX_IP4RXDISCARD] = "ip4InDiscards", + [I40IW_HW_STAT_INDEX_IP4RXTRUNC] = "ip4InTruncatedPkts", + [I40IW_HW_STAT_INDEX_IP4TXNOROUTE] = "ip4OutNoRoutes", + [I40IW_HW_STAT_INDEX_IP6RXDISCARD] = "ip6InDiscards", + [I40IW_HW_STAT_INDEX_IP6RXTRUNC] = "ip6InTruncatedPkts", + [I40IW_HW_STAT_INDEX_IP6TXNOROUTE] = "ip6OutNoRoutes", + [I40IW_HW_STAT_INDEX_TCPRTXSEG] = "tcpRetransSegs", + [I40IW_HW_STAT_INDEX_TCPRXOPTERR] = "tcpInOptErrors", + [I40IW_HW_STAT_INDEX_TCPRXPROTOERR] = "tcpInProtoErrors", + // 64bit names + [I40IW_HW_STAT_INDEX_IP4RXOCTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip4InOctets", + [I40IW_HW_STAT_INDEX_IP4RXPKTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip4InPkts", + [I40IW_HW_STAT_INDEX_IP4RXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip4InReasmRqd", + [I40IW_HW_STAT_INDEX_IP4RXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip4InMcastPkts", + [I40IW_HW_STAT_INDEX_IP4TXOCTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip4OutOctets", + [I40IW_HW_STAT_INDEX_IP4TXPKTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip4OutPkts", + [I40IW_HW_STAT_INDEX_IP4TXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip4OutSegRqd", + [I40IW_HW_STAT_INDEX_IP4TXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip4OutMcastPkts", + [I40IW_HW_STAT_INDEX_IP6RXOCTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip6InOctets", + [I40IW_HW_STAT_INDEX_IP6RXPKTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip6InPkts", + [I40IW_HW_STAT_INDEX_IP6RXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip6InReasmRqd", + [I40IW_HW_STAT_INDEX_IP6RXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip6InMcastPkts", + [I40IW_HW_STAT_INDEX_IP6TXOCTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip6OutOctets", + [I40IW_HW_STAT_INDEX_IP6TXPKTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip6OutPkts", + [I40IW_HW_STAT_INDEX_IP6TXFRAGS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip6OutSegRqd", + [I40IW_HW_STAT_INDEX_IP6TXMCPKTS + I40IW_HW_STAT_INDEX_MAX_32] = + "ip6OutMcastPkts", + [I40IW_HW_STAT_INDEX_TCPRXSEGS + I40IW_HW_STAT_INDEX_MAX_32] = + "tcpInSegs", + [I40IW_HW_STAT_INDEX_TCPTXSEG + I40IW_HW_STAT_INDEX_MAX_32] = + "tcpOutSegs", + [I40IW_HW_STAT_INDEX_RDMARXRDS + I40IW_HW_STAT_INDEX_MAX_32] = + "iwInRdmaReads", + [I40IW_HW_STAT_INDEX_RDMARXSNDS + I40IW_HW_STAT_INDEX_MAX_32] = + "iwInRdmaSends", + [I40IW_HW_STAT_INDEX_RDMARXWRS + I40IW_HW_STAT_INDEX_MAX_32] = + "iwInRdmaWrites", + [I40IW_HW_STAT_INDEX_RDMATXRDS + I40IW_HW_STAT_INDEX_MAX_32] = + "iwOutRdmaReads", + [I40IW_HW_STAT_INDEX_RDMATXSNDS + I40IW_HW_STAT_INDEX_MAX_32] = + "iwOutRdmaSends", + [I40IW_HW_STAT_INDEX_RDMATXWRS + I40IW_HW_STAT_INDEX_MAX_32] = + "iwOutRdmaWrites", + [I40IW_HW_STAT_INDEX_RDMAVBND + I40IW_HW_STAT_INDEX_MAX_32] = + "iwRdmaBnd", + [I40IW_HW_STAT_INDEX_RDMAVINV + I40IW_HW_STAT_INDEX_MAX_32] = + "iwRdmaInv" +}; + +/** + * i40iw_alloc_hw_stats - Allocate a hw stats structure + * @ibdev: device pointer from stack + * @port_num: port number + */ +static struct rdma_hw_stats *i40iw_alloc_hw_stats(struct ib_device *ibdev, + u8 port_num) +{ + struct i40iw_device *iwdev = to_iwdev(ibdev); + struct i40iw_sc_dev *dev = &iwdev->sc_dev; + int num_counters = I40IW_HW_STAT_INDEX_MAX_32 + + I40IW_HW_STAT_INDEX_MAX_64; + unsigned long lifespan = RDMA_HW_STATS_DEFAULT_LIFESPAN; + + BUILD_BUG_ON(ARRAY_SIZE(i40iw_hw_stat_names) != + (I40IW_HW_STAT_INDEX_MAX_32 + + I40IW_HW_STAT_INDEX_MAX_64)); + + /* + * PFs get the default update lifespan, but VFs only update once + * per second + */ + if (!dev->is_pf) + lifespan = 1000; + return rdma_alloc_hw_stats_struct(i40iw_hw_stat_names, num_counters, + lifespan); +} + /** - * i40iw_get_protocol_stats - Populates the rdma_stats structure - * @ibdev: ib dev struct - * @stats: iw protocol stats struct + * i40iw_get_hw_stats - Populates the rdma_hw_stats structure + * @ibdev: device pointer from stack + * @stats: stats pointer from stack + * @port_num: port number + * @index: which hw counter the stack is requesting we update */ -static int i40iw_get_protocol_stats(struct ib_device *ibdev, - union rdma_protocol_stats *stats) +static int i40iw_get_hw_stats(struct ib_device *ibdev, + struct rdma_hw_stats *stats, + u8 port_num, int index) { struct i40iw_device *iwdev = to_iwdev(ibdev); struct i40iw_sc_dev *dev = &iwdev->sc_dev; struct i40iw_dev_pestat *devstat = &dev->dev_pestat; struct i40iw_dev_hw_stats *hw_stats = &devstat->hw_stats; - struct timespec curr_time; - static struct timespec last_rd_time = {0, 0}; - enum i40iw_status_code status = 0; unsigned long flags; - curr_time = current_kernel_time(); - memset(stats, 0, sizeof(*stats)); - if (dev->is_pf) { spin_lock_irqsave(&devstat->stats_lock, flags); devstat->ops.iw_hw_stat_read_all(devstat, &devstat->hw_stats); spin_unlock_irqrestore(&devstat->stats_lock, flags); } else { - if (((u64)curr_time.tv_sec - (u64)last_rd_time.tv_sec) > 1) - status = i40iw_vchnl_vf_get_pe_stats(dev, - &devstat->hw_stats); - - if (status) + if (i40iw_vchnl_vf_get_pe_stats(dev, &devstat->hw_stats)) return -ENOSYS; } - stats->iw.ipInReceives = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4RXPKTS] + - hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6RXPKTS]; - stats->iw.ipInTruncatedPkts = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP4RXTRUNC] + - hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP6RXTRUNC]; - stats->iw.ipInDiscards = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP4RXDISCARD] + - hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP6RXDISCARD]; - stats->iw.ipOutNoRoutes = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP4TXNOROUTE] + - hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_IP6TXNOROUTE]; - stats->iw.ipReasmReqds = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4RXFRAGS] + - hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6RXFRAGS]; - stats->iw.ipFragCreates = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4TXFRAGS] + - hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6TXFRAGS]; - stats->iw.ipInMcastPkts = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4RXMCPKTS] + - hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6RXMCPKTS]; - stats->iw.ipOutMcastPkts = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP4TXMCPKTS] + - hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_IP6TXMCPKTS]; - stats->iw.tcpOutSegs = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_TCPTXSEG]; - stats->iw.tcpInSegs = hw_stats->stat_value_64[I40IW_HW_STAT_INDEX_TCPRXSEGS]; - stats->iw.tcpRetransSegs = hw_stats->stat_value_32[I40IW_HW_STAT_INDEX_TCPRTXSEG]; - - last_rd_time = curr_time; - return 0; + memcpy(&stats->value[0], &hw_stats, sizeof(*hw_stats)); + + return stats->num_counters; } /** @@ -2323,10 +2633,15 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev iwibdev->ibdev.get_dma_mr = i40iw_get_dma_mr; iwibdev->ibdev.reg_user_mr = i40iw_reg_user_mr; iwibdev->ibdev.dereg_mr = i40iw_dereg_mr; - iwibdev->ibdev.get_protocol_stats = i40iw_get_protocol_stats; + iwibdev->ibdev.alloc_hw_stats = i40iw_alloc_hw_stats; + iwibdev->ibdev.get_hw_stats = i40iw_get_hw_stats; iwibdev->ibdev.query_device = i40iw_query_device; iwibdev->ibdev.create_ah = i40iw_create_ah; iwibdev->ibdev.destroy_ah = i40iw_destroy_ah; + iwibdev->ibdev.drain_sq = i40iw_drain_sq; + iwibdev->ibdev.drain_rq = i40iw_drain_rq; + iwibdev->ibdev.alloc_mr = i40iw_alloc_mr; + iwibdev->ibdev.map_mr_sg = i40iw_map_mr_sg; iwibdev->ibdev.iwcm = kzalloc(sizeof(*iwibdev->ibdev.iwcm), GFP_KERNEL); if (!iwibdev->ibdev.iwcm) { ib_dealloc_device(&iwibdev->ibdev); diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.h b/drivers/infiniband/hw/i40iw/i40iw_verbs.h index 1101f7708..0069be8a5 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.h +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.h @@ -92,6 +92,7 @@ struct i40iw_mr { struct ib_umem *region; u16 type; u32 page_cnt; + u32 npages; u32 stag; u64 length; u64 pgaddrmem[MAX_SAVE_PAGE_ADDRS]; @@ -169,5 +170,7 @@ struct i40iw_qp { struct i40iw_pbl *iwpbl; struct i40iw_dma_mem q2_ctx_mem; struct i40iw_dma_mem ietf_mem; + struct completion sq_drained; + struct completion rq_drained; }; #endif diff --git a/drivers/infiniband/hw/i40iw/i40iw_vf.c b/drivers/infiniband/hw/i40iw/i40iw_vf.c index cb0f18340..e33d48109 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_vf.c +++ b/drivers/infiniband/hw/i40iw/i40iw_vf.c @@ -80,6 +80,6 @@ enum i40iw_status_code i40iw_manage_vf_pble_bp(struct i40iw_sc_cqp *cqp, return 0; } -struct i40iw_vf_cqp_ops iw_vf_cqp_ops = { +const struct i40iw_vf_cqp_ops iw_vf_cqp_ops = { i40iw_manage_vf_pble_bp }; diff --git a/drivers/infiniband/hw/i40iw/i40iw_vf.h b/drivers/infiniband/hw/i40iw/i40iw_vf.h index f649f3a62..4359559ec 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_vf.h +++ b/drivers/infiniband/hw/i40iw/i40iw_vf.h @@ -57,6 +57,6 @@ enum i40iw_status_code i40iw_manage_vf_pble_bp(struct i40iw_sc_cqp *cqp, u64 scratch, bool post_sq); -extern struct i40iw_vf_cqp_ops iw_vf_cqp_ops; +extern const struct i40iw_vf_cqp_ops iw_vf_cqp_ops; #endif diff --git a/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c b/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c index 6b68f7890..3041003c9 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c +++ b/drivers/infiniband/hw/i40iw/i40iw_virtchnl.c @@ -254,7 +254,7 @@ static void vchnl_pf_send_get_hmc_fcn_resp(struct i40iw_sc_dev *dev, static void vchnl_pf_send_get_pe_stats_resp(struct i40iw_sc_dev *dev, u32 vf_id, struct i40iw_virtchnl_op_buf *vchnl_msg, - struct i40iw_dev_hw_stats hw_stats) + struct i40iw_dev_hw_stats *hw_stats) { enum i40iw_status_code ret_code; u8 resp_buffer[sizeof(struct i40iw_virtchnl_resp_buf) + sizeof(struct i40iw_dev_hw_stats) - 1]; @@ -264,7 +264,7 @@ static void vchnl_pf_send_get_pe_stats_resp(struct i40iw_sc_dev *dev, vchnl_msg_resp->iw_chnl_op_ctx = vchnl_msg->iw_chnl_op_ctx; vchnl_msg_resp->iw_chnl_buf_len = sizeof(resp_buffer); vchnl_msg_resp->iw_op_ret_code = I40IW_SUCCESS; - *((struct i40iw_dev_hw_stats *)vchnl_msg_resp->iw_chnl_buf) = hw_stats; + *((struct i40iw_dev_hw_stats *)vchnl_msg_resp->iw_chnl_buf) = *hw_stats; ret_code = dev->vchnl_if.vchnl_send(dev, vf_id, resp_buffer, sizeof(resp_buffer)); if (ret_code) i40iw_debug(dev, I40IW_DEBUG_VIRT, @@ -437,11 +437,9 @@ enum i40iw_status_code i40iw_vchnl_recv_pf(struct i40iw_sc_dev *dev, vchnl_pf_send_get_ver_resp(dev, vf_id, vchnl_msg); return I40IW_SUCCESS; } - for (iw_vf_idx = 0; iw_vf_idx < I40IW_MAX_PE_ENABLED_VF_COUNT; - iw_vf_idx++) { + for (iw_vf_idx = 0; iw_vf_idx < I40IW_MAX_PE_ENABLED_VF_COUNT; iw_vf_idx++) { if (!dev->vf_dev[iw_vf_idx]) { - if (first_avail_iw_vf == - I40IW_MAX_PE_ENABLED_VF_COUNT) + if (first_avail_iw_vf == I40IW_MAX_PE_ENABLED_VF_COUNT) first_avail_iw_vf = iw_vf_idx; continue; } @@ -541,7 +539,7 @@ enum i40iw_status_code i40iw_vchnl_recv_pf(struct i40iw_sc_dev *dev, devstat->ops.iw_hw_stat_read_all(devstat, &devstat->hw_stats); spin_unlock_irqrestore(&dev->dev_pestat.stats_lock, flags); vf_dev->msg_count--; - vchnl_pf_send_get_pe_stats_resp(dev, vf_id, vchnl_msg, devstat->hw_stats); + vchnl_pf_send_get_pe_stats_resp(dev, vf_id, vchnl_msg, &devstat->hw_stats); break; default: i40iw_debug(dev, I40IW_DEBUG_VIRT, @@ -596,23 +594,25 @@ enum i40iw_status_code i40iw_vchnl_vf_get_ver(struct i40iw_sc_dev *dev, struct i40iw_virtchnl_req vchnl_req; enum i40iw_status_code ret_code; + if (!i40iw_vf_clear_to_send(dev)) + return I40IW_ERR_TIMEOUT; memset(&vchnl_req, 0, sizeof(vchnl_req)); vchnl_req.dev = dev; vchnl_req.parm = vchnl_ver; vchnl_req.parm_len = sizeof(*vchnl_ver); vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg; + ret_code = vchnl_vf_send_get_ver_req(dev, &vchnl_req); - if (!ret_code) { - ret_code = i40iw_vf_wait_vchnl_resp(dev); - if (!ret_code) - ret_code = vchnl_req.ret_code; - else - dev->vchnl_up = false; - } else { + if (ret_code) { i40iw_debug(dev, I40IW_DEBUG_VIRT, "%s Send message failed 0x%0x\n", __func__, ret_code); + return ret_code; } - return ret_code; + ret_code = i40iw_vf_wait_vchnl_resp(dev); + if (ret_code) + return ret_code; + else + return vchnl_req.ret_code; } /** @@ -626,23 +626,25 @@ enum i40iw_status_code i40iw_vchnl_vf_get_hmc_fcn(struct i40iw_sc_dev *dev, struct i40iw_virtchnl_req vchnl_req; enum i40iw_status_code ret_code; + if (!i40iw_vf_clear_to_send(dev)) + return I40IW_ERR_TIMEOUT; memset(&vchnl_req, 0, sizeof(vchnl_req)); vchnl_req.dev = dev; vchnl_req.parm = hmc_fcn; vchnl_req.parm_len = sizeof(*hmc_fcn); vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg; + ret_code = vchnl_vf_send_get_hmc_fcn_req(dev, &vchnl_req); - if (!ret_code) { - ret_code = i40iw_vf_wait_vchnl_resp(dev); - if (!ret_code) - ret_code = vchnl_req.ret_code; - else - dev->vchnl_up = false; - } else { + if (ret_code) { i40iw_debug(dev, I40IW_DEBUG_VIRT, "%s Send message failed 0x%0x\n", __func__, ret_code); + return ret_code; } - return ret_code; + ret_code = i40iw_vf_wait_vchnl_resp(dev); + if (ret_code) + return ret_code; + else + return vchnl_req.ret_code; } /** @@ -660,25 +662,27 @@ enum i40iw_status_code i40iw_vchnl_vf_add_hmc_objs(struct i40iw_sc_dev *dev, struct i40iw_virtchnl_req vchnl_req; enum i40iw_status_code ret_code; + if (!i40iw_vf_clear_to_send(dev)) + return I40IW_ERR_TIMEOUT; memset(&vchnl_req, 0, sizeof(vchnl_req)); vchnl_req.dev = dev; vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg; + ret_code = vchnl_vf_send_add_hmc_objs_req(dev, &vchnl_req, rsrc_type, start_index, rsrc_count); - if (!ret_code) { - ret_code = i40iw_vf_wait_vchnl_resp(dev); - if (!ret_code) - ret_code = vchnl_req.ret_code; - else - dev->vchnl_up = false; - } else { + if (ret_code) { i40iw_debug(dev, I40IW_DEBUG_VIRT, "%s Send message failed 0x%0x\n", __func__, ret_code); + return ret_code; } - return ret_code; + ret_code = i40iw_vf_wait_vchnl_resp(dev); + if (ret_code) + return ret_code; + else + return vchnl_req.ret_code; } /** @@ -696,25 +700,27 @@ enum i40iw_status_code i40iw_vchnl_vf_del_hmc_obj(struct i40iw_sc_dev *dev, struct i40iw_virtchnl_req vchnl_req; enum i40iw_status_code ret_code; + if (!i40iw_vf_clear_to_send(dev)) + return I40IW_ERR_TIMEOUT; memset(&vchnl_req, 0, sizeof(vchnl_req)); vchnl_req.dev = dev; vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg; + ret_code = vchnl_vf_send_del_hmc_objs_req(dev, &vchnl_req, rsrc_type, start_index, rsrc_count); - if (!ret_code) { - ret_code = i40iw_vf_wait_vchnl_resp(dev); - if (!ret_code) - ret_code = vchnl_req.ret_code; - else - dev->vchnl_up = false; - } else { + if (ret_code) { i40iw_debug(dev, I40IW_DEBUG_VIRT, "%s Send message failed 0x%0x\n", __func__, ret_code); + return ret_code; } - return ret_code; + ret_code = i40iw_vf_wait_vchnl_resp(dev); + if (ret_code) + return ret_code; + else + return vchnl_req.ret_code; } /** @@ -728,21 +734,23 @@ enum i40iw_status_code i40iw_vchnl_vf_get_pe_stats(struct i40iw_sc_dev *dev, struct i40iw_virtchnl_req vchnl_req; enum i40iw_status_code ret_code; + if (!i40iw_vf_clear_to_send(dev)) + return I40IW_ERR_TIMEOUT; memset(&vchnl_req, 0, sizeof(vchnl_req)); vchnl_req.dev = dev; vchnl_req.parm = hw_stats; vchnl_req.parm_len = sizeof(*hw_stats); vchnl_req.vchnl_msg = &dev->vchnl_vf_msg_buf.vchnl_msg; + ret_code = vchnl_vf_send_get_pe_stats_req(dev, &vchnl_req); - if (!ret_code) { - ret_code = i40iw_vf_wait_vchnl_resp(dev); - if (!ret_code) - ret_code = vchnl_req.ret_code; - else - dev->vchnl_up = false; - } else { + if (ret_code) { i40iw_debug(dev, I40IW_DEBUG_VIRT, "%s Send message failed 0x%0x\n", __func__, ret_code); + return ret_code; } - return ret_code; + ret_code = i40iw_vf_wait_vchnl_resp(dev); + if (ret_code) + return ret_code; + else + return vchnl_req.ret_code; } diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index d68f506c1..9c2e53d28 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -527,7 +527,7 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1); spin_unlock(&tun_qp->tx_lock); if (ret) - goto out; + goto end; tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr); if (tun_qp->tx_ring[tun_tx_ix].ah) @@ -596,9 +596,15 @@ int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port, wr.wr.send_flags = IB_SEND_SIGNALED; ret = ib_post_send(src_qp, &wr.wr, &bad_wr); -out: - if (ret) - ib_destroy_ah(ah); + if (!ret) + return 0; + out: + spin_lock(&tun_qp->tx_lock); + tun_qp->tx_ix_tail++; + spin_unlock(&tun_qp->tx_lock); + tun_qp->tx_ring[tun_tx_ix].ah = NULL; +end: + ib_destroy_ah(ah); return ret; } @@ -1326,9 +1332,15 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port, ret = ib_post_send(send_qp, &wr.wr, &bad_wr); + if (!ret) + return 0; + + spin_lock(&sqp->tx_lock); + sqp->tx_ix_tail++; + spin_unlock(&sqp->tx_lock); + sqp->tx_ring[wire_tx_ix].ah = NULL; out: - if (ret) - ib_destroy_ah(ah); + ib_destroy_ah(ah); return ret; } diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index f014eaf59..42a46078d 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -505,9 +505,9 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B; else props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A; - if (dev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) - props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; } + if (dev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) + props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; @@ -1601,7 +1601,7 @@ static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_att else if (ret == -ENXIO) pr_err("Device managed flow steering is disabled. Fail to register network rule.\n"); else if (ret) - pr_err("Invalid argumant. Fail to register network rule.\n"); + pr_err("Invalid argument. Fail to register network rule.\n"); mlx4_free_cmd_mailbox(mdev->dev, mailbox); return ret; @@ -1704,6 +1704,9 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, struct mlx4_dev *dev = (to_mdev(qp->device))->dev; int is_bonded = mlx4_is_bonded(dev); + if (flow_attr->port < 1 || flow_attr->port > qp->device->phys_port_cnt) + return ERR_PTR(-EINVAL); + if ((flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP) && (flow_attr->type != IB_FLOW_ATTR_NORMAL)) return ERR_PTR(-EOPNOTSUPP); diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c index 99451d887..8f7ad0791 100644 --- a/drivers/infiniband/hw/mlx4/mcg.c +++ b/drivers/infiniband/hw/mlx4/mcg.c @@ -96,7 +96,7 @@ struct ib_sa_mcmember_data { u8 scope_join_state; u8 proxy_join; u8 reserved[2]; -}; +} __packed __aligned(4); struct mcast_group { struct ib_sa_mcmember_data rec; @@ -747,14 +747,11 @@ static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx __be64 tid, union ib_gid *new_mgid) { - struct mcast_group *group = NULL, *cur_group; + struct mcast_group *group = NULL, *cur_group, *n; struct mcast_req *req; - struct list_head *pos; - struct list_head *n; mutex_lock(&ctx->mcg_table_lock); - list_for_each_safe(pos, n, &ctx->mcg_mgid0_list) { - group = list_entry(pos, struct mcast_group, mgid0_list); + list_for_each_entry_safe(group, n, &ctx->mcg_mgid0_list, mgid0_list) { mutex_lock(&group->lock); if (group->last_req_tid == tid) { if (memcmp(new_mgid, &mgid0, sizeof mgid0)) { diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 1eca01ceb..29acda249 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -139,7 +139,7 @@ struct mlx4_ib_mr { u32 max_pages; struct mlx4_mr mmr; struct ib_umem *umem; - void *pages_alloc; + size_t page_map_size; }; struct mlx4_ib_mw { @@ -717,9 +717,8 @@ int mlx4_ib_dealloc_mw(struct ib_mw *mw); struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); -int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents); +int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period); int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index ce0b5aa8e..5d73989d9 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -277,20 +277,23 @@ mlx4_alloc_priv_pages(struct ib_device *device, struct mlx4_ib_mr *mr, int max_pages) { - int size = max_pages * sizeof(u64); - int add_size; int ret; - add_size = max_t(int, MLX4_MR_PAGES_ALIGN - ARCH_KMALLOC_MINALIGN, 0); + /* Ensure that size is aligned to DMA cacheline + * requirements. + * max_pages is limited to MLX4_MAX_FAST_REG_PAGES + * so page_map_size will never cross PAGE_SIZE. + */ + mr->page_map_size = roundup(max_pages * sizeof(u64), + MLX4_MR_PAGES_ALIGN); - mr->pages_alloc = kzalloc(size + add_size, GFP_KERNEL); - if (!mr->pages_alloc) + /* Prevent cross page boundary allocation. */ + mr->pages = (__be64 *)get_zeroed_page(GFP_KERNEL); + if (!mr->pages) return -ENOMEM; - mr->pages = PTR_ALIGN(mr->pages_alloc, MLX4_MR_PAGES_ALIGN); - mr->page_map = dma_map_single(device->dma_device, mr->pages, - size, DMA_TO_DEVICE); + mr->page_map_size, DMA_TO_DEVICE); if (dma_mapping_error(device->dma_device, mr->page_map)) { ret = -ENOMEM; @@ -298,9 +301,9 @@ mlx4_alloc_priv_pages(struct ib_device *device, } return 0; -err: - kfree(mr->pages_alloc); +err: + free_page((unsigned long)mr->pages); return ret; } @@ -309,11 +312,10 @@ mlx4_free_priv_pages(struct mlx4_ib_mr *mr) { if (mr->pages) { struct ib_device *device = mr->ibmr.device; - int size = mr->max_pages * sizeof(u64); dma_unmap_single(device->dma_device, mr->page_map, - size, DMA_TO_DEVICE); - kfree(mr->pages_alloc); + mr->page_map_size, DMA_TO_DEVICE); + free_page((unsigned long)mr->pages); mr->pages = NULL; } } @@ -528,9 +530,8 @@ static int mlx4_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) { struct mlx4_ib_mr *mr = to_mmr(ibmr); int rc; @@ -538,14 +539,12 @@ int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, mr->npages = 0; ib_dma_sync_single_for_cpu(ibmr->device, mr->page_map, - sizeof(u64) * mr->max_pages, - DMA_TO_DEVICE); + mr->page_map_size, DMA_TO_DEVICE); - rc = ib_sg_to_pages(ibmr, sg, sg_nents, mlx4_set_page); + rc = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, mlx4_set_page); ib_dma_sync_single_for_device(ibmr->device, mr->page_map, - sizeof(u64) * mr->max_pages, - DMA_TO_DEVICE); + mr->page_map_size, DMA_TO_DEVICE); return rc; } diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index fd9753476..8db8405c1 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -362,7 +362,7 @@ static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags) sizeof (struct mlx4_wqe_raddr_seg); case MLX4_IB_QPT_RC: return sizeof (struct mlx4_wqe_ctrl_seg) + - sizeof (struct mlx4_wqe_atomic_seg) + + sizeof (struct mlx4_wqe_masked_atomic_seg) + sizeof (struct mlx4_wqe_raddr_seg); case MLX4_IB_QPT_SMI: case MLX4_IB_QPT_GSI: @@ -419,7 +419,8 @@ static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, } static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, - enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp) + enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp, + bool shrink_wqe) { int s; @@ -477,7 +478,7 @@ static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap, * We set WQE size to at least 64 bytes, this way stamping * invalidates each WQE. */ - if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && + if (shrink_wqe && dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && qp->sq_signal_bits && BITS_PER_LONG == 64 && type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI && !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | @@ -642,6 +643,7 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, { int qpn; int err; + struct ib_qp_cap backup_cap; struct mlx4_ib_sqp *sqp; struct mlx4_ib_qp *qp; enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type; @@ -775,7 +777,9 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, goto err; } - err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp); + memcpy(&backup_cap, &init_attr->cap, sizeof(backup_cap)); + err = set_kernel_sq_size(dev, &init_attr->cap, + qp_type, qp, true); if (err) goto err; @@ -787,9 +791,20 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, *qp->db.db = 0; } - if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) { - err = -ENOMEM; - goto err_db; + if (mlx4_buf_alloc(dev->dev, qp->buf_size, qp->buf_size, + &qp->buf, gfp)) { + memcpy(&init_attr->cap, &backup_cap, + sizeof(backup_cap)); + err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, + qp, false); + if (err) + goto err_db; + + if (mlx4_buf_alloc(dev->dev, qp->buf_size, + PAGE_SIZE * 2, &qp->buf, gfp)) { + err = -ENOMEM; + goto err_db; + } } err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift, @@ -1176,8 +1191,10 @@ static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, { err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, 0, &qp, gfp); - if (err) + if (err) { + kfree(qp); return ERR_PTR(err); + } qp->ibqp.qp_num = qp->mqp.qpn; qp->xrcdn = xrcdn; diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index a00ba4418..9c0e67bd2 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -822,7 +822,8 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int eqn; int err; - if (entries < 0) + if (entries < 0 || + (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))) return ERR_PTR(-EINVAL); if (check_cq_create_flags(attr->flags)) @@ -879,7 +880,10 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn); cq->mcq.irqn = irqn; - cq->mcq.comp = mlx5_ib_cq_comp; + if (context) + cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp; + else + cq->mcq.comp = mlx5_ib_cq_comp; cq->mcq.event = mlx5_ib_cq_event; INIT_LIST_HEAD(&cq->wc_list); @@ -1165,11 +1169,16 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) return -ENOSYS; } - if (entries < 1) + if (entries < 1 || + entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) { + mlx5_ib_warn(dev, "wrong entries number %d, max %d\n", + entries, + 1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)); return -EINVAL; + } entries = roundup_pow_of_two(entries + 1); - if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)) + 1) + if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)) + 1) return -EINVAL; if (entries == ibcq->cqe + 1) diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 1534af113..364aab9f3 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -121,7 +121,7 @@ static void pma_cnt_ext_assign(struct ib_pma_portcounters_ext *pma_cnt_ext, pma_cnt_ext->port_xmit_data = cpu_to_be64(MLX5_SUM_CNT(out, transmitted_ib_unicast.octets, transmitted_ib_multicast.octets) >> 2); - pma_cnt_ext->port_xmit_data = + pma_cnt_ext->port_rcv_data = cpu_to_be64(MLX5_SUM_CNT(out, received_ib_unicast.octets, received_ib_multicast.octets) >> 2); pma_cnt_ext->port_xmit_packets = diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 6ad0489cb..b48ad8531 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -38,6 +38,9 @@ #include <linux/dma-mapping.h> #include <linux/slab.h> #include <linux/io-mapping.h> +#if defined(CONFIG_X86) +#include <asm/pat.h> +#endif #include <linux/sched.h> #include <rdma/ib_user_verbs.h> #include <rdma/ib_addr.h> @@ -517,6 +520,13 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_UD_TSO; } + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + MLX5_CAP_ETH(dev->mdev, scatter_fcs)) + props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS; + + if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS)) + props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; + props->vendor_part_id = mdev->pdev->device; props->hw_ver = mdev->pdev->revision; @@ -908,7 +918,8 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE; gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE; resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); - resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); + if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf)) + resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); resp.cache_line_size = L1_CACHE_BYTES; resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq); resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq); @@ -981,7 +992,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, if (field_avail(typeof(resp), cqe_version, udata->outlen)) resp.response_length += sizeof(resp.cqe_version); - if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) { + /* + * We don't want to expose information from the PCI bar that is located + * after 4096 bytes, so if the arch only supports larger pages, let's + * pretend we don't support reading the HCA's core clock. This is also + * forced by mmap function. + */ + if (PAGE_SIZE <= 4096 && + field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) { resp.comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET; resp.hca_core_clock_offset = @@ -1068,38 +1086,89 @@ static int get_index(unsigned long offset) return get_arg(offset); } +static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) +{ + switch (cmd) { + case MLX5_IB_MMAP_WC_PAGE: + return "WC"; + case MLX5_IB_MMAP_REGULAR_PAGE: + return "best effort WC"; + case MLX5_IB_MMAP_NC_PAGE: + return "NC"; + default: + return NULL; + } +} + +static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, + struct vm_area_struct *vma, struct mlx5_uuar_info *uuari) +{ + int err; + unsigned long idx; + phys_addr_t pfn, pa; + pgprot_t prot; + + switch (cmd) { + case MLX5_IB_MMAP_WC_PAGE: +/* Some architectures don't support WC memory */ +#if defined(CONFIG_X86) + if (!pat_enabled()) + return -EPERM; +#elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU))) + return -EPERM; +#endif + /* fall through */ + case MLX5_IB_MMAP_REGULAR_PAGE: + /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */ + prot = pgprot_writecombine(vma->vm_page_prot); + break; + case MLX5_IB_MMAP_NC_PAGE: + prot = pgprot_noncached(vma->vm_page_prot); + break; + default: + return -EINVAL; + } + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + idx = get_index(vma->vm_pgoff); + if (idx >= uuari->num_uars) + return -EINVAL; + + pfn = uar_index2pfn(dev, uuari->uars[idx].index); + mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn); + + vma->vm_page_prot = prot; + err = io_remap_pfn_range(vma, vma->vm_start, pfn, + PAGE_SIZE, vma->vm_page_prot); + if (err) { + mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%lx, pfn=%pa, mmap_cmd=%s\n", + err, vma->vm_start, &pfn, mmap_cmd2str(cmd)); + return -EAGAIN; + } + + pa = pfn << PAGE_SHIFT; + mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd), + vma->vm_start, &pa); + + return 0; +} + static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_uuar_info *uuari = &context->uuari; unsigned long command; - unsigned long idx; phys_addr_t pfn; command = get_command(vma->vm_pgoff); switch (command) { + case MLX5_IB_MMAP_WC_PAGE: + case MLX5_IB_MMAP_NC_PAGE: case MLX5_IB_MMAP_REGULAR_PAGE: - if (vma->vm_end - vma->vm_start != PAGE_SIZE) - return -EINVAL; - - idx = get_index(vma->vm_pgoff); - if (idx >= uuari->num_uars) - return -EINVAL; - - pfn = uar_index2pfn(dev, uuari->uars[idx].index); - mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx, - (unsigned long long)pfn); - - vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); - if (io_remap_pfn_range(vma, vma->vm_start, pfn, - PAGE_SIZE, vma->vm_page_prot)) - return -EAGAIN; - - mlx5_ib_dbg(dev, "mapped WC at 0x%lx, PA 0x%llx\n", - vma->vm_start, - (unsigned long long)pfn << PAGE_SHIFT); - break; + return uar_mmap(dev, command, vma, uuari); case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: return -ENOSYS; @@ -1108,7 +1177,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; - if (vma->vm_flags & (VM_WRITE | VM_EXEC)) + if (vma->vm_flags & VM_WRITE) return -EPERM; /* Don't expose to user-space information it shouldn't have */ @@ -1438,7 +1507,8 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, if (!ft) { ft = mlx5_create_auto_grouped_flow_table(ns, priority, num_entries, - num_groups); + num_groups, + 0); if (!IS_ERR(ft)) { prio->refcount = 0; @@ -1739,7 +1809,7 @@ static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); - return sprintf(buf, "%d.%d.%d\n", fw_rev_maj(dev->mdev), + return sprintf(buf, "%d.%d.%04d\n", fw_rev_maj(dev->mdev), fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev)); } @@ -1807,14 +1877,11 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, break; case MLX5_DEV_EVENT_PORT_DOWN: + case MLX5_DEV_EVENT_PORT_INITIALIZED: ibev.event = IB_EVENT_PORT_ERR; port = (u8)param; break; - case MLX5_DEV_EVENT_PORT_INITIALIZED: - /* not used by ULPs */ - return; - case MLX5_DEV_EVENT_LID_CHANGE: ibev.event = IB_EVENT_LID_CHANGE; port = (u8)param; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index b46c25542..c4a982582 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -70,6 +70,8 @@ enum { enum mlx5_ib_mmap_cmd { MLX5_IB_MMAP_REGULAR_PAGE = 0, MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, + MLX5_IB_MMAP_WC_PAGE = 2, + MLX5_IB_MMAP_NC_PAGE = 3, /* 5 is chosen in order to be compatible with old versions of libmlx5 */ MLX5_IB_MMAP_CORE_CLOCK = 5, }; @@ -356,6 +358,7 @@ enum mlx5_ib_qp_flags { MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 5, /* QP uses 1 as its source QP number */ MLX5_IB_QP_SQPN_QP1 = 1 << 6, + MLX5_IB_QP_CAP_SCATTER_FCS = 1 << 7, }; struct mlx5_umr_wr { @@ -712,9 +715,8 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr); struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); -int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents); +int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const struct ib_mad_hdr *in, size_t in_mad_size, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 4d5bff151..8cf2ce505 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1751,26 +1751,33 @@ done: static int mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, struct scatterlist *sgl, - unsigned short sg_nents) + unsigned short sg_nents, + unsigned int *sg_offset_p) { struct scatterlist *sg = sgl; struct mlx5_klm *klms = mr->descs; + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; u32 lkey = mr->ibmr.pd->local_dma_lkey; int i; - mr->ibmr.iova = sg_dma_address(sg); + mr->ibmr.iova = sg_dma_address(sg) + sg_offset; mr->ibmr.length = 0; mr->ndescs = sg_nents; for_each_sg(sgl, sg, sg_nents, i) { if (unlikely(i > mr->max_descs)) break; - klms[i].va = cpu_to_be64(sg_dma_address(sg)); - klms[i].bcount = cpu_to_be32(sg_dma_len(sg)); + klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); + klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); klms[i].key = cpu_to_be32(lkey); mr->ibmr.length += sg_dma_len(sg); + + sg_offset = 0; } + if (sg_offset_p) + *sg_offset_p = sg_offset; + return i; } @@ -1788,9 +1795,8 @@ static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) { struct mlx5_ib_mr *mr = to_mmr(ibmr); int n; @@ -1802,9 +1808,10 @@ int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, DMA_TO_DEVICE); if (mr->access_mode == MLX5_ACCESS_MODE_KLM) - n = mlx5_ib_sg_to_klms(mr, sg, sg_nents); + n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset); else - n = ib_sg_to_pages(ibmr, sg, sg_nents, mlx5_set_page); + n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, + mlx5_set_page); ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, mr->desc_size * mr->max_descs, diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8dee8bc1e..ce0a7ab35 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -235,6 +235,8 @@ static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap, qp->rq.max_gs = 0; qp->rq.wqe_cnt = 0; qp->rq.wqe_shift = 0; + cap->max_recv_wr = 0; + cap->max_recv_sge = 0; } else { if (ucmd) { qp->rq.wqe_cnt = ucmd->rq_wqe_count; @@ -1028,6 +1030,7 @@ static int get_rq_pas_size(void *qpc) static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, void *qpin) { + struct mlx5_ib_qp *mqp = rq->base.container_mibqp; __be64 *pas; __be64 *qp_pas; void *in; @@ -1051,6 +1054,9 @@ static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index)); MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv)); + if (mqp->flags & MLX5_IB_QP_CAP_SCATTER_FCS) + MLX5_SET(rqc, rqc, scatter_fcs, 1); + wq = MLX5_ADDR_OF(rqc, rqc, wq); MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); MLX5_SET(wq, wq, end_padding_mode, @@ -1136,11 +1142,12 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, } if (qp->rq.wqe_cnt) { + rq->base.container_mibqp = qp; + err = create_raw_packet_qp_rq(dev, rq, in); if (err) goto err_destroy_sq; - rq->base.container_mibqp = qp; err = create_raw_packet_qp_tir(dev, rq, tdn); if (err) @@ -1252,6 +1259,19 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, return -EOPNOTSUPP; } + if (init_attr->create_flags & IB_QP_CREATE_SCATTER_FCS) { + if (init_attr->qp_type != IB_QPT_RAW_PACKET) { + mlx5_ib_dbg(dev, "Scatter FCS is supported only for Raw Packet QPs"); + return -EOPNOTSUPP; + } + if (!MLX5_CAP_GEN(dev->mdev, eth_net_offloads) || + !MLX5_CAP_ETH(dev->mdev, scatter_fcs)) { + mlx5_ib_dbg(dev, "Scatter FCS isn't supported\n"); + return -EOPNOTSUPP; + } + qp->flags |= MLX5_IB_QP_CAP_SCATTER_FCS; + } + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; @@ -1833,13 +1853,15 @@ static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev, static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, const struct ib_ah_attr *ah, struct mlx5_qp_path *path, u8 port, int attr_mask, - u32 path_flags, const struct ib_qp_attr *attr) + u32 path_flags, const struct ib_qp_attr *attr, + bool alt) { enum rdma_link_layer ll = rdma_port_get_link_layer(&dev->ib_dev, port); int err; if (attr_mask & IB_QP_PKEY_INDEX) - path->pkey_index = attr->pkey_index; + path->pkey_index = cpu_to_be16(alt ? attr->alt_pkey_index : + attr->pkey_index); if (ah->ah_flags & IB_AH_GRH) { if (ah->grh.sgid_index >= @@ -1859,9 +1881,9 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, ah->grh.sgid_index); path->dci_cfi_prio_sl = (ah->sl & 0x7) << 4; } else { - path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; - path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : - 0; + path->fl_free_ar = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; + path->fl_free_ar |= + (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x40 : 0; path->rlid = cpu_to_be16(ah->dlid); path->grh_mlid = ah->src_path_bits & 0x7f; if (ah->ah_flags & IB_AH_GRH) @@ -1885,7 +1907,7 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, path->port = port; if (attr_mask & IB_QP_TIMEOUT) - path->ackto_lt = attr->timeout << 3; + path->ackto_lt = (alt ? attr->alt_timeout : attr->timeout) << 3; if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt) return modify_raw_packet_eth_prio(dev->mdev, @@ -2246,7 +2268,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num); if (attr_mask & IB_QP_PKEY_INDEX) - context->pri_path.pkey_index = attr->pkey_index; + context->pri_path.pkey_index = cpu_to_be16(attr->pkey_index); /* todo implement counter_index functionality */ @@ -2259,7 +2281,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (attr_mask & IB_QP_AV) { err = mlx5_set_path(dev, qp, &attr->ah_attr, &context->pri_path, attr_mask & IB_QP_PORT ? attr->port_num : qp->port, - attr_mask, 0, attr); + attr_mask, 0, attr, false); if (err) goto out; } @@ -2270,7 +2292,9 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (attr_mask & IB_QP_ALT_PATH) { err = mlx5_set_path(dev, qp, &attr->alt_ah_attr, &context->alt_path, - attr->alt_port_num, attr_mask, 0, attr); + attr->alt_port_num, + attr_mask | IB_QP_PKEY_INDEX | IB_QP_TIMEOUT, + 0, attr, true); if (err) goto out; } @@ -3308,10 +3332,11 @@ static u8 get_fence(u8 fence, struct ib_send_wr *wr) return MLX5_FENCE_MODE_SMALL_AND_FENCE; else return fence; - - } else { - return 0; + } else if (unlikely(wr->send_flags & IB_SEND_FENCE)) { + return MLX5_FENCE_MODE_FENCE; } + + return 0; } static int begin_wqe(struct mlx5_ib_qp *qp, void **seg, @@ -3995,11 +4020,12 @@ static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) { to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path); to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path); - qp_attr->alt_pkey_index = context->alt_path.pkey_index & 0x7f; + qp_attr->alt_pkey_index = + be16_to_cpu(context->alt_path.pkey_index); qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num; } - qp_attr->pkey_index = context->pri_path.pkey_index & 0x7f; + qp_attr->pkey_index = be16_to_cpu(context->pri_path.pkey_index); qp_attr->port_num = context->pri_path.port; /* qp_attr->en_sqd_async_notify is only applicable in modify qp */ @@ -4061,17 +4087,19 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, qp_attr->cap.max_recv_sge = qp->rq.max_gs; if (!ibqp->uobject) { - qp_attr->cap.max_send_wr = qp->sq.wqe_cnt; + qp_attr->cap.max_send_wr = qp->sq.max_post; qp_attr->cap.max_send_sge = qp->sq.max_gs; + qp_init_attr->qp_context = ibqp->qp_context; } else { qp_attr->cap.max_send_wr = 0; qp_attr->cap.max_send_sge = 0; } - /* We don't support inline sends for kernel QPs (yet), and we - * don't know what userspace's value should be. - */ - qp_attr->cap.max_inline_data = 0; + qp_init_attr->qp_type = ibqp->qp_type; + qp_init_attr->recv_cq = ibqp->recv_cq; + qp_init_attr->send_cq = ibqp->send_cq; + qp_init_attr->srq = ibqp->srq; + qp_attr->cap.max_inline_data = qp->max_inline_data; qp_init_attr->cap = qp_attr->cap; diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c index 92914539e..2b27d1351 100644 --- a/drivers/infiniband/hw/nes/nes_nic.c +++ b/drivers/infiniband/hw/nes/nes_nic.c @@ -356,7 +356,7 @@ static int nes_netdev_stop(struct net_device *netdev) /** * nes_nic_send */ -static int nes_nic_send(struct sk_buff *skb, struct net_device *netdev) +static bool nes_nic_send(struct sk_buff *skb, struct net_device *netdev) { struct nes_vnic *nesvnic = netdev_priv(netdev); struct nes_device *nesdev = nesvnic->nesdev; @@ -413,7 +413,7 @@ static int nes_nic_send(struct sk_buff *skb, struct net_device *netdev) netdev->name, skb_shinfo(skb)->nr_frags + 2, skb_headlen(skb)); kfree_skb(skb); nesvnic->tx_sw_dropped++; - return NETDEV_TX_LOCKED; + return false; } set_bit(nesnic->sq_head, nesnic->first_frag_overflow); bus_address = pci_map_single(nesdev->pcidev, skb->data + NES_FIRST_FRAG_SIZE, @@ -454,8 +454,7 @@ static int nes_nic_send(struct sk_buff *skb, struct net_device *netdev) set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX, wqe_misc); nesnic->sq_head++; nesnic->sq_head &= nesnic->sq_size - 1; - - return NETDEV_TX_OK; + return true; } @@ -479,7 +478,6 @@ static int nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev) u32 tso_wqe_length; u32 curr_tcp_seq; u32 wqe_count=1; - u32 send_rc; struct iphdr *iph; __le16 *wqe_fragment_length; u32 nr_frags; @@ -670,13 +668,11 @@ tso_sq_no_longer_full: skb_linearize(skb); skb_set_transport_header(skb, hoffset); skb_set_network_header(skb, nhoffset); - send_rc = nes_nic_send(skb, netdev); - if (send_rc != NETDEV_TX_OK) + if (!nes_nic_send(skb, netdev)) return NETDEV_TX_OK; } } else { - send_rc = nes_nic_send(skb, netdev); - if (send_rc != NETDEV_TX_OK) + if (!nes_nic_send(skb, netdev)) return NETDEV_TX_OK; } @@ -686,7 +682,7 @@ tso_sq_no_longer_full: nes_write32(nesdev->regs+NES_WQE_ALLOC, (wqe_count << 24) | (1 << 23) | nesvnic->nic.qp_id); - netdev->trans_start = jiffies; + netif_trans_update(netdev); return NETDEV_TX_OK; } diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c index 6d3a169c0..37331e2fd 100644 --- a/drivers/infiniband/hw/nes/nes_utils.c +++ b/drivers/infiniband/hw/nes/nes_utils.c @@ -44,6 +44,7 @@ #include <linux/ip.h> #include <linux/tcp.h> #include <linux/init.h> +#include <linux/kernel.h> #include <asm/io.h> #include <asm/irq.h> @@ -903,70 +904,15 @@ void nes_clc(unsigned long parm) */ void nes_dump_mem(unsigned int dump_debug_level, void *addr, int length) { - char xlate[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', - 'a', 'b', 'c', 'd', 'e', 'f'}; - char *ptr; - char hex_buf[80]; - char ascii_buf[20]; - int num_char; - int num_ascii; - int num_hex; - if (!(nes_debug_level & dump_debug_level)) { return; } - ptr = addr; if (length > 0x100) { nes_debug(dump_debug_level, "Length truncated from %x to %x\n", length, 0x100); length = 0x100; } - nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", ptr, length, length); - - memset(ascii_buf, 0, 20); - memset(hex_buf, 0, 80); - - num_ascii = 0; - num_hex = 0; - for (num_char = 0; num_char < length; num_char++) { - if (num_ascii == 8) { - ascii_buf[num_ascii++] = ' '; - hex_buf[num_hex++] = '-'; - hex_buf[num_hex++] = ' '; - } - - if (*ptr < 0x20 || *ptr > 0x7e) - ascii_buf[num_ascii++] = '.'; - else - ascii_buf[num_ascii++] = *ptr; - hex_buf[num_hex++] = xlate[((*ptr & 0xf0) >> 4)]; - hex_buf[num_hex++] = xlate[*ptr & 0x0f]; - hex_buf[num_hex++] = ' '; - ptr++; - - if (num_ascii >= 17) { - /* output line and reset */ - nes_debug(dump_debug_level, " %s | %s\n", hex_buf, ascii_buf); - memset(ascii_buf, 0, 20); - memset(hex_buf, 0, 80); - num_ascii = 0; - num_hex = 0; - } - } + nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", addr, length, length); - /* output the rest */ - if (num_ascii) { - while (num_ascii < 17) { - if (num_ascii == 8) { - hex_buf[num_hex++] = ' '; - hex_buf[num_hex++] = ' '; - } - hex_buf[num_hex++] = ' '; - hex_buf[num_hex++] = ' '; - hex_buf[num_hex++] = ' '; - num_ascii++; - } - - nes_debug(dump_debug_level, " %s | %s\n", hex_buf, ascii_buf); - } + print_hex_dump(KERN_ERR, PFX, DUMP_PREFIX_NONE, 16, 1, addr, length, true); } diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index fba69a39a..464d6da5f 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -402,15 +402,14 @@ static int nes_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -static int nes_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +static int nes_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset) { struct nes_mr *nesmr = to_nesmr(ibmr); nesmr->npages = 0; - return ib_sg_to_pages(ibmr, sg, sg_nents, nes_set_page); + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, nes_set_page); } /** @@ -981,7 +980,7 @@ static int nes_setup_mmap_qp(struct nes_qp *nesqp, struct nes_vnic *nesvnic, /** * nes_free_qp_mem() is to free up the qp's pci_alloc_consistent() memory. */ -static inline void nes_free_qp_mem(struct nes_device *nesdev, +static void nes_free_qp_mem(struct nes_device *nesdev, struct nes_qp *nesqp, int virt_wqs) { unsigned long flags; @@ -1315,6 +1314,8 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd, nes_debug(NES_DBG_QP, "Invalid QP type: %d\n", init_attr->qp_type); return ERR_PTR(-EINVAL); } + init_completion(&nesqp->sq_drained); + init_completion(&nesqp->rq_drained); nesqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR); init_timer(&nesqp->terminate_timer); @@ -3452,6 +3453,29 @@ out: return err; } +/** + * nes_drain_sq - drain sq + * @ibqp: pointer to ibqp + */ +static void nes_drain_sq(struct ib_qp *ibqp) +{ + struct nes_qp *nesqp = to_nesqp(ibqp); + + if (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head) + wait_for_completion(&nesqp->sq_drained); +} + +/** + * nes_drain_rq - drain rq + * @ibqp: pointer to ibqp + */ +static void nes_drain_rq(struct ib_qp *ibqp) +{ + struct nes_qp *nesqp = to_nesqp(ibqp); + + if (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head) + wait_for_completion(&nesqp->rq_drained); +} /** * nes_poll_cq @@ -3582,6 +3606,13 @@ static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry) } } + if (nesqp->iwarp_state > NES_CQP_QP_IWARP_STATE_RTS) { + if (nesqp->hwqp.sq_tail == nesqp->hwqp.sq_head) + complete(&nesqp->sq_drained); + if (nesqp->hwqp.rq_tail == nesqp->hwqp.rq_head) + complete(&nesqp->rq_drained); + } + entry->wr_id = wrid; entry++; cqe_count++; @@ -3754,6 +3785,8 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) nesibdev->ibdev.req_notify_cq = nes_req_notify_cq; nesibdev->ibdev.post_send = nes_post_send; nesibdev->ibdev.post_recv = nes_post_recv; + nesibdev->ibdev.drain_sq = nes_drain_sq; + nesibdev->ibdev.drain_rq = nes_drain_rq; nesibdev->ibdev.iwcm = kzalloc(sizeof(*nesibdev->ibdev.iwcm), GFP_KERNEL); if (nesibdev->ibdev.iwcm == NULL) { diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index 70290883d..e02a5662d 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -189,6 +189,8 @@ struct nes_qp { u8 pau_pending; u8 pau_state; __u64 nesuqp_addr; + struct completion sq_drained; + struct completion rq_drained; }; struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index a8496a18e..b1a3d91fe 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -3081,13 +3081,12 @@ static int ocrdma_set_page(struct ib_mr *ibmr, u64 addr) return 0; } -int ocrdma_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents) +int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) { struct ocrdma_mr *mr = get_ocrdma_mr(ibmr); mr->npages = 0; - return ib_sg_to_pages(ibmr, sg, sg_nents, ocrdma_set_page); + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, ocrdma_set_page); } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index 8b517fd36..704ef1e92 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -122,8 +122,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length, struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); -int ocrdma_map_mr_sg(struct ib_mr *ibmr, - struct scatterlist *sg, - int sg_nents); +int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); #endif /* __OCRDMA_VERBS_H__ */ diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 24f4a782e..382466a90 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -824,10 +824,7 @@ static int mmap_piobufs(struct vm_area_struct *vma, phys = dd->physaddr + piobufs; #if defined(__powerpc__) - /* There isn't a generic way to specify writethrough mappings */ - pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE; - pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU; - pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED; + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); #endif /* @@ -2181,6 +2178,11 @@ static ssize_t qib_write(struct file *fp, const char __user *data, switch (cmd.type) { case QIB_CMD_ASSIGN_CTXT: + if (rcd) { + ret = -EINVAL; + goto bail; + } + ret = qib_assign_ctxt(fp, &cmd.cmd.user_info); if (ret) goto bail; diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 82d7c4bf5..ce4034071 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -1308,21 +1308,6 @@ static const struct qib_hwerror_msgs qib_7322p_error_msgs[] = { SYM_LSB(IntMask, fldname##17IntMask)), \ .msg = #fldname "_C", .sz = sizeof(#fldname "_C") } -static const struct qib_hwerror_msgs qib_7322_intr_msgs[] = { - INTR_AUTO_P(SDmaInt), - INTR_AUTO_P(SDmaProgressInt), - INTR_AUTO_P(SDmaIdleInt), - INTR_AUTO_P(SDmaCleanupDone), - INTR_AUTO_C(RcvUrg), - INTR_AUTO_P(ErrInt), - INTR_AUTO(ErrInt), /* non-port-specific errs */ - INTR_AUTO(AssertGPIOInt), - INTR_AUTO_P(SendDoneInt), - INTR_AUTO(SendBufAvailInt), - INTR_AUTO_C(RcvAvail), - { .mask = 0, .sz = 0 } -}; - #define TXSYMPTOM_AUTO_P(fldname) \ { .mask = SYM_MASK(SendHdrErrSymptom_0, fldname), \ .msg = #fldname, .sz = sizeof(#fldname) } diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 3f062f0dd..f253111e6 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -1090,7 +1090,7 @@ void qib_free_devdata(struct qib_devdata *dd) qib_dbg_ibdev_exit(&dd->verbs_dev); #endif free_percpu(dd->int_counter); - ib_dealloc_device(&dd->verbs_dev.rdi.ibdev); + rvt_dealloc_device(&dd->verbs_dev.rdi); } u64 qib_int_counter(struct qib_devdata *dd) @@ -1183,7 +1183,7 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) bail: if (!list_empty(&dd->list)) list_del_init(&dd->list); - ib_dealloc_device(&dd->verbs_dev.rdi.ibdev); + rvt_dealloc_device(&dd->verbs_dev.rdi); return ERR_PTR(ret); } diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c index 0bd18375d..d2ac29861 100644 --- a/drivers/infiniband/hw/qib/qib_mad.c +++ b/drivers/infiniband/hw/qib/qib_mad.c @@ -1172,11 +1172,13 @@ static int pma_get_classportinfo(struct ib_pma_mad *pmp, * Set the most significant bit of CM2 to indicate support for * congestion statistics */ - p->reserved[0] = dd->psxmitwait_supported << 7; + ib_set_cpi_capmask2(p, + dd->psxmitwait_supported << + (31 - IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE)); /* * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec. */ - p->resp_time_value = 18; + ib_set_cpi_resp_time(p, 18); return reply((struct ib_smp *) pmp); } diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c index 4758a3801..6abe1c621 100644 --- a/drivers/infiniband/hw/qib/qib_pcie.c +++ b/drivers/infiniband/hw/qib/qib_pcie.c @@ -144,13 +144,7 @@ int qib_pcie_ddinit(struct qib_devdata *dd, struct pci_dev *pdev, addr = pci_resource_start(pdev, 0); len = pci_resource_len(pdev, 0); -#if defined(__powerpc__) - /* There isn't a generic way to specify writethrough mappings */ - dd->kregbase = __ioremap(addr, len, _PAGE_NO_CACHE | _PAGE_WRITETHRU); -#else dd->kregbase = ioremap_nocache(addr, len); -#endif - if (!dd->kregbase) return -ENOMEM; diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 9088e26d3..444028a35 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -230,7 +230,7 @@ bail: * * Return 1 if constructed; otherwise, return 0. */ -int qib_make_rc_req(struct rvt_qp *qp) +int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags) { struct qib_qp_priv *priv = qp->priv; struct qib_ibdev *dev = to_idev(qp->ibqp.device); diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index a5f07a64b..b67779256 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -739,7 +739,7 @@ void qib_do_send(struct rvt_qp *qp) struct qib_qp_priv *priv = qp->priv; struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct qib_pportdata *ppd = ppd_from_ibp(ibp); - int (*make_req)(struct rvt_qp *qp); + int (*make_req)(struct rvt_qp *qp, unsigned long *flags); unsigned long flags; if ((qp->ibqp.qp_type == IB_QPT_RC || @@ -781,7 +781,7 @@ void qib_do_send(struct rvt_qp *qp) qp->s_hdrwords = 0; spin_lock_irqsave(&qp->s_lock, flags); } - } while (make_req(qp)); + } while (make_req(qp, &flags)); spin_unlock_irqrestore(&qp->s_lock, flags); } diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 7bdbc79ce..1d61bd04f 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -45,7 +45,7 @@ * * Return 1 if constructed; otherwise, return 0. */ -int qib_make_uc_req(struct rvt_qp *qp) +int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags) { struct qib_qp_priv *priv = qp->priv; struct qib_other_headers *ohdr; diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index d9502137d..846e6c726 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -238,7 +238,7 @@ drop: * * Return 1 if constructed; otherwise, return 0. */ -int qib_make_ud_req(struct rvt_qp *qp) +int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) { struct qib_qp_priv *priv = qp->priv; struct qib_other_headers *ohdr; @@ -294,7 +294,7 @@ int qib_make_ud_req(struct rvt_qp *qp) this_cpu_inc(ibp->pmastats->n_unicast_xmit); lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1); if (unlikely(lid == ppd->lid)) { - unsigned long flags; + unsigned long tflags = *flags; /* * If DMAs are in progress, we can't generate * a completion for the loopback packet since @@ -307,10 +307,10 @@ int qib_make_ud_req(struct rvt_qp *qp) goto bail; } qp->s_cur = next_cur; - local_irq_save(flags); - spin_unlock_irqrestore(&qp->s_lock, flags); + spin_unlock_irqrestore(&qp->s_lock, tflags); qib_ud_loopback(qp, wqe); - spin_lock_irqsave(&qp->s_lock, flags); + spin_lock_irqsave(&qp->s_lock, tflags); + *flags = tflags; qib_send_complete(qp, wqe, IB_WC_SUCCESS); goto done; } diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 4b76a8d59..4f878151f 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -159,6 +159,7 @@ struct qib_other_headers { } at; __be32 imm_data; __be32 aeth; + __be32 ieth; struct ib_atomic_eth atomic_eth; } u; } __packed; @@ -430,11 +431,11 @@ void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, void qib_send_rc_ack(struct rvt_qp *qp); -int qib_make_rc_req(struct rvt_qp *qp); +int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags); -int qib_make_uc_req(struct rvt_qp *qp); +int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags); -int qib_make_ud_req(struct rvt_qp *qp); +int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags); int qib_register_ib_device(struct qib_devdata *); diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 7209fbc03..a0b6ebee4 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -36,7 +36,6 @@ #include <linux/dma-mapping.h> #include <linux/sched.h> #include <linux/hugetlb.h> -#include <linux/dma-attrs.h> #include <linux/iommu.h> #include <linux/workqueue.h> #include <linux/list.h> @@ -112,10 +111,6 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, int i; int flags; dma_addr_t pa; - DEFINE_DMA_ATTRS(attrs); - - if (dmasync) - dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); if (!can_do_mlock()) return -EPERM; diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index b1ffc8b4a..6ca6fa80d 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -525,6 +525,7 @@ int rvt_driver_cq_init(struct rvt_dev_info *rdi) return PTR_ERR(task); } + set_user_nice(task, MIN_NICE); cpu = cpumask_first(cpumask_of_node(rdi->dparms.node)); kthread_bind(task, cpu); wake_up_process(task); diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 0ff765bfd..0f4d4500f 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -124,11 +124,13 @@ static int rvt_init_mregion(struct rvt_mregion *mr, struct ib_pd *pd, int count) { int m, i = 0; + struct rvt_dev_info *dev = ib_to_rvt(pd->device); mr->mapsz = 0; m = (count + RVT_SEGSZ - 1) / RVT_SEGSZ; for (; i < m; i++) { - mr->map[i] = kzalloc(sizeof(*mr->map[0]), GFP_KERNEL); + mr->map[i] = kzalloc_node(sizeof(*mr->map[0]), GFP_KERNEL, + dev->dparms.node); if (!mr->map[i]) { rvt_deinit_mregion(mr); return -ENOMEM; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index a0ecf08b2..41ba7e9ca 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -369,8 +369,8 @@ static int alloc_qpn(struct rvt_dev_info *rdi, struct rvt_qpn_table *qpt, /* wrap to first map page, invert bit 0 */ offset = qpt->incr | ((offset & 1) ^ 1); } - /* there can be no bits at shift and below */ - WARN_ON(offset & (rdi->dparms.qos_shift - 1)); + /* there can be no set bits in low-order QoS bits */ + WARN_ON(offset & (BIT(rdi->dparms.qos_shift) - 1)); qpn = mk_qpn(qpt, map, offset); } @@ -397,6 +397,7 @@ static void free_qpn(struct rvt_qpn_table *qpt, u32 qpn) static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends) { unsigned n; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); if (test_and_clear_bit(RVT_R_REWIND_SGE, &qp->r_aflags)) rvt_put_ss(&qp->s_rdma_read_sge); @@ -431,7 +432,7 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends) if (qp->ibqp.qp_type != IB_QPT_RC) return; - for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) { + for (n = 0; n < rvt_max_atomic(rdi); n++) { struct rvt_ack_entry *e = &qp->s_ack_queue[n]; if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST && @@ -501,6 +502,12 @@ static void rvt_remove_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp) */ static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, enum ib_qp_type type) + __releases(&qp->s_lock) + __releases(&qp->s_hlock) + __releases(&qp->r_lock) + __acquires(&qp->r_lock) + __acquires(&qp->s_hlock) + __acquires(&qp->s_lock) { if (qp->state != IB_QPS_RESET) { qp->state = IB_QPS_RESET; @@ -569,7 +576,6 @@ static void rvt_reset_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, qp->s_ssn = 1; qp->s_lsn = 0; qp->s_mig_state = IB_MIG_MIGRATED; - memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue)); qp->r_head_ack_queue = 0; qp->s_tail_ack_queue = 0; qp->s_num_rd_atomic = 0; @@ -653,9 +659,9 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, if (gfp == GFP_NOIO) swq = __vmalloc( (init_attr->cap.max_send_wr + 1) * sz, - gfp, PAGE_KERNEL); + gfp | __GFP_ZERO, PAGE_KERNEL); else - swq = vmalloc_node( + swq = vzalloc_node( (init_attr->cap.max_send_wr + 1) * sz, rdi->dparms.node); if (!swq) @@ -677,6 +683,16 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, goto bail_swq; RCU_INIT_POINTER(qp->next, NULL); + if (init_attr->qp_type == IB_QPT_RC) { + qp->s_ack_queue = + kzalloc_node( + sizeof(*qp->s_ack_queue) * + rvt_max_atomic(rdi), + gfp, + rdi->dparms.node); + if (!qp->s_ack_queue) + goto bail_qp; + } /* * Driver needs to set up it's private QP structure and do any @@ -706,9 +722,9 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, qp->r_rq.wq = __vmalloc( sizeof(struct rvt_rwq) + qp->r_rq.size * sz, - gfp, PAGE_KERNEL); + gfp | __GFP_ZERO, PAGE_KERNEL); else - qp->r_rq.wq = vmalloc_node( + qp->r_rq.wq = vzalloc_node( sizeof(struct rvt_rwq) + qp->r_rq.size * sz, rdi->dparms.node); @@ -831,13 +847,13 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd, case IB_QPT_SMI: case IB_QPT_GSI: case IB_QPT_UD: - qp->allowed_ops = IB_OPCODE_UD_SEND_ONLY & RVT_OPCODE_QP_MASK; + qp->allowed_ops = IB_OPCODE_UD; break; case IB_QPT_RC: - qp->allowed_ops = IB_OPCODE_RC_SEND_ONLY & RVT_OPCODE_QP_MASK; + qp->allowed_ops = IB_OPCODE_RC; break; case IB_QPT_UC: - qp->allowed_ops = IB_OPCODE_UC_SEND_ONLY & RVT_OPCODE_QP_MASK; + qp->allowed_ops = IB_OPCODE_UC; break; default: ret = ERR_PTR(-EINVAL); @@ -859,6 +875,7 @@ bail_driver_priv: rdi->driver_f.qp_priv_free(rdi, qp); bail_qp: + kfree(qp->s_ack_queue); kfree(qp); bail_swq: @@ -1286,6 +1303,7 @@ int rvt_destroy_qp(struct ib_qp *ibqp) vfree(qp->r_rq.wq); vfree(qp->s_wq); rdi->driver_f.qp_priv_free(rdi, qp); + kfree(qp->s_ack_queue); kfree(qp); return 0; } diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 6caf5272b..30c4fda7a 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -106,6 +106,19 @@ struct rvt_dev_info *rvt_alloc_device(size_t size, int nports) } EXPORT_SYMBOL(rvt_alloc_device); +/** + * rvt_dealloc_device - deallocate rdi + * @rdi: structure to free + * + * Free a structure allocated with rvt_alloc_device() + */ +void rvt_dealloc_device(struct rvt_dev_info *rdi) +{ + kfree(rdi->ports); + ib_dealloc_device(&rdi->ibdev); +} +EXPORT_SYMBOL(rvt_dealloc_device); + static int rvt_query_device(struct ib_device *ibdev, struct ib_device_attr *props, struct ib_udata *uhw) @@ -488,9 +501,7 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb) !rdi->driver_f.quiesce_qp || !rdi->driver_f.notify_error_qp || !rdi->driver_f.mtu_from_qp || - !rdi->driver_f.mtu_to_path_mtu || - !rdi->driver_f.shut_down_port || - !rdi->driver_f.cap_mask_chg) + !rdi->driver_f.mtu_to_path_mtu) return -EINVAL; break; diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index caec8e9c4..4f7d9b48d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -92,6 +92,9 @@ enum { IPOIB_FLAG_UMCAST = 10, IPOIB_STOP_NEIGH_GC = 11, IPOIB_NEIGH_TBL_FLUSH = 12, + IPOIB_FLAG_DEV_ADDR_SET = 13, + IPOIB_FLAG_DEV_ADDR_CTRL = 14, + IPOIB_FLAG_GOING_DOWN = 15, IPOIB_MAX_BACKOFF_SECONDS = 16, @@ -392,6 +395,7 @@ struct ipoib_dev_priv { struct ipoib_ethtool_st ethtool; struct timer_list poll_timer; unsigned max_send_sge; + bool sm_fullmember_sendonly_support; }; struct ipoib_ah { @@ -476,6 +480,7 @@ void ipoib_reap_ah(struct work_struct *work); void ipoib_mark_paths_invalid(struct net_device *dev); void ipoib_flush_paths(struct net_device *dev); +int ipoib_check_sm_sendonly_fullmember_support(struct ipoib_dev_priv *priv); struct ipoib_dev_priv *ipoib_intf_alloc(const char *format); int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index c8ed53562..951d9abcc 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -766,7 +766,7 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ ipoib_dma_unmap_tx(priv, tx_req); dev_kfree_skb_any(skb); } else { - dev->trans_start = jiffies; + netif_trans_update(dev); ++tx->tx_head; if (++priv->tx_outstanding == ipoib_sendq_size) { @@ -1486,6 +1486,10 @@ static ssize_t set_mode(struct device *d, struct device_attribute *attr, { struct net_device *dev = to_net_dev(d); int ret; + struct ipoib_dev_priv *priv = netdev_priv(dev); + + if (test_bit(IPOIB_FLAG_GOING_DOWN, &priv->flags)) + return -EPERM; if (!rtnl_trylock()) return restart_syscall(); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index a53fa5fc0..1502199c8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -36,6 +36,27 @@ #include "ipoib.h" +struct ipoib_stats { + char stat_string[ETH_GSTRING_LEN]; + int stat_offset; +}; + +#define IPOIB_NETDEV_STAT(m) { \ + .stat_string = #m, \ + .stat_offset = offsetof(struct rtnl_link_stats64, m) } + +static const struct ipoib_stats ipoib_gstrings_stats[] = { + IPOIB_NETDEV_STAT(rx_packets), + IPOIB_NETDEV_STAT(tx_packets), + IPOIB_NETDEV_STAT(rx_bytes), + IPOIB_NETDEV_STAT(tx_bytes), + IPOIB_NETDEV_STAT(tx_errors), + IPOIB_NETDEV_STAT(rx_dropped), + IPOIB_NETDEV_STAT(tx_dropped) +}; + +#define IPOIB_GLOBAL_STATS_LEN ARRAY_SIZE(ipoib_gstrings_stats) + static void ipoib_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { @@ -92,11 +113,57 @@ static int ipoib_set_coalesce(struct net_device *dev, return 0; } +static void ipoib_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats __always_unused *stats, + u64 *data) +{ + int i; + struct net_device_stats *net_stats = &dev->stats; + u8 *p = (u8 *)net_stats; + + for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++) + data[i] = *(u64 *)(p + ipoib_gstrings_stats[i].stat_offset); + +} +static void ipoib_get_strings(struct net_device __always_unused *dev, + u32 stringset, u8 *data) +{ + u8 *p = data; + int i; + + switch (stringset) { + case ETH_SS_STATS: + for (i = 0; i < IPOIB_GLOBAL_STATS_LEN; i++) { + memcpy(p, ipoib_gstrings_stats[i].stat_string, + ETH_GSTRING_LEN); + p += ETH_GSTRING_LEN; + } + break; + case ETH_SS_TEST: + default: + break; + } +} +static int ipoib_get_sset_count(struct net_device __always_unused *dev, + int sset) +{ + switch (sset) { + case ETH_SS_STATS: + return IPOIB_GLOBAL_STATS_LEN; + case ETH_SS_TEST: + default: + break; + } + return -EOPNOTSUPP; +} static const struct ethtool_ops ipoib_ethtool_ops = { .get_drvinfo = ipoib_get_drvinfo, .get_coalesce = ipoib_get_coalesce, .set_coalesce = ipoib_set_coalesce, + .get_strings = ipoib_get_strings, + .get_ethtool_stats = ipoib_get_ethtool_stats, + .get_sset_count = ipoib_get_sset_count, }; void ipoib_set_ethtool_ops(struct net_device *dev) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index f0e55e47e..dc6d241b9 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -51,8 +51,6 @@ MODULE_PARM_DESC(data_debug_level, "Enable data path debug tracing if > 0"); #endif -static DEFINE_MUTEX(pkey_mutex); - struct ipoib_ah *ipoib_create_ah(struct net_device *dev, struct ib_pd *pd, struct ib_ah_attr *attr) { @@ -637,7 +635,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb, if (netif_queue_stopped(dev)) netif_wake_queue(dev); } else { - dev->trans_start = jiffies; + netif_trans_update(dev); address->last_send = priv->tx_head; ++priv->tx_head; @@ -999,6 +997,106 @@ static inline int update_child_pkey(struct ipoib_dev_priv *priv) return 0; } +/* + * returns true if the device address of the ipoib interface has changed and the + * new address is a valid one (i.e in the gid table), return false otherwise. + */ +static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv) +{ + union ib_gid search_gid; + union ib_gid gid0; + union ib_gid *netdev_gid; + int err; + u16 index; + u8 port; + bool ret = false; + + netdev_gid = (union ib_gid *)(priv->dev->dev_addr + 4); + if (ib_query_gid(priv->ca, priv->port, 0, &gid0, NULL)) + return false; + + netif_addr_lock_bh(priv->dev); + + /* The subnet prefix may have changed, update it now so we won't have + * to do it later + */ + priv->local_gid.global.subnet_prefix = gid0.global.subnet_prefix; + netdev_gid->global.subnet_prefix = gid0.global.subnet_prefix; + search_gid.global.subnet_prefix = gid0.global.subnet_prefix; + + search_gid.global.interface_id = priv->local_gid.global.interface_id; + + netif_addr_unlock_bh(priv->dev); + + err = ib_find_gid(priv->ca, &search_gid, IB_GID_TYPE_IB, + priv->dev, &port, &index); + + netif_addr_lock_bh(priv->dev); + + if (search_gid.global.interface_id != + priv->local_gid.global.interface_id) + /* There was a change while we were looking up the gid, bail + * here and let the next work sort this out + */ + goto out; + + /* The next section of code needs some background: + * Per IB spec the port GUID can't change if the HCA is powered on. + * port GUID is the basis for GID at index 0 which is the basis for + * the default device address of a ipoib interface. + * + * so it seems the flow should be: + * if user_changed_dev_addr && gid in gid tbl + * set bit dev_addr_set + * return true + * else + * return false + * + * The issue is that there are devices that don't follow the spec, + * they change the port GUID when the HCA is powered, so in order + * not to break userspace applications, We need to check if the + * user wanted to control the device address and we assume that + * if he sets the device address back to be based on GID index 0, + * he no longer wishs to control it. + * + * If the user doesn't control the the device address, + * IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means + * the port GUID has changed and GID at index 0 has changed + * so we need to change priv->local_gid and priv->dev->dev_addr + * to reflect the new GID. + */ + if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) { + if (!err && port == priv->port) { + set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); + if (index == 0) + clear_bit(IPOIB_FLAG_DEV_ADDR_CTRL, + &priv->flags); + else + set_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags); + ret = true; + } else { + ret = false; + } + } else { + if (!err && port == priv->port) { + ret = true; + } else { + if (!test_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags)) { + memcpy(&priv->local_gid, &gid0, + sizeof(priv->local_gid)); + memcpy(priv->dev->dev_addr + 4, &gid0, + sizeof(priv->local_gid)); + ret = true; + } + } + } + +out: + netif_addr_unlock_bh(priv->dev); + + return ret; +} + static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, enum ipoib_flush_level level, int nesting) @@ -1020,6 +1118,9 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) && level != IPOIB_FLUSH_HEAVY) { + /* Make sure the dev_addr is set even if not flushing */ + if (level == IPOIB_FLUSH_LIGHT) + ipoib_dev_addr_changed_valid(priv); ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); return; } @@ -1031,7 +1132,8 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, update_parent_pkey(priv); else update_child_pkey(priv); - } + } else if (level == IPOIB_FLUSH_LIGHT) + ipoib_dev_addr_changed_valid(priv); ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n"); return; } @@ -1083,7 +1185,8 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { if (level >= IPOIB_FLUSH_NORMAL) ipoib_ib_dev_up(dev); - ipoib_mcast_restart_task(&priv->restart_task); + if (ipoib_dev_addr_changed_valid(priv)) + ipoib_mcast_restart_task(&priv->restart_task); } } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 80807d6e5..5f58c41ef 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -99,6 +99,7 @@ static struct net_device *ipoib_get_net_dev_by_params( struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data); +static int ipoib_set_mac(struct net_device *dev, void *addr); static struct ib_client ipoib_client = { .name = "ipoib", @@ -117,6 +118,8 @@ int ipoib_open(struct net_device *dev) set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + priv->sm_fullmember_sendonly_support = false; + if (ipoib_ib_dev_open(dev)) { if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) return 0; @@ -629,6 +632,77 @@ void ipoib_mark_paths_invalid(struct net_device *dev) spin_unlock_irq(&priv->lock); } +struct classport_info_context { + struct ipoib_dev_priv *priv; + struct completion done; + struct ib_sa_query *sa_query; +}; + +static void classport_info_query_cb(int status, struct ib_class_port_info *rec, + void *context) +{ + struct classport_info_context *cb_ctx = context; + struct ipoib_dev_priv *priv; + + WARN_ON(!context); + + priv = cb_ctx->priv; + + if (status || !rec) { + pr_debug("device: %s failed query classport_info status: %d\n", + priv->dev->name, status); + /* keeps the default, will try next mcast_restart */ + priv->sm_fullmember_sendonly_support = false; + goto out; + } + + if (ib_get_cpi_capmask2(rec) & + IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT) { + pr_debug("device: %s enabled fullmember-sendonly for sendonly MCG\n", + priv->dev->name); + priv->sm_fullmember_sendonly_support = true; + } else { + pr_debug("device: %s disabled fullmember-sendonly for sendonly MCG\n", + priv->dev->name); + priv->sm_fullmember_sendonly_support = false; + } + +out: + complete(&cb_ctx->done); +} + +int ipoib_check_sm_sendonly_fullmember_support(struct ipoib_dev_priv *priv) +{ + struct classport_info_context *callback_context; + int ret; + + callback_context = kmalloc(sizeof(*callback_context), GFP_KERNEL); + if (!callback_context) + return -ENOMEM; + + callback_context->priv = priv; + init_completion(&callback_context->done); + + ret = ib_sa_classport_info_rec_query(&ipoib_sa_client, + priv->ca, priv->port, 3000, + GFP_KERNEL, + classport_info_query_cb, + callback_context, + &callback_context->sa_query); + if (ret < 0) { + pr_info("%s failed to send ib_sa_classport_info query, ret: %d\n", + priv->dev->name, ret); + kfree(callback_context); + return ret; + } + + /* waiting for the callback to finish before returnning */ + wait_for_completion(&callback_context->done); + kfree(callback_context); + + return ret; +} + void ipoib_flush_paths(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -1036,7 +1110,7 @@ static void ipoib_timeout(struct net_device *dev) struct ipoib_dev_priv *priv = netdev_priv(dev); ipoib_warn(priv, "transmit timeout: latency %d msecs\n", - jiffies_to_msecs(jiffies - dev->trans_start)); + jiffies_to_msecs(jiffies - dev_trans_start(dev))); ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n", netif_queue_stopped(dev), priv->tx_head, priv->tx_tail); @@ -1132,7 +1206,9 @@ struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr) neigh = NULL; goto out_unlock; } - neigh->alive = jiffies; + + if (likely(skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)) + neigh->alive = jiffies; goto out_unlock; } } @@ -1649,6 +1725,7 @@ static const struct net_device_ops ipoib_netdev_ops_pf = { .ndo_get_vf_config = ipoib_get_vf_config, .ndo_get_vf_stats = ipoib_get_vf_stats, .ndo_set_vf_guid = ipoib_set_vf_guid, + .ndo_set_mac_address = ipoib_set_mac, }; static const struct net_device_ops ipoib_netdev_ops_vf = { @@ -1771,6 +1848,70 @@ int ipoib_add_umcast_attr(struct net_device *dev) return device_create_file(&dev->dev, &dev_attr_umcast); } +static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid) +{ + struct ipoib_dev_priv *child_priv; + struct net_device *netdev = priv->dev; + + netif_addr_lock_bh(netdev); + + memcpy(&priv->local_gid.global.interface_id, + &gid->global.interface_id, + sizeof(gid->global.interface_id)); + memcpy(netdev->dev_addr + 4, &priv->local_gid, sizeof(priv->local_gid)); + clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); + + netif_addr_unlock_bh(netdev); + + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + down_read(&priv->vlan_rwsem); + list_for_each_entry(child_priv, &priv->child_intfs, list) + set_base_guid(child_priv, gid); + up_read(&priv->vlan_rwsem); + } +} + +static int ipoib_check_lladdr(struct net_device *dev, + struct sockaddr_storage *ss) +{ + union ib_gid *gid = (union ib_gid *)(ss->__data + 4); + int ret = 0; + + netif_addr_lock_bh(dev); + + /* Make sure the QPN, reserved and subnet prefix match the current + * lladdr, it also makes sure the lladdr is unicast. + */ + if (memcmp(dev->dev_addr, ss->__data, + 4 + sizeof(gid->global.subnet_prefix)) || + gid->global.interface_id == 0) + ret = -EINVAL; + + netif_addr_unlock_bh(dev); + + return ret; +} + +static int ipoib_set_mac(struct net_device *dev, void *addr) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct sockaddr_storage *ss = addr; + int ret; + + if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev)) + return -EBUSY; + + ret = ipoib_check_lladdr(dev, ss); + if (ret) + return ret; + + set_base_guid(priv, (union ib_gid *)(ss->__data + 4)); + + queue_work(ipoib_workqueue, &priv->flush_light); + + return 0; +} + static ssize_t create_child(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -1894,6 +2035,7 @@ static struct net_device *ipoib_add_port(const char *format, goto device_init_failed; } else memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); result = ipoib_dev_init(priv->dev, hca, port); if (result < 0) { @@ -2001,6 +2143,9 @@ static void ipoib_remove_one(struct ib_device *device, void *client_data) ib_unregister_event_handler(&priv->event_handler); flush_workqueue(ipoib_workqueue); + /* mark interface in the middle of destruction */ + set_bit(IPOIB_FLAG_GOING_DOWN, &priv->flags); + rtnl_lock(); dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); rtnl_unlock(); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 25889311b..d3394b6ad 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -64,6 +64,9 @@ struct ipoib_mcast_iter { unsigned int send_only; }; +/* join state that allows creating mcg with sendonly member request */ +#define SENDONLY_FULLMEMBER_JOIN 8 + /* * This should be called with the priv->lock held */ @@ -326,12 +329,23 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work) struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, carrier_on_task); struct ib_port_attr attr; + int ret; if (ib_query_port(priv->ca, priv->port, &attr) || attr.state != IB_PORT_ACTIVE) { ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); return; } + /* + * Check if can send sendonly MCG's with sendonly-fullmember join state. + * It done here after the successfully join to the broadcast group, + * because the broadcast group must always be joined first and is always + * re-joined if the SM changes substantially. + */ + ret = ipoib_check_sm_sendonly_fullmember_support(priv); + if (ret < 0) + pr_debug("%s failed query sm support for sendonly-fullmember (ret: %d)\n", + priv->dev->name, ret); /* * Take rtnl_lock to avoid racing with ipoib_stop() and @@ -515,22 +529,20 @@ static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast) rec.hop_limit = priv->broadcast->mcmember.hop_limit; /* - * Send-only IB Multicast joins do not work at the core - * IB layer yet, so we can't use them here. However, - * we are emulating an Ethernet multicast send, which - * does not require a multicast subscription and will - * still send properly. The most appropriate thing to + * Send-only IB Multicast joins work at the core IB layer but + * require specific SM support. + * We can use such joins here only if the current SM supports that feature. + * However, if not, we emulate an Ethernet multicast send, + * which does not require a multicast subscription and will + * still send properly. The most appropriate thing to * do is to create the group if it doesn't exist as that * most closely emulates the behavior, from a user space - * application perspecitive, of Ethernet multicast - * operation. For now, we do a full join, maybe later - * when the core IB layers support send only joins we - * will use them. + * application perspective, of Ethernet multicast operation. */ -#if 0 - if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) - rec.join_state = 4; -#endif + if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && + priv->sm_fullmember_sendonly_support) + /* SM supports sendonly-fullmember, otherwise fallback to full-member */ + rec.join_state = SENDONLY_FULLMEMBER_JOIN; } spin_unlock_irq(&priv->lock); @@ -570,11 +582,13 @@ void ipoib_mcast_join_task(struct work_struct *work) return; } priv->local_lid = port_attr.lid; + netif_addr_lock_bh(dev); - if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid, NULL)) - ipoib_warn(priv, "ib_query_gid() failed\n"); - else - memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) { + netif_addr_unlock_bh(dev); + return; + } + netif_addr_unlock_bh(dev); spin_lock_irq(&priv->lock); if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index b809c373e..1e7cbbaa1 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -307,5 +307,8 @@ void ipoib_event(struct ib_event_handler *handler, queue_work(ipoib_workqueue, &priv->flush_normal); } else if (record->event == IB_EVENT_PKEY_CHANGE) { queue_work(ipoib_workqueue, &priv->flush_heavy); + } else if (record->event == IB_EVENT_GID_CHANGE && + !test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) { + queue_work(ipoib_workqueue, &priv->flush_light); } } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index fca1a882d..a2f9f29c6 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -68,6 +68,8 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, priv->pkey = pkey; memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN); + memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid)); + set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); priv->dev->broadcast[8] = pkey >> 8; priv->dev->broadcast[9] = pkey & 0xff; @@ -129,6 +131,9 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) ppriv = netdev_priv(pdev); + if (test_bit(IPOIB_FLAG_GOING_DOWN, &ppriv->flags)) + return -EPERM; + snprintf(intf_name, sizeof intf_name, "%s.%04x", ppriv->dev->name, pkey); priv = ipoib_intf_alloc(intf_name); @@ -181,6 +186,9 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) ppriv = netdev_priv(pdev); + if (test_bit(IPOIB_FLAG_GOING_DOWN, &ppriv->flags)) + return -EPERM; + if (!rtnl_trylock()) return restart_syscall(); diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index 9a391cc5b..90be56893 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -236,7 +236,7 @@ int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task, page_vec->npages = 0; page_vec->fake_mr.page_size = SIZE_4K; plen = ib_sg_to_pages(&page_vec->fake_mr, mem->sg, - mem->size, iser_set_page); + mem->size, NULL, iser_set_page); if (unlikely(plen < mem->size)) { iser_err("page vec too short to hold this SG\n"); iser_data_buf_dump(mem, device->ib_device); @@ -446,7 +446,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); - n = ib_map_mr_sg(mr, mem->sg, mem->size, SIZE_4K); + n = ib_map_mr_sg(mr, mem->sg, mem->size, NULL, SIZE_4K); if (unlikely(n != mem->size)) { iser_err("failed to map sg (%d/%d)\n", n, mem->size); diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 411e4464c..a990c0420 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -33,7 +33,8 @@ #define ISERT_MAX_CONN 8 #define ISER_MAX_RX_CQ_LEN (ISERT_QP_MAX_RECV_DTOS * ISERT_MAX_CONN) -#define ISER_MAX_TX_CQ_LEN (ISERT_QP_MAX_REQ_DTOS * ISERT_MAX_CONN) +#define ISER_MAX_TX_CQ_LEN \ + ((ISERT_QP_MAX_REQ_DTOS + ISCSI_DEF_XMIT_CMDS_MAX) * ISERT_MAX_CONN) #define ISER_MAX_CQ_LEN (ISER_MAX_RX_CQ_LEN + ISER_MAX_TX_CQ_LEN + \ ISERT_MAX_CONN) @@ -46,14 +47,6 @@ static LIST_HEAD(device_list); static struct workqueue_struct *isert_comp_wq; static struct workqueue_struct *isert_release_wq; -static void -isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn); -static int -isert_map_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn); -static void -isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn); -static int -isert_reg_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn); static int isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd); static int @@ -142,6 +135,7 @@ isert_create_qp(struct isert_conn *isert_conn, attr.recv_cq = comp->cq; attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS + 1; attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1; + attr.cap.max_rdma_ctxs = ISCSI_DEF_XMIT_CMDS_MAX; attr.cap.max_send_sge = device->ib_device->attrs.max_sge; isert_conn->max_sge = min(device->ib_device->attrs.max_sge, device->ib_device->attrs.max_sge_rd); @@ -270,9 +264,9 @@ isert_alloc_comps(struct isert_device *device) device->ib_device->num_comp_vectors)); isert_info("Using %d CQs, %s supports %d vectors support " - "Fast registration %d pi_capable %d\n", + "pi_capable %d\n", device->comps_used, device->ib_device->name, - device->ib_device->num_comp_vectors, device->use_fastreg, + device->ib_device->num_comp_vectors, device->pi_capable); device->comps = kcalloc(device->comps_used, sizeof(struct isert_comp), @@ -313,18 +307,6 @@ isert_create_device_ib_res(struct isert_device *device) isert_dbg("devattr->max_sge: %d\n", ib_dev->attrs.max_sge); isert_dbg("devattr->max_sge_rd: %d\n", ib_dev->attrs.max_sge_rd); - /* asign function handlers */ - if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS && - ib_dev->attrs.device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) { - device->use_fastreg = 1; - device->reg_rdma_mem = isert_reg_rdma; - device->unreg_rdma_mem = isert_unreg_rdma; - } else { - device->use_fastreg = 0; - device->reg_rdma_mem = isert_map_rdma; - device->unreg_rdma_mem = isert_unmap_cmd; - } - ret = isert_alloc_comps(device); if (ret) goto out; @@ -417,146 +399,6 @@ isert_device_get(struct rdma_cm_id *cma_id) } static void -isert_conn_free_fastreg_pool(struct isert_conn *isert_conn) -{ - struct fast_reg_descriptor *fr_desc, *tmp; - int i = 0; - - if (list_empty(&isert_conn->fr_pool)) - return; - - isert_info("Freeing conn %p fastreg pool", isert_conn); - - list_for_each_entry_safe(fr_desc, tmp, - &isert_conn->fr_pool, list) { - list_del(&fr_desc->list); - ib_dereg_mr(fr_desc->data_mr); - if (fr_desc->pi_ctx) { - ib_dereg_mr(fr_desc->pi_ctx->prot_mr); - ib_dereg_mr(fr_desc->pi_ctx->sig_mr); - kfree(fr_desc->pi_ctx); - } - kfree(fr_desc); - ++i; - } - - if (i < isert_conn->fr_pool_size) - isert_warn("Pool still has %d regions registered\n", - isert_conn->fr_pool_size - i); -} - -static int -isert_create_pi_ctx(struct fast_reg_descriptor *desc, - struct ib_device *device, - struct ib_pd *pd) -{ - struct pi_context *pi_ctx; - int ret; - - pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); - if (!pi_ctx) { - isert_err("Failed to allocate pi context\n"); - return -ENOMEM; - } - - pi_ctx->prot_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, - ISCSI_ISER_SG_TABLESIZE); - if (IS_ERR(pi_ctx->prot_mr)) { - isert_err("Failed to allocate prot frmr err=%ld\n", - PTR_ERR(pi_ctx->prot_mr)); - ret = PTR_ERR(pi_ctx->prot_mr); - goto err_pi_ctx; - } - desc->ind |= ISERT_PROT_KEY_VALID; - - pi_ctx->sig_mr = ib_alloc_mr(pd, IB_MR_TYPE_SIGNATURE, 2); - if (IS_ERR(pi_ctx->sig_mr)) { - isert_err("Failed to allocate signature enabled mr err=%ld\n", - PTR_ERR(pi_ctx->sig_mr)); - ret = PTR_ERR(pi_ctx->sig_mr); - goto err_prot_mr; - } - - desc->pi_ctx = pi_ctx; - desc->ind |= ISERT_SIG_KEY_VALID; - desc->ind &= ~ISERT_PROTECTED; - - return 0; - -err_prot_mr: - ib_dereg_mr(pi_ctx->prot_mr); -err_pi_ctx: - kfree(pi_ctx); - - return ret; -} - -static int -isert_create_fr_desc(struct ib_device *ib_device, struct ib_pd *pd, - struct fast_reg_descriptor *fr_desc) -{ - fr_desc->data_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, - ISCSI_ISER_SG_TABLESIZE); - if (IS_ERR(fr_desc->data_mr)) { - isert_err("Failed to allocate data frmr err=%ld\n", - PTR_ERR(fr_desc->data_mr)); - return PTR_ERR(fr_desc->data_mr); - } - fr_desc->ind |= ISERT_DATA_KEY_VALID; - - isert_dbg("Created fr_desc %p\n", fr_desc); - - return 0; -} - -static int -isert_conn_create_fastreg_pool(struct isert_conn *isert_conn) -{ - struct fast_reg_descriptor *fr_desc; - struct isert_device *device = isert_conn->device; - struct se_session *se_sess = isert_conn->conn->sess->se_sess; - struct se_node_acl *se_nacl = se_sess->se_node_acl; - int i, ret, tag_num; - /* - * Setup the number of FRMRs based upon the number of tags - * available to session in iscsi_target_locate_portal(). - */ - tag_num = max_t(u32, ISCSIT_MIN_TAGS, se_nacl->queue_depth); - tag_num = (tag_num * 2) + ISCSIT_EXTRA_TAGS; - - isert_conn->fr_pool_size = 0; - for (i = 0; i < tag_num; i++) { - fr_desc = kzalloc(sizeof(*fr_desc), GFP_KERNEL); - if (!fr_desc) { - isert_err("Failed to allocate fast_reg descriptor\n"); - ret = -ENOMEM; - goto err; - } - - ret = isert_create_fr_desc(device->ib_device, - device->pd, fr_desc); - if (ret) { - isert_err("Failed to create fastreg descriptor err=%d\n", - ret); - kfree(fr_desc); - goto err; - } - - list_add_tail(&fr_desc->list, &isert_conn->fr_pool); - isert_conn->fr_pool_size++; - } - - isert_dbg("Creating conn %p fastreg pool size=%d", - isert_conn, isert_conn->fr_pool_size); - - return 0; - -err: - isert_conn_free_fastreg_pool(isert_conn); - return ret; -} - -static void isert_init_conn(struct isert_conn *isert_conn) { isert_conn->state = ISER_CONN_INIT; @@ -565,8 +407,6 @@ isert_init_conn(struct isert_conn *isert_conn) init_completion(&isert_conn->login_req_comp); kref_init(&isert_conn->kref); mutex_init(&isert_conn->mutex); - spin_lock_init(&isert_conn->pool_lock); - INIT_LIST_HEAD(&isert_conn->fr_pool); INIT_WORK(&isert_conn->release_work, isert_release_work); } @@ -739,9 +579,6 @@ isert_connect_release(struct isert_conn *isert_conn) BUG_ON(!device); - if (device->use_fastreg) - isert_conn_free_fastreg_pool(isert_conn); - isert_free_rx_descriptors(isert_conn); if (isert_conn->cm_id) rdma_destroy_id(isert_conn->cm_id); @@ -1080,7 +917,6 @@ isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, { struct iser_tx_desc *tx_desc = &isert_cmd->tx_desc; - isert_cmd->iser_ib_op = ISER_IB_SEND; tx_desc->tx_cqe.done = isert_send_done; send_wr->wr_cqe = &tx_desc->tx_cqe; @@ -1160,16 +996,6 @@ isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login, } if (!login->login_failed) { if (login->login_complete) { - if (!conn->sess->sess_ops->SessionType && - isert_conn->device->use_fastreg) { - ret = isert_conn_create_fastreg_pool(isert_conn); - if (ret) { - isert_err("Conn: %p failed to create" - " fastreg pool\n", isert_conn); - return ret; - } - } - ret = isert_alloc_rx_descriptors(isert_conn); if (ret) return ret; @@ -1633,97 +1459,26 @@ isert_login_recv_done(struct ib_cq *cq, struct ib_wc *wc) ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE); } -static int -isert_map_data_buf(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, - struct scatterlist *sg, u32 nents, u32 length, u32 offset, - enum iser_ib_op_code op, struct isert_data_buf *data) -{ - struct ib_device *ib_dev = isert_conn->cm_id->device; - - data->dma_dir = op == ISER_IB_RDMA_WRITE ? - DMA_TO_DEVICE : DMA_FROM_DEVICE; - - data->len = length - offset; - data->offset = offset; - data->sg_off = data->offset / PAGE_SIZE; - - data->sg = &sg[data->sg_off]; - data->nents = min_t(unsigned int, nents - data->sg_off, - ISCSI_ISER_SG_TABLESIZE); - data->len = min_t(unsigned int, data->len, ISCSI_ISER_SG_TABLESIZE * - PAGE_SIZE); - - data->dma_nents = ib_dma_map_sg(ib_dev, data->sg, data->nents, - data->dma_dir); - if (unlikely(!data->dma_nents)) { - isert_err("Cmd: unable to dma map SGs %p\n", sg); - return -EINVAL; - } - - isert_dbg("Mapped cmd: %p count: %u sg: %p sg_nents: %u rdma_len %d\n", - isert_cmd, data->dma_nents, data->sg, data->nents, data->len); - - return 0; -} - -static void -isert_unmap_data_buf(struct isert_conn *isert_conn, struct isert_data_buf *data) -{ - struct ib_device *ib_dev = isert_conn->cm_id->device; - - ib_dma_unmap_sg(ib_dev, data->sg, data->nents, data->dma_dir); - memset(data, 0, sizeof(*data)); -} - - - static void -isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) +isert_rdma_rw_ctx_destroy(struct isert_cmd *cmd, struct isert_conn *conn) { - isert_dbg("Cmd %p\n", isert_cmd); + struct se_cmd *se_cmd = &cmd->iscsi_cmd->se_cmd; + enum dma_data_direction dir = target_reverse_dma_direction(se_cmd); - if (isert_cmd->data.sg) { - isert_dbg("Cmd %p unmap_sg op\n", isert_cmd); - isert_unmap_data_buf(isert_conn, &isert_cmd->data); - } - - if (isert_cmd->rdma_wr) { - isert_dbg("Cmd %p free send_wr\n", isert_cmd); - kfree(isert_cmd->rdma_wr); - isert_cmd->rdma_wr = NULL; - } - - if (isert_cmd->ib_sge) { - isert_dbg("Cmd %p free ib_sge\n", isert_cmd); - kfree(isert_cmd->ib_sge); - isert_cmd->ib_sge = NULL; - } -} - -static void -isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn) -{ - isert_dbg("Cmd %p\n", isert_cmd); - - if (isert_cmd->fr_desc) { - isert_dbg("Cmd %p free fr_desc %p\n", isert_cmd, isert_cmd->fr_desc); - if (isert_cmd->fr_desc->ind & ISERT_PROTECTED) { - isert_unmap_data_buf(isert_conn, &isert_cmd->prot); - isert_cmd->fr_desc->ind &= ~ISERT_PROTECTED; - } - spin_lock_bh(&isert_conn->pool_lock); - list_add_tail(&isert_cmd->fr_desc->list, &isert_conn->fr_pool); - spin_unlock_bh(&isert_conn->pool_lock); - isert_cmd->fr_desc = NULL; - } + if (!cmd->rw.nr_ops) + return; - if (isert_cmd->data.sg) { - isert_dbg("Cmd %p unmap_sg op\n", isert_cmd); - isert_unmap_data_buf(isert_conn, &isert_cmd->data); + if (isert_prot_cmd(conn, se_cmd)) { + rdma_rw_ctx_destroy_signature(&cmd->rw, conn->qp, + conn->cm_id->port_num, se_cmd->t_data_sg, + se_cmd->t_data_nents, se_cmd->t_prot_sg, + se_cmd->t_prot_nents, dir); + } else { + rdma_rw_ctx_destroy(&cmd->rw, conn->qp, conn->cm_id->port_num, + se_cmd->t_data_sg, se_cmd->t_data_nents, dir); } - isert_cmd->ib_sge = NULL; - isert_cmd->rdma_wr = NULL; + cmd->rw.nr_ops = 0; } static void @@ -1732,7 +1487,6 @@ isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err) struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; struct isert_conn *isert_conn = isert_cmd->conn; struct iscsi_conn *conn = isert_conn->conn; - struct isert_device *device = isert_conn->device; struct iscsi_text_rsp *hdr; isert_dbg("Cmd %p\n", isert_cmd); @@ -1760,7 +1514,7 @@ isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err) } } - device->unreg_rdma_mem(isert_cmd, isert_conn); + isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); transport_generic_free_cmd(&cmd->se_cmd, 0); break; case ISCSI_OP_SCSI_TMFUNC: @@ -1894,14 +1648,9 @@ isert_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) isert_dbg("Cmd %p\n", isert_cmd); - if (isert_cmd->fr_desc && isert_cmd->fr_desc->ind & ISERT_PROTECTED) { - ret = isert_check_pi_status(cmd, - isert_cmd->fr_desc->pi_ctx->sig_mr); - isert_cmd->fr_desc->ind &= ~ISERT_PROTECTED; - } + ret = isert_check_pi_status(cmd, isert_cmd->rw.sig->sig_mr); + isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); - device->unreg_rdma_mem(isert_cmd, isert_conn); - isert_cmd->rdma_wr_num = 0; if (ret) transport_send_check_condition_and_sense(cmd, cmd->pi_err, 0); else @@ -1929,16 +1678,12 @@ isert_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc) isert_dbg("Cmd %p\n", isert_cmd); - if (isert_cmd->fr_desc && isert_cmd->fr_desc->ind & ISERT_PROTECTED) { - ret = isert_check_pi_status(se_cmd, - isert_cmd->fr_desc->pi_ctx->sig_mr); - isert_cmd->fr_desc->ind &= ~ISERT_PROTECTED; - } - iscsit_stop_dataout_timer(cmd); - device->unreg_rdma_mem(isert_cmd, isert_conn); - cmd->write_data_done = isert_cmd->data.len; - isert_cmd->rdma_wr_num = 0; + + if (isert_prot_cmd(isert_conn, se_cmd)) + ret = isert_check_pi_status(se_cmd, isert_cmd->rw.sig->sig_mr); + isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); + cmd->write_data_done = 0; isert_dbg("Cmd: %p RDMA_READ comp calling execute_cmd\n", isert_cmd); spin_lock_bh(&cmd->istate_lock); @@ -2111,7 +1856,6 @@ isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd) { struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); struct isert_conn *isert_conn = conn->context; - struct isert_device *device = isert_conn->device; spin_lock_bh(&conn->cmd_lock); if (!list_empty(&cmd->i_conn_node)) @@ -2120,8 +1864,7 @@ isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd) if (cmd->data_direction == DMA_TO_DEVICE) iscsit_stop_dataout_timer(cmd); - - device->unreg_rdma_mem(isert_cmd, isert_conn); + isert_rdma_rw_ctx_destroy(isert_cmd, isert_conn); } static enum target_prot_op @@ -2274,234 +2017,6 @@ isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn) return isert_post_response(isert_conn, isert_cmd); } -static int -isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, - struct ib_sge *ib_sge, struct ib_rdma_wr *rdma_wr, - u32 data_left, u32 offset) -{ - struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; - struct scatterlist *sg_start, *tmp_sg; - struct isert_device *device = isert_conn->device; - struct ib_device *ib_dev = device->ib_device; - u32 sg_off, page_off; - int i = 0, sg_nents; - - sg_off = offset / PAGE_SIZE; - sg_start = &cmd->se_cmd.t_data_sg[sg_off]; - sg_nents = min(cmd->se_cmd.t_data_nents - sg_off, isert_conn->max_sge); - page_off = offset % PAGE_SIZE; - - rdma_wr->wr.sg_list = ib_sge; - rdma_wr->wr.wr_cqe = &isert_cmd->tx_desc.tx_cqe; - - /* - * Perform mapping of TCM scatterlist memory ib_sge dma_addr. - */ - for_each_sg(sg_start, tmp_sg, sg_nents, i) { - isert_dbg("RDMA from SGL dma_addr: 0x%llx dma_len: %u, " - "page_off: %u\n", - (unsigned long long)tmp_sg->dma_address, - tmp_sg->length, page_off); - - ib_sge->addr = ib_sg_dma_address(ib_dev, tmp_sg) + page_off; - ib_sge->length = min_t(u32, data_left, - ib_sg_dma_len(ib_dev, tmp_sg) - page_off); - ib_sge->lkey = device->pd->local_dma_lkey; - - isert_dbg("RDMA ib_sge: addr: 0x%llx length: %u lkey: %x\n", - ib_sge->addr, ib_sge->length, ib_sge->lkey); - page_off = 0; - data_left -= ib_sge->length; - if (!data_left) - break; - ib_sge++; - isert_dbg("Incrementing ib_sge pointer to %p\n", ib_sge); - } - - rdma_wr->wr.num_sge = ++i; - isert_dbg("Set outgoing sg_list: %p num_sg: %u from TCM SGLs\n", - rdma_wr->wr.sg_list, rdma_wr->wr.num_sge); - - return rdma_wr->wr.num_sge; -} - -static int -isert_map_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn) -{ - struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; - struct se_cmd *se_cmd = &cmd->se_cmd; - struct isert_conn *isert_conn = conn->context; - struct isert_data_buf *data = &isert_cmd->data; - struct ib_rdma_wr *rdma_wr; - struct ib_sge *ib_sge; - u32 offset, data_len, data_left, rdma_write_max, va_offset = 0; - int ret = 0, i, ib_sge_cnt; - - offset = isert_cmd->iser_ib_op == ISER_IB_RDMA_READ ? - cmd->write_data_done : 0; - ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg, - se_cmd->t_data_nents, se_cmd->data_length, - offset, isert_cmd->iser_ib_op, - &isert_cmd->data); - if (ret) - return ret; - - data_left = data->len; - offset = data->offset; - - ib_sge = kzalloc(sizeof(struct ib_sge) * data->nents, GFP_KERNEL); - if (!ib_sge) { - isert_warn("Unable to allocate ib_sge\n"); - ret = -ENOMEM; - goto unmap_cmd; - } - isert_cmd->ib_sge = ib_sge; - - isert_cmd->rdma_wr_num = DIV_ROUND_UP(data->nents, isert_conn->max_sge); - isert_cmd->rdma_wr = kzalloc(sizeof(struct ib_rdma_wr) * - isert_cmd->rdma_wr_num, GFP_KERNEL); - if (!isert_cmd->rdma_wr) { - isert_dbg("Unable to allocate isert_cmd->rdma_wr\n"); - ret = -ENOMEM; - goto unmap_cmd; - } - - rdma_write_max = isert_conn->max_sge * PAGE_SIZE; - - for (i = 0; i < isert_cmd->rdma_wr_num; i++) { - rdma_wr = &isert_cmd->rdma_wr[i]; - data_len = min(data_left, rdma_write_max); - - rdma_wr->wr.send_flags = 0; - if (isert_cmd->iser_ib_op == ISER_IB_RDMA_WRITE) { - isert_cmd->tx_desc.tx_cqe.done = isert_rdma_write_done; - - rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; - rdma_wr->remote_addr = isert_cmd->read_va + offset; - rdma_wr->rkey = isert_cmd->read_stag; - if (i + 1 == isert_cmd->rdma_wr_num) - rdma_wr->wr.next = &isert_cmd->tx_desc.send_wr; - else - rdma_wr->wr.next = &isert_cmd->rdma_wr[i + 1].wr; - } else { - isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done; - - rdma_wr->wr.opcode = IB_WR_RDMA_READ; - rdma_wr->remote_addr = isert_cmd->write_va + va_offset; - rdma_wr->rkey = isert_cmd->write_stag; - if (i + 1 == isert_cmd->rdma_wr_num) - rdma_wr->wr.send_flags = IB_SEND_SIGNALED; - else - rdma_wr->wr.next = &isert_cmd->rdma_wr[i + 1].wr; - } - - ib_sge_cnt = isert_build_rdma_wr(isert_conn, isert_cmd, ib_sge, - rdma_wr, data_len, offset); - ib_sge += ib_sge_cnt; - - offset += data_len; - va_offset += data_len; - data_left -= data_len; - } - - return 0; -unmap_cmd: - isert_unmap_data_buf(isert_conn, data); - - return ret; -} - -static inline void -isert_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr) -{ - u32 rkey; - - memset(inv_wr, 0, sizeof(*inv_wr)); - inv_wr->wr_cqe = NULL; - inv_wr->opcode = IB_WR_LOCAL_INV; - inv_wr->ex.invalidate_rkey = mr->rkey; - - /* Bump the key */ - rkey = ib_inc_rkey(mr->rkey); - ib_update_fast_reg_key(mr, rkey); -} - -static int -isert_fast_reg_mr(struct isert_conn *isert_conn, - struct fast_reg_descriptor *fr_desc, - struct isert_data_buf *mem, - enum isert_indicator ind, - struct ib_sge *sge) -{ - struct isert_device *device = isert_conn->device; - struct ib_device *ib_dev = device->ib_device; - struct ib_mr *mr; - struct ib_reg_wr reg_wr; - struct ib_send_wr inv_wr, *bad_wr, *wr = NULL; - int ret, n; - - if (mem->dma_nents == 1) { - sge->lkey = device->pd->local_dma_lkey; - sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]); - sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]); - isert_dbg("sge: addr: 0x%llx length: %u lkey: %x\n", - sge->addr, sge->length, sge->lkey); - return 0; - } - - if (ind == ISERT_DATA_KEY_VALID) - /* Registering data buffer */ - mr = fr_desc->data_mr; - else - /* Registering protection buffer */ - mr = fr_desc->pi_ctx->prot_mr; - - if (!(fr_desc->ind & ind)) { - isert_inv_rkey(&inv_wr, mr); - wr = &inv_wr; - } - - n = ib_map_mr_sg(mr, mem->sg, mem->nents, PAGE_SIZE); - if (unlikely(n != mem->nents)) { - isert_err("failed to map mr sg (%d/%d)\n", - n, mem->nents); - return n < 0 ? n : -EINVAL; - } - - isert_dbg("Use fr_desc %p sg_nents %d offset %u\n", - fr_desc, mem->nents, mem->offset); - - reg_wr.wr.next = NULL; - reg_wr.wr.opcode = IB_WR_REG_MR; - reg_wr.wr.wr_cqe = NULL; - reg_wr.wr.send_flags = 0; - reg_wr.wr.num_sge = 0; - reg_wr.mr = mr; - reg_wr.key = mr->lkey; - reg_wr.access = IB_ACCESS_LOCAL_WRITE; - - if (!wr) - wr = ®_wr.wr; - else - wr->next = ®_wr.wr; - - ret = ib_post_send(isert_conn->qp, wr, &bad_wr); - if (ret) { - isert_err("fast registration failed, ret:%d\n", ret); - return ret; - } - fr_desc->ind &= ~ind; - - sge->lkey = mr->lkey; - sge->addr = mr->iova; - sge->length = mr->length; - - isert_dbg("sge: addr: 0x%llx length: %u lkey: %x\n", - sge->addr, sge->length, sge->lkey); - - return ret; -} - static inline void isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs, struct ib_sig_domain *domain) @@ -2526,6 +2041,8 @@ isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs, static int isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs) { + memset(sig_attrs, 0, sizeof(*sig_attrs)); + switch (se_cmd->prot_op) { case TARGET_PROT_DIN_INSERT: case TARGET_PROT_DOUT_STRIP: @@ -2547,228 +2064,59 @@ isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs) return -EINVAL; } + sig_attrs->check_mask = + (se_cmd->prot_checks & TARGET_DIF_CHECK_GUARD ? 0xc0 : 0) | + (se_cmd->prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) | + (se_cmd->prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0); return 0; } -static inline u8 -isert_set_prot_checks(u8 prot_checks) -{ - return (prot_checks & TARGET_DIF_CHECK_GUARD ? 0xc0 : 0) | - (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) | - (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0); -} - static int -isert_reg_sig_mr(struct isert_conn *isert_conn, - struct isert_cmd *isert_cmd, - struct fast_reg_descriptor *fr_desc) -{ - struct se_cmd *se_cmd = &isert_cmd->iscsi_cmd->se_cmd; - struct ib_sig_handover_wr sig_wr; - struct ib_send_wr inv_wr, *bad_wr, *wr = NULL; - struct pi_context *pi_ctx = fr_desc->pi_ctx; - struct ib_sig_attrs sig_attrs; +isert_rdma_rw_ctx_post(struct isert_cmd *cmd, struct isert_conn *conn, + struct ib_cqe *cqe, struct ib_send_wr *chain_wr) +{ + struct se_cmd *se_cmd = &cmd->iscsi_cmd->se_cmd; + enum dma_data_direction dir = target_reverse_dma_direction(se_cmd); + u8 port_num = conn->cm_id->port_num; + u64 addr; + u32 rkey, offset; int ret; - memset(&sig_attrs, 0, sizeof(sig_attrs)); - ret = isert_set_sig_attrs(se_cmd, &sig_attrs); - if (ret) - goto err; - - sig_attrs.check_mask = isert_set_prot_checks(se_cmd->prot_checks); - - if (!(fr_desc->ind & ISERT_SIG_KEY_VALID)) { - isert_inv_rkey(&inv_wr, pi_ctx->sig_mr); - wr = &inv_wr; - } - - memset(&sig_wr, 0, sizeof(sig_wr)); - sig_wr.wr.opcode = IB_WR_REG_SIG_MR; - sig_wr.wr.wr_cqe = NULL; - sig_wr.wr.sg_list = &isert_cmd->ib_sg[DATA]; - sig_wr.wr.num_sge = 1; - sig_wr.access_flags = IB_ACCESS_LOCAL_WRITE; - sig_wr.sig_attrs = &sig_attrs; - sig_wr.sig_mr = pi_ctx->sig_mr; - if (se_cmd->t_prot_sg) - sig_wr.prot = &isert_cmd->ib_sg[PROT]; - - if (!wr) - wr = &sig_wr.wr; - else - wr->next = &sig_wr.wr; - - ret = ib_post_send(isert_conn->qp, wr, &bad_wr); - if (ret) { - isert_err("fast registration failed, ret:%d\n", ret); - goto err; - } - fr_desc->ind &= ~ISERT_SIG_KEY_VALID; - - isert_cmd->ib_sg[SIG].lkey = pi_ctx->sig_mr->lkey; - isert_cmd->ib_sg[SIG].addr = 0; - isert_cmd->ib_sg[SIG].length = se_cmd->data_length; - if (se_cmd->prot_op != TARGET_PROT_DIN_STRIP && - se_cmd->prot_op != TARGET_PROT_DOUT_INSERT) - /* - * We have protection guards on the wire - * so we need to set a larget transfer - */ - isert_cmd->ib_sg[SIG].length += se_cmd->prot_length; - - isert_dbg("sig_sge: addr: 0x%llx length: %u lkey: %x\n", - isert_cmd->ib_sg[SIG].addr, isert_cmd->ib_sg[SIG].length, - isert_cmd->ib_sg[SIG].lkey); -err: - return ret; -} - -static int -isert_handle_prot_cmd(struct isert_conn *isert_conn, - struct isert_cmd *isert_cmd) -{ - struct isert_device *device = isert_conn->device; - struct se_cmd *se_cmd = &isert_cmd->iscsi_cmd->se_cmd; - int ret; - - if (!isert_cmd->fr_desc->pi_ctx) { - ret = isert_create_pi_ctx(isert_cmd->fr_desc, - device->ib_device, - device->pd); - if (ret) { - isert_err("conn %p failed to allocate pi_ctx\n", - isert_conn); - return ret; - } - } - - if (se_cmd->t_prot_sg) { - ret = isert_map_data_buf(isert_conn, isert_cmd, - se_cmd->t_prot_sg, - se_cmd->t_prot_nents, - se_cmd->prot_length, - 0, - isert_cmd->iser_ib_op, - &isert_cmd->prot); - if (ret) { - isert_err("conn %p failed to map protection buffer\n", - isert_conn); - return ret; - } - - memset(&isert_cmd->ib_sg[PROT], 0, sizeof(isert_cmd->ib_sg[PROT])); - ret = isert_fast_reg_mr(isert_conn, isert_cmd->fr_desc, - &isert_cmd->prot, - ISERT_PROT_KEY_VALID, - &isert_cmd->ib_sg[PROT]); - if (ret) { - isert_err("conn %p failed to fast reg mr\n", - isert_conn); - goto unmap_prot_cmd; - } - } - - ret = isert_reg_sig_mr(isert_conn, isert_cmd, isert_cmd->fr_desc); - if (ret) { - isert_err("conn %p failed to fast reg mr\n", - isert_conn); - goto unmap_prot_cmd; - } - isert_cmd->fr_desc->ind |= ISERT_PROTECTED; - - return 0; - -unmap_prot_cmd: - if (se_cmd->t_prot_sg) - isert_unmap_data_buf(isert_conn, &isert_cmd->prot); - - return ret; -} - -static int -isert_reg_rdma(struct isert_cmd *isert_cmd, struct iscsi_conn *conn) -{ - struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd; - struct se_cmd *se_cmd = &cmd->se_cmd; - struct isert_conn *isert_conn = conn->context; - struct fast_reg_descriptor *fr_desc = NULL; - struct ib_rdma_wr *rdma_wr; - struct ib_sge *ib_sg; - u32 offset; - int ret = 0; - unsigned long flags; - - offset = isert_cmd->iser_ib_op == ISER_IB_RDMA_READ ? - cmd->write_data_done : 0; - ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg, - se_cmd->t_data_nents, se_cmd->data_length, - offset, isert_cmd->iser_ib_op, - &isert_cmd->data); - if (ret) - return ret; - - if (isert_cmd->data.dma_nents != 1 || - isert_prot_cmd(isert_conn, se_cmd)) { - spin_lock_irqsave(&isert_conn->pool_lock, flags); - fr_desc = list_first_entry(&isert_conn->fr_pool, - struct fast_reg_descriptor, list); - list_del(&fr_desc->list); - spin_unlock_irqrestore(&isert_conn->pool_lock, flags); - isert_cmd->fr_desc = fr_desc; - } - - ret = isert_fast_reg_mr(isert_conn, fr_desc, &isert_cmd->data, - ISERT_DATA_KEY_VALID, &isert_cmd->ib_sg[DATA]); - if (ret) - goto unmap_cmd; - - if (isert_prot_cmd(isert_conn, se_cmd)) { - ret = isert_handle_prot_cmd(isert_conn, isert_cmd); - if (ret) - goto unmap_cmd; - - ib_sg = &isert_cmd->ib_sg[SIG]; + if (dir == DMA_FROM_DEVICE) { + addr = cmd->write_va; + rkey = cmd->write_stag; + offset = cmd->iscsi_cmd->write_data_done; } else { - ib_sg = &isert_cmd->ib_sg[DATA]; + addr = cmd->read_va; + rkey = cmd->read_stag; + offset = 0; } - memcpy(&isert_cmd->s_ib_sge, ib_sg, sizeof(*ib_sg)); - isert_cmd->ib_sge = &isert_cmd->s_ib_sge; - isert_cmd->rdma_wr_num = 1; - memset(&isert_cmd->s_rdma_wr, 0, sizeof(isert_cmd->s_rdma_wr)); - isert_cmd->rdma_wr = &isert_cmd->s_rdma_wr; + if (isert_prot_cmd(conn, se_cmd)) { + struct ib_sig_attrs sig_attrs; - rdma_wr = &isert_cmd->s_rdma_wr; - rdma_wr->wr.sg_list = &isert_cmd->s_ib_sge; - rdma_wr->wr.num_sge = 1; - rdma_wr->wr.wr_cqe = &isert_cmd->tx_desc.tx_cqe; - if (isert_cmd->iser_ib_op == ISER_IB_RDMA_WRITE) { - isert_cmd->tx_desc.tx_cqe.done = isert_rdma_write_done; + ret = isert_set_sig_attrs(se_cmd, &sig_attrs); + if (ret) + return ret; - rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; - rdma_wr->remote_addr = isert_cmd->read_va; - rdma_wr->rkey = isert_cmd->read_stag; - rdma_wr->wr.send_flags = !isert_prot_cmd(isert_conn, se_cmd) ? - 0 : IB_SEND_SIGNALED; + WARN_ON_ONCE(offset); + ret = rdma_rw_ctx_signature_init(&cmd->rw, conn->qp, port_num, + se_cmd->t_data_sg, se_cmd->t_data_nents, + se_cmd->t_prot_sg, se_cmd->t_prot_nents, + &sig_attrs, addr, rkey, dir); } else { - isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done; - - rdma_wr->wr.opcode = IB_WR_RDMA_READ; - rdma_wr->remote_addr = isert_cmd->write_va; - rdma_wr->rkey = isert_cmd->write_stag; - rdma_wr->wr.send_flags = IB_SEND_SIGNALED; + ret = rdma_rw_ctx_init(&cmd->rw, conn->qp, port_num, + se_cmd->t_data_sg, se_cmd->t_data_nents, + offset, addr, rkey, dir); } - - return 0; - -unmap_cmd: - if (fr_desc) { - spin_lock_irqsave(&isert_conn->pool_lock, flags); - list_add_tail(&fr_desc->list, &isert_conn->fr_pool); - spin_unlock_irqrestore(&isert_conn->pool_lock, flags); + if (ret < 0) { + isert_err("Cmd: %p failed to prepare RDMA res\n", cmd); + return ret; } - isert_unmap_data_buf(isert_conn, &isert_cmd->data); + ret = rdma_rw_ctx_post(&cmd->rw, conn->qp, port_num, cqe, chain_wr); + if (ret < 0) + isert_err("Cmd: %p failed to post RDMA res\n", cmd); return ret; } @@ -2778,21 +2126,17 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) struct se_cmd *se_cmd = &cmd->se_cmd; struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); struct isert_conn *isert_conn = conn->context; - struct isert_device *device = isert_conn->device; - struct ib_send_wr *wr_failed; + struct ib_cqe *cqe = NULL; + struct ib_send_wr *chain_wr = NULL; int rc; isert_dbg("Cmd: %p RDMA_WRITE data_length: %u\n", isert_cmd, se_cmd->data_length); - isert_cmd->iser_ib_op = ISER_IB_RDMA_WRITE; - rc = device->reg_rdma_mem(isert_cmd, conn); - if (rc) { - isert_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd); - return rc; - } - - if (!isert_prot_cmd(isert_conn, se_cmd)) { + if (isert_prot_cmd(isert_conn, se_cmd)) { + isert_cmd->tx_desc.tx_cqe.done = isert_rdma_write_done; + cqe = &isert_cmd->tx_desc.tx_cqe; + } else { /* * Build isert_conn->tx_desc for iSCSI response PDU and attach */ @@ -2803,56 +2147,35 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); isert_init_send_wr(isert_conn, isert_cmd, &isert_cmd->tx_desc.send_wr); - isert_cmd->s_rdma_wr.wr.next = &isert_cmd->tx_desc.send_wr; - isert_cmd->rdma_wr_num += 1; rc = isert_post_recv(isert_conn, isert_cmd->rx_desc); if (rc) { isert_err("ib_post_recv failed with %d\n", rc); return rc; } - } - rc = ib_post_send(isert_conn->qp, &isert_cmd->rdma_wr->wr, &wr_failed); - if (rc) - isert_warn("ib_post_send() failed for IB_WR_RDMA_WRITE\n"); - - if (!isert_prot_cmd(isert_conn, se_cmd)) - isert_dbg("Cmd: %p posted RDMA_WRITE + Response for iSER Data " - "READ\n", isert_cmd); - else - isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ\n", - isert_cmd); + chain_wr = &isert_cmd->tx_desc.send_wr; + } + isert_rdma_rw_ctx_post(isert_cmd, isert_conn, cqe, chain_wr); + isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ\n", isert_cmd); return 1; } static int isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery) { - struct se_cmd *se_cmd = &cmd->se_cmd; struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd); - struct isert_conn *isert_conn = conn->context; - struct isert_device *device = isert_conn->device; - struct ib_send_wr *wr_failed; - int rc; isert_dbg("Cmd: %p RDMA_READ data_length: %u write_data_done: %u\n", - isert_cmd, se_cmd->data_length, cmd->write_data_done); - isert_cmd->iser_ib_op = ISER_IB_RDMA_READ; - rc = device->reg_rdma_mem(isert_cmd, conn); - if (rc) { - isert_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd); - return rc; - } + isert_cmd, cmd->se_cmd.data_length, cmd->write_data_done); - rc = ib_post_send(isert_conn->qp, &isert_cmd->rdma_wr->wr, &wr_failed); - if (rc) - isert_warn("ib_post_send() failed for IB_WR_RDMA_READ\n"); + isert_cmd->tx_desc.tx_cqe.done = isert_rdma_read_done; + isert_rdma_rw_ctx_post(isert_cmd, conn->context, + &isert_cmd->tx_desc.tx_cqe, NULL); isert_dbg("Cmd: %p posted RDMA_READ memory for ISER Data WRITE\n", isert_cmd); - return 0; } @@ -3273,9 +2596,19 @@ static void isert_free_conn(struct iscsi_conn *conn) isert_put_conn(isert_conn); } +static void isert_get_rx_pdu(struct iscsi_conn *conn) +{ + struct completion comp; + + init_completion(&comp); + + wait_for_completion_interruptible(&comp); +} + static struct iscsit_transport iser_target_transport = { .name = "IB/iSER", .transport_type = ISCSI_INFINIBAND, + .rdma_shutdown = true, .priv_size = sizeof(struct isert_cmd), .owner = THIS_MODULE, .iscsit_setup_np = isert_setup_np, @@ -3291,6 +2624,7 @@ static struct iscsit_transport iser_target_transport = { .iscsit_queue_data_in = isert_put_datain, .iscsit_queue_status = isert_put_response, .iscsit_aborted_task = isert_aborted_task, + .iscsit_get_rx_pdu = isert_get_rx_pdu, .iscsit_get_sup_prot_ops = isert_get_sup_prot_ops, }; diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h index 147900cbb..e512ba941 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.h +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -3,6 +3,7 @@ #include <linux/in6.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> +#include <rdma/rw.h> #include <scsi/iser.h> @@ -53,10 +54,7 @@ #define ISERT_MIN_POSTED_RX (ISCSI_DEF_XMIT_CMDS_MAX >> 2) -#define ISERT_INFLIGHT_DATAOUTS 8 - -#define ISERT_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX * \ - (1 + ISERT_INFLIGHT_DATAOUTS) + \ +#define ISERT_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX + \ ISERT_MAX_TX_MISC_PDUS + \ ISERT_MAX_RX_MISC_PDUS) @@ -71,13 +69,6 @@ enum isert_desc_type { ISCSI_TX_DATAIN }; -enum iser_ib_op_code { - ISER_IB_RECV, - ISER_IB_SEND, - ISER_IB_RDMA_WRITE, - ISER_IB_RDMA_READ, -}; - enum iser_conn_state { ISER_CONN_INIT, ISER_CONN_UP, @@ -118,42 +109,6 @@ static inline struct iser_tx_desc *cqe_to_tx_desc(struct ib_cqe *cqe) return container_of(cqe, struct iser_tx_desc, tx_cqe); } - -enum isert_indicator { - ISERT_PROTECTED = 1 << 0, - ISERT_DATA_KEY_VALID = 1 << 1, - ISERT_PROT_KEY_VALID = 1 << 2, - ISERT_SIG_KEY_VALID = 1 << 3, -}; - -struct pi_context { - struct ib_mr *prot_mr; - struct ib_mr *sig_mr; -}; - -struct fast_reg_descriptor { - struct list_head list; - struct ib_mr *data_mr; - u8 ind; - struct pi_context *pi_ctx; -}; - -struct isert_data_buf { - struct scatterlist *sg; - int nents; - u32 sg_off; - u32 len; /* cur_rdma_length */ - u32 offset; - unsigned int dma_nents; - enum dma_data_direction dma_dir; -}; - -enum { - DATA = 0, - PROT = 1, - SIG = 2, -}; - struct isert_cmd { uint32_t read_stag; uint32_t write_stag; @@ -166,16 +121,7 @@ struct isert_cmd { struct iscsi_cmd *iscsi_cmd; struct iser_tx_desc tx_desc; struct iser_rx_desc *rx_desc; - enum iser_ib_op_code iser_ib_op; - struct ib_sge *ib_sge; - struct ib_sge s_ib_sge; - int rdma_wr_num; - struct ib_rdma_wr *rdma_wr; - struct ib_rdma_wr s_rdma_wr; - struct ib_sge ib_sg[3]; - struct isert_data_buf data; - struct isert_data_buf prot; - struct fast_reg_descriptor *fr_desc; + struct rdma_rw_ctx rw; struct work_struct comp_work; struct scatterlist sg; }; @@ -210,10 +156,6 @@ struct isert_conn { struct isert_device *device; struct mutex mutex; struct kref kref; - struct list_head fr_pool; - int fr_pool_size; - /* lock to protect fastreg pool */ - spinlock_t pool_lock; struct work_struct release_work; bool logout_posted; bool snd_w_inv; @@ -236,7 +178,6 @@ struct isert_comp { }; struct isert_device { - int use_fastreg; bool pi_capable; int refcount; struct ib_device *ib_device; @@ -244,10 +185,6 @@ struct isert_device { struct isert_comp *comps; int comps_used; struct list_head dev_node; - int (*reg_rdma_mem)(struct isert_cmd *isert_cmd, - struct iscsi_conn *conn); - void (*unreg_rdma_mem)(struct isert_cmd *isert_cmd, - struct isert_conn *isert_conn); }; struct isert_np { diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 845ce90c2..3322ed750 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -70,6 +70,7 @@ static unsigned int indirect_sg_entries; static bool allow_ext_sg; static bool prefer_fr = true; static bool register_always = true; +static bool never_register; static int topspin_workarounds = 1; module_param(srp_sg_tablesize, uint, 0444); @@ -81,7 +82,7 @@ MODULE_PARM_DESC(cmd_sg_entries, module_param(indirect_sg_entries, uint, 0444); MODULE_PARM_DESC(indirect_sg_entries, - "Default max number of gather/scatter entries (default is 12, max is " __stringify(SCSI_MAX_SG_CHAIN_SEGMENTS) ")"); + "Default max number of gather/scatter entries (default is 12, max is " __stringify(SG_MAX_SEGMENTS) ")"); module_param(allow_ext_sg, bool, 0444); MODULE_PARM_DESC(allow_ext_sg, @@ -99,6 +100,9 @@ module_param(register_always, bool, 0444); MODULE_PARM_DESC(register_always, "Use memory registration even for contiguous memory regions"); +module_param(never_register, bool, 0444); +MODULE_PARM_DESC(never_register, "Never register memory"); + static const struct kernel_param_ops srp_tmo_ops; static int srp_reconnect_delay = 10; @@ -316,7 +320,7 @@ static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target) struct ib_fmr_pool_param fmr_param; memset(&fmr_param, 0, sizeof(fmr_param)); - fmr_param.pool_size = target->scsi_host->can_queue; + fmr_param.pool_size = target->mr_pool_size; fmr_param.dirty_watermark = fmr_param.pool_size / 4; fmr_param.cache = 1; fmr_param.max_pages_per_fmr = dev->max_pages_per_mr; @@ -441,8 +445,7 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target) { struct srp_device *dev = target->srp_host->srp_dev; - return srp_create_fr_pool(dev->dev, dev->pd, - target->scsi_host->can_queue, + return srp_create_fr_pool(dev->dev, dev->pd, target->mr_pool_size, dev->max_pages_per_mr); } @@ -469,7 +472,7 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) struct ib_qp *qp; struct ib_fmr_pool *fmr_pool = NULL; struct srp_fr_pool *fr_pool = NULL; - const int m = dev->use_fast_reg ? 3 : 1; + const int m = 1 + dev->use_fast_reg * target->mr_per_cmd * 2; int ret; init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL); @@ -850,7 +853,7 @@ static int srp_alloc_req_data(struct srp_rdma_ch *ch) for (i = 0; i < target->req_ring_size; ++i) { req = &ch->req_ring[i]; - mr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *), + mr_list = kmalloc(target->mr_per_cmd * sizeof(void *), GFP_KERNEL); if (!mr_list) goto out; @@ -1112,7 +1115,7 @@ static struct scsi_cmnd *srp_claim_req(struct srp_rdma_ch *ch, } /** - * srp_free_req() - Unmap data and add request to the free request list. + * srp_free_req() - Unmap data and adjust ch->req_lim. * @ch: SRP RDMA channel. * @req: Request to be freed. * @scmnd: SCSI command associated with @req. @@ -1299,9 +1302,16 @@ static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc) srp_handle_qp_err(cq, wc, "FAST REG"); } +/* + * Map up to sg_nents elements of state->sg where *sg_offset_p is the offset + * where to start in the first element. If sg_offset_p != NULL then + * *sg_offset_p is updated to the offset in state->sg[retval] of the first + * byte that has not yet been mapped. + */ static int srp_map_finish_fr(struct srp_map_state *state, struct srp_request *req, - struct srp_rdma_ch *ch, int sg_nents) + struct srp_rdma_ch *ch, int sg_nents, + unsigned int *sg_offset_p) { struct srp_target_port *target = ch->target; struct srp_device *dev = target->srp_host->srp_dev; @@ -1316,13 +1326,14 @@ static int srp_map_finish_fr(struct srp_map_state *state, WARN_ON_ONCE(!dev->use_fast_reg); - if (sg_nents == 0) - return 0; - if (sg_nents == 1 && target->global_mr) { - srp_map_desc(state, sg_dma_address(state->sg), - sg_dma_len(state->sg), + unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; + + srp_map_desc(state, sg_dma_address(state->sg) + sg_offset, + sg_dma_len(state->sg) - sg_offset, target->global_mr->rkey); + if (sg_offset_p) + *sg_offset_p = 0; return 1; } @@ -1333,9 +1344,17 @@ static int srp_map_finish_fr(struct srp_map_state *state, rkey = ib_inc_rkey(desc->mr->rkey); ib_update_fast_reg_key(desc->mr, rkey); - n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, dev->mr_page_size); - if (unlikely(n < 0)) + n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, sg_offset_p, + dev->mr_page_size); + if (unlikely(n < 0)) { + srp_fr_pool_put(ch->fr_pool, &desc, 1); + pr_debug("%s: ib_map_mr_sg(%d, %d) returned %d.\n", + dev_name(&req->scmnd->device->sdev_gendev), sg_nents, + sg_offset_p ? *sg_offset_p : -1, n); return n; + } + + WARN_ON_ONCE(desc->mr->length == 0); req->reg_cqe.done = srp_reg_mr_err_done; @@ -1357,8 +1376,10 @@ static int srp_map_finish_fr(struct srp_map_state *state, desc->mr->length, desc->mr->rkey); err = ib_post_send(ch->qp, &wr.wr, &bad_wr); - if (unlikely(err)) + if (unlikely(err)) { + WARN_ON_ONCE(err == -ENOMEM); return err; + } return n; } @@ -1398,7 +1419,7 @@ static int srp_map_sg_entry(struct srp_map_state *state, /* * If the last entry of the MR wasn't a full page, then we need to * close it out and start a new one -- we can only merge at page - * boundries. + * boundaries. */ ret = 0; if (len != dev->mr_page_size) @@ -1413,10 +1434,9 @@ static int srp_map_sg_fmr(struct srp_map_state *state, struct srp_rdma_ch *ch, struct scatterlist *sg; int i, ret; - state->desc = req->indirect_desc; state->pages = req->map_page; state->fmr.next = req->fmr_list; - state->fmr.end = req->fmr_list + ch->target->cmd_sg_cnt; + state->fmr.end = req->fmr_list + ch->target->mr_per_cmd; for_each_sg(scat, sg, count, i) { ret = srp_map_sg_entry(state, ch, sg, i); @@ -1428,8 +1448,6 @@ static int srp_map_sg_fmr(struct srp_map_state *state, struct srp_rdma_ch *ch, if (ret) return ret; - req->nmdesc = state->nmdesc; - return 0; } @@ -1437,15 +1455,19 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch, struct srp_request *req, struct scatterlist *scat, int count) { - state->desc = req->indirect_desc; + unsigned int sg_offset = 0; + state->fr.next = req->fr_list; - state->fr.end = req->fr_list + ch->target->cmd_sg_cnt; + state->fr.end = req->fr_list + ch->target->mr_per_cmd; state->sg = scat; + if (count == 0) + return 0; + while (count) { int i, n; - n = srp_map_finish_fr(state, req, ch, count); + n = srp_map_finish_fr(state, req, ch, count, &sg_offset); if (unlikely(n < 0)) return n; @@ -1454,8 +1476,6 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch, state->sg = sg_next(state->sg); } - req->nmdesc = state->nmdesc; - return 0; } @@ -1468,15 +1488,12 @@ static int srp_map_sg_dma(struct srp_map_state *state, struct srp_rdma_ch *ch, struct scatterlist *sg; int i; - state->desc = req->indirect_desc; for_each_sg(scat, sg, count, i) { srp_map_desc(state, ib_sg_dma_address(dev->dev, sg), ib_sg_dma_len(dev->dev, sg), target->global_mr->rkey); } - req->nmdesc = state->nmdesc; - return 0; } @@ -1514,9 +1531,10 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req, #ifdef CONFIG_NEED_SG_DMA_LENGTH idb_sg->dma_length = idb_sg->length; /* hack^2 */ #endif - ret = srp_map_finish_fr(&state, req, ch, 1); + ret = srp_map_finish_fr(&state, req, ch, 1, NULL); if (ret < 0) return ret; + WARN_ON_ONCE(ret < 1); } else if (dev->use_fmr) { state.pages = idb_pages; state.pages[0] = (req->indirect_dma_addr & @@ -1534,6 +1552,41 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req, return 0; } +#if defined(DYNAMIC_DATA_DEBUG) +static void srp_check_mapping(struct srp_map_state *state, + struct srp_rdma_ch *ch, struct srp_request *req, + struct scatterlist *scat, int count) +{ + struct srp_device *dev = ch->target->srp_host->srp_dev; + struct srp_fr_desc **pfr; + u64 desc_len = 0, mr_len = 0; + int i; + + for (i = 0; i < state->ndesc; i++) + desc_len += be32_to_cpu(req->indirect_desc[i].len); + if (dev->use_fast_reg) + for (i = 0, pfr = req->fr_list; i < state->nmdesc; i++, pfr++) + mr_len += (*pfr)->mr->length; + else if (dev->use_fmr) + for (i = 0; i < state->nmdesc; i++) + mr_len += be32_to_cpu(req->indirect_desc[i].len); + if (desc_len != scsi_bufflen(req->scmnd) || + mr_len > scsi_bufflen(req->scmnd)) + pr_err("Inconsistent: scsi len %d <> desc len %lld <> mr len %lld; ndesc %d; nmdesc = %d\n", + scsi_bufflen(req->scmnd), desc_len, mr_len, + state->ndesc, state->nmdesc); +} +#endif + +/** + * srp_map_data() - map SCSI data buffer onto an SRP request + * @scmnd: SCSI command to map + * @ch: SRP RDMA channel + * @req: SRP request + * + * Returns the length in bytes of the SRP_CMD IU or a negative value if + * mapping failed. + */ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, struct srp_request *req) { @@ -1600,12 +1653,25 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, target->indirect_size, DMA_TO_DEVICE); memset(&state, 0, sizeof(state)); + state.desc = req->indirect_desc; if (dev->use_fast_reg) - srp_map_sg_fr(&state, ch, req, scat, count); + ret = srp_map_sg_fr(&state, ch, req, scat, count); else if (dev->use_fmr) - srp_map_sg_fmr(&state, ch, req, scat, count); + ret = srp_map_sg_fmr(&state, ch, req, scat, count); else - srp_map_sg_dma(&state, ch, req, scat, count); + ret = srp_map_sg_dma(&state, ch, req, scat, count); + req->nmdesc = state.nmdesc; + if (ret < 0) + goto unmap; + +#if defined(DYNAMIC_DEBUG) + { + DEFINE_DYNAMIC_DEBUG_METADATA(ddm, + "Memory mapping consistency check"); + if (unlikely(ddm.flags & _DPRINTK_FLAGS_PRINT)) + srp_check_mapping(&state, ch, req, scat, count); + } +#endif /* We've mapped the request, now pull as much of the indirect * descriptor table as we can into the command buffer. If this @@ -1628,7 +1694,8 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, !target->allow_ext_sg)) { shost_printk(KERN_ERR, target->scsi_host, "Could not fit S/G list into SRP_CMD\n"); - return -EIO; + ret = -EIO; + goto unmap; } count = min(state.ndesc, target->cmd_sg_cnt); @@ -1646,7 +1713,7 @@ static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch, ret = srp_map_idb(ch, req, state.gen.next, state.gen.end, idb_len, &idb_rkey); if (ret < 0) - return ret; + goto unmap; req->nmdesc++; } else { idb_rkey = cpu_to_be32(target->global_mr->rkey); @@ -1672,6 +1739,12 @@ map_complete: cmd->buf_fmt = fmt; return len; + +unmap: + srp_unmap_data(scmnd, ch, req); + if (ret == -ENOMEM && req->nmdesc >= target->mr_pool_size) + ret = -E2BIG; + return ret; } /* @@ -2564,6 +2637,20 @@ static int srp_reset_host(struct scsi_cmnd *scmnd) return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED; } +static int srp_slave_alloc(struct scsi_device *sdev) +{ + struct Scsi_Host *shost = sdev->host; + struct srp_target_port *target = host_to_target(shost); + struct srp_device *srp_dev = target->srp_host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; + + if (!(ibdev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG)) + blk_queue_virt_boundary(sdev->request_queue, + ~srp_dev->mr_page_mask); + + return 0; +} + static int srp_slave_configure(struct scsi_device *sdev) { struct Scsi_Host *shost = sdev->host; @@ -2755,6 +2842,7 @@ static struct scsi_host_template srp_template = { .module = THIS_MODULE, .name = "InfiniBand SRP initiator", .proc_name = DRV_NAME, + .slave_alloc = srp_slave_alloc, .slave_configure = srp_slave_configure, .info = srp_target_info, .queuecommand = srp_queuecommand, @@ -2819,7 +2907,7 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target) spin_unlock(&host->target_lock); scsi_scan_target(&target->scsi_host->shost_gendev, - 0, target->scsi_id, SCAN_WILD_CARD, 0); + 0, target->scsi_id, SCAN_WILD_CARD, SCSI_SCAN_INITIAL); if (srp_connected_ch(target) < target->ch_count || target->qp_in_error) { @@ -2829,7 +2917,7 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target) goto out; } - pr_debug(PFX "%s: SCSI scan succeeded - detected %d LUNs\n", + pr_debug("%s: SCSI scan succeeded - detected %d LUNs\n", dev_name(&target->scsi_host->shost_gendev), srp_sdev_count(target->scsi_host)); @@ -3097,7 +3185,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) case SRP_OPT_SG_TABLESIZE: if (match_int(args, &token) || token < 1 || - token > SCSI_MAX_SG_CHAIN_SEGMENTS) { + token > SG_MAX_SEGMENTS) { pr_warn("bad max sg_tablesize parameter '%s'\n", p); goto out; @@ -3161,6 +3249,7 @@ static ssize_t srp_create_target(struct device *dev, struct srp_device *srp_dev = host->srp_dev; struct ib_device *ibdev = srp_dev->dev; int ret, node_idx, node, cpu, i; + unsigned int max_sectors_per_mr, mr_per_cmd = 0; bool multich = false; target_host = scsi_host_alloc(&srp_template, @@ -3217,7 +3306,33 @@ static ssize_t srp_create_target(struct device *dev, target->sg_tablesize = target->cmd_sg_cnt; } + if (srp_dev->use_fast_reg || srp_dev->use_fmr) { + /* + * FR and FMR can only map one HCA page per entry. If the + * start address is not aligned on a HCA page boundary two + * entries will be used for the head and the tail although + * these two entries combined contain at most one HCA page of + * data. Hence the "+ 1" in the calculation below. + * + * The indirect data buffer descriptor is contiguous so the + * memory for that buffer will only be registered if + * register_always is true. Hence add one to mr_per_cmd if + * register_always has been set. + */ + max_sectors_per_mr = srp_dev->max_pages_per_mr << + (ilog2(srp_dev->mr_page_size) - 9); + mr_per_cmd = register_always + + (target->scsi_host->max_sectors + 1 + + max_sectors_per_mr - 1) / max_sectors_per_mr; + pr_debug("max_sectors = %u; max_pages_per_mr = %u; mr_page_size = %u; max_sectors_per_mr = %u; mr_per_cmd = %u\n", + target->scsi_host->max_sectors, + srp_dev->max_pages_per_mr, srp_dev->mr_page_size, + max_sectors_per_mr, mr_per_cmd); + } + target_host->sg_tablesize = target->sg_tablesize; + target->mr_pool_size = target->scsi_host->can_queue * mr_per_cmd; + target->mr_per_cmd = mr_per_cmd; target->indirect_size = target->sg_tablesize * sizeof (struct srp_direct_buf); target->max_iu_len = sizeof (struct srp_cmd) + @@ -3410,21 +3525,10 @@ static void srp_add_one(struct ib_device *device) int mr_page_shift, p; u64 max_pages_per_mr; - srp_dev = kmalloc(sizeof *srp_dev, GFP_KERNEL); + srp_dev = kzalloc(sizeof(*srp_dev), GFP_KERNEL); if (!srp_dev) return; - srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && - device->map_phys_fmr && device->unmap_fmr); - srp_dev->has_fr = (device->attrs.device_cap_flags & - IB_DEVICE_MEM_MGT_EXTENSIONS); - if (!srp_dev->has_fmr && !srp_dev->has_fr) - dev_warn(&device->dev, "neither FMR nor FR is supported\n"); - - srp_dev->use_fast_reg = (srp_dev->has_fr && - (!srp_dev->has_fmr || prefer_fr)); - srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr; - /* * Use the smallest page size supported by the HCA, down to a * minimum of 4096 bytes. We're unlikely to build large sglists @@ -3435,8 +3539,25 @@ static void srp_add_one(struct ib_device *device) srp_dev->mr_page_mask = ~((u64) srp_dev->mr_page_size - 1); max_pages_per_mr = device->attrs.max_mr_size; do_div(max_pages_per_mr, srp_dev->mr_page_size); + pr_debug("%s: %llu / %u = %llu <> %u\n", __func__, + device->attrs.max_mr_size, srp_dev->mr_page_size, + max_pages_per_mr, SRP_MAX_PAGES_PER_MR); srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR, max_pages_per_mr); + + srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && + device->map_phys_fmr && device->unmap_fmr); + srp_dev->has_fr = (device->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + if (!never_register && !srp_dev->has_fmr && !srp_dev->has_fr) { + dev_warn(&device->dev, "neither FMR nor FR is supported\n"); + } else if (!never_register && + device->attrs.max_mr_size >= 2 * srp_dev->mr_page_size) { + srp_dev->use_fast_reg = (srp_dev->has_fr && + (!srp_dev->has_fmr || prefer_fr)); + srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr; + } + if (srp_dev->use_fast_reg) { srp_dev->max_pages_per_mr = min_t(u32, srp_dev->max_pages_per_mr, @@ -3456,15 +3577,14 @@ static void srp_add_one(struct ib_device *device) if (IS_ERR(srp_dev->pd)) goto free_dev; - if (!register_always || (!srp_dev->has_fmr && !srp_dev->has_fr)) { + if (never_register || !register_always || + (!srp_dev->has_fmr && !srp_dev->has_fr)) { srp_dev->global_mr = ib_get_dma_mr(srp_dev->pd, IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE); if (IS_ERR(srp_dev->global_mr)) goto err_pd; - } else { - srp_dev->global_mr = NULL; } for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) { diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 9e05ce4a0..26bb9b0a7 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -202,6 +202,8 @@ struct srp_target_port { char target_name[32]; unsigned int scsi_id; unsigned int sg_tablesize; + int mr_pool_size; + int mr_per_cmd; int queue_size; int req_ring_size; int comp_vector; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 8b42401d4..4a4155640 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -254,8 +254,8 @@ static void srpt_get_class_port_info(struct ib_dm_mad *mad) memset(cif, 0, sizeof(*cif)); cif->base_version = 1; cif->class_version = 1; - cif->resp_time_value = 20; + ib_set_cpi_resp_time(cif, 20); mad->mad_hdr.status = 0; } @@ -765,52 +765,6 @@ static int srpt_post_recv(struct srpt_device *sdev, } /** - * srpt_post_send() - Post an IB send request. - * - * Returns zero upon success and a non-zero value upon failure. - */ -static int srpt_post_send(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx, int len) -{ - struct ib_sge list; - struct ib_send_wr wr, *bad_wr; - struct srpt_device *sdev = ch->sport->sdev; - int ret; - - atomic_inc(&ch->req_lim); - - ret = -ENOMEM; - if (unlikely(atomic_dec_return(&ch->sq_wr_avail) < 0)) { - pr_warn("IB send queue full (needed 1)\n"); - goto out; - } - - ib_dma_sync_single_for_device(sdev->device, ioctx->ioctx.dma, len, - DMA_TO_DEVICE); - - list.addr = ioctx->ioctx.dma; - list.length = len; - list.lkey = sdev->pd->local_dma_lkey; - - ioctx->ioctx.cqe.done = srpt_send_done; - wr.next = NULL; - wr.wr_cqe = &ioctx->ioctx.cqe; - wr.sg_list = &list; - wr.num_sge = 1; - wr.opcode = IB_WR_SEND; - wr.send_flags = IB_SEND_SIGNALED; - - ret = ib_post_send(ch->qp, &wr, &bad_wr); - -out: - if (ret < 0) { - atomic_inc(&ch->sq_wr_avail); - atomic_dec(&ch->req_lim); - } - return ret; -} - -/** * srpt_zerolength_write() - Perform a zero-length RDMA write. * * A quote from the InfiniBand specification: C9-88: For an HCA responder @@ -843,6 +797,110 @@ static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc) } } +static int srpt_alloc_rw_ctxs(struct srpt_send_ioctx *ioctx, + struct srp_direct_buf *db, int nbufs, struct scatterlist **sg, + unsigned *sg_cnt) +{ + enum dma_data_direction dir = target_reverse_dma_direction(&ioctx->cmd); + struct srpt_rdma_ch *ch = ioctx->ch; + struct scatterlist *prev = NULL; + unsigned prev_nents; + int ret, i; + + if (nbufs == 1) { + ioctx->rw_ctxs = &ioctx->s_rw_ctx; + } else { + ioctx->rw_ctxs = kmalloc_array(nbufs, sizeof(*ioctx->rw_ctxs), + GFP_KERNEL); + if (!ioctx->rw_ctxs) + return -ENOMEM; + } + + for (i = ioctx->n_rw_ctx; i < nbufs; i++, db++) { + struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i]; + u64 remote_addr = be64_to_cpu(db->va); + u32 size = be32_to_cpu(db->len); + u32 rkey = be32_to_cpu(db->key); + + ret = target_alloc_sgl(&ctx->sg, &ctx->nents, size, false, + i < nbufs - 1); + if (ret) + goto unwind; + + ret = rdma_rw_ctx_init(&ctx->rw, ch->qp, ch->sport->port, + ctx->sg, ctx->nents, 0, remote_addr, rkey, dir); + if (ret < 0) { + target_free_sgl(ctx->sg, ctx->nents); + goto unwind; + } + + ioctx->n_rdma += ret; + ioctx->n_rw_ctx++; + + if (prev) { + sg_unmark_end(&prev[prev_nents - 1]); + sg_chain(prev, prev_nents + 1, ctx->sg); + } else { + *sg = ctx->sg; + } + + prev = ctx->sg; + prev_nents = ctx->nents; + + *sg_cnt += ctx->nents; + } + + return 0; + +unwind: + while (--i >= 0) { + struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i]; + + rdma_rw_ctx_destroy(&ctx->rw, ch->qp, ch->sport->port, + ctx->sg, ctx->nents, dir); + target_free_sgl(ctx->sg, ctx->nents); + } + if (ioctx->rw_ctxs != &ioctx->s_rw_ctx) + kfree(ioctx->rw_ctxs); + return ret; +} + +static void srpt_free_rw_ctxs(struct srpt_rdma_ch *ch, + struct srpt_send_ioctx *ioctx) +{ + enum dma_data_direction dir = target_reverse_dma_direction(&ioctx->cmd); + int i; + + for (i = 0; i < ioctx->n_rw_ctx; i++) { + struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i]; + + rdma_rw_ctx_destroy(&ctx->rw, ch->qp, ch->sport->port, + ctx->sg, ctx->nents, dir); + target_free_sgl(ctx->sg, ctx->nents); + } + + if (ioctx->rw_ctxs != &ioctx->s_rw_ctx) + kfree(ioctx->rw_ctxs); +} + +static inline void *srpt_get_desc_buf(struct srp_cmd *srp_cmd) +{ + /* + * The pointer computations below will only be compiled correctly + * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check + * whether srp_cmd::add_data has been declared as a byte pointer. + */ + BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0) && + !__same_type(srp_cmd->add_data[0], (u8)0)); + + /* + * According to the SRP spec, the lower two bits of the 'ADDITIONAL + * CDB LENGTH' field are reserved and the size in bytes of this field + * is four times the value specified in bits 3..7. Hence the "& ~3". + */ + return srp_cmd->add_data + (srp_cmd->add_cdb_len & ~3); +} + /** * srpt_get_desc_tbl() - Parse the data descriptors of an SRP_CMD request. * @ioctx: Pointer to the I/O context associated with the request. @@ -858,94 +916,59 @@ static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc) * -ENOMEM when memory allocation fails and zero upon success. */ static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx, - struct srp_cmd *srp_cmd, - enum dma_data_direction *dir, u64 *data_len) + struct srp_cmd *srp_cmd, enum dma_data_direction *dir, + struct scatterlist **sg, unsigned *sg_cnt, u64 *data_len) { - struct srp_indirect_buf *idb; - struct srp_direct_buf *db; - unsigned add_cdb_offset; - int ret; - - /* - * The pointer computations below will only be compiled correctly - * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check - * whether srp_cmd::add_data has been declared as a byte pointer. - */ - BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0) - && !__same_type(srp_cmd->add_data[0], (u8)0)); - BUG_ON(!dir); BUG_ON(!data_len); - ret = 0; - *data_len = 0; - /* * The lower four bits of the buffer format field contain the DATA-IN * buffer descriptor format, and the highest four bits contain the * DATA-OUT buffer descriptor format. */ - *dir = DMA_NONE; if (srp_cmd->buf_fmt & 0xf) /* DATA-IN: transfer data from target to initiator (read). */ *dir = DMA_FROM_DEVICE; else if (srp_cmd->buf_fmt >> 4) /* DATA-OUT: transfer data from initiator to target (write). */ *dir = DMA_TO_DEVICE; + else + *dir = DMA_NONE; + + /* initialize data_direction early as srpt_alloc_rw_ctxs needs it */ + ioctx->cmd.data_direction = *dir; - /* - * According to the SRP spec, the lower two bits of the 'ADDITIONAL - * CDB LENGTH' field are reserved and the size in bytes of this field - * is four times the value specified in bits 3..7. Hence the "& ~3". - */ - add_cdb_offset = srp_cmd->add_cdb_len & ~3; if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_DIRECT) || ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_DIRECT)) { - ioctx->n_rbuf = 1; - ioctx->rbufs = &ioctx->single_rbuf; + struct srp_direct_buf *db = srpt_get_desc_buf(srp_cmd); - db = (struct srp_direct_buf *)(srp_cmd->add_data - + add_cdb_offset); - memcpy(ioctx->rbufs, db, sizeof(*db)); *data_len = be32_to_cpu(db->len); + return srpt_alloc_rw_ctxs(ioctx, db, 1, sg, sg_cnt); } else if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_INDIRECT) || ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_INDIRECT)) { - idb = (struct srp_indirect_buf *)(srp_cmd->add_data - + add_cdb_offset); - - ioctx->n_rbuf = be32_to_cpu(idb->table_desc.len) / sizeof(*db); + struct srp_indirect_buf *idb = srpt_get_desc_buf(srp_cmd); + int nbufs = be32_to_cpu(idb->table_desc.len) / + sizeof(struct srp_direct_buf); - if (ioctx->n_rbuf > + if (nbufs > (srp_cmd->data_out_desc_cnt + srp_cmd->data_in_desc_cnt)) { pr_err("received unsupported SRP_CMD request" " type (%u out + %u in != %u / %zu)\n", srp_cmd->data_out_desc_cnt, srp_cmd->data_in_desc_cnt, be32_to_cpu(idb->table_desc.len), - sizeof(*db)); - ioctx->n_rbuf = 0; - ret = -EINVAL; - goto out; + sizeof(struct srp_direct_buf)); + return -EINVAL; } - if (ioctx->n_rbuf == 1) - ioctx->rbufs = &ioctx->single_rbuf; - else { - ioctx->rbufs = - kmalloc(ioctx->n_rbuf * sizeof(*db), GFP_ATOMIC); - if (!ioctx->rbufs) { - ioctx->n_rbuf = 0; - ret = -ENOMEM; - goto out; - } - } - - db = idb->desc_list; - memcpy(ioctx->rbufs, db, ioctx->n_rbuf * sizeof(*db)); *data_len = be32_to_cpu(idb->len); + return srpt_alloc_rw_ctxs(ioctx, idb->desc_list, nbufs, + sg, sg_cnt); + } else { + *data_len = 0; + return 0; } -out: - return ret; } /** @@ -1049,217 +1072,6 @@ static int srpt_ch_qp_err(struct srpt_rdma_ch *ch) } /** - * srpt_unmap_sg_to_ib_sge() - Unmap an IB SGE list. - */ -static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx) -{ - struct scatterlist *sg; - enum dma_data_direction dir; - - BUG_ON(!ch); - BUG_ON(!ioctx); - BUG_ON(ioctx->n_rdma && !ioctx->rdma_wrs); - - while (ioctx->n_rdma) - kfree(ioctx->rdma_wrs[--ioctx->n_rdma].wr.sg_list); - - kfree(ioctx->rdma_wrs); - ioctx->rdma_wrs = NULL; - - if (ioctx->mapped_sg_count) { - sg = ioctx->sg; - WARN_ON(!sg); - dir = ioctx->cmd.data_direction; - BUG_ON(dir == DMA_NONE); - ib_dma_unmap_sg(ch->sport->sdev->device, sg, ioctx->sg_cnt, - target_reverse_dma_direction(&ioctx->cmd)); - ioctx->mapped_sg_count = 0; - } -} - -/** - * srpt_map_sg_to_ib_sge() - Map an SG list to an IB SGE list. - */ -static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx) -{ - struct ib_device *dev = ch->sport->sdev->device; - struct se_cmd *cmd; - struct scatterlist *sg, *sg_orig; - int sg_cnt; - enum dma_data_direction dir; - struct ib_rdma_wr *riu; - struct srp_direct_buf *db; - dma_addr_t dma_addr; - struct ib_sge *sge; - u64 raddr; - u32 rsize; - u32 tsize; - u32 dma_len; - int count, nrdma; - int i, j, k; - - BUG_ON(!ch); - BUG_ON(!ioctx); - cmd = &ioctx->cmd; - dir = cmd->data_direction; - BUG_ON(dir == DMA_NONE); - - ioctx->sg = sg = sg_orig = cmd->t_data_sg; - ioctx->sg_cnt = sg_cnt = cmd->t_data_nents; - - count = ib_dma_map_sg(ch->sport->sdev->device, sg, sg_cnt, - target_reverse_dma_direction(cmd)); - if (unlikely(!count)) - return -EAGAIN; - - ioctx->mapped_sg_count = count; - - if (ioctx->rdma_wrs && ioctx->n_rdma_wrs) - nrdma = ioctx->n_rdma_wrs; - else { - nrdma = (count + SRPT_DEF_SG_PER_WQE - 1) / SRPT_DEF_SG_PER_WQE - + ioctx->n_rbuf; - - ioctx->rdma_wrs = kcalloc(nrdma, sizeof(*ioctx->rdma_wrs), - GFP_KERNEL); - if (!ioctx->rdma_wrs) - goto free_mem; - - ioctx->n_rdma_wrs = nrdma; - } - - db = ioctx->rbufs; - tsize = cmd->data_length; - dma_len = ib_sg_dma_len(dev, &sg[0]); - riu = ioctx->rdma_wrs; - - /* - * For each remote desc - calculate the #ib_sge. - * If #ib_sge < SRPT_DEF_SG_PER_WQE per rdma operation then - * each remote desc rdma_iu is required a rdma wr; - * else - * we need to allocate extra rdma_iu to carry extra #ib_sge in - * another rdma wr - */ - for (i = 0, j = 0; - j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) { - rsize = be32_to_cpu(db->len); - raddr = be64_to_cpu(db->va); - riu->remote_addr = raddr; - riu->rkey = be32_to_cpu(db->key); - riu->wr.num_sge = 0; - - /* calculate how many sge required for this remote_buf */ - while (rsize > 0 && tsize > 0) { - - if (rsize >= dma_len) { - tsize -= dma_len; - rsize -= dma_len; - raddr += dma_len; - - if (tsize > 0) { - ++j; - if (j < count) { - sg = sg_next(sg); - dma_len = ib_sg_dma_len( - dev, sg); - } - } - } else { - tsize -= rsize; - dma_len -= rsize; - rsize = 0; - } - - ++riu->wr.num_sge; - - if (rsize > 0 && - riu->wr.num_sge == SRPT_DEF_SG_PER_WQE) { - ++ioctx->n_rdma; - riu->wr.sg_list = kmalloc_array(riu->wr.num_sge, - sizeof(*riu->wr.sg_list), - GFP_KERNEL); - if (!riu->wr.sg_list) - goto free_mem; - - ++riu; - riu->wr.num_sge = 0; - riu->remote_addr = raddr; - riu->rkey = be32_to_cpu(db->key); - } - } - - ++ioctx->n_rdma; - riu->wr.sg_list = kmalloc_array(riu->wr.num_sge, - sizeof(*riu->wr.sg_list), - GFP_KERNEL); - if (!riu->wr.sg_list) - goto free_mem; - } - - db = ioctx->rbufs; - tsize = cmd->data_length; - riu = ioctx->rdma_wrs; - sg = sg_orig; - dma_len = ib_sg_dma_len(dev, &sg[0]); - dma_addr = ib_sg_dma_address(dev, &sg[0]); - - /* this second loop is really mapped sg_addres to rdma_iu->ib_sge */ - for (i = 0, j = 0; - j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) { - rsize = be32_to_cpu(db->len); - sge = riu->wr.sg_list; - k = 0; - - while (rsize > 0 && tsize > 0) { - sge->addr = dma_addr; - sge->lkey = ch->sport->sdev->pd->local_dma_lkey; - - if (rsize >= dma_len) { - sge->length = - (tsize < dma_len) ? tsize : dma_len; - tsize -= dma_len; - rsize -= dma_len; - - if (tsize > 0) { - ++j; - if (j < count) { - sg = sg_next(sg); - dma_len = ib_sg_dma_len( - dev, sg); - dma_addr = ib_sg_dma_address( - dev, sg); - } - } - } else { - sge->length = (tsize < rsize) ? tsize : rsize; - tsize -= rsize; - dma_len -= rsize; - dma_addr += rsize; - rsize = 0; - } - - ++k; - if (k == riu->wr.num_sge && rsize > 0 && tsize > 0) { - ++riu; - sge = riu->wr.sg_list; - k = 0; - } else if (rsize > 0 && tsize > 0) - ++sge; - } - } - - return 0; - -free_mem: - srpt_unmap_sg_to_ib_sge(ch, ioctx); - - return -ENOMEM; -} - -/** * srpt_get_send_ioctx() - Obtain an I/O context for sending to the initiator. */ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) @@ -1284,12 +1096,8 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) BUG_ON(ioctx->ch != ch); spin_lock_init(&ioctx->spinlock); ioctx->state = SRPT_STATE_NEW; - ioctx->n_rbuf = 0; - ioctx->rbufs = NULL; ioctx->n_rdma = 0; - ioctx->n_rdma_wrs = 0; - ioctx->rdma_wrs = NULL; - ioctx->mapped_sg_count = 0; + ioctx->n_rw_ctx = 0; init_completion(&ioctx->tx_done); ioctx->queue_status_only = false; /* @@ -1359,7 +1167,6 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) * SRP_RSP sending failed or the SRP_RSP send completion has * not been received in time. */ - srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); transport_generic_free_cmd(&ioctx->cmd, 0); break; case SRPT_STATE_MGMT_RSP_SENT: @@ -1387,6 +1194,7 @@ static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc) WARN_ON(ioctx->n_rdma <= 0); atomic_add(ioctx->n_rdma, &ch->sq_wr_avail); + ioctx->n_rdma = 0; if (unlikely(wc->status != IB_WC_SUCCESS)) { pr_info("RDMA_READ for ioctx 0x%p failed with status %d\n", @@ -1403,23 +1211,6 @@ static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc) __LINE__, srpt_get_cmd_state(ioctx)); } -static void srpt_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) -{ - struct srpt_send_ioctx *ioctx = - container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe); - - if (unlikely(wc->status != IB_WC_SUCCESS)) { - /* - * Note: if an RDMA write error completion is received that - * means that a SEND also has been posted. Defer further - * processing of the associated command until the send error - * completion has been received. - */ - pr_info("RDMA_WRITE for ioctx 0x%p failed with status %d\n", - ioctx, wc->status); - } -} - /** * srpt_build_cmd_rsp() - Build an SRP_RSP response. * @ch: RDMA channel through which the request has been received. @@ -1537,6 +1328,8 @@ static void srpt_handle_cmd(struct srpt_rdma_ch *ch, { struct se_cmd *cmd; struct srp_cmd *srp_cmd; + struct scatterlist *sg = NULL; + unsigned sg_cnt = 0; u64 data_len; enum dma_data_direction dir; int rc; @@ -1563,16 +1356,21 @@ static void srpt_handle_cmd(struct srpt_rdma_ch *ch, break; } - if (srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &data_len)) { - pr_err("0x%llx: parsing SRP descriptor table failed.\n", - srp_cmd->tag); + rc = srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &sg, &sg_cnt, + &data_len); + if (rc) { + if (rc != -EAGAIN) { + pr_err("0x%llx: parsing SRP descriptor table failed.\n", + srp_cmd->tag); + } goto release_ioctx; } - rc = target_submit_cmd(cmd, ch->sess, srp_cmd->cdb, + rc = target_submit_cmd_map_sgls(cmd, ch->sess, srp_cmd->cdb, &send_ioctx->sense_data[0], scsilun_to_int(&srp_cmd->lun), data_len, - TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF); + TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF, + sg, sg_cnt, NULL, 0, NULL, 0); if (rc != 0) { pr_debug("target_submit_cmd() returned %d for tag %#llx\n", rc, srp_cmd->tag); @@ -1664,23 +1462,21 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, recv_ioctx->ioctx.dma, srp_max_req_size, DMA_FROM_DEVICE); - if (unlikely(ch->state == CH_CONNECTING)) { - list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list); - goto out; - } + if (unlikely(ch->state == CH_CONNECTING)) + goto out_wait; if (unlikely(ch->state != CH_LIVE)) - goto out; + return; srp_cmd = recv_ioctx->ioctx.buf; if (srp_cmd->opcode == SRP_CMD || srp_cmd->opcode == SRP_TSK_MGMT) { - if (!send_ioctx) + if (!send_ioctx) { + if (!list_empty(&ch->cmd_wait_list)) + goto out_wait; send_ioctx = srpt_get_send_ioctx(ch); - if (unlikely(!send_ioctx)) { - list_add_tail(&recv_ioctx->wait_list, - &ch->cmd_wait_list); - goto out; } + if (unlikely(!send_ioctx)) + goto out_wait; } switch (srp_cmd->opcode) { @@ -1709,8 +1505,10 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, } srpt_post_recv(ch->sport->sdev, recv_ioctx); -out: return; + +out_wait: + list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list); } static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc) @@ -1779,14 +1577,13 @@ static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc) WARN_ON(state != SRPT_STATE_CMD_RSP_SENT && state != SRPT_STATE_MGMT_RSP_SENT); - atomic_inc(&ch->sq_wr_avail); + atomic_add(1 + ioctx->n_rdma, &ch->sq_wr_avail); if (wc->status != IB_WC_SUCCESS) pr_info("sending response for ioctx 0x%p failed" " with status %d\n", ioctx, wc->status); if (state != SRPT_STATE_DONE) { - srpt_unmap_sg_to_ib_sge(ch, ioctx); transport_generic_free_cmd(&ioctx->cmd, 0); } else { pr_err("IB completion has been received too late for" @@ -1832,8 +1629,17 @@ retry: qp_init->srq = sdev->srq; qp_init->sq_sig_type = IB_SIGNAL_REQ_WR; qp_init->qp_type = IB_QPT_RC; - qp_init->cap.max_send_wr = srp_sq_size; + /* + * We divide up our send queue size into half SEND WRs to send the + * completions, and half R/W contexts to actually do the RDMA + * READ/WRITE transfers. Note that we need to allocate CQ slots for + * both both, as RDMA contexts will also post completions for the + * RDMA READ case. + */ + qp_init->cap.max_send_wr = srp_sq_size / 2; + qp_init->cap.max_rdma_ctxs = srp_sq_size / 2; qp_init->cap.max_send_sge = SRPT_DEF_SG_PER_WQE; + qp_init->port_num = ch->sport->port; ch->qp = ib_create_qp(sdev->pd, qp_init); if (IS_ERR(ch->qp)) { @@ -1960,14 +1766,6 @@ static void __srpt_close_all_ch(struct srpt_device *sdev) } } -/** - * srpt_shutdown_session() - Whether or not a session may be shut down. - */ -static int srpt_shutdown_session(struct se_session *se_sess) -{ - return 1; -} - static void srpt_free_ch(struct kref *kref) { struct srpt_rdma_ch *ch = container_of(kref, struct srpt_rdma_ch, kref); @@ -2386,95 +2184,6 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) return ret; } -/** - * srpt_perform_rdmas() - Perform IB RDMA. - * - * Returns zero upon success or a negative number upon failure. - */ -static int srpt_perform_rdmas(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx) -{ - struct ib_send_wr *bad_wr; - int sq_wr_avail, ret, i; - enum dma_data_direction dir; - const int n_rdma = ioctx->n_rdma; - - dir = ioctx->cmd.data_direction; - if (dir == DMA_TO_DEVICE) { - /* write */ - ret = -ENOMEM; - sq_wr_avail = atomic_sub_return(n_rdma, &ch->sq_wr_avail); - if (sq_wr_avail < 0) { - pr_warn("IB send queue full (needed %d)\n", - n_rdma); - goto out; - } - } - - for (i = 0; i < n_rdma; i++) { - struct ib_send_wr *wr = &ioctx->rdma_wrs[i].wr; - - wr->opcode = (dir == DMA_FROM_DEVICE) ? - IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; - - if (i == n_rdma - 1) { - /* only get completion event for the last rdma read */ - if (dir == DMA_TO_DEVICE) { - wr->send_flags = IB_SEND_SIGNALED; - ioctx->rdma_cqe.done = srpt_rdma_read_done; - } else { - ioctx->rdma_cqe.done = srpt_rdma_write_done; - } - wr->wr_cqe = &ioctx->rdma_cqe; - wr->next = NULL; - } else { - wr->wr_cqe = NULL; - wr->next = &ioctx->rdma_wrs[i + 1].wr; - } - } - - ret = ib_post_send(ch->qp, &ioctx->rdma_wrs->wr, &bad_wr); - if (ret) - pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n", - __func__, __LINE__, ret, i, n_rdma); -out: - if (unlikely(dir == DMA_TO_DEVICE && ret < 0)) - atomic_add(n_rdma, &ch->sq_wr_avail); - return ret; -} - -/** - * srpt_xfer_data() - Start data transfer from initiator to target. - */ -static int srpt_xfer_data(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx) -{ - int ret; - - ret = srpt_map_sg_to_ib_sge(ch, ioctx); - if (ret) { - pr_err("%s[%d] ret=%d\n", __func__, __LINE__, ret); - goto out; - } - - ret = srpt_perform_rdmas(ch, ioctx); - if (ret) { - if (ret == -EAGAIN || ret == -ENOMEM) - pr_info("%s[%d] queue full -- ret=%d\n", - __func__, __LINE__, ret); - else - pr_err("%s[%d] fatal error -- ret=%d\n", - __func__, __LINE__, ret); - goto out_unmap; - } - -out: - return ret; -out_unmap: - srpt_unmap_sg_to_ib_sge(ch, ioctx); - goto out; -} - static int srpt_write_pending_status(struct se_cmd *se_cmd) { struct srpt_send_ioctx *ioctx; @@ -2491,11 +2200,42 @@ static int srpt_write_pending(struct se_cmd *se_cmd) struct srpt_send_ioctx *ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); struct srpt_rdma_ch *ch = ioctx->ch; + struct ib_send_wr *first_wr = NULL, *bad_wr; + struct ib_cqe *cqe = &ioctx->rdma_cqe; enum srpt_command_state new_state; + int ret, i; new_state = srpt_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA); WARN_ON(new_state == SRPT_STATE_DONE); - return srpt_xfer_data(ch, ioctx); + + if (atomic_sub_return(ioctx->n_rdma, &ch->sq_wr_avail) < 0) { + pr_warn("%s: IB send queue full (needed %d)\n", + __func__, ioctx->n_rdma); + ret = -ENOMEM; + goto out_undo; + } + + cqe->done = srpt_rdma_read_done; + for (i = ioctx->n_rw_ctx - 1; i >= 0; i--) { + struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i]; + + first_wr = rdma_rw_ctx_wrs(&ctx->rw, ch->qp, ch->sport->port, + cqe, first_wr); + cqe = NULL; + } + + ret = ib_post_send(ch->qp, first_wr, &bad_wr); + if (ret) { + pr_err("%s: ib_post_send() returned %d for %d (avail: %d)\n", + __func__, ret, ioctx->n_rdma, + atomic_read(&ch->sq_wr_avail)); + goto out_undo; + } + + return 0; +out_undo: + atomic_add(ioctx->n_rdma, &ch->sq_wr_avail); + return ret; } static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status) @@ -2517,17 +2257,17 @@ static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status) */ static void srpt_queue_response(struct se_cmd *cmd) { - struct srpt_rdma_ch *ch; - struct srpt_send_ioctx *ioctx; + struct srpt_send_ioctx *ioctx = + container_of(cmd, struct srpt_send_ioctx, cmd); + struct srpt_rdma_ch *ch = ioctx->ch; + struct srpt_device *sdev = ch->sport->sdev; + struct ib_send_wr send_wr, *first_wr = NULL, *bad_wr; + struct ib_sge sge; enum srpt_command_state state; unsigned long flags; - int ret; - enum dma_data_direction dir; - int resp_len; + int resp_len, ret, i; u8 srp_tm_status; - ioctx = container_of(cmd, struct srpt_send_ioctx, cmd); - ch = ioctx->ch; BUG_ON(!ch); spin_lock_irqsave(&ioctx->spinlock, flags); @@ -2554,17 +2294,19 @@ static void srpt_queue_response(struct se_cmd *cmd) return; } - dir = ioctx->cmd.data_direction; - /* For read commands, transfer the data to the initiator. */ - if (dir == DMA_FROM_DEVICE && ioctx->cmd.data_length && + if (ioctx->cmd.data_direction == DMA_FROM_DEVICE && + ioctx->cmd.data_length && !ioctx->queue_status_only) { - ret = srpt_xfer_data(ch, ioctx); - if (ret) { - pr_err("xfer_data failed for tag %llu\n", - ioctx->cmd.tag); - return; + for (i = ioctx->n_rw_ctx - 1; i >= 0; i--) { + struct srpt_rw_ctx *ctx = &ioctx->rw_ctxs[i]; + + first_wr = rdma_rw_ctx_wrs(&ctx->rw, ch->qp, + ch->sport->port, NULL, + first_wr ? first_wr : &send_wr); } + } else { + first_wr = &send_wr; } if (state != SRPT_STATE_MGMT) @@ -2576,14 +2318,46 @@ static void srpt_queue_response(struct se_cmd *cmd) resp_len = srpt_build_tskmgmt_rsp(ch, ioctx, srp_tm_status, ioctx->cmd.tag); } - ret = srpt_post_send(ch, ioctx, resp_len); - if (ret) { - pr_err("sending cmd response failed for tag %llu\n", - ioctx->cmd.tag); - srpt_unmap_sg_to_ib_sge(ch, ioctx); - srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); - target_put_sess_cmd(&ioctx->cmd); + + atomic_inc(&ch->req_lim); + + if (unlikely(atomic_sub_return(1 + ioctx->n_rdma, + &ch->sq_wr_avail) < 0)) { + pr_warn("%s: IB send queue full (needed %d)\n", + __func__, ioctx->n_rdma); + ret = -ENOMEM; + goto out; + } + + ib_dma_sync_single_for_device(sdev->device, ioctx->ioctx.dma, resp_len, + DMA_TO_DEVICE); + + sge.addr = ioctx->ioctx.dma; + sge.length = resp_len; + sge.lkey = sdev->pd->local_dma_lkey; + + ioctx->ioctx.cqe.done = srpt_send_done; + send_wr.next = NULL; + send_wr.wr_cqe = &ioctx->ioctx.cqe; + send_wr.sg_list = &sge; + send_wr.num_sge = 1; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + ret = ib_post_send(ch->qp, first_wr, &bad_wr); + if (ret < 0) { + pr_err("%s: sending cmd response failed for tag %llu (%d)\n", + __func__, ioctx->cmd.tag, ret); + goto out; } + + return; + +out: + atomic_add(1 + ioctx->n_rdma, &ch->sq_wr_avail); + atomic_dec(&ch->req_lim); + srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); + target_put_sess_cmd(&ioctx->cmd); } static int srpt_queue_data_in(struct se_cmd *cmd) @@ -2599,10 +2373,6 @@ static void srpt_queue_tm_rsp(struct se_cmd *cmd) static void srpt_aborted_task(struct se_cmd *cmd) { - struct srpt_send_ioctx *ioctx = container_of(cmd, - struct srpt_send_ioctx, cmd); - - srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx); } static int srpt_queue_status(struct se_cmd *cmd) @@ -2903,12 +2673,10 @@ static void srpt_release_cmd(struct se_cmd *se_cmd) unsigned long flags; WARN_ON(ioctx->state != SRPT_STATE_DONE); - WARN_ON(ioctx->mapped_sg_count != 0); - if (ioctx->n_rbuf > 1) { - kfree(ioctx->rbufs); - ioctx->rbufs = NULL; - ioctx->n_rbuf = 0; + if (ioctx->n_rw_ctx) { + srpt_free_rw_ctxs(ch, ioctx); + ioctx->n_rw_ctx = 0; } spin_lock_irqsave(&ch->spinlock, flags); @@ -3287,7 +3055,6 @@ static const struct target_core_fabric_ops srpt_template = { .tpg_get_inst_index = srpt_tpg_get_inst_index, .release_cmd = srpt_release_cmd, .check_stop_free = srpt_check_stop_free, - .shutdown_session = srpt_shutdown_session, .close_session = srpt_close_session, .sess_get_index = srpt_sess_get_index, .sess_get_initiator_sid = NULL, diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index af9b8b527..389030487 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -42,6 +42,7 @@ #include <rdma/ib_verbs.h> #include <rdma/ib_sa.h> #include <rdma/ib_cm.h> +#include <rdma/rw.h> #include <scsi/srp.h> @@ -174,21 +175,17 @@ struct srpt_recv_ioctx { struct srpt_ioctx ioctx; struct list_head wait_list; }; + +struct srpt_rw_ctx { + struct rdma_rw_ctx rw; + struct scatterlist *sg; + unsigned int nents; +}; /** * struct srpt_send_ioctx - SRPT send I/O context. * @ioctx: See above. * @ch: Channel pointer. - * @free_list: Node in srpt_rdma_ch.free_list. - * @n_rbuf: Number of data buffers in the received SRP command. - * @rbufs: Pointer to SRP data buffer array. - * @single_rbuf: SRP data buffer if the command has only a single buffer. - * @sg: Pointer to sg-list associated with this I/O context. - * @sg_cnt: SG-list size. - * @mapped_sg_count: ib_dma_map_sg() return value. - * @n_rdma_wrs: Number of elements in the rdma_wrs array. - * @rdma_wrs: Array with information about the RDMA mapping. - * @tag: Tag of the received SRP information unit. * @spinlock: Protects 'state'. * @state: I/O context state. * @cmd: Target core command data structure. @@ -197,21 +194,18 @@ struct srpt_recv_ioctx { struct srpt_send_ioctx { struct srpt_ioctx ioctx; struct srpt_rdma_ch *ch; - struct ib_rdma_wr *rdma_wrs; + + struct srpt_rw_ctx s_rw_ctx; + struct srpt_rw_ctx *rw_ctxs; + struct ib_cqe rdma_cqe; - struct srp_direct_buf *rbufs; - struct srp_direct_buf single_rbuf; - struct scatterlist *sg; struct list_head free_list; spinlock_t spinlock; enum srpt_command_state state; struct se_cmd cmd; struct completion tx_done; - int sg_cnt; - int mapped_sg_count; - u16 n_rdma_wrs; u8 n_rdma; - u8 n_rbuf; + u8 n_rw_ctx; bool queue_status_only; u8 sense_data[TRANSPORT_SENSE_BUFFER]; }; |