diff options
Diffstat (limited to 'drivers/infiniband/hw/mlx5')
-rw-r--r-- | drivers/infiniband/hw/mlx5/ah.c | 32 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/cq.c | 31 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/main.c | 912 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mlx5_ib.h | 160 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/odp.c | 29 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/qp.c | 1026 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/srq.c | 54 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/user.h | 63 |
8 files changed, 2093 insertions, 214 deletions
diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c index 66080580e..745efa4cf 100644 --- a/drivers/infiniband/hw/mlx5/ah.c +++ b/drivers/infiniband/hw/mlx5/ah.c @@ -32,8 +32,10 @@ #include "mlx5_ib.h" -struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr, - struct mlx5_ib_ah *ah) +static struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev, + struct mlx5_ib_ah *ah, + struct ib_ah_attr *ah_attr, + enum rdma_link_layer ll) { if (ah_attr->ah_flags & IB_AH_GRH) { memcpy(ah->av.rgid, &ah_attr->grh.dgid, 16); @@ -44,9 +46,20 @@ struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr, ah->av.tclass = ah_attr->grh.traffic_class; } - ah->av.rlid = cpu_to_be16(ah_attr->dlid); - ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f; - ah->av.stat_rate_sl = (ah_attr->static_rate << 4) | (ah_attr->sl & 0xf); + ah->av.stat_rate_sl = (ah_attr->static_rate << 4); + + if (ll == IB_LINK_LAYER_ETHERNET) { + memcpy(ah->av.rmac, ah_attr->dmac, sizeof(ah_attr->dmac)); + ah->av.udp_sport = + mlx5_get_roce_udp_sport(dev, + ah_attr->port_num, + ah_attr->grh.sgid_index); + ah->av.stat_rate_sl |= (ah_attr->sl & 0x7) << 1; + } else { + ah->av.rlid = cpu_to_be16(ah_attr->dlid); + ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f; + ah->av.stat_rate_sl |= (ah_attr->sl & 0xf); + } return &ah->ibah; } @@ -54,12 +67,19 @@ struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr, struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) { struct mlx5_ib_ah *ah; + struct mlx5_ib_dev *dev = to_mdev(pd->device); + enum rdma_link_layer ll; + + ll = pd->device->get_link_layer(pd->device, ah_attr->port_num); + + if (ll == IB_LINK_LAYER_ETHERNET && !(ah_attr->ah_flags & IB_AH_GRH)) + return ERR_PTR(-EINVAL); ah = kzalloc(sizeof(*ah), GFP_ATOMIC); if (!ah) return ERR_PTR(-ENOMEM); - return create_ib_ah(ah_attr, ah); /* never fails */ + return create_ib_ah(dev, ah, ah_attr, ll); /* never fails */ } int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 92ddae101..fd1de31e0 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -154,9 +154,6 @@ static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe, wc->opcode = IB_WC_MASKED_FETCH_ADD; wc->byte_len = 8; break; - case MLX5_OPCODE_BIND_MW: - wc->opcode = IB_WC_BIND_MW; - break; case MLX5_OPCODE_UMR: wc->opcode = get_umr_comp(wq, idx); break; @@ -171,6 +168,7 @@ enum { static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, struct mlx5_ib_qp *qp) { + enum rdma_link_layer ll = rdma_port_get_link_layer(qp->ibqp.device, 1); struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); struct mlx5_ib_srq *srq; struct mlx5_ib_wq *wq; @@ -236,6 +234,22 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, } else { wc->pkey_index = 0; } + + if (ll != IB_LINK_LAYER_ETHERNET) + return; + + switch (wc->sl & 0x3) { + case MLX5_CQE_ROCE_L3_HEADER_TYPE_GRH: + wc->network_hdr_type = RDMA_NETWORK_IB; + break; + case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV6: + wc->network_hdr_type = RDMA_NETWORK_IPV6; + break; + case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV4: + wc->network_hdr_type = RDMA_NETWORK_IPV4; + break; + } + wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE; } static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe) @@ -760,12 +774,12 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int eqn; int err; - if (attr->flags) - return ERR_PTR(-EINVAL); - if (entries < 0) return ERR_PTR(-EINVAL); + if (check_cq_create_flags(attr->flags)) + return ERR_PTR(-EOPNOTSUPP); + entries = roundup_pow_of_two(entries + 1); if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) return ERR_PTR(-EINVAL); @@ -779,6 +793,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, spin_lock_init(&cq->lock); cq->resize_buf = NULL; cq->resize_umem = NULL; + cq->create_flags = attr->flags; if (context) { err = create_cq_user(dev, udata, context, cq, entries, @@ -796,6 +811,10 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, cq->cqe_size = cqe_size; cqb->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; + + if (cq->create_flags & IB_CQ_FLAGS_IGNORE_OVERRUN) + cqb->ctx.cqe_sz_flags |= (1 << 1); + cqb->ctx.log_sz_usr_page = cpu_to_be32((ilog2(entries) << 24) | index); err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn); if (err) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c4e091528..03c418ccb 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -40,9 +40,14 @@ #include <linux/io-mapping.h> #include <linux/sched.h> #include <rdma/ib_user_verbs.h> +#include <rdma/ib_addr.h> +#include <rdma/ib_cache.h> #include <linux/mlx5/vport.h> #include <rdma/ib_smi.h> #include <rdma/ib_umem.h> +#include <linux/in.h> +#include <linux/etherdevice.h> +#include <linux/mlx5/fs.h> #include "user.h" #include "mlx5_ib.h" @@ -63,12 +68,14 @@ static char mlx5_version[] = DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" DRIVER_VERSION " (" DRIVER_RELDATE ")\n"; +enum { + MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3, +}; + static enum rdma_link_layer -mlx5_ib_port_link_layer(struct ib_device *device) +mlx5_port_type_cap_to_rdma_ll(int port_type_cap) { - struct mlx5_ib_dev *dev = to_mdev(device); - - switch (MLX5_CAP_GEN(dev->mdev, port_type)) { + switch (port_type_cap) { case MLX5_CAP_PORT_TYPE_IB: return IB_LINK_LAYER_INFINIBAND; case MLX5_CAP_PORT_TYPE_ETH: @@ -78,6 +85,202 @@ mlx5_ib_port_link_layer(struct ib_device *device) } } +static enum rdma_link_layer +mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type); + + return mlx5_port_type_cap_to_rdma_ll(port_type_cap); +} + +static int mlx5_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev, + roce.nb); + + if ((event != NETDEV_UNREGISTER) && (event != NETDEV_REGISTER)) + return NOTIFY_DONE; + + write_lock(&ibdev->roce.netdev_lock); + if (ndev->dev.parent == &ibdev->mdev->pdev->dev) + ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ? NULL : ndev; + write_unlock(&ibdev->roce.netdev_lock); + + return NOTIFY_DONE; +} + +static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, + u8 port_num) +{ + struct mlx5_ib_dev *ibdev = to_mdev(device); + struct net_device *ndev; + + /* Ensure ndev does not disappear before we invoke dev_hold() + */ + read_lock(&ibdev->roce.netdev_lock); + ndev = ibdev->roce.netdev; + if (ndev) + dev_hold(ndev); + read_unlock(&ibdev->roce.netdev_lock); + + return ndev; +} + +static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, + struct ib_port_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct net_device *ndev; + enum ib_mtu ndev_ib_mtu; + u16 qkey_viol_cntr; + + memset(props, 0, sizeof(*props)); + + props->port_cap_flags |= IB_PORT_CM_SUP; + props->port_cap_flags |= IB_PORT_IP_BASED_GIDS; + + props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev, + roce_address_table_size); + props->max_mtu = IB_MTU_4096; + props->max_msg_sz = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg); + props->pkey_tbl_len = 1; + props->state = IB_PORT_DOWN; + props->phys_state = 3; + + mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr); + props->qkey_viol_cntr = qkey_viol_cntr; + + ndev = mlx5_ib_get_netdev(device, port_num); + if (!ndev) + return 0; + + if (netif_running(ndev) && netif_carrier_ok(ndev)) { + props->state = IB_PORT_ACTIVE; + props->phys_state = 5; + } + + ndev_ib_mtu = iboe_get_mtu(ndev->mtu); + + dev_put(ndev); + + props->active_mtu = min(props->max_mtu, ndev_ib_mtu); + + props->active_width = IB_WIDTH_4X; /* TODO */ + props->active_speed = IB_SPEED_QDR; /* TODO */ + + return 0; +} + +static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid, + const struct ib_gid_attr *attr, + void *mlx5_addr) +{ +#define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v) + char *mlx5_addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, + source_l3_address); + void *mlx5_addr_mac = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, + source_mac_47_32); + + if (!gid) + return; + + ether_addr_copy(mlx5_addr_mac, attr->ndev->dev_addr); + + if (is_vlan_dev(attr->ndev)) { + MLX5_SET_RA(mlx5_addr, vlan_valid, 1); + MLX5_SET_RA(mlx5_addr, vlan_id, vlan_dev_vlan_id(attr->ndev)); + } + + switch (attr->gid_type) { + case IB_GID_TYPE_IB: + MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1); + break; + case IB_GID_TYPE_ROCE_UDP_ENCAP: + MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2); + break; + + default: + WARN_ON(true); + } + + if (attr->gid_type != IB_GID_TYPE_IB) { + if (ipv6_addr_v4mapped((void *)gid)) + MLX5_SET_RA(mlx5_addr, roce_l3_type, + MLX5_ROCE_L3_TYPE_IPV4); + else + MLX5_SET_RA(mlx5_addr, roce_l3_type, + MLX5_ROCE_L3_TYPE_IPV6); + } + + if ((attr->gid_type == IB_GID_TYPE_IB) || + !ipv6_addr_v4mapped((void *)gid)) + memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid)); + else + memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4); +} + +static int set_roce_addr(struct ib_device *device, u8 port_num, + unsigned int index, + const union ib_gid *gid, + const struct ib_gid_attr *attr) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + u32 in[MLX5_ST_SZ_DW(set_roce_address_in)]; + u32 out[MLX5_ST_SZ_DW(set_roce_address_out)]; + void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address); + enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num); + + if (ll != IB_LINK_LAYER_ETHERNET) + return -EINVAL; + + memset(in, 0, sizeof(in)); + + ib_gid_to_mlx5_roce_addr(gid, attr, in_addr); + + MLX5_SET(set_roce_address_in, in, roce_address_index, index); + MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS); + + memset(out, 0, sizeof(out)); + return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); +} + +static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num, + unsigned int index, const union ib_gid *gid, + const struct ib_gid_attr *attr, + __always_unused void **context) +{ + return set_roce_addr(device, port_num, index, gid, attr); +} + +static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num, + unsigned int index, __always_unused void **context) +{ + return set_roce_addr(device, port_num, index, NULL, NULL); +} + +__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, + int index) +{ + struct ib_gid_attr attr; + union ib_gid gid; + + if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr)) + return 0; + + if (!attr.ndev) + return 0; + + dev_put(attr.ndev); + + if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) + return 0; + + return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port)); +} + static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev) { return !dev->mdev->issi; @@ -94,13 +297,35 @@ static int mlx5_get_vport_access_method(struct ib_device *ibdev) if (mlx5_use_mad_ifc(to_mdev(ibdev))) return MLX5_VPORT_ACCESS_METHOD_MAD; - if (mlx5_ib_port_link_layer(ibdev) == + if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) return MLX5_VPORT_ACCESS_METHOD_NIC; return MLX5_VPORT_ACCESS_METHOD_HCA; } +static void get_atomic_caps(struct mlx5_ib_dev *dev, + struct ib_device_attr *props) +{ + u8 tmp; + u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); + u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); + u8 atomic_req_8B_endianness_mode = + MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode); + + /* Check if HW supports 8 bytes standard atomic operations and capable + * of host endianness respond + */ + tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD; + if (((atomic_operations & tmp) == tmp) && + (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) && + (atomic_req_8B_endianness_mode)) { + props->atomic_cap = IB_ATOMIC_HCA; + } else { + props->atomic_cap = IB_ATOMIC_NONE; + } +} + static int mlx5_query_system_image_guid(struct ib_device *ibdev, __be64 *sys_image_guid) { @@ -116,13 +341,21 @@ static int mlx5_query_system_image_guid(struct ib_device *ibdev, case MLX5_VPORT_ACCESS_METHOD_HCA: err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp); - if (!err) - *sys_image_guid = cpu_to_be64(tmp); - return err; + break; + + case MLX5_VPORT_ACCESS_METHOD_NIC: + err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp); + break; default: return -EINVAL; } + + if (!err) + *sys_image_guid = cpu_to_be64(tmp); + + return err; + } static int mlx5_query_max_pkeys(struct ib_device *ibdev, @@ -176,13 +409,20 @@ static int mlx5_query_node_guid(struct mlx5_ib_dev *dev, case MLX5_VPORT_ACCESS_METHOD_HCA: err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp); - if (!err) - *node_guid = cpu_to_be64(tmp); - return err; + break; + + case MLX5_VPORT_ACCESS_METHOD_NIC: + err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp); + break; default: return -EINVAL; } + + if (!err) + *node_guid = cpu_to_be64(tmp); + + return err; } struct mlx5_reg_node_desc { @@ -260,6 +500,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (MLX5_CAP_GEN(mdev, block_lb_mc)) props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + (MLX5_CAP_ETH(dev->mdev, csum_cap))) + props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; + props->vendor_part_id = mdev->pdev->device; props->hw_ver = mdev->pdev->revision; @@ -286,13 +530,15 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq_sge = max_rq_sg - 1; props->max_fast_reg_page_list_len = (unsigned int)-1; - props->atomic_cap = IB_ATOMIC_NONE; + get_atomic_caps(dev, props); props->masked_atomic_cap = IB_ATOMIC_NONE; props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg); props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ + props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz); + props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING if (MLX5_CAP_GEN(mdev, pg)) @@ -300,6 +546,9 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->odp_caps = dev->odp_caps; #endif + if (MLX5_CAP_GEN(mdev, cd)) + props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL; + return 0; } @@ -480,6 +729,9 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, case MLX5_VPORT_ACCESS_METHOD_HCA: return mlx5_query_hca_port(ibdev, port, props); + case MLX5_VPORT_ACCESS_METHOD_NIC: + return mlx5_query_port_roce(ibdev, port, props); + default: return -EINVAL; } @@ -580,8 +832,8 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibdev); - struct mlx5_ib_alloc_ucontext_req_v2 req; - struct mlx5_ib_alloc_ucontext_resp resp; + struct mlx5_ib_alloc_ucontext_req_v2 req = {}; + struct mlx5_ib_alloc_ucontext_resp resp = {}; struct mlx5_ib_ucontext *context; struct mlx5_uuar_info *uuari; struct mlx5_uar *uars; @@ -592,24 +844,28 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, int err; int i; size_t reqlen; + size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2, + max_cqe_version); if (!dev->ib_active) return ERR_PTR(-EAGAIN); - memset(&req, 0, sizeof(req)); + if (udata->inlen < sizeof(struct ib_uverbs_cmd_hdr)) + return ERR_PTR(-EINVAL); + reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) ver = 0; - else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2)) + else if (reqlen >= min_req_v2) ver = 2; else return ERR_PTR(-EINVAL); - err = ib_copy_from_udata(&req, udata, reqlen); + err = ib_copy_from_udata(&req, udata, min(reqlen, sizeof(req))); if (err) return ERR_PTR(err); - if (req.flags || req.reserved) + if (req.flags) return ERR_PTR(-EINVAL); if (req.total_num_uuars > MLX5_MAX_UUARS) @@ -618,6 +874,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, if (req.total_num_uuars == 0) return ERR_PTR(-EINVAL); + if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2) + return ERR_PTR(-EOPNOTSUPP); + + if (reqlen > sizeof(req) && + !ib_is_udata_cleared(udata, sizeof(req), + reqlen - sizeof(req))) + return ERR_PTR(-EOPNOTSUPP); + req.total_num_uuars = ALIGN(req.total_num_uuars, MLX5_NON_FP_BF_REGS_PER_PAGE); if (req.num_low_latency_uuars > req.total_num_uuars - 1) @@ -633,6 +897,11 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); + resp.cqe_version = min_t(__u8, + (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version), + req.max_cqe_version); + resp.response_length = min(offsetof(typeof(resp), response_length) + + sizeof(resp.response_length), udata->outlen); context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) @@ -678,22 +947,49 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; #endif + if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) { + err = mlx5_core_alloc_transport_domain(dev->mdev, + &context->tdn); + if (err) + goto out_uars; + } + INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); resp.tot_uuars = req.total_num_uuars; resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports); - err = ib_copy_to_udata(udata, &resp, - sizeof(resp) - sizeof(resp.reserved)); + + if (field_avail(typeof(resp), cqe_version, udata->outlen)) + resp.response_length += sizeof(resp.cqe_version); + + if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) { + resp.comp_mask |= + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET; + resp.hca_core_clock_offset = + offsetof(struct mlx5_init_seg, internal_timer_h) % + PAGE_SIZE; + resp.response_length += sizeof(resp.hca_core_clock_offset) + + sizeof(resp.reserved2) + + sizeof(resp.reserved3); + } + + err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) - goto out_uars; + goto out_td; uuari->ver = ver; uuari->num_low_latency_uuars = req.num_low_latency_uuars; uuari->uars = uars; uuari->num_uars = num_uars; + context->cqe_version = resp.cqe_version; + return &context->ibucontext; +out_td: + if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) + mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn); + out_uars: for (i--; i >= 0; i--) mlx5_cmd_free_uar(dev->mdev, uars[i].index); @@ -718,6 +1014,9 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) struct mlx5_uuar_info *uuari = &context->uuari; int i; + if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) + mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn); + for (i = 0; i < uuari->num_uars; i++) { if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index)) mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index); @@ -787,6 +1086,30 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: return -ENOSYS; + case MLX5_IB_MMAP_CORE_CLOCK: + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + if (vma->vm_flags & (VM_WRITE | VM_EXEC)) + return -EPERM; + + /* Don't expose to user-space information it shouldn't have */ + if (PAGE_SIZE > 4096) + return -EOPNOTSUPP; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + pfn = (dev->mdev->iseg_base + + offsetof(struct mlx5_init_seg, internal_timer_h)) >> + PAGE_SHIFT; + if (io_remap_pfn_range(vma, vma->vm_start, pfn, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + + mlx5_ib_dbg(dev, "mapped internal timer at 0x%lx, PA 0x%llx\n", + vma->vm_start, + (unsigned long long)pfn << PAGE_SHIFT); + break; + default: return -EINVAL; } @@ -835,6 +1158,457 @@ static int mlx5_ib_dealloc_pd(struct ib_pd *pd) return 0; } +static bool outer_header_zero(u32 *match_criteria) +{ + int size = MLX5_ST_SZ_BYTES(fte_match_param); + char *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_criteria, + outer_headers); + + return outer_headers_c[0] == 0 && !memcmp(outer_headers_c, + outer_headers_c + 1, + size - 1); +} + +static int parse_flow_attr(u32 *match_c, u32 *match_v, + union ib_flow_spec *ib_spec) +{ + void *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_c, + outer_headers); + void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v, + outer_headers); + switch (ib_spec->type) { + case IB_FLOW_SPEC_ETH: + if (ib_spec->size != sizeof(ib_spec->eth)) + return -EINVAL; + + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + dmac_47_16), + ib_spec->eth.mask.dst_mac); + ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, + dmac_47_16), + ib_spec->eth.val.dst_mac); + + if (ib_spec->eth.mask.vlan_tag) { + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + vlan_tag, 1); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + vlan_tag, 1); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + first_vid, ntohs(ib_spec->eth.mask.vlan_tag)); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + first_vid, ntohs(ib_spec->eth.val.vlan_tag)); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + first_cfi, + ntohs(ib_spec->eth.mask.vlan_tag) >> 12); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + first_cfi, + ntohs(ib_spec->eth.val.vlan_tag) >> 12); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + first_prio, + ntohs(ib_spec->eth.mask.vlan_tag) >> 13); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + first_prio, + ntohs(ib_spec->eth.val.vlan_tag) >> 13); + } + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + ethertype, ntohs(ib_spec->eth.mask.ether_type)); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + ethertype, ntohs(ib_spec->eth.val.ether_type)); + break; + case IB_FLOW_SPEC_IPV4: + if (ib_spec->size != sizeof(ib_spec->ipv4)) + return -EINVAL; + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, + ethertype, 0xffff); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, + ethertype, ETH_P_IP); + + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + src_ipv4_src_ipv6.ipv4_layout.ipv4), + &ib_spec->ipv4.mask.src_ip, + sizeof(ib_spec->ipv4.mask.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, + src_ipv4_src_ipv6.ipv4_layout.ipv4), + &ib_spec->ipv4.val.src_ip, + sizeof(ib_spec->ipv4.val.src_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &ib_spec->ipv4.mask.dst_ip, + sizeof(ib_spec->ipv4.mask.dst_ip)); + memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v, + dst_ipv4_dst_ipv6.ipv4_layout.ipv4), + &ib_spec->ipv4.val.dst_ip, + sizeof(ib_spec->ipv4.val.dst_ip)); + break; + case IB_FLOW_SPEC_TCP: + if (ib_spec->size != sizeof(ib_spec->tcp_udp)) + return -EINVAL; + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol, + 0xff); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol, + IPPROTO_TCP); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_sport, + ntohs(ib_spec->tcp_udp.mask.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_sport, + ntohs(ib_spec->tcp_udp.val.src_port)); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_dport, + ntohs(ib_spec->tcp_udp.mask.dst_port)); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_dport, + ntohs(ib_spec->tcp_udp.val.dst_port)); + break; + case IB_FLOW_SPEC_UDP: + if (ib_spec->size != sizeof(ib_spec->tcp_udp)) + return -EINVAL; + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol, + 0xff); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol, + IPPROTO_UDP); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_sport, + ntohs(ib_spec->tcp_udp.mask.src_port)); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_sport, + ntohs(ib_spec->tcp_udp.val.src_port)); + + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_dport, + ntohs(ib_spec->tcp_udp.mask.dst_port)); + MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_dport, + ntohs(ib_spec->tcp_udp.val.dst_port)); + break; + default: + return -EINVAL; + } + + return 0; +} + +/* If a flow could catch both multicast and unicast packets, + * it won't fall into the multicast flow steering table and this rule + * could steal other multicast packets. + */ +static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr) +{ + struct ib_flow_spec_eth *eth_spec; + + if (ib_attr->type != IB_FLOW_ATTR_NORMAL || + ib_attr->size < sizeof(struct ib_flow_attr) + + sizeof(struct ib_flow_spec_eth) || + ib_attr->num_of_specs < 1) + return false; + + eth_spec = (struct ib_flow_spec_eth *)(ib_attr + 1); + if (eth_spec->type != IB_FLOW_SPEC_ETH || + eth_spec->size != sizeof(*eth_spec)) + return false; + + return is_multicast_ether_addr(eth_spec->mask.dst_mac) && + is_multicast_ether_addr(eth_spec->val.dst_mac); +} + +static bool is_valid_attr(struct ib_flow_attr *flow_attr) +{ + union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1); + bool has_ipv4_spec = false; + bool eth_type_ipv4 = true; + unsigned int spec_index; + + /* Validate that ethertype is correct */ + for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { + if (ib_spec->type == IB_FLOW_SPEC_ETH && + ib_spec->eth.mask.ether_type) { + if (!((ib_spec->eth.mask.ether_type == htons(0xffff)) && + ib_spec->eth.val.ether_type == htons(ETH_P_IP))) + eth_type_ipv4 = false; + } else if (ib_spec->type == IB_FLOW_SPEC_IPV4) { + has_ipv4_spec = true; + } + ib_spec = (void *)ib_spec + ib_spec->size; + } + return !has_ipv4_spec || eth_type_ipv4; +} + +static void put_flow_table(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *prio, bool ft_added) +{ + prio->refcount -= !!ft_added; + if (!prio->refcount) { + mlx5_destroy_flow_table(prio->flow_table); + prio->flow_table = NULL; + } +} + +static int mlx5_ib_destroy_flow(struct ib_flow *flow_id) +{ + struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device); + struct mlx5_ib_flow_handler *handler = container_of(flow_id, + struct mlx5_ib_flow_handler, + ibflow); + struct mlx5_ib_flow_handler *iter, *tmp; + + mutex_lock(&dev->flow_db.lock); + + list_for_each_entry_safe(iter, tmp, &handler->list, list) { + mlx5_del_flow_rule(iter->rule); + list_del(&iter->list); + kfree(iter); + } + + mlx5_del_flow_rule(handler->rule); + put_flow_table(dev, &dev->flow_db.prios[handler->prio], true); + mutex_unlock(&dev->flow_db.lock); + + kfree(handler); + + return 0; +} + +#define MLX5_FS_MAX_TYPES 10 +#define MLX5_FS_MAX_ENTRIES 32000UL +static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, + struct ib_flow_attr *flow_attr) +{ + struct mlx5_flow_namespace *ns = NULL; + struct mlx5_ib_flow_prio *prio; + struct mlx5_flow_table *ft; + int num_entries; + int num_groups; + int priority; + int err = 0; + + if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { + if (flow_is_multicast_only(flow_attr)) + priority = MLX5_IB_FLOW_MCAST_PRIO; + else + priority = flow_attr->priority; + ns = mlx5_get_flow_namespace(dev->mdev, + MLX5_FLOW_NAMESPACE_BYPASS); + num_entries = MLX5_FS_MAX_ENTRIES; + num_groups = MLX5_FS_MAX_TYPES; + prio = &dev->flow_db.prios[priority]; + } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || + flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { + ns = mlx5_get_flow_namespace(dev->mdev, + MLX5_FLOW_NAMESPACE_LEFTOVERS); + build_leftovers_ft_param(&priority, + &num_entries, + &num_groups); + prio = &dev->flow_db.prios[MLX5_IB_FLOW_LEFTOVERS_PRIO]; + } + + if (!ns) + return ERR_PTR(-ENOTSUPP); + + ft = prio->flow_table; + if (!ft) { + ft = mlx5_create_auto_grouped_flow_table(ns, priority, + num_entries, + num_groups); + + if (!IS_ERR(ft)) { + prio->refcount = 0; + prio->flow_table = ft; + } else { + err = PTR_ERR(ft); + } + } + + return err ? ERR_PTR(err) : prio; +} + +static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst) +{ + struct mlx5_flow_table *ft = ft_prio->flow_table; + struct mlx5_ib_flow_handler *handler; + void *ib_flow = flow_attr + 1; + u8 match_criteria_enable = 0; + unsigned int spec_index; + u32 *match_c; + u32 *match_v; + int err = 0; + + if (!is_valid_attr(flow_attr)) + return ERR_PTR(-EINVAL); + + match_c = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL); + match_v = kzalloc(MLX5_ST_SZ_BYTES(fte_match_param), GFP_KERNEL); + handler = kzalloc(sizeof(*handler), GFP_KERNEL); + if (!handler || !match_c || !match_v) { + err = -ENOMEM; + goto free; + } + + INIT_LIST_HEAD(&handler->list); + + for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { + err = parse_flow_attr(match_c, match_v, ib_flow); + if (err < 0) + goto free; + + ib_flow += ((union ib_flow_spec *)ib_flow)->size; + } + + /* Outer header support only */ + match_criteria_enable = (!outer_header_zero(match_c)) << 0; + handler->rule = mlx5_add_flow_rule(ft, match_criteria_enable, + match_c, match_v, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST, + MLX5_FS_DEFAULT_FLOW_TAG, + dst); + + if (IS_ERR(handler->rule)) { + err = PTR_ERR(handler->rule); + goto free; + } + + handler->prio = ft_prio - dev->flow_db.prios; + + ft_prio->flow_table = ft; +free: + if (err) + kfree(handler); + kfree(match_c); + kfree(match_v); + return err ? ERR_PTR(err) : handler; +} + +enum { + LEFTOVERS_MC, + LEFTOVERS_UC, +}; + +static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_prio *ft_prio, + struct ib_flow_attr *flow_attr, + struct mlx5_flow_destination *dst) +{ + struct mlx5_ib_flow_handler *handler_ucast = NULL; + struct mlx5_ib_flow_handler *handler = NULL; + + static struct { + struct ib_flow_attr flow_attr; + struct ib_flow_spec_eth eth_flow; + } leftovers_specs[] = { + [LEFTOVERS_MC] = { + .flow_attr = { + .num_of_specs = 1, + .size = sizeof(leftovers_specs[0]) + }, + .eth_flow = { + .type = IB_FLOW_SPEC_ETH, + .size = sizeof(struct ib_flow_spec_eth), + .mask = {.dst_mac = {0x1} }, + .val = {.dst_mac = {0x1} } + } + }, + [LEFTOVERS_UC] = { + .flow_attr = { + .num_of_specs = 1, + .size = sizeof(leftovers_specs[0]) + }, + .eth_flow = { + .type = IB_FLOW_SPEC_ETH, + .size = sizeof(struct ib_flow_spec_eth), + .mask = {.dst_mac = {0x1} }, + .val = {.dst_mac = {} } + } + } + }; + + handler = create_flow_rule(dev, ft_prio, + &leftovers_specs[LEFTOVERS_MC].flow_attr, + dst); + if (!IS_ERR(handler) && + flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) { + handler_ucast = create_flow_rule(dev, ft_prio, + &leftovers_specs[LEFTOVERS_UC].flow_attr, + dst); + if (IS_ERR(handler_ucast)) { + kfree(handler); + handler = handler_ucast; + } else { + list_add(&handler_ucast->list, &handler->list); + } + } + + return handler; +} + +static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, + struct ib_flow_attr *flow_attr, + int domain) +{ + struct mlx5_ib_dev *dev = to_mdev(qp->device); + struct mlx5_ib_flow_handler *handler = NULL; + struct mlx5_flow_destination *dst = NULL; + struct mlx5_ib_flow_prio *ft_prio; + int err; + + if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO) + return ERR_PTR(-ENOSPC); + + if (domain != IB_FLOW_DOMAIN_USER || + flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) || + flow_attr->flags) + return ERR_PTR(-EINVAL); + + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) + return ERR_PTR(-ENOMEM); + + mutex_lock(&dev->flow_db.lock); + + ft_prio = get_flow_table(dev, flow_attr); + if (IS_ERR(ft_prio)) { + err = PTR_ERR(ft_prio); + goto unlock; + } + + dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR; + dst->tir_num = to_mqp(qp)->raw_packet_qp.rq.tirn; + + if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { + handler = create_flow_rule(dev, ft_prio, flow_attr, + dst); + } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || + flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { + handler = create_leftovers_rule(dev, ft_prio, flow_attr, + dst); + } else { + err = -EINVAL; + goto destroy_ft; + } + + if (IS_ERR(handler)) { + err = PTR_ERR(handler); + handler = NULL; + goto destroy_ft; + } + + ft_prio->refcount++; + mutex_unlock(&dev->flow_db.lock); + kfree(dst); + + return &handler->ibflow; + +destroy_ft: + put_flow_table(dev, ft_prio, false); +unlock: + mutex_unlock(&dev->flow_db.lock); + kfree(dst); + kfree(handler); + return ERR_PTR(err); +} + static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); @@ -1304,6 +2078,32 @@ static void destroy_dev_resources(struct mlx5_ib_resources *devr) mlx5_ib_dealloc_pd(devr->p0); } +static u32 get_core_cap_flags(struct ib_device *ibdev) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1); + u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type); + u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version); + u32 ret = 0; + + if (ll == IB_LINK_LAYER_INFINIBAND) + return RDMA_CORE_PORT_IBA_IB; + + if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP)) + return 0; + + if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP)) + return 0; + + if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP) + ret |= RDMA_CORE_PORT_IBA_ROCE; + + if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP) + ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + + return ret; +} + static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable) { @@ -1316,20 +2116,50 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; - immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB; + immutable->core_cap_flags = get_core_cap_flags(ibdev); immutable->max_mad_size = IB_MGMT_MAD_SIZE; return 0; } +static int mlx5_enable_roce(struct mlx5_ib_dev *dev) +{ + int err; + + dev->roce.nb.notifier_call = mlx5_netdev_event; + err = register_netdevice_notifier(&dev->roce.nb); + if (err) + return err; + + err = mlx5_nic_vport_enable_roce(dev->mdev); + if (err) + goto err_unregister_netdevice_notifier; + + return 0; + +err_unregister_netdevice_notifier: + unregister_netdevice_notifier(&dev->roce.nb); + return err; +} + +static void mlx5_disable_roce(struct mlx5_ib_dev *dev) +{ + mlx5_nic_vport_disable_roce(dev->mdev); + unregister_netdevice_notifier(&dev->roce.nb); +} + static void *mlx5_ib_add(struct mlx5_core_dev *mdev) { struct mlx5_ib_dev *dev; + enum rdma_link_layer ll; + int port_type_cap; int err; int i; - /* don't create IB instance over Eth ports, no RoCE yet! */ - if (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) + port_type_cap = MLX5_CAP_GEN(mdev, port_type); + ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); + + if ((ll == IB_LINK_LAYER_ETHERNET) && !MLX5_CAP_GEN(mdev, roce)) return NULL; printk_once(KERN_INFO "%s", mlx5_version); @@ -1340,6 +2170,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->mdev = mdev; + rwlock_init(&dev->roce.netdev_lock); err = get_port_caps(dev); if (err) goto err_dealloc; @@ -1385,11 +2216,18 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | (1ull << IB_USER_VERBS_CMD_OPEN_QP); dev->ib_dev.uverbs_ex_cmd_mask = - (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE); + (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_QP); dev->ib_dev.query_device = mlx5_ib_query_device; dev->ib_dev.query_port = mlx5_ib_query_port; + dev->ib_dev.get_link_layer = mlx5_ib_port_link_layer; + if (ll == IB_LINK_LAYER_ETHERNET) + dev->ib_dev.get_netdev = mlx5_ib_get_netdev; dev->ib_dev.query_gid = mlx5_ib_query_gid; + dev->ib_dev.add_gid = mlx5_ib_add_gid; + dev->ib_dev.del_gid = mlx5_ib_del_gid; dev->ib_dev.query_pkey = mlx5_ib_query_pkey; dev->ib_dev.modify_device = mlx5_ib_modify_device; dev->ib_dev.modify_port = mlx5_ib_modify_port; @@ -1439,15 +2277,30 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } + if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == + IB_LINK_LAYER_ETHERNET) { + dev->ib_dev.create_flow = mlx5_ib_create_flow; + dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow; + dev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); + } err = init_node_data(dev); if (err) goto err_dealloc; + mutex_init(&dev->flow_db.lock); mutex_init(&dev->cap_mask_mutex); + if (ll == IB_LINK_LAYER_ETHERNET) { + err = mlx5_enable_roce(dev); + if (err) + goto err_dealloc; + } + err = create_dev_resources(&dev->devr); if (err) - goto err_dealloc; + goto err_disable_roce; err = mlx5_ib_odp_init_one(dev); if (err) @@ -1484,6 +2337,10 @@ err_odp: err_rsrc: destroy_dev_resources(&dev->devr); +err_disable_roce: + if (ll == IB_LINK_LAYER_ETHERNET) + mlx5_disable_roce(dev); + err_dealloc: ib_dealloc_device((struct ib_device *)dev); @@ -1493,11 +2350,14 @@ err_dealloc: static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) { struct mlx5_ib_dev *dev = context; + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1); ib_unregister_device(&dev->ib_dev); destroy_umrc_res(dev); mlx5_ib_odp_remove_one(dev); destroy_dev_resources(&dev->devr); + if (ll == IB_LINK_LAYER_ETHERNET) + mlx5_disable_roce(dev); ib_dealloc_device(&dev->ib_dev); } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 633347260..d2b9737ba 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -42,6 +42,7 @@ #include <linux/mlx5/qp.h> #include <linux/mlx5/srq.h> #include <linux/types.h> +#include <linux/mlx5/transobj.h> #define mlx5_ib_dbg(dev, format, arg...) \ pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ @@ -55,6 +56,11 @@ pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ __LINE__, current->pid, ##arg) +#define field_avail(type, fld, sz) (offsetof(type, fld) + \ + sizeof(((type *)0)->fld) <= (sz)) +#define MLX5_IB_DEFAULT_UIDX 0xffffff +#define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index) + enum { MLX5_IB_MMAP_CMD_SHIFT = 8, MLX5_IB_MMAP_CMD_MASK = 0xff, @@ -62,7 +68,9 @@ enum { enum mlx5_ib_mmap_cmd { MLX5_IB_MMAP_REGULAR_PAGE = 0, - MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, /* always last */ + MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, + /* 5 is chosen in order to be compatible with old versions of libmlx5 */ + MLX5_IB_MMAP_CORE_CLOCK = 5, }; enum { @@ -85,6 +93,15 @@ enum mlx5_ib_mad_ifc_flags { MLX5_MAD_IFC_NET_VIEW = 4, }; +enum { + MLX5_CROSS_CHANNEL_UUAR = 0, +}; + +enum { + MLX5_CQE_VERSION_V0, + MLX5_CQE_VERSION_V1, +}; + struct mlx5_ib_ucontext { struct ib_ucontext ibucontext; struct list_head db_page_list; @@ -93,6 +110,9 @@ struct mlx5_ib_ucontext { */ struct mutex db_page_mutex; struct mlx5_uuar_info uuari; + u8 cqe_version; + /* Transport Domain number */ + u32 tdn; }; static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) @@ -105,6 +125,36 @@ struct mlx5_ib_pd { u32 pdn; }; +#define MLX5_IB_FLOW_MCAST_PRIO (MLX5_BY_PASS_NUM_PRIOS - 1) +#define MLX5_IB_FLOW_LAST_PRIO (MLX5_IB_FLOW_MCAST_PRIO - 1) +#if (MLX5_IB_FLOW_LAST_PRIO <= 0) +#error "Invalid number of bypass priorities" +#endif +#define MLX5_IB_FLOW_LEFTOVERS_PRIO (MLX5_IB_FLOW_MCAST_PRIO + 1) + +#define MLX5_IB_NUM_FLOW_FT (MLX5_IB_FLOW_LEFTOVERS_PRIO + 1) +struct mlx5_ib_flow_prio { + struct mlx5_flow_table *flow_table; + unsigned int refcount; +}; + +struct mlx5_ib_flow_handler { + struct list_head list; + struct ib_flow ibflow; + unsigned int prio; + struct mlx5_flow_rule *rule; +}; + +struct mlx5_ib_flow_db { + struct mlx5_ib_flow_prio prios[MLX5_IB_NUM_FLOW_FT]; + /* Protect flow steering bypass flow tables + * when add/del flow rules. + * only single add/removal of flow steering rule could be done + * simultaneously. + */ + struct mutex lock; +}; + /* Use macros here so that don't have to duplicate * enum ib_send_flags and enum ib_qp_type for low-level driver */ @@ -171,35 +221,70 @@ struct mlx5_ib_pfault { struct mlx5_pagefault mpfault; }; +struct mlx5_ib_ubuffer { + struct ib_umem *umem; + int buf_size; + u64 buf_addr; +}; + +struct mlx5_ib_qp_base { + struct mlx5_ib_qp *container_mibqp; + struct mlx5_core_qp mqp; + struct mlx5_ib_ubuffer ubuffer; +}; + +struct mlx5_ib_qp_trans { + struct mlx5_ib_qp_base base; + u16 xrcdn; + u8 alt_port; + u8 atomic_rd_en; + u8 resp_depth; +}; + +struct mlx5_ib_rq { + struct mlx5_ib_qp_base base; + struct mlx5_ib_wq *rq; + struct mlx5_ib_ubuffer ubuffer; + struct mlx5_db *doorbell; + u32 tirn; + u8 state; +}; + +struct mlx5_ib_sq { + struct mlx5_ib_qp_base base; + struct mlx5_ib_wq *sq; + struct mlx5_ib_ubuffer ubuffer; + struct mlx5_db *doorbell; + u32 tisn; + u8 state; +}; + +struct mlx5_ib_raw_packet_qp { + struct mlx5_ib_sq sq; + struct mlx5_ib_rq rq; +}; + struct mlx5_ib_qp { struct ib_qp ibqp; - struct mlx5_core_qp mqp; + union { + struct mlx5_ib_qp_trans trans_qp; + struct mlx5_ib_raw_packet_qp raw_packet_qp; + }; struct mlx5_buf buf; struct mlx5_db db; struct mlx5_ib_wq rq; - u32 doorbell_qpn; u8 sq_signal_bits; u8 fm_cache; - int sq_max_wqes_per_wr; - int sq_spare_wqes; struct mlx5_ib_wq sq; - struct ib_umem *umem; - int buf_size; - /* serialize qp state modifications */ struct mutex mutex; - u16 xrcdn; u32 flags; u8 port; - u8 alt_port; - u8 atomic_rd_en; - u8 resp_depth; u8 state; - int mlx_type; int wq_sig; int scat_cqe; int max_inline_data; @@ -242,6 +327,9 @@ struct mlx5_ib_cq_buf { enum mlx5_ib_qp_flags { MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 0, MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, + MLX5_IB_QP_CROSS_CHANNEL = 1 << 2, + MLX5_IB_QP_MANAGED_SEND = 1 << 3, + MLX5_IB_QP_MANAGED_RECV = 1 << 4, }; struct mlx5_umr_wr { @@ -284,6 +372,7 @@ struct mlx5_ib_cq { struct mlx5_ib_cq_buf *resize_buf; struct ib_umem *resize_umem; int cqe_size; + u32 create_flags; }; struct mlx5_ib_srq { @@ -407,9 +496,19 @@ struct mlx5_ib_resources { struct ib_srq *s1; }; +struct mlx5_roce { + /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL + * netdev pointer + */ + rwlock_t netdev_lock; + struct net_device *netdev; + struct notifier_block nb; +}; + struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; + struct mlx5_roce roce; MLX5_DECLARE_DOORBELL_LOCK(uar_lock); int num_ports; /* serialize update of capability mask @@ -431,6 +530,7 @@ struct mlx5_ib_dev { */ struct srcu_struct mr_srcu; #endif + struct mlx5_ib_flow_db flow_db; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -455,7 +555,7 @@ static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq) static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp) { - return container_of(mqp, struct mlx5_ib_qp, mqp); + return container_of(mqp, struct mlx5_ib_qp_base, mqp)->container_mibqp; } static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr) @@ -507,8 +607,6 @@ void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index); int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const void *in_mad, void *response_mad); -struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr, - struct mlx5_ib_ah *ah); struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); int mlx5_ib_destroy_ah(struct ib_ah *ah); @@ -535,7 +633,8 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr); void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, - void *buffer, u32 length); + void *buffer, u32 length, + struct mlx5_ib_qp_base *base); struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_ucontext *context, @@ -637,6 +736,9 @@ static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ +__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, + int index); + static inline void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; @@ -662,4 +764,28 @@ static inline int is_qp1(enum ib_qp_type qp_type) #define MLX5_MAX_UMR_SHIFT 16 #define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) +static inline u32 check_cq_create_flags(u32 flags) +{ + /* + * It returns non-zero value for unsupported CQ + * create flags, otherwise it returns zero. + */ + return (flags & ~(IB_CQ_FLAGS_IGNORE_OVERRUN | + IB_CQ_FLAGS_TIMESTAMP_COMPLETION)); +} + +static inline int verify_assign_uidx(u8 cqe_version, u32 cmd_uidx, + u32 *user_index) +{ + if (cqe_version) { + if ((cmd_uidx == MLX5_IB_DEFAULT_UIDX) || + (cmd_uidx & ~MLX5_USER_ASSIGNED_UIDX_MASK)) + return -EINVAL; + *user_index = cmd_uidx; + } else { + *user_index = MLX5_IB_DEFAULT_UIDX; + } + + return 0; +} #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index aa8391e75..b8d76361a 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -153,14 +153,16 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, - int error) { + int error) +{ struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); - int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn, + u32 qpn = qp->trans_qp.base.mqp.qpn; + int ret = mlx5_core_page_fault_resume(dev->mdev, + qpn, pfault->mpfault.flags, error); if (ret) - pr_err("Failed to resolve the page fault on QP 0x%x\n", - qp->mqp.qpn); + pr_err("Failed to resolve the page fault on QP 0x%x\n", qpn); } /* @@ -391,6 +393,7 @@ static int mlx5_ib_mr_initiator_pfault_handler( #if defined(DEBUG) u32 ctrl_wqe_index, ctrl_qpn; #endif + u32 qpn = qp->trans_qp.base.mqp.qpn; ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; if (ds * MLX5_WQE_DS_UNITS > wqe_length) { @@ -401,7 +404,7 @@ static int mlx5_ib_mr_initiator_pfault_handler( if (ds == 0) { mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", - wqe_index, qp->mqp.qpn); + wqe_index, qpn); return -EFAULT; } @@ -411,16 +414,16 @@ static int mlx5_ib_mr_initiator_pfault_handler( MLX5_WQE_CTRL_WQE_INDEX_SHIFT; if (wqe_index != ctrl_wqe_index) { mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", - wqe_index, qp->mqp.qpn, + wqe_index, qpn, ctrl_wqe_index); return -EFAULT; } ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> MLX5_WQE_CTRL_QPN_SHIFT; - if (qp->mqp.qpn != ctrl_qpn) { + if (qpn != ctrl_qpn) { mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", - wqe_index, qp->mqp.qpn, + wqe_index, qpn, ctrl_qpn); return -EFAULT; } @@ -537,6 +540,7 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, int resume_with_error = 0; u16 wqe_index = pfault->mpfault.wqe.wqe_index; int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR; + u32 qpn = qp->trans_qp.base.mqp.qpn; buffer = (char *)__get_free_page(GFP_KERNEL); if (!buffer) { @@ -546,10 +550,10 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, } ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, - PAGE_SIZE); + PAGE_SIZE, &qp->trans_qp.base); if (ret < 0) { mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n", - -ret, wqe_index, qp->mqp.qpn); + -ret, wqe_index, qpn); resume_with_error = 1; goto resolve_page_fault; } @@ -586,7 +590,8 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, resolve_page_fault: mlx5_ib_page_fault_resume(qp, pfault, resume_with_error); mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n", - qp->mqp.qpn, resume_with_error, pfault->mpfault.flags); + qpn, resume_with_error, + pfault->mpfault.flags); free_page((unsigned long)buffer); } @@ -753,7 +758,7 @@ void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) qp->disable_page_faults = 1; spin_lock_init(&qp->disable_page_faults_lock); - qp->mqp.pfault_handler = mlx5_ib_pfault_handler; + qp->trans_qp.base.mqp.pfault_handler = mlx5_ib_pfault_handler; for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i) INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 307bdbca8..34cb8e87c 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -32,6 +32,8 @@ #include <linux/module.h> #include <rdma/ib_umem.h> +#include <rdma/ib_cache.h> +#include <rdma/ib_user_verbs.h> #include "mlx5_ib.h" #include "user.h" @@ -114,14 +116,15 @@ void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n) * Return: the number of bytes copied, or an error code. */ int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, - void *buffer, u32 length) + void *buffer, u32 length, + struct mlx5_ib_qp_base *base) { struct ib_device *ibdev = qp->ibqp.device; struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq; size_t offset; size_t wq_end; - struct ib_umem *umem = qp->umem; + struct ib_umem *umem = base->ubuffer.umem; u32 first_copy_length; int wqe_length; int ret; @@ -172,8 +175,10 @@ static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; struct ib_event event; - if (type == MLX5_EVENT_TYPE_PATH_MIG) - to_mibqp(qp)->port = to_mibqp(qp)->alt_port; + if (type == MLX5_EVENT_TYPE_PATH_MIG) { + /* This event is only valid for trans_qps */ + to_mibqp(qp)->port = to_mibqp(qp)->trans_qp.alt_port; + } if (ibqp->event_handler) { event.device = ibqp->device; @@ -265,8 +270,10 @@ static int sq_overhead(enum ib_qp_type qp_type) /* fall through */ case IB_QPT_RC: size += sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_atomic_seg) + - sizeof(struct mlx5_wqe_raddr_seg); + max(sizeof(struct mlx5_wqe_atomic_seg) + + sizeof(struct mlx5_wqe_raddr_seg), + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg)); break; case IB_QPT_XRC_TGT: @@ -274,9 +281,9 @@ static int sq_overhead(enum ib_qp_type qp_type) case IB_QPT_UC: size += sizeof(struct mlx5_wqe_ctrl_seg) + - sizeof(struct mlx5_wqe_raddr_seg) + - sizeof(struct mlx5_wqe_umr_ctrl_seg) + - sizeof(struct mlx5_mkey_seg); + max(sizeof(struct mlx5_wqe_raddr_seg), + sizeof(struct mlx5_wqe_umr_ctrl_seg) + + sizeof(struct mlx5_mkey_seg)); break; case IB_QPT_UD: @@ -366,7 +373,9 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, static int set_user_buf_size(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, - struct mlx5_ib_create_qp *ucmd) + struct mlx5_ib_create_qp *ucmd, + struct mlx5_ib_qp_base *base, + struct ib_qp_init_attr *attr) { int desc_sz = 1 << qp->sq.wqe_shift; @@ -391,8 +400,13 @@ static int set_user_buf_size(struct mlx5_ib_dev *dev, return -EINVAL; } - qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + - (qp->sq.wqe_cnt << 6); + if (attr->qp_type == IB_QPT_RAW_PACKET) { + base->ubuffer.buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift; + qp->raw_packet_qp.sq.ubuffer.buf_size = qp->sq.wqe_cnt << 6; + } else { + base->ubuffer.buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << 6); + } return 0; } @@ -578,8 +592,8 @@ static int to_mlx5_st(enum ib_qp_type type) case IB_QPT_SMI: return MLX5_QP_ST_QP0; case IB_QPT_GSI: return MLX5_QP_ST_QP1; case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; - case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; case IB_QPT_RAW_PACKET: + case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; case IB_QPT_MAX: default: return -EINVAL; } @@ -590,13 +604,51 @@ static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn) return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index; } +static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, + struct ib_pd *pd, + unsigned long addr, size_t size, + struct ib_umem **umem, + int *npages, int *page_shift, int *ncont, + u32 *offset) +{ + int err; + + *umem = ib_umem_get(pd->uobject->context, addr, size, 0, 0); + if (IS_ERR(*umem)) { + mlx5_ib_dbg(dev, "umem_get failed\n"); + return PTR_ERR(*umem); + } + + mlx5_ib_cont_pages(*umem, addr, npages, page_shift, ncont, NULL); + + err = mlx5_ib_get_buf_offset(addr, *page_shift, offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + + mlx5_ib_dbg(dev, "addr 0x%lx, size %zu, npages %d, page_shift %d, ncont %d, offset %d\n", + addr, size, *npages, *page_shift, *ncont, *offset); + + return 0; + +err_umem: + ib_umem_release(*umem); + *umem = NULL; + + return err; +} + static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_qp *qp, struct ib_udata *udata, + struct ib_qp_init_attr *attr, struct mlx5_create_qp_mbox_in **in, - struct mlx5_ib_create_qp_resp *resp, int *inlen) + struct mlx5_ib_create_qp_resp *resp, int *inlen, + struct mlx5_ib_qp_base *base) { struct mlx5_ib_ucontext *context; struct mlx5_ib_create_qp ucmd; + struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer; int page_shift = 0; int uar_index; int npages; @@ -615,18 +667,23 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, /* * TBD: should come from the verbs when we have the API */ - uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH); - if (uuarn < 0) { - mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n"); - mlx5_ib_dbg(dev, "reverting to medium latency\n"); - uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM); + if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) + /* In CROSS_CHANNEL CQ and QP must use the same UAR */ + uuarn = MLX5_CROSS_CHANNEL_UUAR; + else { + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH); if (uuarn < 0) { - mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n"); - mlx5_ib_dbg(dev, "reverting to high latency\n"); - uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); + mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n"); + mlx5_ib_dbg(dev, "reverting to medium latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM); if (uuarn < 0) { - mlx5_ib_warn(dev, "uuar allocation failed\n"); - return uuarn; + mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n"); + mlx5_ib_dbg(dev, "reverting to high latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); + if (uuarn < 0) { + mlx5_ib_warn(dev, "uuar allocation failed\n"); + return uuarn; + } } } } @@ -638,32 +695,20 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; - err = set_user_buf_size(dev, qp, &ucmd); + err = set_user_buf_size(dev, qp, &ucmd, base, attr); if (err) goto err_uuar; - if (ucmd.buf_addr && qp->buf_size) { - qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, - qp->buf_size, 0, 0); - if (IS_ERR(qp->umem)) { - mlx5_ib_dbg(dev, "umem_get failed\n"); - err = PTR_ERR(qp->umem); + if (ucmd.buf_addr && ubuffer->buf_size) { + ubuffer->buf_addr = ucmd.buf_addr; + err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, + ubuffer->buf_size, + &ubuffer->umem, &npages, &page_shift, + &ncont, &offset); + if (err) goto err_uuar; - } } else { - qp->umem = NULL; - } - - if (qp->umem) { - mlx5_ib_cont_pages(qp->umem, ucmd.buf_addr, &npages, &page_shift, - &ncont, NULL); - err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, &offset); - if (err) { - mlx5_ib_warn(dev, "bad offset\n"); - goto err_umem; - } - mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n", - ucmd.buf_addr, qp->buf_size, npages, page_shift, ncont, offset); + ubuffer->umem = NULL; } *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; @@ -672,8 +717,9 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, err = -ENOMEM; goto err_umem; } - if (qp->umem) - mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); + if (ubuffer->umem) + mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift, + (*in)->pas, 0); (*in)->ctx.log_pg_sz_remote_qpn = cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); (*in)->ctx.params2 = cpu_to_be32(offset << 6); @@ -704,29 +750,31 @@ err_free: kvfree(*in); err_umem: - if (qp->umem) - ib_umem_release(qp->umem); + if (ubuffer->umem) + ib_umem_release(ubuffer->umem); err_uuar: free_uuar(&context->uuari, uuarn); return err; } -static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp) +static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp, + struct mlx5_ib_qp_base *base) { struct mlx5_ib_ucontext *context; context = to_mucontext(pd->uobject->context); mlx5_ib_db_unmap_user(context, &qp->db); - if (qp->umem) - ib_umem_release(qp->umem); + if (base->ubuffer.umem) + ib_umem_release(base->ubuffer.umem); free_uuar(&context->uuari, qp->uuarn); } static int create_kernel_qp(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *init_attr, struct mlx5_ib_qp *qp, - struct mlx5_create_qp_mbox_in **in, int *inlen) + struct mlx5_create_qp_mbox_in **in, int *inlen, + struct mlx5_ib_qp_base *base) { enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW; struct mlx5_uuar_info *uuari; @@ -758,9 +806,9 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, qp->rq.offset = 0; qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; - qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift); + base->ubuffer.buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift); - err = mlx5_buf_alloc(dev->mdev, qp->buf_size, &qp->buf); + err = mlx5_buf_alloc(dev->mdev, base->ubuffer.buf_size, &qp->buf); if (err) { mlx5_ib_dbg(dev, "err %d\n", err); goto err_uuar; @@ -853,19 +901,304 @@ static int is_connected(enum ib_qp_type qp_type) return 0; } +static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq, u32 tdn) +{ + u32 in[MLX5_ST_SZ_DW(create_tis_in)]; + void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); + + memset(in, 0, sizeof(in)); + + MLX5_SET(tisc, tisc, transport_domain, tdn); + + return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &sq->tisn); +} + +static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq) +{ + mlx5_core_destroy_tis(dev->mdev, sq->tisn); +} + +static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq, void *qpin, + struct ib_pd *pd) +{ + struct mlx5_ib_ubuffer *ubuffer = &sq->ubuffer; + __be64 *pas; + void *in; + void *sqc; + void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc); + void *wq; + int inlen; + int err; + int page_shift = 0; + int npages; + int ncont = 0; + u32 offset = 0; + + err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, ubuffer->buf_size, + &sq->ubuffer.umem, &npages, &page_shift, + &ncont, &offset); + if (err) + return err; + + inlen = MLX5_ST_SZ_BYTES(create_sq_in) + sizeof(u64) * ncont; + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto err_umem; + } + + sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); + MLX5_SET(sqc, sqc, flush_in_error_en, 1); + MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); + MLX5_SET(sqc, sqc, user_index, MLX5_GET(qpc, qpc, user_index)); + MLX5_SET(sqc, sqc, cqn, MLX5_GET(qpc, qpc, cqn_snd)); + MLX5_SET(sqc, sqc, tis_lst_sz, 1); + MLX5_SET(sqc, sqc, tis_num_0, sq->tisn); + + wq = MLX5_ADDR_OF(sqc, sqc, wq); + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd)); + MLX5_SET(wq, wq, uar_page, MLX5_GET(qpc, qpc, uar_page)); + MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr)); + MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); + MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_sq_size)); + MLX5_SET(wq, wq, log_wq_pg_sz, page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(wq, wq, page_offset, offset); + + pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); + mlx5_ib_populate_pas(dev, sq->ubuffer.umem, page_shift, pas, 0); + + err = mlx5_core_create_sq_tracked(dev->mdev, in, inlen, &sq->base.mqp); + + kvfree(in); + + if (err) + goto err_umem; + + return 0; + +err_umem: + ib_umem_release(sq->ubuffer.umem); + sq->ubuffer.umem = NULL; + + return err; +} + +static void destroy_raw_packet_qp_sq(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq) +{ + mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp); + ib_umem_release(sq->ubuffer.umem); +} + +static int get_rq_pas_size(void *qpc) +{ + u32 log_page_size = MLX5_GET(qpc, qpc, log_page_size) + 12; + u32 log_rq_stride = MLX5_GET(qpc, qpc, log_rq_stride); + u32 log_rq_size = MLX5_GET(qpc, qpc, log_rq_size); + u32 page_offset = MLX5_GET(qpc, qpc, page_offset); + u32 po_quanta = 1 << (log_page_size - 6); + u32 rq_sz = 1 << (log_rq_size + 4 + log_rq_stride); + u32 page_size = 1 << log_page_size; + u32 rq_sz_po = rq_sz + (page_offset * po_quanta); + u32 rq_num_pas = (rq_sz_po + page_size - 1) / page_size; + + return rq_num_pas * sizeof(u64); +} + +static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, void *qpin) +{ + __be64 *pas; + __be64 *qp_pas; + void *in; + void *rqc; + void *wq; + void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc); + int inlen; + int err; + u32 rq_pas_size = get_rq_pas_size(qpc); + + inlen = MLX5_ST_SZ_BYTES(create_rq_in) + rq_pas_size; + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + MLX5_SET(rqc, rqc, vsd, 1); + MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE); + MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); + MLX5_SET(rqc, rqc, flush_in_error_en, 1); + MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index)); + MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv)); + + wq = MLX5_ADDR_OF(rqc, rqc, wq); + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + MLX5_SET(wq, wq, end_padding_mode, + MLX5_GET(qpc, qpc, end_padding_mode)); + MLX5_SET(wq, wq, page_offset, MLX5_GET(qpc, qpc, page_offset)); + MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd)); + MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr)); + MLX5_SET(wq, wq, log_wq_stride, MLX5_GET(qpc, qpc, log_rq_stride) + 4); + MLX5_SET(wq, wq, log_wq_pg_sz, MLX5_GET(qpc, qpc, log_page_size)); + MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_rq_size)); + + pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); + qp_pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, qpin, pas); + memcpy(pas, qp_pas, rq_pas_size); + + err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rq->base.mqp); + + kvfree(in); + + return err; +} + +static void destroy_raw_packet_qp_rq(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq) +{ + mlx5_core_destroy_rq_tracked(dev->mdev, &rq->base.mqp); +} + +static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, u32 tdn) +{ + u32 *in; + void *tirc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(create_tir_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); + MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT); + MLX5_SET(tirc, tirc, inline_rqn, rq->base.mqp.qpn); + MLX5_SET(tirc, tirc, transport_domain, tdn); + + err = mlx5_core_create_tir(dev->mdev, in, inlen, &rq->tirn); + + kvfree(in); + + return err; +} + +static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq) +{ + mlx5_core_destroy_tir(dev->mdev, rq->tirn); +} + +static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct mlx5_create_qp_mbox_in *in, + struct ib_pd *pd) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + struct ib_uobject *uobj = pd->uobject; + struct ib_ucontext *ucontext = uobj->context; + struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext); + int err; + u32 tdn = mucontext->tdn; + + if (qp->sq.wqe_cnt) { + err = create_raw_packet_qp_tis(dev, sq, tdn); + if (err) + return err; + + err = create_raw_packet_qp_sq(dev, sq, in, pd); + if (err) + goto err_destroy_tis; + + sq->base.container_mibqp = qp; + } + + if (qp->rq.wqe_cnt) { + err = create_raw_packet_qp_rq(dev, rq, in); + if (err) + goto err_destroy_sq; + + rq->base.container_mibqp = qp; + + err = create_raw_packet_qp_tir(dev, rq, tdn); + if (err) + goto err_destroy_rq; + } + + qp->trans_qp.base.mqp.qpn = qp->sq.wqe_cnt ? sq->base.mqp.qpn : + rq->base.mqp.qpn; + + return 0; + +err_destroy_rq: + destroy_raw_packet_qp_rq(dev, rq); +err_destroy_sq: + if (!qp->sq.wqe_cnt) + return err; + destroy_raw_packet_qp_sq(dev, sq); +err_destroy_tis: + destroy_raw_packet_qp_tis(dev, sq); + + return err; +} + +static void destroy_raw_packet_qp(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + + if (qp->rq.wqe_cnt) { + destroy_raw_packet_qp_tir(dev, rq); + destroy_raw_packet_qp_rq(dev, rq); + } + + if (qp->sq.wqe_cnt) { + destroy_raw_packet_qp_sq(dev, sq); + destroy_raw_packet_qp_tis(dev, sq); + } +} + +static void raw_packet_qp_copy_info(struct mlx5_ib_qp *qp, + struct mlx5_ib_raw_packet_qp *raw_packet_qp) +{ + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + + sq->sq = &qp->sq; + rq->rq = &qp->rq; + sq->doorbell = &qp->db; + rq->doorbell = &qp->db; +} + static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, struct mlx5_ib_qp *qp) { struct mlx5_ib_resources *devr = &dev->devr; struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_qp_base *base; struct mlx5_ib_create_qp_resp resp; struct mlx5_create_qp_mbox_in *in; struct mlx5_ib_create_qp ucmd; int inlen = sizeof(*in); int err; + u32 uidx = MLX5_IB_DEFAULT_UIDX; + void *qpc; + + base = init_attr->qp_type == IB_QPT_RAW_PACKET ? + &qp->raw_packet_qp.rq.base : + &qp->trans_qp.base; - mlx5_ib_odp_create_qp(qp); + if (init_attr->qp_type != IB_QPT_RAW_PACKET) + mlx5_ib_odp_create_qp(qp); mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); @@ -880,6 +1213,21 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, } } + if (init_attr->create_flags & + (IB_QP_CREATE_CROSS_CHANNEL | + IB_QP_CREATE_MANAGED_SEND | + IB_QP_CREATE_MANAGED_RECV)) { + if (!MLX5_CAP_GEN(mdev, cd)) { + mlx5_ib_dbg(dev, "cross-channel isn't supported\n"); + return -EINVAL; + } + if (init_attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL) + qp->flags |= MLX5_IB_QP_CROSS_CHANNEL; + if (init_attr->create_flags & IB_QP_CREATE_MANAGED_SEND) + qp->flags |= MLX5_IB_QP_MANAGED_SEND; + if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV) + qp->flags |= MLX5_IB_QP_MANAGED_RECV; + } if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; @@ -889,6 +1237,11 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, return -EFAULT; } + err = get_qp_user_index(to_mucontext(pd->uobject->context), + &ucmd, udata->inlen, &uidx); + if (err) + return err; + qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE); qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE); } else { @@ -918,11 +1271,13 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, ucmd.sq_wqe_count, max_wqes); return -EINVAL; } - err = create_user_qp(dev, pd, qp, udata, &in, &resp, &inlen); + err = create_user_qp(dev, pd, qp, udata, init_attr, &in, + &resp, &inlen, base); if (err) mlx5_ib_dbg(dev, "err %d\n", err); } else { - err = create_kernel_qp(dev, init_attr, qp, &in, &inlen); + err = create_kernel_qp(dev, init_attr, qp, &in, &inlen, + base); if (err) mlx5_ib_dbg(dev, "err %d\n", err); } @@ -954,6 +1309,13 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST); + if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) + in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_MASTER); + if (qp->flags & MLX5_IB_QP_MANAGED_SEND) + in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_SLAVE_SEND); + if (qp->flags & MLX5_IB_QP_MANAGED_RECV) + in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_SLAVE_RECV); + if (qp->scat_cqe && is_connected(init_attr->qp_type)) { int rcqe_sz; int scqe_sz; @@ -1018,26 +1380,35 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma); - err = mlx5_core_create_qp(dev->mdev, &qp->mqp, in, inlen); + if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1) { + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + /* 0xffffff means we ask to work with cqe version 0 */ + MLX5_SET(qpc, qpc, user_index, uidx); + } + + if (init_attr->qp_type == IB_QPT_RAW_PACKET) { + qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr; + raw_packet_qp_copy_info(qp, &qp->raw_packet_qp); + err = create_raw_packet_qp(dev, qp, in, pd); + } else { + err = mlx5_core_create_qp(dev->mdev, &base->mqp, in, inlen); + } + if (err) { mlx5_ib_dbg(dev, "create qp failed\n"); goto err_create; } kvfree(in); - /* Hardware wants QPN written in big-endian order (after - * shifting) for send doorbell. Precompute this value to save - * a little bit when posting sends. - */ - qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); - qp->mqp.event = mlx5_ib_qp_event; + base->container_mibqp = qp; + base->mqp.event = mlx5_ib_qp_event; return 0; err_create: if (qp->create_type == MLX5_QP_USER) - destroy_qp_user(pd, qp); + destroy_qp_user(pd, qp, base); else if (qp->create_type == MLX5_QP_KERNEL) destroy_qp_kernel(dev, qp); @@ -1129,11 +1500,11 @@ static void get_cqs(struct mlx5_ib_qp *qp, case IB_QPT_UD: case IB_QPT_RAW_IPV6: case IB_QPT_RAW_ETHERTYPE: + case IB_QPT_RAW_PACKET: *send_cq = to_mcq(qp->ibqp.send_cq); *recv_cq = to_mcq(qp->ibqp.recv_cq); break; - case IB_QPT_RAW_PACKET: case IB_QPT_MAX: default: *send_cq = NULL; @@ -1142,45 +1513,66 @@ static void get_cqs(struct mlx5_ib_qp *qp, } } +static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + u16 operation); + static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) { struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct mlx5_modify_qp_mbox_in *in; int err; + base = qp->ibqp.qp_type == IB_QPT_RAW_PACKET ? + &qp->raw_packet_qp.rq.base : + &qp->trans_qp.base; + in = kzalloc(sizeof(*in), GFP_KERNEL); if (!in) return; if (qp->state != IB_QPS_RESET) { - mlx5_ib_qp_disable_pagefaults(qp); - if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state), - MLX5_QP_STATE_RST, in, 0, &qp->mqp)) - mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", - qp->mqp.qpn); + if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) { + mlx5_ib_qp_disable_pagefaults(qp); + err = mlx5_core_qp_modify(dev->mdev, + MLX5_CMD_OP_2RST_QP, in, 0, + &base->mqp); + } else { + err = modify_raw_packet_qp(dev, qp, + MLX5_CMD_OP_2RST_QP); + } + if (err) + mlx5_ib_warn(dev, "mlx5_ib: modify QP 0x%06x to RESET failed\n", + base->mqp.qpn); } get_cqs(qp, &send_cq, &recv_cq); if (qp->create_type == MLX5_QP_KERNEL) { mlx5_ib_lock_cqs(send_cq, recv_cq); - __mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + __mlx5_ib_cq_clean(recv_cq, base->mqp.qpn, qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); if (send_cq != recv_cq) - __mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + __mlx5_ib_cq_clean(send_cq, base->mqp.qpn, + NULL); mlx5_ib_unlock_cqs(send_cq, recv_cq); } - err = mlx5_core_destroy_qp(dev->mdev, &qp->mqp); - if (err) - mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", qp->mqp.qpn); - kfree(in); + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { + destroy_raw_packet_qp(dev, qp); + } else { + err = mlx5_core_destroy_qp(dev->mdev, &base->mqp); + if (err) + mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", + base->mqp.qpn); + } + kfree(in); if (qp->create_type == MLX5_QP_KERNEL) destroy_qp_kernel(dev, qp); else if (qp->create_type == MLX5_QP_USER) - destroy_qp_user(&get_pd(qp)->ibpd, qp); + destroy_qp_user(&get_pd(qp)->ibpd, qp, base); } static const char *ib_qp_type_str(enum ib_qp_type type) @@ -1225,6 +1617,16 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, if (pd) { dev = to_mdev(pd->device); + + if (init_attr->qp_type == IB_QPT_RAW_PACKET) { + if (!pd->uobject) { + mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n"); + return ERR_PTR(-EINVAL); + } else if (!to_mucontext(pd->uobject->context)->cqe_version) { + mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n"); + return ERR_PTR(-EINVAL); + } + } } else { /* being cautious here */ if (init_attr->qp_type != IB_QPT_XRC_TGT && @@ -1250,6 +1652,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, } /* fall through */ + case IB_QPT_RAW_PACKET: case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: @@ -1272,19 +1675,19 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, else if (is_qp1(init_attr->qp_type)) qp->ibqp.qp_num = 1; else - qp->ibqp.qp_num = qp->mqp.qpn; + qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn; mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n", - qp->ibqp.qp_num, qp->mqp.qpn, to_mcq(init_attr->recv_cq)->mcq.cqn, + qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn, + to_mcq(init_attr->recv_cq)->mcq.cqn, to_mcq(init_attr->send_cq)->mcq.cqn); - qp->xrcdn = xrcdn; + qp->trans_qp.xrcdn = xrcdn; break; case IB_QPT_RAW_IPV6: case IB_QPT_RAW_ETHERTYPE: - case IB_QPT_RAW_PACKET: case IB_QPT_MAX: default: mlx5_ib_dbg(dev, "unsupported qp type %d\n", @@ -1318,12 +1721,12 @@ static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_att if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) dest_rd_atomic = attr->max_dest_rd_atomic; else - dest_rd_atomic = qp->resp_depth; + dest_rd_atomic = qp->trans_qp.resp_depth; if (attr_mask & IB_QP_ACCESS_FLAGS) access_flags = attr->qp_access_flags; else - access_flags = qp->atomic_rd_en; + access_flags = qp->trans_qp.atomic_rd_en; if (!dest_rd_atomic) access_flags &= IB_ACCESS_REMOTE_WRITE; @@ -1360,21 +1763,42 @@ static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) return rate + MLX5_STAT_RATE_OFFSET; } -static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, +static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev, + struct mlx5_ib_sq *sq, u8 sl) +{ + void *in; + void *tisc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_tis_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_tis_in, in, bitmask.prio, 1); + + tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); + MLX5_SET(tisc, tisc, prio, ((sl & 0x7) << 1)); + + err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen); + + kvfree(in); + + return err; +} + +static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct ib_ah_attr *ah, struct mlx5_qp_path *path, u8 port, int attr_mask, u32 path_flags, const struct ib_qp_attr *attr) { + enum rdma_link_layer ll = rdma_port_get_link_layer(&dev->ib_dev, port); int err; - path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; - path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : 0; - if (attr_mask & IB_QP_PKEY_INDEX) path->pkey_index = attr->pkey_index; - path->grh_mlid = ah->src_path_bits & 0x7f; - path->rlid = cpu_to_be16(ah->dlid); - if (ah->ah_flags & IB_AH_GRH) { if (ah->grh.sgid_index >= dev->mdev->port_caps[port - 1].gid_table_len) { @@ -1383,7 +1807,27 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, dev->mdev->port_caps[port - 1].gid_table_len); return -EINVAL; } - path->grh_mlid |= 1 << 7; + } + + if (ll == IB_LINK_LAYER_ETHERNET) { + if (!(ah->ah_flags & IB_AH_GRH)) + return -EINVAL; + memcpy(path->rmac, ah->dmac, sizeof(ah->dmac)); + path->udp_sport = mlx5_get_roce_udp_sport(dev, port, + ah->grh.sgid_index); + path->dci_cfi_prio_sl = (ah->sl & 0x7) << 4; + } else { + path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; + path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : + 0; + path->rlid = cpu_to_be16(ah->dlid); + path->grh_mlid = ah->src_path_bits & 0x7f; + if (ah->ah_flags & IB_AH_GRH) + path->grh_mlid |= 1 << 7; + path->dci_cfi_prio_sl = ah->sl & 0xf; + } + + if (ah->ah_flags & IB_AH_GRH) { path->mgid_index = ah->grh.sgid_index; path->hop_limit = ah->grh.hop_limit; path->tclass_flowlabel = @@ -1401,7 +1845,10 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, if (attr_mask & IB_QP_TIMEOUT) path->ackto_lt = attr->timeout << 3; - path->sl = ah->sl & 0xf; + if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt) + return modify_raw_packet_eth_prio(dev->mdev, + &qp->raw_packet_qp.sq, + ah->sl & 0xf); return 0; } @@ -1549,12 +1996,154 @@ static int ib_mask_to_mlx5_opt(int ib_mask) return result; } +static int modify_raw_packet_qp_rq(struct mlx5_core_dev *dev, + struct mlx5_ib_rq *rq, int new_state) +{ + void *in; + void *rqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_rq_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_rq_in, in, rq_state, rq->state); + + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); + MLX5_SET(rqc, rqc, state, new_state); + + err = mlx5_core_modify_rq(dev, rq->base.mqp.qpn, in, inlen); + if (err) + goto out; + + rq->state = new_state; + +out: + kvfree(in); + return err; +} + +static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev, + struct mlx5_ib_sq *sq, int new_state) +{ + void *in; + void *sqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_sq_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_sq_in, in, sq_state, sq->state); + + sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); + MLX5_SET(sqc, sqc, state, new_state); + + err = mlx5_core_modify_sq(dev, sq->base.mqp.qpn, in, inlen); + if (err) + goto out; + + sq->state = new_state; + +out: + kvfree(in); + return err; +} + +static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + u16 operation) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + int rq_state; + int sq_state; + int err; + + switch (operation) { + case MLX5_CMD_OP_RST2INIT_QP: + rq_state = MLX5_RQC_STATE_RDY; + sq_state = MLX5_SQC_STATE_RDY; + break; + case MLX5_CMD_OP_2ERR_QP: + rq_state = MLX5_RQC_STATE_ERR; + sq_state = MLX5_SQC_STATE_ERR; + break; + case MLX5_CMD_OP_2RST_QP: + rq_state = MLX5_RQC_STATE_RST; + sq_state = MLX5_SQC_STATE_RST; + break; + case MLX5_CMD_OP_INIT2INIT_QP: + case MLX5_CMD_OP_INIT2RTR_QP: + case MLX5_CMD_OP_RTR2RTS_QP: + case MLX5_CMD_OP_RTS2RTS_QP: + /* Nothing to do here... */ + return 0; + default: + WARN_ON(1); + return -EINVAL; + } + + if (qp->rq.wqe_cnt) { + err = modify_raw_packet_qp_rq(dev->mdev, rq, rq_state); + if (err) + return err; + } + + if (qp->sq.wqe_cnt) + return modify_raw_packet_qp_sq(dev->mdev, sq, sq_state); + + return 0; +} + static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) { + static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = { + [MLX5_QP_STATE_RST] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_RST2INIT_QP, + }, + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_INIT2INIT_QP, + [MLX5_QP_STATE_RTR] = MLX5_CMD_OP_INIT2RTR_QP, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTR2RTS_QP, + }, + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTS2RTS_QP, + }, + [MLX5_QP_STATE_SQD] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + }, + [MLX5_QP_STATE_SQER] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQERR2RTS_QP, + }, + [MLX5_QP_STATE_ERR] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + } + }; + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct mlx5_ib_cq *send_cq, *recv_cq; struct mlx5_qp_context *context; struct mlx5_modify_qp_mbox_in *in; @@ -1564,6 +2153,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, int sqd_event; int mlx5_st; int err; + u16 op; in = kzalloc(sizeof(*in), GFP_KERNEL); if (!in) @@ -1623,7 +2213,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, context->pri_path.port = attr->port_num; if (attr_mask & IB_QP_AV) { - err = mlx5_set_path(dev, &attr->ah_attr, &context->pri_path, + err = mlx5_set_path(dev, qp, &attr->ah_attr, &context->pri_path, attr_mask & IB_QP_PORT ? attr->port_num : qp->port, attr_mask, 0, attr); if (err) @@ -1634,7 +2224,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, context->pri_path.ackto_lt |= attr->timeout << 3; if (attr_mask & IB_QP_ALT_PATH) { - err = mlx5_set_path(dev, &attr->alt_ah_attr, &context->alt_path, + err = mlx5_set_path(dev, qp, &attr->alt_ah_attr, + &context->alt_path, attr->alt_port_num, attr_mask, 0, attr); if (err) goto out; @@ -1706,41 +2297,51 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, * again to RTS, and may cause the driver and the device to get out of * sync. */ if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && - (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) + (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR) && + (qp->ibqp.qp_type != IB_QPT_RAW_PACKET)) mlx5_ib_qp_disable_pagefaults(qp); + if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE || + !optab[mlx5_cur][mlx5_new]) + goto out; + + op = optab[mlx5_cur][mlx5_new]; optpar = ib_mask_to_mlx5_opt(attr_mask); optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; in->optparam = cpu_to_be32(optpar); - err = mlx5_core_qp_modify(dev->mdev, to_mlx5_state(cur_state), - to_mlx5_state(new_state), in, sqd_event, - &qp->mqp); + + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) + err = modify_raw_packet_qp(dev, qp, op); + else + err = mlx5_core_qp_modify(dev->mdev, op, in, sqd_event, + &base->mqp); if (err) goto out; - if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT && + (qp->ibqp.qp_type != IB_QPT_RAW_PACKET)) mlx5_ib_qp_enable_pagefaults(qp); qp->state = new_state; if (attr_mask & IB_QP_ACCESS_FLAGS) - qp->atomic_rd_en = attr->qp_access_flags; + qp->trans_qp.atomic_rd_en = attr->qp_access_flags; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) - qp->resp_depth = attr->max_dest_rd_atomic; + qp->trans_qp.resp_depth = attr->max_dest_rd_atomic; if (attr_mask & IB_QP_PORT) qp->port = attr->port_num; if (attr_mask & IB_QP_ALT_PATH) - qp->alt_port = attr->alt_port_num; + qp->trans_qp.alt_port = attr->alt_port_num; /* * If we moved a kernel QP to RESET, clean up all old CQ * entries and reinitialize the QP. */ if (new_state == IB_QPS_RESET && !ibqp->uobject) { - mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + mlx5_ib_cq_clean(recv_cq, base->mqp.qpn, ibqp->srq ? to_msrq(ibqp->srq) : NULL); if (send_cq != recv_cq) - mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + mlx5_ib_cq_clean(send_cq, base->mqp.qpn, NULL); qp->rq.head = 0; qp->rq.tail = 0; @@ -1765,15 +2366,21 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, enum ib_qp_state cur_state, new_state; int err = -EINVAL; int port; + enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; mutex_lock(&qp->mutex); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + if (!(cur_state == new_state && cur_state == IB_QPS_RESET)) { + port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port); + } + if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR && !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, - IB_LINK_LAYER_UNSPECIFIED)) + ll)) goto out; if ((attr_mask & IB_QP_PORT) && @@ -2570,7 +3177,7 @@ static void finish_wqe(struct mlx5_ib_qp *qp, ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | mlx5_opcode | ((u32)opmod << 24)); - ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8)); + ctrl->qpn_ds = cpu_to_be32(size | (qp->trans_qp.base.mqp.qpn << 8)); ctrl->fm_ce_se |= fence; qp->fm_cache = next_fence; if (unlikely(qp->wq_sig)) @@ -3003,7 +3610,7 @@ static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_at ib_ah_attr->port_num > MLX5_CAP_GEN(dev, num_ports)) return; - ib_ah_attr->sl = path->sl & 0xf; + ib_ah_attr->sl = path->dci_cfi_prio_sl & 0xf; ib_ah_attr->dlid = be16_to_cpu(path->rlid); ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f; @@ -3021,39 +3628,153 @@ static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_at } } -int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, - struct ib_qp_init_attr *qp_init_attr) +static int query_raw_packet_qp_sq_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq, + u8 *sq_state) +{ + void *out; + void *sqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(query_sq_out); + out = mlx5_vzalloc(inlen); + if (!out) + return -ENOMEM; + + err = mlx5_core_query_sq(dev->mdev, sq->base.mqp.qpn, out); + if (err) + goto out; + + sqc = MLX5_ADDR_OF(query_sq_out, out, sq_context); + *sq_state = MLX5_GET(sqc, sqc, state); + sq->state = *sq_state; + +out: + kvfree(out); + return err; +} + +static int query_raw_packet_qp_rq_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, + u8 *rq_state) +{ + void *out; + void *rqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(query_rq_out); + out = mlx5_vzalloc(inlen); + if (!out) + return -ENOMEM; + + err = mlx5_core_query_rq(dev->mdev, rq->base.mqp.qpn, out); + if (err) + goto out; + + rqc = MLX5_ADDR_OF(query_rq_out, out, rq_context); + *rq_state = MLX5_GET(rqc, rqc, state); + rq->state = *rq_state; + +out: + kvfree(out); + return err; +} + +static int sqrq_state_to_qp_state(u8 sq_state, u8 rq_state, + struct mlx5_ib_qp *qp, u8 *qp_state) +{ + static const u8 sqrq_trans[MLX5_RQ_NUM_STATE][MLX5_SQ_NUM_STATE] = { + [MLX5_RQC_STATE_RST] = { + [MLX5_SQC_STATE_RST] = IB_QPS_RESET, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_ERR] = MLX5_QP_STATE_BAD, + [MLX5_SQ_STATE_NA] = IB_QPS_RESET, + }, + [MLX5_RQC_STATE_RDY] = { + [MLX5_SQC_STATE_RST] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE, + [MLX5_SQC_STATE_ERR] = IB_QPS_SQE, + [MLX5_SQ_STATE_NA] = MLX5_QP_STATE, + }, + [MLX5_RQC_STATE_ERR] = { + [MLX5_SQC_STATE_RST] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_ERR] = IB_QPS_ERR, + [MLX5_SQ_STATE_NA] = IB_QPS_ERR, + }, + [MLX5_RQ_STATE_NA] = { + [MLX5_SQC_STATE_RST] = IB_QPS_RESET, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE, + [MLX5_SQC_STATE_ERR] = MLX5_QP_STATE, + [MLX5_SQ_STATE_NA] = MLX5_QP_STATE_BAD, + }, + }; + + *qp_state = sqrq_trans[rq_state][sq_state]; + + if (*qp_state == MLX5_QP_STATE_BAD) { + WARN(1, "Buggy Raw Packet QP state, SQ 0x%x state: 0x%x, RQ 0x%x state: 0x%x", + qp->raw_packet_qp.sq.base.mqp.qpn, sq_state, + qp->raw_packet_qp.rq.base.mqp.qpn, rq_state); + return -EINVAL; + } + + if (*qp_state == MLX5_QP_STATE) + *qp_state = qp->state; + + return 0; +} + +static int query_raw_packet_qp_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + u8 *raw_packet_qp_state) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + int err; + u8 sq_state = MLX5_SQ_STATE_NA; + u8 rq_state = MLX5_RQ_STATE_NA; + + if (qp->sq.wqe_cnt) { + err = query_raw_packet_qp_sq_state(dev, sq, &sq_state); + if (err) + return err; + } + + if (qp->rq.wqe_cnt) { + err = query_raw_packet_qp_rq_state(dev, rq, &rq_state); + if (err) + return err; + } + + return sqrq_state_to_qp_state(sq_state, rq_state, qp, + raw_packet_qp_state); +} + +static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct ib_qp_attr *qp_attr) { - struct mlx5_ib_dev *dev = to_mdev(ibqp->device); - struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_query_qp_mbox_out *outb; struct mlx5_qp_context *context; int mlx5_state; int err = 0; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - /* - * Wait for any outstanding page faults, in case the user frees memory - * based upon this query's result. - */ - flush_workqueue(mlx5_ib_page_fault_wq); -#endif - - mutex_lock(&qp->mutex); outb = kzalloc(sizeof(*outb), GFP_KERNEL); - if (!outb) { - err = -ENOMEM; - goto out; - } + if (!outb) + return -ENOMEM; + context = &outb->ctx; - err = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, sizeof(*outb)); + err = mlx5_core_qp_query(dev->mdev, &qp->trans_qp.base.mqp, outb, + sizeof(*outb)); if (err) - goto out_free; + goto out; mlx5_state = be32_to_cpu(context->flags) >> 28; qp->state = to_ib_qp_state(mlx5_state); - qp_attr->qp_state = qp->state; qp_attr->path_mtu = context->mtu_msgmax >> 5; qp_attr->path_mig_state = to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3); @@ -3087,6 +3808,43 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7; qp_attr->rnr_retry = (be32_to_cpu(context->params1) >> 13) & 0x7; qp_attr->alt_timeout = context->alt_path.ackto_lt >> 3; + +out: + kfree(outb); + return err; +} + +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + int err = 0; + u8 raw_packet_qp_state; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* + * Wait for any outstanding page faults, in case the user frees memory + * based upon this query's result. + */ + flush_workqueue(mlx5_ib_page_fault_wq); +#endif + + mutex_lock(&qp->mutex); + + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { + err = query_raw_packet_qp_state(dev, qp, &raw_packet_qp_state); + if (err) + goto out; + qp->state = raw_packet_qp_state; + qp_attr->port_num = 1; + } else { + err = query_qp_attr(dev, qp, qp_attr); + if (err) + goto out; + } + + qp_attr->qp_state = qp->state; qp_attr->cur_qp_state = qp_attr->qp_state; qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; qp_attr->cap.max_recv_sge = qp->rq.max_gs; @@ -3110,12 +3868,16 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) + qp_init_attr->create_flags |= IB_QP_CREATE_CROSS_CHANNEL; + if (qp->flags & MLX5_IB_QP_MANAGED_SEND) + qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND; + if (qp->flags & MLX5_IB_QP_MANAGED_RECV) + qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV; + qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; -out_free: - kfree(outb); - out: mutex_unlock(&qp->mutex); return err; diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index e008505e9..3b2ddd64a 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -75,31 +75,42 @@ static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type) static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, struct mlx5_create_srq_mbox_in **in, - struct ib_udata *udata, int buf_size, int *inlen) + struct ib_udata *udata, int buf_size, int *inlen, + int is_xrc) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct mlx5_ib_create_srq ucmd; + struct mlx5_ib_create_srq ucmd = {}; size_t ucmdlen; + void *xsrqc; int err; int npages; int page_shift; int ncont; u32 offset; + u32 uidx = MLX5_IB_DEFAULT_UIDX; - ucmdlen = - (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < - sizeof(ucmd)) ? (sizeof(ucmd) - - sizeof(ucmd.reserved)) : sizeof(ucmd); + ucmdlen = min(udata->inlen, sizeof(ucmd)); if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { mlx5_ib_dbg(dev, "failed copy udata\n"); return -EFAULT; } - if (ucmdlen == sizeof(ucmd) && - ucmd.reserved != 0) + if (ucmd.reserved0 || ucmd.reserved1) return -EINVAL; + if (udata->inlen > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + udata->inlen - sizeof(ucmd))) + return -EINVAL; + + if (is_xrc) { + err = get_srq_user_index(to_mucontext(pd->uobject->context), + &ucmd, udata->inlen, &uidx); + if (err) + return err; + } + srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size, @@ -138,6 +149,13 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); + if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) && + is_xrc){ + xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, + xrc_srq_context_entry); + MLX5_SET(xrc_srqc, xsrqc, user_index, uidx); + } + return 0; err_in: @@ -151,13 +169,14 @@ err_umem: static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, struct mlx5_create_srq_mbox_in **in, int buf_size, - int *inlen) + int *inlen, int is_xrc) { int err; int i; struct mlx5_wqe_srq_next_seg *next; int page_shift; int npages; + void *xsrqc; err = mlx5_db_alloc(dev->mdev, &srq->db); if (err) { @@ -204,6 +223,14 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + if ((MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) && + is_xrc){ + xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, + xrc_srq_context_entry); + /* 0xffffff means we ask to work with cqe version 0 */ + MLX5_SET(xrc_srqc, xsrqc, user_index, MLX5_IB_DEFAULT_UIDX); + } + return 0; err_in: @@ -275,10 +302,14 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs, srq->msrq.max_avail_gather); + is_xrc = (init_attr->srq_type == IB_SRQT_XRC); + if (pd->uobject) - err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen); + err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen, + is_xrc); else - err = create_srq_kernel(dev, srq, &in, buf_size, &inlen); + err = create_srq_kernel(dev, srq, &in, buf_size, &inlen, + is_xrc); if (err) { mlx5_ib_warn(dev, "create srq %s failed, err %d\n", @@ -286,7 +317,6 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd, goto err_srq; } - is_xrc = (init_attr->srq_type == IB_SRQT_XRC); in->ctx.state_log_sz = ilog2(srq->msrq.max); flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24; xrcdn = 0; diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h index 76fb7b927..b94a55404 100644 --- a/drivers/infiniband/hw/mlx5/user.h +++ b/drivers/infiniband/hw/mlx5/user.h @@ -35,6 +35,8 @@ #include <linux/types.h> +#include "mlx5_ib.h" + enum { MLX5_QP_FLAG_SIGNATURE = 1 << 0, MLX5_QP_FLAG_SCATTER_CQE = 1 << 1, @@ -66,7 +68,15 @@ struct mlx5_ib_alloc_ucontext_req_v2 { __u32 total_num_uuars; __u32 num_low_latency_uuars; __u32 flags; - __u32 reserved; + __u32 comp_mask; + __u8 max_cqe_version; + __u8 reserved0; + __u16 reserved1; + __u32 reserved2; +}; + +enum mlx5_ib_alloc_ucontext_resp_mask { + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0, }; struct mlx5_ib_alloc_ucontext_resp { @@ -80,7 +90,13 @@ struct mlx5_ib_alloc_ucontext_resp { __u32 max_recv_wr; __u32 max_srq_recv_wr; __u16 num_ports; - __u16 reserved; + __u16 reserved1; + __u32 comp_mask; + __u32 response_length; + __u8 cqe_version; + __u8 reserved2; + __u16 reserved3; + __u64 hca_core_clock_offset; }; struct mlx5_ib_alloc_pd_resp { @@ -110,7 +126,9 @@ struct mlx5_ib_create_srq { __u64 buf_addr; __u64 db_addr; __u32 flags; - __u32 reserved; /* explicit padding (optional on i386) */ + __u32 reserved0; /* explicit padding (optional on i386) */ + __u32 uidx; + __u32 reserved1; }; struct mlx5_ib_create_srq_resp { @@ -125,9 +143,48 @@ struct mlx5_ib_create_qp { __u32 rq_wqe_count; __u32 rq_wqe_shift; __u32 flags; + __u32 uidx; + __u32 reserved0; + __u64 sq_buf_addr; }; struct mlx5_ib_create_qp_resp { __u32 uuar_index; }; + +static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext, + struct mlx5_ib_create_qp *ucmd, + int inlen, + u32 *user_index) +{ + u8 cqe_version = ucontext->cqe_version; + + if (field_avail(struct mlx5_ib_create_qp, uidx, inlen) && + !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) + return 0; + + if (!!(field_avail(struct mlx5_ib_create_qp, uidx, inlen) != + !!cqe_version)) + return -EINVAL; + + return verify_assign_uidx(cqe_version, ucmd->uidx, user_index); +} + +static inline int get_srq_user_index(struct mlx5_ib_ucontext *ucontext, + struct mlx5_ib_create_srq *ucmd, + int inlen, + u32 *user_index) +{ + u8 cqe_version = ucontext->cqe_version; + + if (field_avail(struct mlx5_ib_create_srq, uidx, inlen) && + !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) + return 0; + + if (!!(field_avail(struct mlx5_ib_create_srq, uidx, inlen) != + !!cqe_version)) + return -EINVAL; + + return verify_assign_uidx(cqe_version, ucmd->uidx, user_index); +} #endif /* MLX5_IB_USER_H */ |