From 863981e96738983919de841ec669e157e6bdaeb0 Mon Sep 17 00:00:00 2001 From: AndrĂ© Fabian Silva Delgado Date: Sun, 11 Sep 2016 04:34:46 -0300 Subject: Linux-libre 4.7.1-gnu --- .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c | 405 ++++++++++++++++----- .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h | 134 ++++--- .../staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c | 101 ++--- .../lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c | 139 +++++-- .../staging/lustre/lnet/klnds/socklnd/socklnd.c | 1 - .../lustre/lnet/klnds/socklnd/socklnd_lib.c | 3 - 6 files changed, 559 insertions(+), 224 deletions(-) (limited to 'drivers/staging/lustre/lnet/klnds') diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c index 0d32e6541..6c59f2ff2 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c @@ -335,8 +335,8 @@ int kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid) peer->ibp_nid = nid; peer->ibp_error = 0; peer->ibp_last_alive = 0; - peer->ibp_max_frags = IBLND_CFG_RDMA_FRAGS; - peer->ibp_queue_depth = *kiblnd_tunables.kib_peertxcredits; + peer->ibp_max_frags = kiblnd_cfg_rdma_frags(peer->ibp_ni); + peer->ibp_queue_depth = ni->ni_peertxcredits; atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */ INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */ @@ -1283,65 +1283,86 @@ static void kiblnd_map_tx_pool(kib_tx_pool_t *tpo) } } -struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd, +struct ib_mr *kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd, int negotiated_nfrags) { - __u16 nfrags = (negotiated_nfrags != -1) ? - negotiated_nfrags : *kiblnd_tunables.kib_map_on_demand; + kib_net_t *net = ni->ni_data; + kib_hca_dev_t *hdev = net->ibn_dev->ibd_hdev; + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + __u16 nfrags; + int mod; + + tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; + mod = tunables->lnd_map_on_demand; + nfrags = (negotiated_nfrags != -1) ? negotiated_nfrags : mod; LASSERT(hdev->ibh_mrs); - if (*kiblnd_tunables.kib_map_on_demand > 0 && - nfrags <= rd->rd_nfrags) + if (mod > 0 && nfrags <= rd->rd_nfrags) return NULL; return hdev->ibh_mrs; } -static void kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool) +static void kiblnd_destroy_fmr_pool(kib_fmr_pool_t *fpo) { - LASSERT(!pool->fpo_map_count); + LASSERT(!fpo->fpo_map_count); - if (pool->fpo_fmr_pool) - ib_destroy_fmr_pool(pool->fpo_fmr_pool); + if (fpo->fpo_is_fmr) { + if (fpo->fmr.fpo_fmr_pool) + ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool); + } else { + struct kib_fast_reg_descriptor *frd, *tmp; + int i = 0; + + list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list, + frd_list) { + list_del(&frd->frd_list); + ib_dereg_mr(frd->frd_mr); + LIBCFS_FREE(frd, sizeof(*frd)); + i++; + } + if (i < fpo->fast_reg.fpo_pool_size) + CERROR("FastReg pool still has %d regions registered\n", + fpo->fast_reg.fpo_pool_size - i); + } - if (pool->fpo_hdev) - kiblnd_hdev_decref(pool->fpo_hdev); + if (fpo->fpo_hdev) + kiblnd_hdev_decref(fpo->fpo_hdev); - LIBCFS_FREE(pool, sizeof(*pool)); + LIBCFS_FREE(fpo, sizeof(*fpo)); } static void kiblnd_destroy_fmr_pool_list(struct list_head *head) { - kib_fmr_pool_t *pool; + kib_fmr_pool_t *fpo, *tmp; - while (!list_empty(head)) { - pool = list_entry(head->next, kib_fmr_pool_t, fpo_list); - list_del(&pool->fpo_list); - kiblnd_destroy_fmr_pool(pool); + list_for_each_entry_safe(fpo, tmp, head, fpo_list) { + list_del(&fpo->fpo_list); + kiblnd_destroy_fmr_pool(fpo); } } -static int kiblnd_fmr_pool_size(int ncpts) +static int +kiblnd_fmr_pool_size(struct lnet_ioctl_config_o2iblnd_tunables *tunables, + int ncpts) { - int size = *kiblnd_tunables.kib_fmr_pool_size / ncpts; + int size = tunables->lnd_fmr_pool_size / ncpts; return max(IBLND_FMR_POOL, size); } -static int kiblnd_fmr_flush_trigger(int ncpts) +static int +kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables, + int ncpts) { - int size = *kiblnd_tunables.kib_fmr_flush_trigger / ncpts; + int size = tunables->lnd_fmr_flush_trigger / ncpts; return max(IBLND_FMR_POOL_FLUSH, size); } -static int kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, - kib_fmr_pool_t **pp_fpo) +static int kiblnd_alloc_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo) { - /* FMR pool for RDMA */ - kib_dev_t *dev = fps->fps_net->ibn_dev; - kib_fmr_pool_t *fpo; struct ib_fmr_pool_param param = { .max_pages_per_fmr = LNET_MAX_PAYLOAD / PAGE_SIZE, .page_shift = PAGE_SHIFT, @@ -1351,7 +1372,78 @@ static int kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, .dirty_watermark = fps->fps_flush_trigger, .flush_function = NULL, .flush_arg = NULL, - .cache = !!*kiblnd_tunables.kib_fmr_cache}; + .cache = !!fps->fps_cache }; + int rc = 0; + + fpo->fmr.fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, + ¶m); + if (IS_ERR(fpo->fmr.fpo_fmr_pool)) { + rc = PTR_ERR(fpo->fmr.fpo_fmr_pool); + if (rc != -ENOSYS) + CERROR("Failed to create FMR pool: %d\n", rc); + else + CERROR("FMRs are not supported\n"); + } + + return rc; +} + +static int kiblnd_alloc_freg_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo) +{ + struct kib_fast_reg_descriptor *frd, *tmp; + int i, rc; + + INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list); + fpo->fast_reg.fpo_pool_size = 0; + for (i = 0; i < fps->fps_pool_size; i++) { + LIBCFS_CPT_ALLOC(frd, lnet_cpt_table(), fps->fps_cpt, + sizeof(*frd)); + if (!frd) { + CERROR("Failed to allocate a new fast_reg descriptor\n"); + rc = -ENOMEM; + goto out; + } + + frd->frd_mr = ib_alloc_mr(fpo->fpo_hdev->ibh_pd, + IB_MR_TYPE_MEM_REG, + LNET_MAX_PAYLOAD / PAGE_SIZE); + if (IS_ERR(frd->frd_mr)) { + rc = PTR_ERR(frd->frd_mr); + CERROR("Failed to allocate ib_alloc_mr: %d\n", rc); + frd->frd_mr = NULL; + goto out_middle; + } + + frd->frd_valid = true; + + list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list); + fpo->fast_reg.fpo_pool_size++; + } + + return 0; + +out_middle: + if (frd->frd_mr) + ib_dereg_mr(frd->frd_mr); + LIBCFS_FREE(frd, sizeof(*frd)); + +out: + list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list, + frd_list) { + list_del(&frd->frd_list); + ib_dereg_mr(frd->frd_mr); + LIBCFS_FREE(frd, sizeof(*frd)); + } + + return rc; +} + +static int kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, + kib_fmr_pool_t **pp_fpo) +{ + kib_dev_t *dev = fps->fps_net->ibn_dev; + struct ib_device_attr *dev_attr; + kib_fmr_pool_t *fpo; int rc; LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo)); @@ -1359,22 +1451,41 @@ static int kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps, return -ENOMEM; fpo->fpo_hdev = kiblnd_current_hdev(dev); - - fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, ¶m); - if (IS_ERR(fpo->fpo_fmr_pool)) { - rc = PTR_ERR(fpo->fpo_fmr_pool); - CERROR("Failed to create FMR pool: %d\n", rc); - - kiblnd_hdev_decref(fpo->fpo_hdev); - LIBCFS_FREE(fpo, sizeof(*fpo)); - return rc; + dev_attr = &fpo->fpo_hdev->ibh_ibdev->attrs; + + /* Check for FMR or FastReg support */ + fpo->fpo_is_fmr = 0; + if (fpo->fpo_hdev->ibh_ibdev->alloc_fmr && + fpo->fpo_hdev->ibh_ibdev->dealloc_fmr && + fpo->fpo_hdev->ibh_ibdev->map_phys_fmr && + fpo->fpo_hdev->ibh_ibdev->unmap_fmr) { + LCONSOLE_INFO("Using FMR for registration\n"); + fpo->fpo_is_fmr = 1; + } else if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + LCONSOLE_INFO("Using FastReg for registration\n"); + } else { + rc = -ENOSYS; + LCONSOLE_ERROR_MSG(rc, "IB device does not support FMRs nor FastRegs, can't register memory\n"); + goto out_fpo; } + if (fpo->fpo_is_fmr) + rc = kiblnd_alloc_fmr_pool(fps, fpo); + else + rc = kiblnd_alloc_freg_pool(fps, fpo); + if (rc) + goto out_fpo; + fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); - fpo->fpo_owner = fps; + fpo->fpo_owner = fps; *pp_fpo = fpo; return 0; + +out_fpo: + kiblnd_hdev_decref(fpo->fpo_hdev); + LIBCFS_FREE(fpo, sizeof(*fpo)); + return rc; } static void kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps, @@ -1407,9 +1518,10 @@ static void kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps) } } -static int kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, - kib_net_t *net, int pool_size, - int flush_trigger) +static int +kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, int ncpts, + kib_net_t *net, + struct lnet_ioctl_config_o2iblnd_tunables *tunables) { kib_fmr_pool_t *fpo; int rc; @@ -1418,8 +1530,11 @@ static int kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, fps->fps_net = net; fps->fps_cpt = cpt; - fps->fps_pool_size = pool_size; - fps->fps_flush_trigger = flush_trigger; + + fps->fps_pool_size = kiblnd_fmr_pool_size(tunables, ncpts); + fps->fps_flush_trigger = kiblnd_fmr_flush_trigger(tunables, ncpts); + fps->fps_cache = tunables->lnd_fmr_cache; + spin_lock_init(&fps->fps_lock); INIT_LIST_HEAD(&fps->fps_pool_list); INIT_LIST_HEAD(&fps->fps_failed_pool_list); @@ -1440,25 +1555,64 @@ static int kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, unsigned long now) return cfs_time_aftereq(now, fpo->fpo_deadline); } +static int +kiblnd_map_tx_pages(kib_tx_t *tx, kib_rdma_desc_t *rd) +{ + __u64 *pages = tx->tx_pages; + kib_hca_dev_t *hdev; + int npages; + int size; + int i; + + hdev = tx->tx_pool->tpo_hdev; + + for (i = 0, npages = 0; i < rd->rd_nfrags; i++) { + for (size = 0; size < rd->rd_frags[i].rf_nob; + size += hdev->ibh_page_size) { + pages[npages++] = (rd->rd_frags[i].rf_addr & + hdev->ibh_page_mask) + size; + } + } + + return npages; +} + void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status) { LIST_HEAD(zombies); kib_fmr_pool_t *fpo = fmr->fmr_pool; - kib_fmr_poolset_t *fps = fpo->fpo_owner; + kib_fmr_poolset_t *fps; unsigned long now = cfs_time_current(); kib_fmr_pool_t *tmp; int rc; - rc = ib_fmr_pool_unmap(fmr->fmr_pfmr); - LASSERT(!rc); + if (!fpo) + return; - if (status) { - rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool); - LASSERT(!rc); - } + fps = fpo->fpo_owner; + if (fpo->fpo_is_fmr) { + if (fmr->fmr_pfmr) { + rc = ib_fmr_pool_unmap(fmr->fmr_pfmr); + LASSERT(!rc); + fmr->fmr_pfmr = NULL; + } + + if (status) { + rc = ib_flush_fmr_pool(fpo->fmr.fpo_fmr_pool); + LASSERT(!rc); + } + } else { + struct kib_fast_reg_descriptor *frd = fmr->fmr_frd; + if (frd) { + frd->frd_valid = false; + spin_lock(&fps->fps_lock); + list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list); + spin_unlock(&fps->fps_lock); + fmr->fmr_frd = NULL; + } + } fmr->fmr_pool = NULL; - fmr->fmr_pfmr = NULL; spin_lock(&fps->fps_lock); fpo->fpo_map_count--; /* decref the pool */ @@ -1479,11 +1633,15 @@ void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status) kiblnd_destroy_fmr_pool_list(&zombies); } -int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages, - __u64 iov, kib_fmr_t *fmr) +int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, + kib_rdma_desc_t *rd, __u32 nob, __u64 iov, + kib_fmr_t *fmr) { - struct ib_pool_fmr *pfmr; + __u64 *pages = tx->tx_pages; + bool is_rx = (rd != tx->tx_rd); + bool tx_pages_mapped = 0; kib_fmr_pool_t *fpo; + int npages = 0; __u64 version; int rc; @@ -1493,21 +1651,95 @@ int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages, list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) { fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE); fpo->fpo_map_count++; - spin_unlock(&fps->fps_lock); - pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool, - pages, npages, iov); - if (likely(!IS_ERR(pfmr))) { - fmr->fmr_pool = fpo; - fmr->fmr_pfmr = pfmr; - return 0; + if (fpo->fpo_is_fmr) { + struct ib_pool_fmr *pfmr; + + spin_unlock(&fps->fps_lock); + + if (!tx_pages_mapped) { + npages = kiblnd_map_tx_pages(tx, rd); + tx_pages_mapped = 1; + } + + pfmr = ib_fmr_pool_map_phys(fpo->fmr.fpo_fmr_pool, + pages, npages, iov); + if (likely(!IS_ERR(pfmr))) { + fmr->fmr_key = is_rx ? pfmr->fmr->rkey : + pfmr->fmr->lkey; + fmr->fmr_frd = NULL; + fmr->fmr_pfmr = pfmr; + fmr->fmr_pool = fpo; + return 0; + } + rc = PTR_ERR(pfmr); + } else { + if (!list_empty(&fpo->fast_reg.fpo_pool_list)) { + struct kib_fast_reg_descriptor *frd; + struct ib_reg_wr *wr; + struct ib_mr *mr; + int n; + + frd = list_first_entry(&fpo->fast_reg.fpo_pool_list, + struct kib_fast_reg_descriptor, + frd_list); + list_del(&frd->frd_list); + spin_unlock(&fps->fps_lock); + + mr = frd->frd_mr; + + if (!frd->frd_valid) { + __u32 key = is_rx ? mr->rkey : mr->lkey; + struct ib_send_wr *inv_wr; + + inv_wr = &frd->frd_inv_wr; + memset(inv_wr, 0, sizeof(*inv_wr)); + inv_wr->opcode = IB_WR_LOCAL_INV; + inv_wr->wr_id = IBLND_WID_MR; + inv_wr->ex.invalidate_rkey = key; + + /* Bump the key */ + key = ib_inc_rkey(key); + ib_update_fast_reg_key(mr, key); + } + + n = ib_map_mr_sg(mr, tx->tx_frags, + tx->tx_nfrags, NULL, PAGE_SIZE); + if (unlikely(n != tx->tx_nfrags)) { + CERROR("Failed to map mr %d/%d elements\n", + n, tx->tx_nfrags); + return n < 0 ? n : -EINVAL; + } + + mr->iova = iov; + + /* Prepare FastReg WR */ + wr = &frd->frd_fastreg_wr; + memset(wr, 0, sizeof(*wr)); + wr->wr.opcode = IB_WR_REG_MR; + wr->wr.wr_id = IBLND_WID_MR; + wr->wr.num_sge = 0; + wr->wr.send_flags = 0; + wr->mr = mr; + wr->key = is_rx ? mr->rkey : mr->lkey; + wr->access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE); + + fmr->fmr_key = is_rx ? mr->rkey : mr->lkey; + fmr->fmr_frd = frd; + fmr->fmr_pfmr = NULL; + fmr->fmr_pool = fpo; + return 0; + } + spin_unlock(&fps->fps_lock); + rc = -EBUSY; } spin_lock(&fps->fps_lock); fpo->fpo_map_count--; - if (PTR_ERR(pfmr) != -EAGAIN) { + if (rc != -EAGAIN) { spin_unlock(&fps->fps_lock); - return PTR_ERR(pfmr); + return rc; } /* EAGAIN and ... */ @@ -1932,25 +2164,28 @@ static void kiblnd_net_fini_pools(kib_net_t *net) } } -static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts) +static int kiblnd_net_init_pools(kib_net_t *net, lnet_ni_t *ni, __u32 *cpts, + int ncpts) { + struct lnet_ioctl_config_o2iblnd_tunables *tunables; unsigned long flags; int cpt; - int rc = 0; + int rc; int i; + tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); - if (!*kiblnd_tunables.kib_map_on_demand) { + if (!tunables->lnd_map_on_demand) { read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); goto create_tx_pool; } read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - if (*kiblnd_tunables.kib_fmr_pool_size < - *kiblnd_tunables.kib_ntx / 4) { + if (tunables->lnd_fmr_pool_size < *kiblnd_tunables.kib_ntx / 4) { CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n", - *kiblnd_tunables.kib_fmr_pool_size, + tunables->lnd_fmr_pool_size, *kiblnd_tunables.kib_ntx / 4); rc = -EINVAL; goto failed; @@ -1965,8 +2200,11 @@ static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts) /* * premapping can fail if ibd_nmr > 1, so we always create * FMR pool and map-on-demand if premapping failed + * + * cfs_precpt_alloc is creating an array of struct kib_fmr_poolset + * The number of struct kib_fmr_poolsets create is equal to the + * number of CPTs that exist, i.e net->ibn_fmr_ps[cpt]. */ - net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(), sizeof(kib_fmr_poolset_t)); if (!net->ibn_fmr_ps) { @@ -1977,9 +2215,8 @@ static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts) for (i = 0; i < ncpts; i++) { cpt = !cpts ? i : cpts[i]; - rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, net, - kiblnd_fmr_pool_size(ncpts), - kiblnd_fmr_flush_trigger(ncpts)); + rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, ncpts, + net, tunables); if (rc) { CERROR("Can't initialize FMR pool for CPT %d: %d\n", cpt, rc); @@ -1991,6 +2228,11 @@ static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts) LASSERT(i == ncpts); create_tx_pool: + /* + * cfs_precpt_alloc is creating an array of struct kib_tx_poolset + * The number of struct kib_tx_poolsets create is equal to the + * number of CPTs that exist, i.e net->ibn_tx_ps[cpt]. + */ net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(), sizeof(kib_tx_poolset_t)); if (!net->ibn_tx_ps) { @@ -2694,10 +2936,9 @@ static int kiblnd_startup(lnet_ni_t *ni) net->ibn_incarnation = tv.tv_sec * USEC_PER_SEC + tv.tv_nsec / NSEC_PER_USEC; - ni->ni_peertimeout = *kiblnd_tunables.kib_peertimeout; - ni->ni_maxtxcredits = *kiblnd_tunables.kib_credits; - ni->ni_peertxcredits = *kiblnd_tunables.kib_peertxcredits; - ni->ni_peerrtrcredits = *kiblnd_tunables.kib_peerrtrcredits; + rc = kiblnd_tunables_setup(ni); + if (rc) + goto net_failed; if (ni->ni_interfaces[0]) { /* Use the IPoIB interface specified in 'networks=' */ @@ -2736,7 +2977,7 @@ static int kiblnd_startup(lnet_ni_t *ni) if (rc) goto failed; - rc = kiblnd_net_init_pools(net, ni->ni_cpts, ni->ni_ncpts); + rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts); if (rc) { CERROR("Failed to initialize NI pools: %d\n", rc); goto failed; @@ -2779,8 +3020,6 @@ static void __exit ko2iblnd_exit(void) static int __init ko2iblnd_init(void) { - int rc; - CLASSERT(sizeof(kib_msg_t) <= IBLND_MSG_SIZE); CLASSERT(offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) @@ -2789,9 +3028,7 @@ static int __init ko2iblnd_init(void) ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS]) <= IBLND_MSG_SIZE); - rc = kiblnd_tunables_init(); - if (rc) - return rc; + kiblnd_tunables_init(); lnet_register_lnd(&the_o2iblnd); diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h index bfcbdd167..b22984fd9 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h @@ -87,22 +87,10 @@ typedef struct { int *kib_timeout; /* comms timeout (seconds) */ int *kib_keepalive; /* keepalive timeout (seconds) */ int *kib_ntx; /* # tx descs */ - int *kib_credits; /* # concurrent sends */ - int *kib_peertxcredits; /* # concurrent sends to 1 peer */ - int *kib_peerrtrcredits; /* # per-peer router buffer credits */ - int *kib_peercredits_hiw; /* # when eagerly to return credits */ - int *kib_peertimeout; /* seconds to consider peer dead */ char **kib_default_ipif; /* default IPoIB interface */ int *kib_retry_count; int *kib_rnr_retry_count; - int *kib_concurrent_sends; /* send work queue sizing */ int *kib_ib_mtu; /* IB MTU */ - int *kib_map_on_demand; /* map-on-demand if RD has more */ - /* fragments than this value, 0 */ - /* disable map-on-demand */ - int *kib_fmr_pool_size; /* # FMRs in pool */ - int *kib_fmr_flush_trigger; /* When to trigger FMR flush */ - int *kib_fmr_cache; /* enable FMR pool cache? */ int *kib_require_priv_port; /* accept only privileged ports */ int *kib_use_priv_port; /* use privileged port for active connect */ int *kib_nscheds; /* # threads on each CPT */ @@ -116,43 +104,21 @@ extern kib_tunables_t kiblnd_tunables; #define IBLND_CREDITS_DEFAULT 8 /* default # of peer credits */ #define IBLND_CREDITS_MAX ((typeof(((kib_msg_t *) 0)->ibm_credits)) - 1) /* Max # of peer credits */ -#define IBLND_MSG_QUEUE_SIZE(v) ((v) == IBLND_MSG_VERSION_1 ? \ - IBLND_MSG_QUEUE_SIZE_V1 : \ - *kiblnd_tunables.kib_peertxcredits) /* # messages/RDMAs in-flight */ -#define IBLND_CREDITS_HIGHWATER(v) ((v) == IBLND_MSG_VERSION_1 ? \ - IBLND_CREDIT_HIGHWATER_V1 : \ - *kiblnd_tunables.kib_peercredits_hiw) /* when eagerly to return credits */ +/* when eagerly to return credits */ +#define IBLND_CREDITS_HIGHWATER(t, v) ((v) == IBLND_MSG_VERSION_1 ? \ + IBLND_CREDIT_HIGHWATER_V1 : \ + t->lnd_peercredits_hiw) #define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(&init_net, \ cb, dev, \ ps, qpt) -static inline int -kiblnd_concurrent_sends_v1(void) -{ - if (*kiblnd_tunables.kib_concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2) - return IBLND_MSG_QUEUE_SIZE_V1 * 2; - - if (*kiblnd_tunables.kib_concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2) - return IBLND_MSG_QUEUE_SIZE_V1 / 2; - - return *kiblnd_tunables.kib_concurrent_sends; -} - -#define IBLND_CONCURRENT_SENDS(v) ((v) == IBLND_MSG_VERSION_1 ? \ - kiblnd_concurrent_sends_v1() : \ - *kiblnd_tunables.kib_concurrent_sends) /* 2 OOB shall suffice for 1 keepalive and 1 returning credits */ #define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1) #define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0) #define IBLND_MSG_SIZE (4 << 10) /* max size of queued messages (inc hdr) */ #define IBLND_MAX_RDMA_FRAGS LNET_MAX_IOV /* max # of fragments supported */ -#define IBLND_CFG_RDMA_FRAGS (*kiblnd_tunables.kib_map_on_demand ? \ - *kiblnd_tunables.kib_map_on_demand : \ - IBLND_MAX_RDMA_FRAGS) /* max # of fragments configured by user */ -#define IBLND_RDMA_FRAGS(v) ((v) == IBLND_MSG_VERSION_1 ? \ - IBLND_MAX_RDMA_FRAGS : IBLND_CFG_RDMA_FRAGS) /************************/ /* derived constants... */ @@ -171,7 +137,8 @@ kiblnd_concurrent_sends_v1(void) /* WRs and CQEs (per connection) */ #define IBLND_RECV_WRS(c) IBLND_RX_MSGS(c) #define IBLND_SEND_WRS(c) \ - ((c->ibc_max_frags + 1) * IBLND_CONCURRENT_SENDS(c->ibc_version)) + ((c->ibc_max_frags + 1) * kiblnd_concurrent_sends(c->ibc_version, \ + c->ibc_peer->ibp_ni)) #define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c)) struct kib_hca_dev; @@ -286,24 +253,44 @@ typedef struct { int fps_cpt; /* CPT id */ int fps_pool_size; int fps_flush_trigger; + int fps_cache; int fps_increasing; /* is allocating new pool */ unsigned long fps_next_retry; /* time stamp for retry if*/ /* failed to allocate */ } kib_fmr_poolset_t; +struct kib_fast_reg_descriptor { /* For fast registration */ + struct list_head frd_list; + struct ib_send_wr frd_inv_wr; + struct ib_reg_wr frd_fastreg_wr; + struct ib_mr *frd_mr; + bool frd_valid; +}; + typedef struct { struct list_head fpo_list; /* chain on pool list */ struct kib_hca_dev *fpo_hdev; /* device for this pool */ kib_fmr_poolset_t *fpo_owner; /* owner of this pool */ - struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */ + union { + struct { + struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */ + } fmr; + struct { /* For fast registration */ + struct list_head fpo_pool_list; + int fpo_pool_size; + } fast_reg; + }; unsigned long fpo_deadline; /* deadline of this pool */ int fpo_failed; /* fmr pool is failed */ int fpo_map_count; /* # of mapped FMR */ + int fpo_is_fmr; } kib_fmr_pool_t; typedef struct { - struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */ - kib_fmr_pool_t *fmr_pool; /* pool of FMR */ + kib_fmr_pool_t *fmr_pool; /* pool of FMR */ + struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */ + struct kib_fast_reg_descriptor *fmr_frd; + u32 fmr_key; } kib_fmr_t; typedef struct kib_net { @@ -615,6 +602,48 @@ extern kib_data_t kiblnd_data; void kiblnd_hdev_destroy(kib_hca_dev_t *hdev); +int kiblnd_msg_queue_size(int version, struct lnet_ni *ni); + +/* max # of fragments configured by user */ +static inline int +kiblnd_cfg_rdma_frags(struct lnet_ni *ni) +{ + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + int mod; + + tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; + mod = tunables->lnd_map_on_demand; + return mod ? mod : IBLND_MAX_RDMA_FRAGS; +} + +static inline int +kiblnd_rdma_frags(int version, struct lnet_ni *ni) +{ + return version == IBLND_MSG_VERSION_1 ? + IBLND_MAX_RDMA_FRAGS : + kiblnd_cfg_rdma_frags(ni); +} + +static inline int +kiblnd_concurrent_sends(int version, struct lnet_ni *ni) +{ + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + int concurrent_sends; + + tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; + concurrent_sends = tunables->lnd_concurrent_sends; + + if (version == IBLND_MSG_VERSION_1) { + if (concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2) + return IBLND_MSG_QUEUE_SIZE_V1 * 2; + + if (concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2) + return IBLND_MSG_QUEUE_SIZE_V1 / 2; + } + + return concurrent_sends; +} + static inline void kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev) { @@ -737,10 +766,14 @@ kiblnd_send_keepalive(kib_conn_t *conn) static inline int kiblnd_need_noop(kib_conn_t *conn) { + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + lnet_ni_t *ni = conn->ibc_peer->ibp_ni; + LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED); + tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; if (conn->ibc_outstanding_credits < - IBLND_CREDITS_HIGHWATER(conn->ibc_version) && + IBLND_CREDITS_HIGHWATER(tunables, conn->ibc_version) && !kiblnd_send_keepalive(conn)) return 0; /* No need to send NOOP */ @@ -799,7 +832,8 @@ kiblnd_queue2str(kib_conn_t *conn, struct list_head *q) #define IBLND_WID_TX 1 #define IBLND_WID_RX 2 #define IBLND_WID_RDMA 3 -#define IBLND_WID_MASK 3UL +#define IBLND_WID_MR 4 +#define IBLND_WID_MASK 7UL static inline __u64 kiblnd_ptr2wreqid(void *ptr, int type) @@ -947,20 +981,20 @@ static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev, #define KIBLND_CONN_PARAM(e) ((e)->param.conn.private_data) #define KIBLND_CONN_PARAM_LEN(e) ((e)->param.conn.private_data_len) -struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, - kib_rdma_desc_t *rd, +struct ib_mr *kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd, int negotiated_nfrags); void kiblnd_map_rx_descs(kib_conn_t *conn); void kiblnd_unmap_rx_descs(kib_conn_t *conn); void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node); struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps); -int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, - int npages, __u64 iov, kib_fmr_t *fmr); +int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx, + kib_rdma_desc_t *rd, __u32 nob, __u64 iov, + kib_fmr_t *fmr); void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status); -int kiblnd_tunables_init(void); -void kiblnd_tunables_fini(void); +int kiblnd_tunables_setup(struct lnet_ni *ni); +void kiblnd_tunables_init(void); int kiblnd_connd(void *arg); int kiblnd_scheduler(void *arg); diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c index 2323e8d3a..845e49a52 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -561,36 +561,23 @@ kiblnd_kvaddr_to_page(unsigned long vaddr) } static int -kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob) +kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, __u32 nob) { kib_hca_dev_t *hdev; - __u64 *pages = tx->tx_pages; kib_fmr_poolset_t *fps; - int npages; - int size; int cpt; int rc; - int i; LASSERT(tx->tx_pool); LASSERT(tx->tx_pool->tpo_pool.po_owner); hdev = tx->tx_pool->tpo_hdev; - - for (i = 0, npages = 0; i < rd->rd_nfrags; i++) { - for (size = 0; size < rd->rd_frags[i].rf_nob; - size += hdev->ibh_page_size) { - pages[npages++] = (rd->rd_frags[i].rf_addr & - hdev->ibh_page_mask) + size; - } - } - cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt; fps = net->ibn_fmr_ps[cpt]; - rc = kiblnd_fmr_pool_map(fps, pages, npages, 0, &tx->fmr); + rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->fmr); if (rc) { - CERROR("Can't map %d pages: %d\n", npages, rc); + CERROR("Can't map %u bytes: %d\n", nob, rc); return rc; } @@ -598,8 +585,7 @@ kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob) * If rd is not tx_rd, it's going to get sent to a peer, who will need * the rkey */ - rd->rd_key = (rd != tx->tx_rd) ? tx->fmr.fmr_pfmr->fmr->rkey : - tx->fmr.fmr_pfmr->fmr->lkey; + rd->rd_key = tx->fmr.fmr_key; rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask; rd->rd_frags[0].rf_nob = nob; rd->rd_nfrags = 1; @@ -613,10 +599,8 @@ static void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx) LASSERT(net); - if (net->ibn_fmr_ps && tx->fmr.fmr_pfmr) { + if (net->ibn_fmr_ps) kiblnd_fmr_pool_unmap(&tx->fmr, tx->tx_status); - tx->fmr.fmr_pfmr = NULL; - } if (tx->tx_nfrags) { kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev, @@ -628,8 +612,8 @@ static void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx) static int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, int nfrags) { - kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev; kib_net_t *net = ni->ni_data; + kib_hca_dev_t *hdev = net->ibn_dev->ibd_hdev; struct ib_mr *mr = NULL; __u32 nob; int i; @@ -652,7 +636,7 @@ static int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, nob += rd->rd_frags[i].rf_nob; } - mr = kiblnd_find_rd_dma_mr(hdev, rd, tx->tx_conn ? + mr = kiblnd_find_rd_dma_mr(ni, rd, tx->tx_conn ? tx->tx_conn->ibc_max_frags : -1); if (mr) { /* found pre-mapping MR */ @@ -704,7 +688,7 @@ kiblnd_setup_rd_iov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, fragnob = min(fragnob, (int)PAGE_SIZE - page_offset); sg_set_page(sg, page, fragnob, page_offset); - sg++; + sg = sg_next(sg); if (offset + fragnob < iov->iov_len) { offset += fragnob; @@ -748,7 +732,7 @@ kiblnd_setup_rd_kiov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd, sg_set_page(sg, kiov->kiov_page, fragnob, kiov->kiov_offset + offset); - sg++; + sg = sg_next(sg); offset = 0; kiov++; @@ -765,6 +749,7 @@ kiblnd_post_tx_locked(kib_conn_t *conn, kib_tx_t *tx, int credit) { kib_msg_t *msg = tx->tx_msg; kib_peer_t *peer = conn->ibc_peer; + struct lnet_ni *ni = peer->ibp_ni; int ver = conn->ibc_version; int rc; int done; @@ -780,7 +765,7 @@ kiblnd_post_tx_locked(kib_conn_t *conn, kib_tx_t *tx, int credit) LASSERT(conn->ibc_credits >= 0); LASSERT(conn->ibc_credits <= conn->ibc_queue_depth); - if (conn->ibc_nsends_posted == IBLND_CONCURRENT_SENDS(ver)) { + if (conn->ibc_nsends_posted == kiblnd_concurrent_sends(ver, ni)) { /* tx completions outstanding... */ CDEBUG(D_NET, "%s: posted enough\n", libcfs_nid2str(peer->ibp_nid)); @@ -851,14 +836,26 @@ kiblnd_post_tx_locked(kib_conn_t *conn, kib_tx_t *tx, int credit) /* close_conn will launch failover */ rc = -ENETDOWN; } else { - struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq - 1].wr; + struct kib_fast_reg_descriptor *frd = tx->fmr.fmr_frd; + struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr; + struct ib_send_wr *wrq = &tx->tx_wrq[0].wr; + + if (frd) { + if (!frd->frd_valid) { + wrq = &frd->frd_inv_wr; + wrq->next = &frd->frd_fastreg_wr.wr; + } else { + wrq = &frd->frd_fastreg_wr.wr; + } + frd->frd_fastreg_wr.wr.next = &tx->tx_wrq[0].wr; + } - LASSERTF(wrq->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX), + LASSERTF(bad->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX), "bad wr_id %llx, opc %d, flags %d, peer: %s\n", - wrq->wr_id, wrq->opcode, wrq->send_flags, - libcfs_nid2str(conn->ibc_peer->ibp_nid)); - wrq = NULL; - rc = ib_post_send(conn->ibc_cmid->qp, &tx->tx_wrq->wr, &wrq); + bad->wr_id, bad->opcode, bad->send_flags, + libcfs_nid2str(conn->ibc_peer->ibp_nid)); + bad = NULL; + rc = ib_post_send(conn->ibc_cmid->qp, wrq, &bad); } conn->ibc_last_send = jiffies; @@ -919,7 +916,7 @@ kiblnd_check_sends(kib_conn_t *conn) spin_lock(&conn->ibc_lock); - LASSERT(conn->ibc_nsends_posted <= IBLND_CONCURRENT_SENDS(ver)); + LASSERT(conn->ibc_nsends_posted <= kiblnd_concurrent_sends(ver, ni)); LASSERT(!IBLND_OOB_CAPABLE(ver) || conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver)); LASSERT(conn->ibc_reserved_credits >= 0); @@ -1066,7 +1063,7 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type, kib_msg_t *ibmsg = tx->tx_msg; kib_rdma_desc_t *srcrd = tx->tx_rd; struct ib_sge *sge = &tx->tx_sge[0]; - struct ib_rdma_wr *wrq = &tx->tx_wrq[0], *next; + struct ib_rdma_wr *wrq, *next; int rc = resid; int srcidx = 0; int dstidx = 0; @@ -2333,11 +2330,11 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) } if (reqmsg->ibm_u.connparams.ibcp_queue_depth > - IBLND_MSG_QUEUE_SIZE(version)) { + kiblnd_msg_queue_size(version, ni)) { CERROR("Can't accept conn from %s, queue depth too large: %d (<=%d wanted)\n", libcfs_nid2str(nid), reqmsg->ibm_u.connparams.ibcp_queue_depth, - IBLND_MSG_QUEUE_SIZE(version)); + kiblnd_msg_queue_size(version, ni)); if (version == IBLND_MSG_VERSION) rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE; @@ -2346,24 +2343,24 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) } if (reqmsg->ibm_u.connparams.ibcp_max_frags > - IBLND_RDMA_FRAGS(version)) { + kiblnd_rdma_frags(version, ni)) { CWARN("Can't accept conn from %s (version %x): max_frags %d too large (%d wanted)\n", libcfs_nid2str(nid), version, reqmsg->ibm_u.connparams.ibcp_max_frags, - IBLND_RDMA_FRAGS(version)); + kiblnd_rdma_frags(version, ni)); if (version >= IBLND_MSG_VERSION) rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; goto failed; } else if (reqmsg->ibm_u.connparams.ibcp_max_frags < - IBLND_RDMA_FRAGS(version) && !net->ibn_fmr_ps) { + kiblnd_rdma_frags(version, ni) && !net->ibn_fmr_ps) { CWARN("Can't accept conn from %s (version %x): max_frags %d incompatible without FMR pool (%d wanted)\n", libcfs_nid2str(nid), version, reqmsg->ibm_u.connparams.ibcp_max_frags, - IBLND_RDMA_FRAGS(version)); + kiblnd_rdma_frags(version, ni)); - if (version >= IBLND_MSG_VERSION) + if (version == IBLND_MSG_VERSION) rej.ibr_why = IBLND_REJECT_RDMA_FRAGS; goto failed; @@ -2524,12 +2521,13 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob) return 0; failed: - if (ni) + if (ni) { lnet_ni_decref(ni); + rej.ibr_cp.ibcp_queue_depth = kiblnd_msg_queue_size(version, ni); + rej.ibr_cp.ibcp_max_frags = kiblnd_rdma_frags(version, ni); + } rej.ibr_version = version; - rej.ibr_cp.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version); - rej.ibr_cp.ibcp_max_frags = IBLND_RDMA_FRAGS(version); kiblnd_reject(cmid, &rej); return -ECONNREFUSED; @@ -2580,12 +2578,15 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version, reason = "Unknown"; break; - case IBLND_REJECT_RDMA_FRAGS: + case IBLND_REJECT_RDMA_FRAGS: { + struct lnet_ioctl_config_lnd_tunables *tunables; + if (!cp) { reason = "can't negotiate max frags"; goto out; } - if (!*kiblnd_tunables.kib_map_on_demand) { + tunables = peer->ibp_ni->ni_lnd_tunables; + if (!tunables->lt_tun_u.lt_o2ib.lnd_map_on_demand) { reason = "map_on_demand must be enabled"; goto out; } @@ -2597,7 +2598,7 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version, peer->ibp_max_frags = frag_num; reason = "rdma fragments"; break; - + } case IBLND_REJECT_MSG_QUEUE_SIZE: if (!cp) { reason = "can't negotiate queue depth"; @@ -3430,6 +3431,12 @@ kiblnd_complete(struct ib_wc *wc) default: LBUG(); + case IBLND_WID_MR: + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) + CNETERR("FastReg failed: %d\n", wc->status); + break; + case IBLND_WID_RDMA: /* * We only get RDMA completion notification if it fails. All diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c index b4607dad3..f8fdd4ae3 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c @@ -152,74 +152,135 @@ kib_tunables_t kiblnd_tunables = { .kib_timeout = &timeout, .kib_keepalive = &keepalive, .kib_ntx = &ntx, - .kib_credits = &credits, - .kib_peertxcredits = &peer_credits, - .kib_peercredits_hiw = &peer_credits_hiw, - .kib_peerrtrcredits = &peer_buffer_credits, - .kib_peertimeout = &peer_timeout, .kib_default_ipif = &ipif_name, .kib_retry_count = &retry_count, .kib_rnr_retry_count = &rnr_retry_count, - .kib_concurrent_sends = &concurrent_sends, .kib_ib_mtu = &ib_mtu, - .kib_map_on_demand = &map_on_demand, - .kib_fmr_pool_size = &fmr_pool_size, - .kib_fmr_flush_trigger = &fmr_flush_trigger, - .kib_fmr_cache = &fmr_cache, .kib_require_priv_port = &require_privileged_port, .kib_use_priv_port = &use_privileged_port, .kib_nscheds = &nscheds }; -int -kiblnd_tunables_init(void) +static struct lnet_ioctl_config_o2iblnd_tunables default_tunables; + +/* # messages/RDMAs in-flight */ +int kiblnd_msg_queue_size(int version, lnet_ni_t *ni) { + if (version == IBLND_MSG_VERSION_1) + return IBLND_MSG_QUEUE_SIZE_V1; + else if (ni) + return ni->ni_peertxcredits; + else + return peer_credits; +} + +int kiblnd_tunables_setup(struct lnet_ni *ni) +{ + struct lnet_ioctl_config_o2iblnd_tunables *tunables; + + /* + * if there was no tunables specified, setup the tunables to be + * defaulted + */ + if (!ni->ni_lnd_tunables) { + LIBCFS_ALLOC(ni->ni_lnd_tunables, + sizeof(*ni->ni_lnd_tunables)); + if (!ni->ni_lnd_tunables) + return -ENOMEM; + + memcpy(&ni->ni_lnd_tunables->lt_tun_u.lt_o2ib, + &default_tunables, sizeof(*tunables)); + } + tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib; + + /* Current API version */ + tunables->lnd_version = 0; + if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) { CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n", *kiblnd_tunables.kib_ib_mtu); return -EINVAL; } - if (*kiblnd_tunables.kib_peertxcredits < IBLND_CREDITS_DEFAULT) - *kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_DEFAULT; + if (!ni->ni_peertimeout) + ni->ni_peertimeout = peer_timeout; + + if (!ni->ni_maxtxcredits) + ni->ni_maxtxcredits = credits; + + if (!ni->ni_peertxcredits) + ni->ni_peertxcredits = peer_credits; - if (*kiblnd_tunables.kib_peertxcredits > IBLND_CREDITS_MAX) - *kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_MAX; + if (!ni->ni_peerrtrcredits) + ni->ni_peerrtrcredits = peer_buffer_credits; - if (*kiblnd_tunables.kib_peertxcredits > *kiblnd_tunables.kib_credits) - *kiblnd_tunables.kib_peertxcredits = *kiblnd_tunables.kib_credits; + if (ni->ni_peertxcredits < IBLND_CREDITS_DEFAULT) + ni->ni_peertxcredits = IBLND_CREDITS_DEFAULT; - if (*kiblnd_tunables.kib_peercredits_hiw < *kiblnd_tunables.kib_peertxcredits / 2) - *kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits / 2; + if (ni->ni_peertxcredits > IBLND_CREDITS_MAX) + ni->ni_peertxcredits = IBLND_CREDITS_MAX; - if (*kiblnd_tunables.kib_peercredits_hiw >= *kiblnd_tunables.kib_peertxcredits) - *kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits - 1; + if (ni->ni_peertxcredits > credits) + ni->ni_peertxcredits = credits; - if (*kiblnd_tunables.kib_map_on_demand < 0 || - *kiblnd_tunables.kib_map_on_demand > IBLND_MAX_RDMA_FRAGS) - *kiblnd_tunables.kib_map_on_demand = 0; /* disable map-on-demand */ + if (!tunables->lnd_peercredits_hiw) + tunables->lnd_peercredits_hiw = peer_credits_hiw; - if (*kiblnd_tunables.kib_map_on_demand == 1) - *kiblnd_tunables.kib_map_on_demand = 2; /* don't make sense to create map if only one fragment */ + if (tunables->lnd_peercredits_hiw < ni->ni_peertxcredits / 2) + tunables->lnd_peercredits_hiw = ni->ni_peertxcredits / 2; - if (!*kiblnd_tunables.kib_concurrent_sends) { - if (*kiblnd_tunables.kib_map_on_demand > 0 && - *kiblnd_tunables.kib_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8) - *kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits) * 2; - else - *kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits); + if (tunables->lnd_peercredits_hiw >= ni->ni_peertxcredits) + tunables->lnd_peercredits_hiw = ni->ni_peertxcredits - 1; + + if (tunables->lnd_map_on_demand < 0 || + tunables->lnd_map_on_demand > IBLND_MAX_RDMA_FRAGS) { + /* disable map-on-demand */ + tunables->lnd_map_on_demand = 0; + } + + if (tunables->lnd_map_on_demand == 1) { + /* don't make sense to create map if only one fragment */ + tunables->lnd_map_on_demand = 2; + } + + if (!tunables->lnd_concurrent_sends) { + if (tunables->lnd_map_on_demand > 0 && + tunables->lnd_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8) { + tunables->lnd_concurrent_sends = + ni->ni_peertxcredits * 2; + } else { + tunables->lnd_concurrent_sends = ni->ni_peertxcredits; + } } - if (*kiblnd_tunables.kib_concurrent_sends > *kiblnd_tunables.kib_peertxcredits * 2) - *kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits * 2; + if (tunables->lnd_concurrent_sends > ni->ni_peertxcredits * 2) + tunables->lnd_concurrent_sends = ni->ni_peertxcredits * 2; - if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits / 2) - *kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits / 2; + if (tunables->lnd_concurrent_sends < ni->ni_peertxcredits / 2) + tunables->lnd_concurrent_sends = ni->ni_peertxcredits / 2; - if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits) { + if (tunables->lnd_concurrent_sends < ni->ni_peertxcredits) { CWARN("Concurrent sends %d is lower than message queue size: %d, performance may drop slightly.\n", - *kiblnd_tunables.kib_concurrent_sends, *kiblnd_tunables.kib_peertxcredits); + tunables->lnd_concurrent_sends, ni->ni_peertxcredits); } + if (!tunables->lnd_fmr_pool_size) + tunables->lnd_fmr_pool_size = fmr_pool_size; + if (!tunables->lnd_fmr_flush_trigger) + tunables->lnd_fmr_flush_trigger = fmr_flush_trigger; + if (!tunables->lnd_fmr_cache) + tunables->lnd_fmr_cache = fmr_cache; + return 0; } + +void kiblnd_tunables_init(void) +{ + default_tunables.lnd_version = 0; + default_tunables.lnd_peercredits_hiw = peer_credits_hiw, + default_tunables.lnd_map_on_demand = map_on_demand; + default_tunables.lnd_concurrent_sends = concurrent_sends; + default_tunables.lnd_fmr_pool_size = fmr_pool_size; + default_tunables.lnd_fmr_flush_trigger = fmr_flush_trigger; + default_tunables.lnd_fmr_cache = fmr_cache; +} diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c index cca7b2f7f..406c0e7a5 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c @@ -2582,7 +2582,6 @@ ksocknal_debug_peerhash(lnet_ni_t *ni) } read_unlock(&ksocknal_data.ksnd_global_lock); - return; } void diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c index d4ce06d0a..964b4e338 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib.c @@ -675,7 +675,6 @@ ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn) sock->sk->sk_user_data = conn; sock->sk->sk_data_ready = ksocknal_data_ready; sock->sk->sk_write_space = ksocknal_write_space; - return; } void @@ -695,8 +694,6 @@ ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn) * sk_user_data is NULL. */ sock->sk->sk_user_data = NULL; - - return ; } int -- cgit v1.2.3-54-g00ecf