summaryrefslogtreecommitdiff
path: root/drivers/staging/lustre/lnet/klnds/o2iblnd
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/staging/lustre/lnet/klnds/o2iblnd')
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c405
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h134
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c101
-rw-r--r--drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c139
4 files changed, 559 insertions, 220 deletions
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
index 0d32e6541..6c59f2ff2 100644
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c
@@ -335,8 +335,8 @@ int kiblnd_create_peer(lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid)
peer->ibp_nid = nid;
peer->ibp_error = 0;
peer->ibp_last_alive = 0;
- peer->ibp_max_frags = IBLND_CFG_RDMA_FRAGS;
- peer->ibp_queue_depth = *kiblnd_tunables.kib_peertxcredits;
+ peer->ibp_max_frags = kiblnd_cfg_rdma_frags(peer->ibp_ni);
+ peer->ibp_queue_depth = ni->ni_peertxcredits;
atomic_set(&peer->ibp_refcount, 1); /* 1 ref for caller */
INIT_LIST_HEAD(&peer->ibp_list); /* not in the peer table yet */
@@ -1283,65 +1283,86 @@ static void kiblnd_map_tx_pool(kib_tx_pool_t *tpo)
}
}
-struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev, kib_rdma_desc_t *rd,
+struct ib_mr *kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd,
int negotiated_nfrags)
{
- __u16 nfrags = (negotiated_nfrags != -1) ?
- negotiated_nfrags : *kiblnd_tunables.kib_map_on_demand;
+ kib_net_t *net = ni->ni_data;
+ kib_hca_dev_t *hdev = net->ibn_dev->ibd_hdev;
+ struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+ __u16 nfrags;
+ int mod;
+
+ tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
+ mod = tunables->lnd_map_on_demand;
+ nfrags = (negotiated_nfrags != -1) ? negotiated_nfrags : mod;
LASSERT(hdev->ibh_mrs);
- if (*kiblnd_tunables.kib_map_on_demand > 0 &&
- nfrags <= rd->rd_nfrags)
+ if (mod > 0 && nfrags <= rd->rd_nfrags)
return NULL;
return hdev->ibh_mrs;
}
-static void kiblnd_destroy_fmr_pool(kib_fmr_pool_t *pool)
+static void kiblnd_destroy_fmr_pool(kib_fmr_pool_t *fpo)
{
- LASSERT(!pool->fpo_map_count);
+ LASSERT(!fpo->fpo_map_count);
- if (pool->fpo_fmr_pool)
- ib_destroy_fmr_pool(pool->fpo_fmr_pool);
+ if (fpo->fpo_is_fmr) {
+ if (fpo->fmr.fpo_fmr_pool)
+ ib_destroy_fmr_pool(fpo->fmr.fpo_fmr_pool);
+ } else {
+ struct kib_fast_reg_descriptor *frd, *tmp;
+ int i = 0;
+
+ list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
+ frd_list) {
+ list_del(&frd->frd_list);
+ ib_dereg_mr(frd->frd_mr);
+ LIBCFS_FREE(frd, sizeof(*frd));
+ i++;
+ }
+ if (i < fpo->fast_reg.fpo_pool_size)
+ CERROR("FastReg pool still has %d regions registered\n",
+ fpo->fast_reg.fpo_pool_size - i);
+ }
- if (pool->fpo_hdev)
- kiblnd_hdev_decref(pool->fpo_hdev);
+ if (fpo->fpo_hdev)
+ kiblnd_hdev_decref(fpo->fpo_hdev);
- LIBCFS_FREE(pool, sizeof(*pool));
+ LIBCFS_FREE(fpo, sizeof(*fpo));
}
static void kiblnd_destroy_fmr_pool_list(struct list_head *head)
{
- kib_fmr_pool_t *pool;
+ kib_fmr_pool_t *fpo, *tmp;
- while (!list_empty(head)) {
- pool = list_entry(head->next, kib_fmr_pool_t, fpo_list);
- list_del(&pool->fpo_list);
- kiblnd_destroy_fmr_pool(pool);
+ list_for_each_entry_safe(fpo, tmp, head, fpo_list) {
+ list_del(&fpo->fpo_list);
+ kiblnd_destroy_fmr_pool(fpo);
}
}
-static int kiblnd_fmr_pool_size(int ncpts)
+static int
+kiblnd_fmr_pool_size(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
+ int ncpts)
{
- int size = *kiblnd_tunables.kib_fmr_pool_size / ncpts;
+ int size = tunables->lnd_fmr_pool_size / ncpts;
return max(IBLND_FMR_POOL, size);
}
-static int kiblnd_fmr_flush_trigger(int ncpts)
+static int
+kiblnd_fmr_flush_trigger(struct lnet_ioctl_config_o2iblnd_tunables *tunables,
+ int ncpts)
{
- int size = *kiblnd_tunables.kib_fmr_flush_trigger / ncpts;
+ int size = tunables->lnd_fmr_flush_trigger / ncpts;
return max(IBLND_FMR_POOL_FLUSH, size);
}
-static int kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps,
- kib_fmr_pool_t **pp_fpo)
+static int kiblnd_alloc_fmr_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
{
- /* FMR pool for RDMA */
- kib_dev_t *dev = fps->fps_net->ibn_dev;
- kib_fmr_pool_t *fpo;
struct ib_fmr_pool_param param = {
.max_pages_per_fmr = LNET_MAX_PAYLOAD / PAGE_SIZE,
.page_shift = PAGE_SHIFT,
@@ -1351,7 +1372,78 @@ static int kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps,
.dirty_watermark = fps->fps_flush_trigger,
.flush_function = NULL,
.flush_arg = NULL,
- .cache = !!*kiblnd_tunables.kib_fmr_cache};
+ .cache = !!fps->fps_cache };
+ int rc = 0;
+
+ fpo->fmr.fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd,
+ &param);
+ if (IS_ERR(fpo->fmr.fpo_fmr_pool)) {
+ rc = PTR_ERR(fpo->fmr.fpo_fmr_pool);
+ if (rc != -ENOSYS)
+ CERROR("Failed to create FMR pool: %d\n", rc);
+ else
+ CERROR("FMRs are not supported\n");
+ }
+
+ return rc;
+}
+
+static int kiblnd_alloc_freg_pool(kib_fmr_poolset_t *fps, kib_fmr_pool_t *fpo)
+{
+ struct kib_fast_reg_descriptor *frd, *tmp;
+ int i, rc;
+
+ INIT_LIST_HEAD(&fpo->fast_reg.fpo_pool_list);
+ fpo->fast_reg.fpo_pool_size = 0;
+ for (i = 0; i < fps->fps_pool_size; i++) {
+ LIBCFS_CPT_ALLOC(frd, lnet_cpt_table(), fps->fps_cpt,
+ sizeof(*frd));
+ if (!frd) {
+ CERROR("Failed to allocate a new fast_reg descriptor\n");
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ frd->frd_mr = ib_alloc_mr(fpo->fpo_hdev->ibh_pd,
+ IB_MR_TYPE_MEM_REG,
+ LNET_MAX_PAYLOAD / PAGE_SIZE);
+ if (IS_ERR(frd->frd_mr)) {
+ rc = PTR_ERR(frd->frd_mr);
+ CERROR("Failed to allocate ib_alloc_mr: %d\n", rc);
+ frd->frd_mr = NULL;
+ goto out_middle;
+ }
+
+ frd->frd_valid = true;
+
+ list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
+ fpo->fast_reg.fpo_pool_size++;
+ }
+
+ return 0;
+
+out_middle:
+ if (frd->frd_mr)
+ ib_dereg_mr(frd->frd_mr);
+ LIBCFS_FREE(frd, sizeof(*frd));
+
+out:
+ list_for_each_entry_safe(frd, tmp, &fpo->fast_reg.fpo_pool_list,
+ frd_list) {
+ list_del(&frd->frd_list);
+ ib_dereg_mr(frd->frd_mr);
+ LIBCFS_FREE(frd, sizeof(*frd));
+ }
+
+ return rc;
+}
+
+static int kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps,
+ kib_fmr_pool_t **pp_fpo)
+{
+ kib_dev_t *dev = fps->fps_net->ibn_dev;
+ struct ib_device_attr *dev_attr;
+ kib_fmr_pool_t *fpo;
int rc;
LIBCFS_CPT_ALLOC(fpo, lnet_cpt_table(), fps->fps_cpt, sizeof(*fpo));
@@ -1359,22 +1451,41 @@ static int kiblnd_create_fmr_pool(kib_fmr_poolset_t *fps,
return -ENOMEM;
fpo->fpo_hdev = kiblnd_current_hdev(dev);
-
- fpo->fpo_fmr_pool = ib_create_fmr_pool(fpo->fpo_hdev->ibh_pd, &param);
- if (IS_ERR(fpo->fpo_fmr_pool)) {
- rc = PTR_ERR(fpo->fpo_fmr_pool);
- CERROR("Failed to create FMR pool: %d\n", rc);
-
- kiblnd_hdev_decref(fpo->fpo_hdev);
- LIBCFS_FREE(fpo, sizeof(*fpo));
- return rc;
+ dev_attr = &fpo->fpo_hdev->ibh_ibdev->attrs;
+
+ /* Check for FMR or FastReg support */
+ fpo->fpo_is_fmr = 0;
+ if (fpo->fpo_hdev->ibh_ibdev->alloc_fmr &&
+ fpo->fpo_hdev->ibh_ibdev->dealloc_fmr &&
+ fpo->fpo_hdev->ibh_ibdev->map_phys_fmr &&
+ fpo->fpo_hdev->ibh_ibdev->unmap_fmr) {
+ LCONSOLE_INFO("Using FMR for registration\n");
+ fpo->fpo_is_fmr = 1;
+ } else if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+ LCONSOLE_INFO("Using FastReg for registration\n");
+ } else {
+ rc = -ENOSYS;
+ LCONSOLE_ERROR_MSG(rc, "IB device does not support FMRs nor FastRegs, can't register memory\n");
+ goto out_fpo;
}
+ if (fpo->fpo_is_fmr)
+ rc = kiblnd_alloc_fmr_pool(fps, fpo);
+ else
+ rc = kiblnd_alloc_freg_pool(fps, fpo);
+ if (rc)
+ goto out_fpo;
+
fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
- fpo->fpo_owner = fps;
+ fpo->fpo_owner = fps;
*pp_fpo = fpo;
return 0;
+
+out_fpo:
+ kiblnd_hdev_decref(fpo->fpo_hdev);
+ LIBCFS_FREE(fpo, sizeof(*fpo));
+ return rc;
}
static void kiblnd_fail_fmr_poolset(kib_fmr_poolset_t *fps,
@@ -1407,9 +1518,10 @@ static void kiblnd_fini_fmr_poolset(kib_fmr_poolset_t *fps)
}
}
-static int kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt,
- kib_net_t *net, int pool_size,
- int flush_trigger)
+static int
+kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt, int ncpts,
+ kib_net_t *net,
+ struct lnet_ioctl_config_o2iblnd_tunables *tunables)
{
kib_fmr_pool_t *fpo;
int rc;
@@ -1418,8 +1530,11 @@ static int kiblnd_init_fmr_poolset(kib_fmr_poolset_t *fps, int cpt,
fps->fps_net = net;
fps->fps_cpt = cpt;
- fps->fps_pool_size = pool_size;
- fps->fps_flush_trigger = flush_trigger;
+
+ fps->fps_pool_size = kiblnd_fmr_pool_size(tunables, ncpts);
+ fps->fps_flush_trigger = kiblnd_fmr_flush_trigger(tunables, ncpts);
+ fps->fps_cache = tunables->lnd_fmr_cache;
+
spin_lock_init(&fps->fps_lock);
INIT_LIST_HEAD(&fps->fps_pool_list);
INIT_LIST_HEAD(&fps->fps_failed_pool_list);
@@ -1440,25 +1555,64 @@ static int kiblnd_fmr_pool_is_idle(kib_fmr_pool_t *fpo, unsigned long now)
return cfs_time_aftereq(now, fpo->fpo_deadline);
}
+static int
+kiblnd_map_tx_pages(kib_tx_t *tx, kib_rdma_desc_t *rd)
+{
+ __u64 *pages = tx->tx_pages;
+ kib_hca_dev_t *hdev;
+ int npages;
+ int size;
+ int i;
+
+ hdev = tx->tx_pool->tpo_hdev;
+
+ for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
+ for (size = 0; size < rd->rd_frags[i].rf_nob;
+ size += hdev->ibh_page_size) {
+ pages[npages++] = (rd->rd_frags[i].rf_addr &
+ hdev->ibh_page_mask) + size;
+ }
+ }
+
+ return npages;
+}
+
void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
{
LIST_HEAD(zombies);
kib_fmr_pool_t *fpo = fmr->fmr_pool;
- kib_fmr_poolset_t *fps = fpo->fpo_owner;
+ kib_fmr_poolset_t *fps;
unsigned long now = cfs_time_current();
kib_fmr_pool_t *tmp;
int rc;
- rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
- LASSERT(!rc);
+ if (!fpo)
+ return;
- if (status) {
- rc = ib_flush_fmr_pool(fpo->fpo_fmr_pool);
- LASSERT(!rc);
- }
+ fps = fpo->fpo_owner;
+ if (fpo->fpo_is_fmr) {
+ if (fmr->fmr_pfmr) {
+ rc = ib_fmr_pool_unmap(fmr->fmr_pfmr);
+ LASSERT(!rc);
+ fmr->fmr_pfmr = NULL;
+ }
+
+ if (status) {
+ rc = ib_flush_fmr_pool(fpo->fmr.fpo_fmr_pool);
+ LASSERT(!rc);
+ }
+ } else {
+ struct kib_fast_reg_descriptor *frd = fmr->fmr_frd;
+ if (frd) {
+ frd->frd_valid = false;
+ spin_lock(&fps->fps_lock);
+ list_add_tail(&frd->frd_list, &fpo->fast_reg.fpo_pool_list);
+ spin_unlock(&fps->fps_lock);
+ fmr->fmr_frd = NULL;
+ }
+ }
fmr->fmr_pool = NULL;
- fmr->fmr_pfmr = NULL;
spin_lock(&fps->fps_lock);
fpo->fpo_map_count--; /* decref the pool */
@@ -1479,11 +1633,15 @@ void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status)
kiblnd_destroy_fmr_pool_list(&zombies);
}
-int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages,
- __u64 iov, kib_fmr_t *fmr)
+int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx,
+ kib_rdma_desc_t *rd, __u32 nob, __u64 iov,
+ kib_fmr_t *fmr)
{
- struct ib_pool_fmr *pfmr;
+ __u64 *pages = tx->tx_pages;
+ bool is_rx = (rd != tx->tx_rd);
+ bool tx_pages_mapped = 0;
kib_fmr_pool_t *fpo;
+ int npages = 0;
__u64 version;
int rc;
@@ -1493,21 +1651,95 @@ int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages, int npages,
list_for_each_entry(fpo, &fps->fps_pool_list, fpo_list) {
fpo->fpo_deadline = cfs_time_shift(IBLND_POOL_DEADLINE);
fpo->fpo_map_count++;
- spin_unlock(&fps->fps_lock);
- pfmr = ib_fmr_pool_map_phys(fpo->fpo_fmr_pool,
- pages, npages, iov);
- if (likely(!IS_ERR(pfmr))) {
- fmr->fmr_pool = fpo;
- fmr->fmr_pfmr = pfmr;
- return 0;
+ if (fpo->fpo_is_fmr) {
+ struct ib_pool_fmr *pfmr;
+
+ spin_unlock(&fps->fps_lock);
+
+ if (!tx_pages_mapped) {
+ npages = kiblnd_map_tx_pages(tx, rd);
+ tx_pages_mapped = 1;
+ }
+
+ pfmr = ib_fmr_pool_map_phys(fpo->fmr.fpo_fmr_pool,
+ pages, npages, iov);
+ if (likely(!IS_ERR(pfmr))) {
+ fmr->fmr_key = is_rx ? pfmr->fmr->rkey :
+ pfmr->fmr->lkey;
+ fmr->fmr_frd = NULL;
+ fmr->fmr_pfmr = pfmr;
+ fmr->fmr_pool = fpo;
+ return 0;
+ }
+ rc = PTR_ERR(pfmr);
+ } else {
+ if (!list_empty(&fpo->fast_reg.fpo_pool_list)) {
+ struct kib_fast_reg_descriptor *frd;
+ struct ib_reg_wr *wr;
+ struct ib_mr *mr;
+ int n;
+
+ frd = list_first_entry(&fpo->fast_reg.fpo_pool_list,
+ struct kib_fast_reg_descriptor,
+ frd_list);
+ list_del(&frd->frd_list);
+ spin_unlock(&fps->fps_lock);
+
+ mr = frd->frd_mr;
+
+ if (!frd->frd_valid) {
+ __u32 key = is_rx ? mr->rkey : mr->lkey;
+ struct ib_send_wr *inv_wr;
+
+ inv_wr = &frd->frd_inv_wr;
+ memset(inv_wr, 0, sizeof(*inv_wr));
+ inv_wr->opcode = IB_WR_LOCAL_INV;
+ inv_wr->wr_id = IBLND_WID_MR;
+ inv_wr->ex.invalidate_rkey = key;
+
+ /* Bump the key */
+ key = ib_inc_rkey(key);
+ ib_update_fast_reg_key(mr, key);
+ }
+
+ n = ib_map_mr_sg(mr, tx->tx_frags,
+ tx->tx_nfrags, NULL, PAGE_SIZE);
+ if (unlikely(n != tx->tx_nfrags)) {
+ CERROR("Failed to map mr %d/%d elements\n",
+ n, tx->tx_nfrags);
+ return n < 0 ? n : -EINVAL;
+ }
+
+ mr->iova = iov;
+
+ /* Prepare FastReg WR */
+ wr = &frd->frd_fastreg_wr;
+ memset(wr, 0, sizeof(*wr));
+ wr->wr.opcode = IB_WR_REG_MR;
+ wr->wr.wr_id = IBLND_WID_MR;
+ wr->wr.num_sge = 0;
+ wr->wr.send_flags = 0;
+ wr->mr = mr;
+ wr->key = is_rx ? mr->rkey : mr->lkey;
+ wr->access = (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE);
+
+ fmr->fmr_key = is_rx ? mr->rkey : mr->lkey;
+ fmr->fmr_frd = frd;
+ fmr->fmr_pfmr = NULL;
+ fmr->fmr_pool = fpo;
+ return 0;
+ }
+ spin_unlock(&fps->fps_lock);
+ rc = -EBUSY;
}
spin_lock(&fps->fps_lock);
fpo->fpo_map_count--;
- if (PTR_ERR(pfmr) != -EAGAIN) {
+ if (rc != -EAGAIN) {
spin_unlock(&fps->fps_lock);
- return PTR_ERR(pfmr);
+ return rc;
}
/* EAGAIN and ... */
@@ -1932,25 +2164,28 @@ static void kiblnd_net_fini_pools(kib_net_t *net)
}
}
-static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts)
+static int kiblnd_net_init_pools(kib_net_t *net, lnet_ni_t *ni, __u32 *cpts,
+ int ncpts)
{
+ struct lnet_ioctl_config_o2iblnd_tunables *tunables;
unsigned long flags;
int cpt;
- int rc = 0;
+ int rc;
int i;
+ tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
+
read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
- if (!*kiblnd_tunables.kib_map_on_demand) {
+ if (!tunables->lnd_map_on_demand) {
read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
goto create_tx_pool;
}
read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
- if (*kiblnd_tunables.kib_fmr_pool_size <
- *kiblnd_tunables.kib_ntx / 4) {
+ if (tunables->lnd_fmr_pool_size < *kiblnd_tunables.kib_ntx / 4) {
CERROR("Can't set fmr pool size (%d) < ntx / 4(%d)\n",
- *kiblnd_tunables.kib_fmr_pool_size,
+ tunables->lnd_fmr_pool_size,
*kiblnd_tunables.kib_ntx / 4);
rc = -EINVAL;
goto failed;
@@ -1965,8 +2200,11 @@ static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts)
/*
* premapping can fail if ibd_nmr > 1, so we always create
* FMR pool and map-on-demand if premapping failed
+ *
+ * cfs_precpt_alloc is creating an array of struct kib_fmr_poolset
+ * The number of struct kib_fmr_poolsets create is equal to the
+ * number of CPTs that exist, i.e net->ibn_fmr_ps[cpt].
*/
-
net->ibn_fmr_ps = cfs_percpt_alloc(lnet_cpt_table(),
sizeof(kib_fmr_poolset_t));
if (!net->ibn_fmr_ps) {
@@ -1977,9 +2215,8 @@ static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts)
for (i = 0; i < ncpts; i++) {
cpt = !cpts ? i : cpts[i];
- rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, net,
- kiblnd_fmr_pool_size(ncpts),
- kiblnd_fmr_flush_trigger(ncpts));
+ rc = kiblnd_init_fmr_poolset(net->ibn_fmr_ps[cpt], cpt, ncpts,
+ net, tunables);
if (rc) {
CERROR("Can't initialize FMR pool for CPT %d: %d\n",
cpt, rc);
@@ -1991,6 +2228,11 @@ static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts)
LASSERT(i == ncpts);
create_tx_pool:
+ /*
+ * cfs_precpt_alloc is creating an array of struct kib_tx_poolset
+ * The number of struct kib_tx_poolsets create is equal to the
+ * number of CPTs that exist, i.e net->ibn_tx_ps[cpt].
+ */
net->ibn_tx_ps = cfs_percpt_alloc(lnet_cpt_table(),
sizeof(kib_tx_poolset_t));
if (!net->ibn_tx_ps) {
@@ -2694,10 +2936,9 @@ static int kiblnd_startup(lnet_ni_t *ni)
net->ibn_incarnation = tv.tv_sec * USEC_PER_SEC +
tv.tv_nsec / NSEC_PER_USEC;
- ni->ni_peertimeout = *kiblnd_tunables.kib_peertimeout;
- ni->ni_maxtxcredits = *kiblnd_tunables.kib_credits;
- ni->ni_peertxcredits = *kiblnd_tunables.kib_peertxcredits;
- ni->ni_peerrtrcredits = *kiblnd_tunables.kib_peerrtrcredits;
+ rc = kiblnd_tunables_setup(ni);
+ if (rc)
+ goto net_failed;
if (ni->ni_interfaces[0]) {
/* Use the IPoIB interface specified in 'networks=' */
@@ -2736,7 +2977,7 @@ static int kiblnd_startup(lnet_ni_t *ni)
if (rc)
goto failed;
- rc = kiblnd_net_init_pools(net, ni->ni_cpts, ni->ni_ncpts);
+ rc = kiblnd_net_init_pools(net, ni, ni->ni_cpts, ni->ni_ncpts);
if (rc) {
CERROR("Failed to initialize NI pools: %d\n", rc);
goto failed;
@@ -2779,8 +3020,6 @@ static void __exit ko2iblnd_exit(void)
static int __init ko2iblnd_init(void)
{
- int rc;
-
CLASSERT(sizeof(kib_msg_t) <= IBLND_MSG_SIZE);
CLASSERT(offsetof(kib_msg_t,
ibm_u.get.ibgm_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
@@ -2789,9 +3028,7 @@ static int __init ko2iblnd_init(void)
ibm_u.putack.ibpam_rd.rd_frags[IBLND_MAX_RDMA_FRAGS])
<= IBLND_MSG_SIZE);
- rc = kiblnd_tunables_init();
- if (rc)
- return rc;
+ kiblnd_tunables_init();
lnet_register_lnd(&the_o2iblnd);
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
index bfcbdd167..b22984fd9 100644
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.h
@@ -87,22 +87,10 @@ typedef struct {
int *kib_timeout; /* comms timeout (seconds) */
int *kib_keepalive; /* keepalive timeout (seconds) */
int *kib_ntx; /* # tx descs */
- int *kib_credits; /* # concurrent sends */
- int *kib_peertxcredits; /* # concurrent sends to 1 peer */
- int *kib_peerrtrcredits; /* # per-peer router buffer credits */
- int *kib_peercredits_hiw; /* # when eagerly to return credits */
- int *kib_peertimeout; /* seconds to consider peer dead */
char **kib_default_ipif; /* default IPoIB interface */
int *kib_retry_count;
int *kib_rnr_retry_count;
- int *kib_concurrent_sends; /* send work queue sizing */
int *kib_ib_mtu; /* IB MTU */
- int *kib_map_on_demand; /* map-on-demand if RD has more */
- /* fragments than this value, 0 */
- /* disable map-on-demand */
- int *kib_fmr_pool_size; /* # FMRs in pool */
- int *kib_fmr_flush_trigger; /* When to trigger FMR flush */
- int *kib_fmr_cache; /* enable FMR pool cache? */
int *kib_require_priv_port; /* accept only privileged ports */
int *kib_use_priv_port; /* use privileged port for active connect */
int *kib_nscheds; /* # threads on each CPT */
@@ -116,43 +104,21 @@ extern kib_tunables_t kiblnd_tunables;
#define IBLND_CREDITS_DEFAULT 8 /* default # of peer credits */
#define IBLND_CREDITS_MAX ((typeof(((kib_msg_t *) 0)->ibm_credits)) - 1) /* Max # of peer credits */
-#define IBLND_MSG_QUEUE_SIZE(v) ((v) == IBLND_MSG_VERSION_1 ? \
- IBLND_MSG_QUEUE_SIZE_V1 : \
- *kiblnd_tunables.kib_peertxcredits) /* # messages/RDMAs in-flight */
-#define IBLND_CREDITS_HIGHWATER(v) ((v) == IBLND_MSG_VERSION_1 ? \
- IBLND_CREDIT_HIGHWATER_V1 : \
- *kiblnd_tunables.kib_peercredits_hiw) /* when eagerly to return credits */
+/* when eagerly to return credits */
+#define IBLND_CREDITS_HIGHWATER(t, v) ((v) == IBLND_MSG_VERSION_1 ? \
+ IBLND_CREDIT_HIGHWATER_V1 : \
+ t->lnd_peercredits_hiw)
#define kiblnd_rdma_create_id(cb, dev, ps, qpt) rdma_create_id(&init_net, \
cb, dev, \
ps, qpt)
-static inline int
-kiblnd_concurrent_sends_v1(void)
-{
- if (*kiblnd_tunables.kib_concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
- return IBLND_MSG_QUEUE_SIZE_V1 * 2;
-
- if (*kiblnd_tunables.kib_concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
- return IBLND_MSG_QUEUE_SIZE_V1 / 2;
-
- return *kiblnd_tunables.kib_concurrent_sends;
-}
-
-#define IBLND_CONCURRENT_SENDS(v) ((v) == IBLND_MSG_VERSION_1 ? \
- kiblnd_concurrent_sends_v1() : \
- *kiblnd_tunables.kib_concurrent_sends)
/* 2 OOB shall suffice for 1 keepalive and 1 returning credits */
#define IBLND_OOB_CAPABLE(v) ((v) != IBLND_MSG_VERSION_1)
#define IBLND_OOB_MSGS(v) (IBLND_OOB_CAPABLE(v) ? 2 : 0)
#define IBLND_MSG_SIZE (4 << 10) /* max size of queued messages (inc hdr) */
#define IBLND_MAX_RDMA_FRAGS LNET_MAX_IOV /* max # of fragments supported */
-#define IBLND_CFG_RDMA_FRAGS (*kiblnd_tunables.kib_map_on_demand ? \
- *kiblnd_tunables.kib_map_on_demand : \
- IBLND_MAX_RDMA_FRAGS) /* max # of fragments configured by user */
-#define IBLND_RDMA_FRAGS(v) ((v) == IBLND_MSG_VERSION_1 ? \
- IBLND_MAX_RDMA_FRAGS : IBLND_CFG_RDMA_FRAGS)
/************************/
/* derived constants... */
@@ -171,7 +137,8 @@ kiblnd_concurrent_sends_v1(void)
/* WRs and CQEs (per connection) */
#define IBLND_RECV_WRS(c) IBLND_RX_MSGS(c)
#define IBLND_SEND_WRS(c) \
- ((c->ibc_max_frags + 1) * IBLND_CONCURRENT_SENDS(c->ibc_version))
+ ((c->ibc_max_frags + 1) * kiblnd_concurrent_sends(c->ibc_version, \
+ c->ibc_peer->ibp_ni))
#define IBLND_CQ_ENTRIES(c) (IBLND_RECV_WRS(c) + IBLND_SEND_WRS(c))
struct kib_hca_dev;
@@ -286,24 +253,44 @@ typedef struct {
int fps_cpt; /* CPT id */
int fps_pool_size;
int fps_flush_trigger;
+ int fps_cache;
int fps_increasing; /* is allocating new pool */
unsigned long fps_next_retry; /* time stamp for retry if*/
/* failed to allocate */
} kib_fmr_poolset_t;
+struct kib_fast_reg_descriptor { /* For fast registration */
+ struct list_head frd_list;
+ struct ib_send_wr frd_inv_wr;
+ struct ib_reg_wr frd_fastreg_wr;
+ struct ib_mr *frd_mr;
+ bool frd_valid;
+};
+
typedef struct {
struct list_head fpo_list; /* chain on pool list */
struct kib_hca_dev *fpo_hdev; /* device for this pool */
kib_fmr_poolset_t *fpo_owner; /* owner of this pool */
- struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */
+ union {
+ struct {
+ struct ib_fmr_pool *fpo_fmr_pool; /* IB FMR pool */
+ } fmr;
+ struct { /* For fast registration */
+ struct list_head fpo_pool_list;
+ int fpo_pool_size;
+ } fast_reg;
+ };
unsigned long fpo_deadline; /* deadline of this pool */
int fpo_failed; /* fmr pool is failed */
int fpo_map_count; /* # of mapped FMR */
+ int fpo_is_fmr;
} kib_fmr_pool_t;
typedef struct {
- struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */
- kib_fmr_pool_t *fmr_pool; /* pool of FMR */
+ kib_fmr_pool_t *fmr_pool; /* pool of FMR */
+ struct ib_pool_fmr *fmr_pfmr; /* IB pool fmr */
+ struct kib_fast_reg_descriptor *fmr_frd;
+ u32 fmr_key;
} kib_fmr_t;
typedef struct kib_net {
@@ -615,6 +602,48 @@ extern kib_data_t kiblnd_data;
void kiblnd_hdev_destroy(kib_hca_dev_t *hdev);
+int kiblnd_msg_queue_size(int version, struct lnet_ni *ni);
+
+/* max # of fragments configured by user */
+static inline int
+kiblnd_cfg_rdma_frags(struct lnet_ni *ni)
+{
+ struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+ int mod;
+
+ tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
+ mod = tunables->lnd_map_on_demand;
+ return mod ? mod : IBLND_MAX_RDMA_FRAGS;
+}
+
+static inline int
+kiblnd_rdma_frags(int version, struct lnet_ni *ni)
+{
+ return version == IBLND_MSG_VERSION_1 ?
+ IBLND_MAX_RDMA_FRAGS :
+ kiblnd_cfg_rdma_frags(ni);
+}
+
+static inline int
+kiblnd_concurrent_sends(int version, struct lnet_ni *ni)
+{
+ struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+ int concurrent_sends;
+
+ tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
+ concurrent_sends = tunables->lnd_concurrent_sends;
+
+ if (version == IBLND_MSG_VERSION_1) {
+ if (concurrent_sends > IBLND_MSG_QUEUE_SIZE_V1 * 2)
+ return IBLND_MSG_QUEUE_SIZE_V1 * 2;
+
+ if (concurrent_sends < IBLND_MSG_QUEUE_SIZE_V1 / 2)
+ return IBLND_MSG_QUEUE_SIZE_V1 / 2;
+ }
+
+ return concurrent_sends;
+}
+
static inline void
kiblnd_hdev_addref_locked(kib_hca_dev_t *hdev)
{
@@ -737,10 +766,14 @@ kiblnd_send_keepalive(kib_conn_t *conn)
static inline int
kiblnd_need_noop(kib_conn_t *conn)
{
+ struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+ lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+
LASSERT(conn->ibc_state >= IBLND_CONN_ESTABLISHED);
+ tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
if (conn->ibc_outstanding_credits <
- IBLND_CREDITS_HIGHWATER(conn->ibc_version) &&
+ IBLND_CREDITS_HIGHWATER(tunables, conn->ibc_version) &&
!kiblnd_send_keepalive(conn))
return 0; /* No need to send NOOP */
@@ -799,7 +832,8 @@ kiblnd_queue2str(kib_conn_t *conn, struct list_head *q)
#define IBLND_WID_TX 1
#define IBLND_WID_RX 2
#define IBLND_WID_RDMA 3
-#define IBLND_WID_MASK 3UL
+#define IBLND_WID_MR 4
+#define IBLND_WID_MASK 7UL
static inline __u64
kiblnd_ptr2wreqid(void *ptr, int type)
@@ -947,20 +981,20 @@ static inline unsigned int kiblnd_sg_dma_len(struct ib_device *dev,
#define KIBLND_CONN_PARAM(e) ((e)->param.conn.private_data)
#define KIBLND_CONN_PARAM_LEN(e) ((e)->param.conn.private_data_len)
-struct ib_mr *kiblnd_find_rd_dma_mr(kib_hca_dev_t *hdev,
- kib_rdma_desc_t *rd,
+struct ib_mr *kiblnd_find_rd_dma_mr(struct lnet_ni *ni, kib_rdma_desc_t *rd,
int negotiated_nfrags);
void kiblnd_map_rx_descs(kib_conn_t *conn);
void kiblnd_unmap_rx_descs(kib_conn_t *conn);
void kiblnd_pool_free_node(kib_pool_t *pool, struct list_head *node);
struct list_head *kiblnd_pool_alloc_node(kib_poolset_t *ps);
-int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, __u64 *pages,
- int npages, __u64 iov, kib_fmr_t *fmr);
+int kiblnd_fmr_pool_map(kib_fmr_poolset_t *fps, kib_tx_t *tx,
+ kib_rdma_desc_t *rd, __u32 nob, __u64 iov,
+ kib_fmr_t *fmr);
void kiblnd_fmr_pool_unmap(kib_fmr_t *fmr, int status);
-int kiblnd_tunables_init(void);
-void kiblnd_tunables_fini(void);
+int kiblnd_tunables_setup(struct lnet_ni *ni);
+void kiblnd_tunables_init(void);
int kiblnd_connd(void *arg);
int kiblnd_scheduler(void *arg);
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
index 2323e8d3a..845e49a52 100644
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_cb.c
@@ -561,36 +561,23 @@ kiblnd_kvaddr_to_page(unsigned long vaddr)
}
static int
-kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
+kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, __u32 nob)
{
kib_hca_dev_t *hdev;
- __u64 *pages = tx->tx_pages;
kib_fmr_poolset_t *fps;
- int npages;
- int size;
int cpt;
int rc;
- int i;
LASSERT(tx->tx_pool);
LASSERT(tx->tx_pool->tpo_pool.po_owner);
hdev = tx->tx_pool->tpo_hdev;
-
- for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
- for (size = 0; size < rd->rd_frags[i].rf_nob;
- size += hdev->ibh_page_size) {
- pages[npages++] = (rd->rd_frags[i].rf_addr &
- hdev->ibh_page_mask) + size;
- }
- }
-
cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
fps = net->ibn_fmr_ps[cpt];
- rc = kiblnd_fmr_pool_map(fps, pages, npages, 0, &tx->fmr);
+ rc = kiblnd_fmr_pool_map(fps, tx, rd, nob, 0, &tx->fmr);
if (rc) {
- CERROR("Can't map %d pages: %d\n", npages, rc);
+ CERROR("Can't map %u bytes: %d\n", nob, rc);
return rc;
}
@@ -598,8 +585,7 @@ kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
* If rd is not tx_rd, it's going to get sent to a peer, who will need
* the rkey
*/
- rd->rd_key = (rd != tx->tx_rd) ? tx->fmr.fmr_pfmr->fmr->rkey :
- tx->fmr.fmr_pfmr->fmr->lkey;
+ rd->rd_key = tx->fmr.fmr_key;
rd->rd_frags[0].rf_addr &= ~hdev->ibh_page_mask;
rd->rd_frags[0].rf_nob = nob;
rd->rd_nfrags = 1;
@@ -613,10 +599,8 @@ static void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx)
LASSERT(net);
- if (net->ibn_fmr_ps && tx->fmr.fmr_pfmr) {
+ if (net->ibn_fmr_ps)
kiblnd_fmr_pool_unmap(&tx->fmr, tx->tx_status);
- tx->fmr.fmr_pfmr = NULL;
- }
if (tx->tx_nfrags) {
kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
@@ -628,8 +612,8 @@ static void kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx)
static int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
int nfrags)
{
- kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
kib_net_t *net = ni->ni_data;
+ kib_hca_dev_t *hdev = net->ibn_dev->ibd_hdev;
struct ib_mr *mr = NULL;
__u32 nob;
int i;
@@ -652,7 +636,7 @@ static int kiblnd_map_tx(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
nob += rd->rd_frags[i].rf_nob;
}
- mr = kiblnd_find_rd_dma_mr(hdev, rd, tx->tx_conn ?
+ mr = kiblnd_find_rd_dma_mr(ni, rd, tx->tx_conn ?
tx->tx_conn->ibc_max_frags : -1);
if (mr) {
/* found pre-mapping MR */
@@ -704,7 +688,7 @@ kiblnd_setup_rd_iov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
sg_set_page(sg, page, fragnob, page_offset);
- sg++;
+ sg = sg_next(sg);
if (offset + fragnob < iov->iov_len) {
offset += fragnob;
@@ -748,7 +732,7 @@ kiblnd_setup_rd_kiov(lnet_ni_t *ni, kib_tx_t *tx, kib_rdma_desc_t *rd,
sg_set_page(sg, kiov->kiov_page, fragnob,
kiov->kiov_offset + offset);
- sg++;
+ sg = sg_next(sg);
offset = 0;
kiov++;
@@ -765,6 +749,7 @@ kiblnd_post_tx_locked(kib_conn_t *conn, kib_tx_t *tx, int credit)
{
kib_msg_t *msg = tx->tx_msg;
kib_peer_t *peer = conn->ibc_peer;
+ struct lnet_ni *ni = peer->ibp_ni;
int ver = conn->ibc_version;
int rc;
int done;
@@ -780,7 +765,7 @@ kiblnd_post_tx_locked(kib_conn_t *conn, kib_tx_t *tx, int credit)
LASSERT(conn->ibc_credits >= 0);
LASSERT(conn->ibc_credits <= conn->ibc_queue_depth);
- if (conn->ibc_nsends_posted == IBLND_CONCURRENT_SENDS(ver)) {
+ if (conn->ibc_nsends_posted == kiblnd_concurrent_sends(ver, ni)) {
/* tx completions outstanding... */
CDEBUG(D_NET, "%s: posted enough\n",
libcfs_nid2str(peer->ibp_nid));
@@ -851,14 +836,26 @@ kiblnd_post_tx_locked(kib_conn_t *conn, kib_tx_t *tx, int credit)
/* close_conn will launch failover */
rc = -ENETDOWN;
} else {
- struct ib_send_wr *wrq = &tx->tx_wrq[tx->tx_nwrq - 1].wr;
+ struct kib_fast_reg_descriptor *frd = tx->fmr.fmr_frd;
+ struct ib_send_wr *bad = &tx->tx_wrq[tx->tx_nwrq - 1].wr;
+ struct ib_send_wr *wrq = &tx->tx_wrq[0].wr;
+
+ if (frd) {
+ if (!frd->frd_valid) {
+ wrq = &frd->frd_inv_wr;
+ wrq->next = &frd->frd_fastreg_wr.wr;
+ } else {
+ wrq = &frd->frd_fastreg_wr.wr;
+ }
+ frd->frd_fastreg_wr.wr.next = &tx->tx_wrq[0].wr;
+ }
- LASSERTF(wrq->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX),
+ LASSERTF(bad->wr_id == kiblnd_ptr2wreqid(tx, IBLND_WID_TX),
"bad wr_id %llx, opc %d, flags %d, peer: %s\n",
- wrq->wr_id, wrq->opcode, wrq->send_flags,
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- wrq = NULL;
- rc = ib_post_send(conn->ibc_cmid->qp, &tx->tx_wrq->wr, &wrq);
+ bad->wr_id, bad->opcode, bad->send_flags,
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ bad = NULL;
+ rc = ib_post_send(conn->ibc_cmid->qp, wrq, &bad);
}
conn->ibc_last_send = jiffies;
@@ -919,7 +916,7 @@ kiblnd_check_sends(kib_conn_t *conn)
spin_lock(&conn->ibc_lock);
- LASSERT(conn->ibc_nsends_posted <= IBLND_CONCURRENT_SENDS(ver));
+ LASSERT(conn->ibc_nsends_posted <= kiblnd_concurrent_sends(ver, ni));
LASSERT(!IBLND_OOB_CAPABLE(ver) ||
conn->ibc_noops_posted <= IBLND_OOB_MSGS(ver));
LASSERT(conn->ibc_reserved_credits >= 0);
@@ -1066,7 +1063,7 @@ kiblnd_init_rdma(kib_conn_t *conn, kib_tx_t *tx, int type,
kib_msg_t *ibmsg = tx->tx_msg;
kib_rdma_desc_t *srcrd = tx->tx_rd;
struct ib_sge *sge = &tx->tx_sge[0];
- struct ib_rdma_wr *wrq = &tx->tx_wrq[0], *next;
+ struct ib_rdma_wr *wrq, *next;
int rc = resid;
int srcidx = 0;
int dstidx = 0;
@@ -2333,11 +2330,11 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
}
if (reqmsg->ibm_u.connparams.ibcp_queue_depth >
- IBLND_MSG_QUEUE_SIZE(version)) {
+ kiblnd_msg_queue_size(version, ni)) {
CERROR("Can't accept conn from %s, queue depth too large: %d (<=%d wanted)\n",
libcfs_nid2str(nid),
reqmsg->ibm_u.connparams.ibcp_queue_depth,
- IBLND_MSG_QUEUE_SIZE(version));
+ kiblnd_msg_queue_size(version, ni));
if (version == IBLND_MSG_VERSION)
rej.ibr_why = IBLND_REJECT_MSG_QUEUE_SIZE;
@@ -2346,24 +2343,24 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
}
if (reqmsg->ibm_u.connparams.ibcp_max_frags >
- IBLND_RDMA_FRAGS(version)) {
+ kiblnd_rdma_frags(version, ni)) {
CWARN("Can't accept conn from %s (version %x): max_frags %d too large (%d wanted)\n",
libcfs_nid2str(nid), version,
reqmsg->ibm_u.connparams.ibcp_max_frags,
- IBLND_RDMA_FRAGS(version));
+ kiblnd_rdma_frags(version, ni));
if (version >= IBLND_MSG_VERSION)
rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
goto failed;
} else if (reqmsg->ibm_u.connparams.ibcp_max_frags <
- IBLND_RDMA_FRAGS(version) && !net->ibn_fmr_ps) {
+ kiblnd_rdma_frags(version, ni) && !net->ibn_fmr_ps) {
CWARN("Can't accept conn from %s (version %x): max_frags %d incompatible without FMR pool (%d wanted)\n",
libcfs_nid2str(nid), version,
reqmsg->ibm_u.connparams.ibcp_max_frags,
- IBLND_RDMA_FRAGS(version));
+ kiblnd_rdma_frags(version, ni));
- if (version >= IBLND_MSG_VERSION)
+ if (version == IBLND_MSG_VERSION)
rej.ibr_why = IBLND_REJECT_RDMA_FRAGS;
goto failed;
@@ -2524,12 +2521,13 @@ kiblnd_passive_connect(struct rdma_cm_id *cmid, void *priv, int priv_nob)
return 0;
failed:
- if (ni)
+ if (ni) {
lnet_ni_decref(ni);
+ rej.ibr_cp.ibcp_queue_depth = kiblnd_msg_queue_size(version, ni);
+ rej.ibr_cp.ibcp_max_frags = kiblnd_rdma_frags(version, ni);
+ }
rej.ibr_version = version;
- rej.ibr_cp.ibcp_queue_depth = IBLND_MSG_QUEUE_SIZE(version);
- rej.ibr_cp.ibcp_max_frags = IBLND_RDMA_FRAGS(version);
kiblnd_reject(cmid, &rej);
return -ECONNREFUSED;
@@ -2580,12 +2578,15 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
reason = "Unknown";
break;
- case IBLND_REJECT_RDMA_FRAGS:
+ case IBLND_REJECT_RDMA_FRAGS: {
+ struct lnet_ioctl_config_lnd_tunables *tunables;
+
if (!cp) {
reason = "can't negotiate max frags";
goto out;
}
- if (!*kiblnd_tunables.kib_map_on_demand) {
+ tunables = peer->ibp_ni->ni_lnd_tunables;
+ if (!tunables->lt_tun_u.lt_o2ib.lnd_map_on_demand) {
reason = "map_on_demand must be enabled";
goto out;
}
@@ -2597,7 +2598,7 @@ kiblnd_check_reconnect(kib_conn_t *conn, int version,
peer->ibp_max_frags = frag_num;
reason = "rdma fragments";
break;
-
+ }
case IBLND_REJECT_MSG_QUEUE_SIZE:
if (!cp) {
reason = "can't negotiate queue depth";
@@ -3430,6 +3431,12 @@ kiblnd_complete(struct ib_wc *wc)
default:
LBUG();
+ case IBLND_WID_MR:
+ if (wc->status != IB_WC_SUCCESS &&
+ wc->status != IB_WC_WR_FLUSH_ERR)
+ CNETERR("FastReg failed: %d\n", wc->status);
+ break;
+
case IBLND_WID_RDMA:
/*
* We only get RDMA completion notification if it fails. All
diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
index b4607dad3..f8fdd4ae3 100644
--- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
+++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd_modparams.c
@@ -152,74 +152,135 @@ kib_tunables_t kiblnd_tunables = {
.kib_timeout = &timeout,
.kib_keepalive = &keepalive,
.kib_ntx = &ntx,
- .kib_credits = &credits,
- .kib_peertxcredits = &peer_credits,
- .kib_peercredits_hiw = &peer_credits_hiw,
- .kib_peerrtrcredits = &peer_buffer_credits,
- .kib_peertimeout = &peer_timeout,
.kib_default_ipif = &ipif_name,
.kib_retry_count = &retry_count,
.kib_rnr_retry_count = &rnr_retry_count,
- .kib_concurrent_sends = &concurrent_sends,
.kib_ib_mtu = &ib_mtu,
- .kib_map_on_demand = &map_on_demand,
- .kib_fmr_pool_size = &fmr_pool_size,
- .kib_fmr_flush_trigger = &fmr_flush_trigger,
- .kib_fmr_cache = &fmr_cache,
.kib_require_priv_port = &require_privileged_port,
.kib_use_priv_port = &use_privileged_port,
.kib_nscheds = &nscheds
};
-int
-kiblnd_tunables_init(void)
+static struct lnet_ioctl_config_o2iblnd_tunables default_tunables;
+
+/* # messages/RDMAs in-flight */
+int kiblnd_msg_queue_size(int version, lnet_ni_t *ni)
{
+ if (version == IBLND_MSG_VERSION_1)
+ return IBLND_MSG_QUEUE_SIZE_V1;
+ else if (ni)
+ return ni->ni_peertxcredits;
+ else
+ return peer_credits;
+}
+
+int kiblnd_tunables_setup(struct lnet_ni *ni)
+{
+ struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+
+ /*
+ * if there was no tunables specified, setup the tunables to be
+ * defaulted
+ */
+ if (!ni->ni_lnd_tunables) {
+ LIBCFS_ALLOC(ni->ni_lnd_tunables,
+ sizeof(*ni->ni_lnd_tunables));
+ if (!ni->ni_lnd_tunables)
+ return -ENOMEM;
+
+ memcpy(&ni->ni_lnd_tunables->lt_tun_u.lt_o2ib,
+ &default_tunables, sizeof(*tunables));
+ }
+ tunables = &ni->ni_lnd_tunables->lt_tun_u.lt_o2ib;
+
+ /* Current API version */
+ tunables->lnd_version = 0;
+
if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) {
CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n",
*kiblnd_tunables.kib_ib_mtu);
return -EINVAL;
}
- if (*kiblnd_tunables.kib_peertxcredits < IBLND_CREDITS_DEFAULT)
- *kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_DEFAULT;
+ if (!ni->ni_peertimeout)
+ ni->ni_peertimeout = peer_timeout;
+
+ if (!ni->ni_maxtxcredits)
+ ni->ni_maxtxcredits = credits;
+
+ if (!ni->ni_peertxcredits)
+ ni->ni_peertxcredits = peer_credits;
- if (*kiblnd_tunables.kib_peertxcredits > IBLND_CREDITS_MAX)
- *kiblnd_tunables.kib_peertxcredits = IBLND_CREDITS_MAX;
+ if (!ni->ni_peerrtrcredits)
+ ni->ni_peerrtrcredits = peer_buffer_credits;
- if (*kiblnd_tunables.kib_peertxcredits > *kiblnd_tunables.kib_credits)
- *kiblnd_tunables.kib_peertxcredits = *kiblnd_tunables.kib_credits;
+ if (ni->ni_peertxcredits < IBLND_CREDITS_DEFAULT)
+ ni->ni_peertxcredits = IBLND_CREDITS_DEFAULT;
- if (*kiblnd_tunables.kib_peercredits_hiw < *kiblnd_tunables.kib_peertxcredits / 2)
- *kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits / 2;
+ if (ni->ni_peertxcredits > IBLND_CREDITS_MAX)
+ ni->ni_peertxcredits = IBLND_CREDITS_MAX;
- if (*kiblnd_tunables.kib_peercredits_hiw >= *kiblnd_tunables.kib_peertxcredits)
- *kiblnd_tunables.kib_peercredits_hiw = *kiblnd_tunables.kib_peertxcredits - 1;
+ if (ni->ni_peertxcredits > credits)
+ ni->ni_peertxcredits = credits;
- if (*kiblnd_tunables.kib_map_on_demand < 0 ||
- *kiblnd_tunables.kib_map_on_demand > IBLND_MAX_RDMA_FRAGS)
- *kiblnd_tunables.kib_map_on_demand = 0; /* disable map-on-demand */
+ if (!tunables->lnd_peercredits_hiw)
+ tunables->lnd_peercredits_hiw = peer_credits_hiw;
- if (*kiblnd_tunables.kib_map_on_demand == 1)
- *kiblnd_tunables.kib_map_on_demand = 2; /* don't make sense to create map if only one fragment */
+ if (tunables->lnd_peercredits_hiw < ni->ni_peertxcredits / 2)
+ tunables->lnd_peercredits_hiw = ni->ni_peertxcredits / 2;
- if (!*kiblnd_tunables.kib_concurrent_sends) {
- if (*kiblnd_tunables.kib_map_on_demand > 0 &&
- *kiblnd_tunables.kib_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8)
- *kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits) * 2;
- else
- *kiblnd_tunables.kib_concurrent_sends = (*kiblnd_tunables.kib_peertxcredits);
+ if (tunables->lnd_peercredits_hiw >= ni->ni_peertxcredits)
+ tunables->lnd_peercredits_hiw = ni->ni_peertxcredits - 1;
+
+ if (tunables->lnd_map_on_demand < 0 ||
+ tunables->lnd_map_on_demand > IBLND_MAX_RDMA_FRAGS) {
+ /* disable map-on-demand */
+ tunables->lnd_map_on_demand = 0;
+ }
+
+ if (tunables->lnd_map_on_demand == 1) {
+ /* don't make sense to create map if only one fragment */
+ tunables->lnd_map_on_demand = 2;
+ }
+
+ if (!tunables->lnd_concurrent_sends) {
+ if (tunables->lnd_map_on_demand > 0 &&
+ tunables->lnd_map_on_demand <= IBLND_MAX_RDMA_FRAGS / 8) {
+ tunables->lnd_concurrent_sends =
+ ni->ni_peertxcredits * 2;
+ } else {
+ tunables->lnd_concurrent_sends = ni->ni_peertxcredits;
+ }
}
- if (*kiblnd_tunables.kib_concurrent_sends > *kiblnd_tunables.kib_peertxcredits * 2)
- *kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits * 2;
+ if (tunables->lnd_concurrent_sends > ni->ni_peertxcredits * 2)
+ tunables->lnd_concurrent_sends = ni->ni_peertxcredits * 2;
- if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits / 2)
- *kiblnd_tunables.kib_concurrent_sends = *kiblnd_tunables.kib_peertxcredits / 2;
+ if (tunables->lnd_concurrent_sends < ni->ni_peertxcredits / 2)
+ tunables->lnd_concurrent_sends = ni->ni_peertxcredits / 2;
- if (*kiblnd_tunables.kib_concurrent_sends < *kiblnd_tunables.kib_peertxcredits) {
+ if (tunables->lnd_concurrent_sends < ni->ni_peertxcredits) {
CWARN("Concurrent sends %d is lower than message queue size: %d, performance may drop slightly.\n",
- *kiblnd_tunables.kib_concurrent_sends, *kiblnd_tunables.kib_peertxcredits);
+ tunables->lnd_concurrent_sends, ni->ni_peertxcredits);
}
+ if (!tunables->lnd_fmr_pool_size)
+ tunables->lnd_fmr_pool_size = fmr_pool_size;
+ if (!tunables->lnd_fmr_flush_trigger)
+ tunables->lnd_fmr_flush_trigger = fmr_flush_trigger;
+ if (!tunables->lnd_fmr_cache)
+ tunables->lnd_fmr_cache = fmr_cache;
+
return 0;
}
+
+void kiblnd_tunables_init(void)
+{
+ default_tunables.lnd_version = 0;
+ default_tunables.lnd_peercredits_hiw = peer_credits_hiw,
+ default_tunables.lnd_map_on_demand = map_on_demand;
+ default_tunables.lnd_concurrent_sends = concurrent_sends;
+ default_tunables.lnd_fmr_pool_size = fmr_pool_size;
+ default_tunables.lnd_fmr_flush_trigger = fmr_flush_trigger;
+ default_tunables.lnd_fmr_cache = fmr_cache;
+}