summaryrefslogtreecommitdiff
path: root/net/rds
diff options
context:
space:
mode:
Diffstat (limited to 'net/rds')
-rw-r--r--net/rds/Kconfig7
-rw-r--r--net/rds/Makefile4
-rw-r--r--net/rds/af_rds.c26
-rw-r--r--net/rds/cong.c4
-rw-r--r--net/rds/ib.c47
-rw-r--r--net/rds/ib.h37
-rw-r--r--net/rds/ib_cm.c61
-rw-r--r--net/rds/ib_fmr.c248
-rw-r--r--net/rds/ib_frmr.c376
-rw-r--r--net/rds/ib_mr.h148
-rw-r--r--net/rds/ib_rdma.c495
-rw-r--r--net/rds/ib_recv.c2
-rw-r--r--net/rds/ib_send.c6
-rw-r--r--net/rds/ib_stats.c2
-rw-r--r--net/rds/iw.c312
-rw-r--r--net/rds/iw.h398
-rw-r--r--net/rds/iw_cm.c769
-rw-r--r--net/rds/iw_rdma.c837
-rw-r--r--net/rds/iw_recv.c904
-rw-r--r--net/rds/iw_ring.c169
-rw-r--r--net/rds/iw_send.c981
-rw-r--r--net/rds/iw_stats.c95
-rw-r--r--net/rds/iw_sysctl.c123
-rw-r--r--net/rds/page.c8
-rw-r--r--net/rds/rdma_transport.c21
-rw-r--r--net/rds/rdma_transport.h5
-rw-r--r--net/rds/rds.h1
-rw-r--r--net/rds/recv.c20
-rw-r--r--net/rds/tcp.c149
-rw-r--r--net/rds/tcp.h4
-rw-r--r--net/rds/tcp_connect.c8
-rw-r--r--net/rds/tcp_listen.c54
32 files changed, 1251 insertions, 5070 deletions
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
index f2c670ba7..bffde4b46 100644
--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -4,14 +4,13 @@ config RDS
depends on INET
---help---
The RDS (Reliable Datagram Sockets) protocol provides reliable,
- sequenced delivery of datagrams over Infiniband, iWARP,
- or TCP.
+ sequenced delivery of datagrams over Infiniband or TCP.
config RDS_RDMA
- tristate "RDS over Infiniband and iWARP"
+ tristate "RDS over Infiniband"
depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
---help---
- Allow RDS to use Infiniband and iWARP as a transport.
+ Allow RDS to use Infiniband as a transport.
This transport supports RDMA operations.
config RDS_TCP
diff --git a/net/rds/Makefile b/net/rds/Makefile
index 56d3f6023..0e72bec15 100644
--- a/net/rds/Makefile
+++ b/net/rds/Makefile
@@ -6,9 +6,7 @@ rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \
obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
rds_rdma-y := rdma_transport.o \
ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
- ib_sysctl.o ib_rdma.o \
- iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
- iw_sysctl.o iw_rdma.o
+ ib_sysctl.o ib_rdma.o ib_fmr.o ib_frmr.o
obj-$(CONFIG_RDS_TCP) += rds_tcp.o
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index b5476aebd..6beaeb113 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -277,6 +277,27 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval,
return rs->rs_transport ? 0 : -ENOPROTOOPT;
}
+static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
+ int optlen)
+{
+ int val, valbool;
+
+ if (optlen != sizeof(int))
+ return -EFAULT;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ valbool = val ? 1 : 0;
+
+ if (valbool)
+ sock_set_flag(sk, SOCK_RCVTSTAMP);
+ else
+ sock_reset_flag(sk, SOCK_RCVTSTAMP);
+
+ return 0;
+}
+
static int rds_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
@@ -312,6 +333,11 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
ret = rds_set_transport(rs, optval, optlen);
release_sock(sock->sk);
break;
+ case SO_TIMESTAMP:
+ lock_sock(sock->sk);
+ ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
+ release_sock(sock->sk);
+ break;
default:
ret = -ENOPROTOOPT;
}
diff --git a/net/rds/cong.c b/net/rds/cong.c
index e6144b824..6641bcf7c 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -299,7 +299,7 @@ void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
- __set_bit_le(off, (void *)map->m_page_addrs[i]);
+ set_bit_le(off, (void *)map->m_page_addrs[i]);
}
void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
@@ -313,7 +313,7 @@ void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
- __clear_bit_le(off, (void *)map->m_page_addrs[i]);
+ clear_bit_le(off, (void *)map->m_page_addrs[i]);
}
static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
diff --git a/net/rds/ib.c b/net/rds/ib.c
index 9481d55ff..b5342fdda 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -42,15 +42,16 @@
#include "rds.h"
#include "ib.h"
+#include "ib_mr.h"
-unsigned int rds_ib_fmr_1m_pool_size = RDS_FMR_1M_POOL_SIZE;
-unsigned int rds_ib_fmr_8k_pool_size = RDS_FMR_8K_POOL_SIZE;
+unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE;
+unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE;
unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
-module_param(rds_ib_fmr_1m_pool_size, int, 0444);
-MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1M fmr per HCA");
-module_param(rds_ib_fmr_8k_pool_size, int, 0444);
-MODULE_PARM_DESC(rds_ib_fmr_8k_pool_size, " Max number of 8K fmr per HCA");
+module_param(rds_ib_mr_1m_pool_size, int, 0444);
+MODULE_PARM_DESC(rds_ib_mr_1m_pool_size, " Max number of 1M mr per HCA");
+module_param(rds_ib_mr_8k_pool_size, int, 0444);
+MODULE_PARM_DESC(rds_ib_mr_8k_pool_size, " Max number of 8K mr per HCA");
module_param(rds_ib_retry_count, int, 0444);
MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
@@ -139,14 +140,20 @@ static void rds_ib_add_one(struct ib_device *device)
rds_ibdev->max_wrs = device->attrs.max_qp_wr;
rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE);
+ rds_ibdev->has_fr = (device->attrs.device_cap_flags &
+ IB_DEVICE_MEM_MGT_EXTENSIONS);
+ rds_ibdev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
+ device->map_phys_fmr && device->unmap_fmr);
+ rds_ibdev->use_fastreg = (rds_ibdev->has_fr && !rds_ibdev->has_fmr);
+
rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
- rds_ibdev->max_1m_fmrs = device->attrs.max_mr ?
+ rds_ibdev->max_1m_mrs = device->attrs.max_mr ?
min_t(unsigned int, (device->attrs.max_mr / 2),
- rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size;
+ rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size;
- rds_ibdev->max_8k_fmrs = device->attrs.max_mr ?
+ rds_ibdev->max_8k_mrs = device->attrs.max_mr ?
min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE),
- rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size;
+ rds_ib_mr_8k_pool_size) : rds_ib_mr_8k_pool_size;
rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
@@ -172,10 +179,14 @@ static void rds_ib_add_one(struct ib_device *device)
goto put_dev;
}
- rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n",
+ rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_mrs = %d, max_8k_mrs = %d\n",
device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
- rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs,
- rds_ibdev->max_8k_fmrs);
+ rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_mrs,
+ rds_ibdev->max_8k_mrs);
+
+ pr_info("RDS/IB: %s: %s supported and preferred\n",
+ device->name,
+ rds_ibdev->use_fastreg ? "FRMR" : "FMR");
INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
INIT_LIST_HEAD(&rds_ibdev->conn_list);
@@ -364,7 +375,7 @@ void rds_ib_exit(void)
rds_ib_sysctl_exit();
rds_ib_recv_exit();
rds_trans_unregister(&rds_ib_transport);
- rds_ib_fmr_exit();
+ rds_ib_mr_exit();
}
struct rds_transport rds_ib_transport = {
@@ -400,13 +411,13 @@ int rds_ib_init(void)
INIT_LIST_HEAD(&rds_ib_devices);
- ret = rds_ib_fmr_init();
+ ret = rds_ib_mr_init();
if (ret)
goto out;
ret = ib_register_client(&rds_ib_client);
if (ret)
- goto out_fmr_exit;
+ goto out_mr_exit;
ret = rds_ib_sysctl_init();
if (ret)
@@ -430,8 +441,8 @@ out_sysctl:
rds_ib_sysctl_exit();
out_ibreg:
rds_ib_unregister_client();
-out_fmr_exit:
- rds_ib_fmr_exit();
+out_mr_exit:
+ rds_ib_mr_exit();
out:
return ret;
}
diff --git a/net/rds/ib.h b/net/rds/ib.h
index b3fdebb57..627fb79ae 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -9,17 +9,12 @@
#include "rds.h"
#include "rdma_transport.h"
-#define RDS_FMR_1M_POOL_SIZE (8192 / 2)
-#define RDS_FMR_1M_MSG_SIZE 256
-#define RDS_FMR_8K_MSG_SIZE 2
-#define RDS_MR_8K_SCALE (256 / (RDS_FMR_8K_MSG_SIZE + 1))
-#define RDS_FMR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2))
-
#define RDS_IB_MAX_SGE 8
#define RDS_IB_RECV_SGE 2
#define RDS_IB_DEFAULT_RECV_WR 1024
#define RDS_IB_DEFAULT_SEND_WR 256
+#define RDS_IB_DEFAULT_FR_WR 512
#define RDS_IB_DEFAULT_RETRY_COUNT 2
@@ -28,7 +23,6 @@
#define RDS_IB_RECYCLE_BATCH_COUNT 32
#define RDS_IB_WC_MAX 32
-#define RDS_IB_SEND_OP BIT_ULL(63)
extern struct rw_semaphore rds_ib_devices_lock;
extern struct list_head rds_ib_devices;
@@ -129,6 +123,9 @@ struct rds_ib_connection {
struct ib_wc i_send_wc[RDS_IB_WC_MAX];
struct ib_wc i_recv_wc[RDS_IB_WC_MAX];
+ /* To control the number of wrs from fastreg */
+ atomic_t i_fastreg_wrs;
+
/* interrupt handling */
struct tasklet_struct i_send_tasklet;
struct tasklet_struct i_recv_tasklet;
@@ -207,12 +204,16 @@ struct rds_ib_device {
struct list_head conn_list;
struct ib_device *dev;
struct ib_pd *pd;
- unsigned int max_fmrs;
+ bool has_fmr;
+ bool has_fr;
+ bool use_fastreg;
+
+ unsigned int max_mrs;
struct rds_ib_mr_pool *mr_1m_pool;
struct rds_ib_mr_pool *mr_8k_pool;
unsigned int fmr_max_remaps;
- unsigned int max_8k_fmrs;
- unsigned int max_1m_fmrs;
+ unsigned int max_8k_mrs;
+ unsigned int max_1m_mrs;
int max_sge;
unsigned int max_wrs;
unsigned int max_initiator_depth;
@@ -266,6 +267,8 @@ struct rds_ib_statistics {
uint64_t s_ib_rdma_mr_1m_pool_flush;
uint64_t s_ib_rdma_mr_1m_pool_wait;
uint64_t s_ib_rdma_mr_1m_pool_depleted;
+ uint64_t s_ib_rdma_mr_8k_reused;
+ uint64_t s_ib_rdma_mr_1m_reused;
uint64_t s_ib_atomic_cswp;
uint64_t s_ib_atomic_fadd;
};
@@ -317,8 +320,6 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
extern struct ib_client rds_ib_client;
-extern unsigned int rds_ib_fmr_1m_pool_size;
-extern unsigned int rds_ib_fmr_8k_pool_size;
extern unsigned int rds_ib_retry_count;
extern spinlock_t ib_nodev_conns_lock;
@@ -348,17 +349,7 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_destroy_nodev_conns(void);
-struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
- int npages);
-void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
-void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
-void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
- struct rds_sock *rs, u32 *key_ret);
-void rds_ib_sync_mr(void *trans_private, int dir);
-void rds_ib_free_mr(void *trans_private, int invalidate);
-void rds_ib_flush_mrs(void);
-int rds_ib_fmr_init(void);
-void rds_ib_fmr_exit(void);
+void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
/* ib_recv.c */
int rds_ib_recv_init(void);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index da5a7fb98..310cabce2 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -194,7 +194,7 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
- dp->dp_ack_seq = rds_ib_piggyb_ack(ic);
+ dp->dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic));
/* Advertise flow control */
if (ic->i_flowctl) {
@@ -236,12 +236,10 @@ static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context)
tasklet_schedule(&ic->i_recv_tasklet);
}
-static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq,
- struct ib_wc *wcs,
- struct rds_ib_ack_state *ack_state)
+static void poll_scq(struct rds_ib_connection *ic, struct ib_cq *cq,
+ struct ib_wc *wcs)
{
- int nr;
- int i;
+ int nr, i;
struct ib_wc *wc;
while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
@@ -251,10 +249,12 @@ static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq,
(unsigned long long)wc->wr_id, wc->status,
wc->byte_len, be32_to_cpu(wc->ex.imm_data));
- if (wc->wr_id & RDS_IB_SEND_OP)
+ if (wc->wr_id <= ic->i_send_ring.w_nr ||
+ wc->wr_id == RDS_IB_ACK_WR_ID)
rds_ib_send_cqe_handler(ic, wc);
else
- rds_ib_recv_cqe_handler(ic, wc, ack_state);
+ rds_ib_mr_cqe_handler(ic, wc);
+
}
}
}
@@ -263,14 +263,12 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
{
struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
struct rds_connection *conn = ic->conn;
- struct rds_ib_ack_state state;
rds_ib_stats_inc(s_ib_tasklet_call);
- memset(&state, 0, sizeof(state));
- poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state);
+ poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
- poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state);
+ poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
if (rds_conn_up(conn) &&
(!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
@@ -278,6 +276,25 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
rds_send_xmit(ic->conn);
}
+static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq,
+ struct ib_wc *wcs,
+ struct rds_ib_ack_state *ack_state)
+{
+ int nr, i;
+ struct ib_wc *wc;
+
+ while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
+ for (i = 0; i < nr; i++) {
+ wc = wcs + i;
+ rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+ (unsigned long long)wc->wr_id, wc->status,
+ wc->byte_len, be32_to_cpu(wc->ex.imm_data));
+
+ rds_ib_recv_cqe_handler(ic, wc, ack_state);
+ }
+ }
+}
+
static void rds_ib_tasklet_fn_recv(unsigned long data)
{
struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
@@ -291,9 +308,9 @@ static void rds_ib_tasklet_fn_recv(unsigned long data)
rds_ib_stats_inc(s_ib_tasklet_call);
memset(&state, 0, sizeof(state));
- poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
+ poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
- poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
+ poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
if (state.ack_next_valid)
rds_ib_set_ack(ic, state.ack_next, state.ack_required);
@@ -351,7 +368,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
struct ib_qp_init_attr attr;
struct ib_cq_init_attr cq_attr = {};
struct rds_ib_device *rds_ibdev;
- int ret;
+ int ret, fr_queue_space;
/*
* It's normal to see a null device if an incoming connection races
@@ -361,6 +378,12 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
if (!rds_ibdev)
return -EOPNOTSUPP;
+ /* The fr_queue_space is currently set to 512, to add extra space on
+ * completion queue and send queue. This extra space is used for FRMR
+ * registration and invalidation work requests
+ */
+ fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0);
+
/* add the conn now so that connection establishment has the dev */
rds_ib_add_conn(rds_ibdev, conn);
@@ -372,7 +395,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
/* Protection domain and memory range */
ic->i_pd = rds_ibdev->pd;
- cq_attr.cqe = ic->i_send_ring.w_nr + 1;
+ cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1;
ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
rds_ib_cq_event_handler, conn,
@@ -412,7 +435,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
attr.event_handler = rds_ib_qp_event_handler;
attr.qp_context = conn;
/* + 1 to allow for the single ack message */
- attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
+ attr.cap.max_send_wr = ic->i_send_ring.w_nr + fr_queue_space + 1;
attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
attr.cap.max_send_sge = rds_ibdev->max_sge;
attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
@@ -420,6 +443,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
attr.qp_type = IB_QPT_RC;
attr.send_cq = ic->i_send_cq;
attr.recv_cq = ic->i_recv_cq;
+ atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
/*
* XXX this can fail if max_*_wr is too large? Are we supposed
@@ -739,7 +763,8 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
*/
wait_event(rds_ib_ring_empty_wait,
rds_ib_ring_empty(&ic->i_recv_ring) &&
- (atomic_read(&ic->i_signaled_sends) == 0));
+ (atomic_read(&ic->i_signaled_sends) == 0) &&
+ (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
tasklet_kill(&ic->i_send_tasklet);
tasklet_kill(&ic->i_recv_tasklet);
diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c
new file mode 100644
index 000000000..4fe8f4fec
--- /dev/null
+++ b/net/rds/ib_fmr.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ib_mr.h"
+
+struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages)
+{
+ struct rds_ib_mr_pool *pool;
+ struct rds_ib_mr *ibmr = NULL;
+ struct rds_ib_fmr *fmr;
+ int err = 0;
+
+ if (npages <= RDS_MR_8K_MSG_SIZE)
+ pool = rds_ibdev->mr_8k_pool;
+ else
+ pool = rds_ibdev->mr_1m_pool;
+
+ ibmr = rds_ib_try_reuse_ibmr(pool);
+ if (ibmr)
+ return ibmr;
+
+ ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL,
+ rdsibdev_to_node(rds_ibdev));
+ if (!ibmr) {
+ err = -ENOMEM;
+ goto out_no_cigar;
+ }
+
+ fmr = &ibmr->u.fmr;
+ fmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
+ (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_ATOMIC),
+ &pool->fmr_attr);
+ if (IS_ERR(fmr->fmr)) {
+ err = PTR_ERR(fmr->fmr);
+ fmr->fmr = NULL;
+ pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, err);
+ goto out_no_cigar;
+ }
+
+ ibmr->pool = pool;
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
+
+ return ibmr;
+
+out_no_cigar:
+ if (ibmr) {
+ if (fmr->fmr)
+ ib_dealloc_fmr(fmr->fmr);
+ kfree(ibmr);
+ }
+ atomic_dec(&pool->item_count);
+ return ERR_PTR(err);
+}
+
+int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
+ struct scatterlist *sg, unsigned int nents)
+{
+ struct ib_device *dev = rds_ibdev->dev;
+ struct rds_ib_fmr *fmr = &ibmr->u.fmr;
+ struct scatterlist *scat = sg;
+ u64 io_addr = 0;
+ u64 *dma_pages;
+ u32 len;
+ int page_cnt, sg_dma_len;
+ int i, j;
+ int ret;
+
+ sg_dma_len = ib_dma_map_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
+ if (unlikely(!sg_dma_len)) {
+ pr_warn("RDS/IB: %s failed!\n", __func__);
+ return -EBUSY;
+ }
+
+ len = 0;
+ page_cnt = 0;
+
+ for (i = 0; i < sg_dma_len; ++i) {
+ unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+ u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+ if (dma_addr & ~PAGE_MASK) {
+ if (i > 0)
+ return -EINVAL;
+ else
+ ++page_cnt;
+ }
+ if ((dma_addr + dma_len) & ~PAGE_MASK) {
+ if (i < sg_dma_len - 1)
+ return -EINVAL;
+ else
+ ++page_cnt;
+ }
+
+ len += dma_len;
+ }
+
+ page_cnt += len >> PAGE_SHIFT;
+ if (page_cnt > ibmr->pool->fmr_attr.max_pages)
+ return -EINVAL;
+
+ dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
+ rdsibdev_to_node(rds_ibdev));
+ if (!dma_pages)
+ return -ENOMEM;
+
+ page_cnt = 0;
+ for (i = 0; i < sg_dma_len; ++i) {
+ unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+ u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+ for (j = 0; j < dma_len; j += PAGE_SIZE)
+ dma_pages[page_cnt++] =
+ (dma_addr & PAGE_MASK) + j;
+ }
+
+ ret = ib_map_phys_fmr(fmr->fmr, dma_pages, page_cnt, io_addr);
+ if (ret)
+ goto out;
+
+ /* Success - we successfully remapped the MR, so we can
+ * safely tear down the old mapping.
+ */
+ rds_ib_teardown_mr(ibmr);
+
+ ibmr->sg = scat;
+ ibmr->sg_len = nents;
+ ibmr->sg_dma_len = sg_dma_len;
+ ibmr->remap_count++;
+
+ if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
+ ret = 0;
+
+out:
+ kfree(dma_pages);
+
+ return ret;
+}
+
+struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *rds_ibdev,
+ struct scatterlist *sg,
+ unsigned long nents,
+ u32 *key)
+{
+ struct rds_ib_mr *ibmr = NULL;
+ struct rds_ib_fmr *fmr;
+ int ret;
+
+ ibmr = rds_ib_alloc_fmr(rds_ibdev, nents);
+ if (IS_ERR(ibmr))
+ return ibmr;
+
+ ibmr->device = rds_ibdev;
+ fmr = &ibmr->u.fmr;
+ ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
+ if (ret == 0)
+ *key = fmr->fmr->rkey;
+ else
+ rds_ib_free_mr(ibmr, 0);
+
+ return ibmr;
+}
+
+void rds_ib_unreg_fmr(struct list_head *list, unsigned int *nfreed,
+ unsigned long *unpinned, unsigned int goal)
+{
+ struct rds_ib_mr *ibmr, *next;
+ struct rds_ib_fmr *fmr;
+ LIST_HEAD(fmr_list);
+ int ret = 0;
+ unsigned int freed = *nfreed;
+
+ /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
+ list_for_each_entry(ibmr, list, unmap_list) {
+ fmr = &ibmr->u.fmr;
+ list_add(&fmr->fmr->list, &fmr_list);
+ }
+
+ ret = ib_unmap_fmr(&fmr_list);
+ if (ret)
+ pr_warn("RDS/IB: FMR invalidation failed (err=%d)\n", ret);
+
+ /* Now we can destroy the DMA mapping and unpin any pages */
+ list_for_each_entry_safe(ibmr, next, list, unmap_list) {
+ fmr = &ibmr->u.fmr;
+ *unpinned += ibmr->sg_len;
+ __rds_ib_teardown_mr(ibmr);
+ if (freed < goal ||
+ ibmr->remap_count >= ibmr->pool->fmr_attr.max_maps) {
+ if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
+ list_del(&ibmr->unmap_list);
+ ib_dealloc_fmr(fmr->fmr);
+ kfree(ibmr);
+ freed++;
+ }
+ }
+ *nfreed = freed;
+}
+
+void rds_ib_free_fmr_list(struct rds_ib_mr *ibmr)
+{
+ struct rds_ib_mr_pool *pool = ibmr->pool;
+
+ if (ibmr->remap_count >= pool->fmr_attr.max_maps)
+ llist_add(&ibmr->llnode, &pool->drop_list);
+ else
+ llist_add(&ibmr->llnode, &pool->free_list);
+}
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
new file mode 100644
index 000000000..93ff038ea
--- /dev/null
+++ b/net/rds/ib_frmr.c
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ib_mr.h"
+
+static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
+ int npages)
+{
+ struct rds_ib_mr_pool *pool;
+ struct rds_ib_mr *ibmr = NULL;
+ struct rds_ib_frmr *frmr;
+ int err = 0;
+
+ if (npages <= RDS_MR_8K_MSG_SIZE)
+ pool = rds_ibdev->mr_8k_pool;
+ else
+ pool = rds_ibdev->mr_1m_pool;
+
+ ibmr = rds_ib_try_reuse_ibmr(pool);
+ if (ibmr)
+ return ibmr;
+
+ ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL,
+ rdsibdev_to_node(rds_ibdev));
+ if (!ibmr) {
+ err = -ENOMEM;
+ goto out_no_cigar;
+ }
+
+ frmr = &ibmr->u.frmr;
+ frmr->mr = ib_alloc_mr(rds_ibdev->pd, IB_MR_TYPE_MEM_REG,
+ pool->fmr_attr.max_pages);
+ if (IS_ERR(frmr->mr)) {
+ pr_warn("RDS/IB: %s failed to allocate MR", __func__);
+ goto out_no_cigar;
+ }
+
+ ibmr->pool = pool;
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
+
+ if (atomic_read(&pool->item_count) > pool->max_items_soft)
+ pool->max_items_soft = pool->max_items;
+
+ frmr->fr_state = FRMR_IS_FREE;
+ return ibmr;
+
+out_no_cigar:
+ kfree(ibmr);
+ atomic_dec(&pool->item_count);
+ return ERR_PTR(err);
+}
+
+static void rds_ib_free_frmr(struct rds_ib_mr *ibmr, bool drop)
+{
+ struct rds_ib_mr_pool *pool = ibmr->pool;
+
+ if (drop)
+ llist_add(&ibmr->llnode, &pool->drop_list);
+ else
+ llist_add(&ibmr->llnode, &pool->free_list);
+ atomic_add(ibmr->sg_len, &pool->free_pinned);
+ atomic_inc(&pool->dirty_count);
+
+ /* If we've pinned too many pages, request a flush */
+ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
+ atomic_read(&pool->dirty_count) >= pool->max_items / 5)
+ queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
+}
+
+static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
+{
+ struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+ struct ib_send_wr *failed_wr;
+ struct ib_reg_wr reg_wr;
+ int ret;
+
+ while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
+ atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ cpu_relax();
+ }
+
+ ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, PAGE_SIZE);
+ if (unlikely(ret != ibmr->sg_len))
+ return ret < 0 ? ret : -EINVAL;
+
+ /* Perform a WR for the fast_reg_mr. Each individual page
+ * in the sg list is added to the fast reg page list and placed
+ * inside the fast_reg_mr WR. The key used is a rolling 8bit
+ * counter, which should guarantee uniqueness.
+ */
+ ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++);
+ frmr->fr_state = FRMR_IS_INUSE;
+
+ memset(&reg_wr, 0, sizeof(reg_wr));
+ reg_wr.wr.wr_id = (unsigned long)(void *)ibmr;
+ reg_wr.wr.opcode = IB_WR_REG_MR;
+ reg_wr.wr.num_sge = 0;
+ reg_wr.mr = frmr->mr;
+ reg_wr.key = frmr->mr->rkey;
+ reg_wr.access = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE;
+ reg_wr.wr.send_flags = IB_SEND_SIGNALED;
+
+ failed_wr = &reg_wr.wr;
+ ret = ib_post_send(ibmr->ic->i_cm_id->qp, &reg_wr.wr, &failed_wr);
+ WARN_ON(failed_wr != &reg_wr.wr);
+ if (unlikely(ret)) {
+ /* Failure here can be because of -ENOMEM as well */
+ frmr->fr_state = FRMR_IS_STALE;
+ atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ if (printk_ratelimit())
+ pr_warn("RDS/IB: %s returned error(%d)\n",
+ __func__, ret);
+ }
+ return ret;
+}
+
+static int rds_ib_map_frmr(struct rds_ib_device *rds_ibdev,
+ struct rds_ib_mr_pool *pool,
+ struct rds_ib_mr *ibmr,
+ struct scatterlist *sg, unsigned int sg_len)
+{
+ struct ib_device *dev = rds_ibdev->dev;
+ struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+ int i;
+ u32 len;
+ int ret = 0;
+
+ /* We want to teardown old ibmr values here and fill it up with
+ * new sg values
+ */
+ rds_ib_teardown_mr(ibmr);
+
+ ibmr->sg = sg;
+ ibmr->sg_len = sg_len;
+ ibmr->sg_dma_len = 0;
+ frmr->sg_byte_len = 0;
+ WARN_ON(ibmr->sg_dma_len);
+ ibmr->sg_dma_len = ib_dma_map_sg(dev, ibmr->sg, ibmr->sg_len,
+ DMA_BIDIRECTIONAL);
+ if (unlikely(!ibmr->sg_dma_len)) {
+ pr_warn("RDS/IB: %s failed!\n", __func__);
+ return -EBUSY;
+ }
+
+ frmr->sg_byte_len = 0;
+ frmr->dma_npages = 0;
+ len = 0;
+
+ ret = -EINVAL;
+ for (i = 0; i < ibmr->sg_dma_len; ++i) {
+ unsigned int dma_len = ib_sg_dma_len(dev, &ibmr->sg[i]);
+ u64 dma_addr = ib_sg_dma_address(dev, &ibmr->sg[i]);
+
+ frmr->sg_byte_len += dma_len;
+ if (dma_addr & ~PAGE_MASK) {
+ if (i > 0)
+ goto out_unmap;
+ else
+ ++frmr->dma_npages;
+ }
+
+ if ((dma_addr + dma_len) & ~PAGE_MASK) {
+ if (i < ibmr->sg_dma_len - 1)
+ goto out_unmap;
+ else
+ ++frmr->dma_npages;
+ }
+
+ len += dma_len;
+ }
+ frmr->dma_npages += len >> PAGE_SHIFT;
+
+ if (frmr->dma_npages > ibmr->pool->fmr_attr.max_pages) {
+ ret = -EMSGSIZE;
+ goto out_unmap;
+ }
+
+ ret = rds_ib_post_reg_frmr(ibmr);
+ if (ret)
+ goto out_unmap;
+
+ if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
+
+ return ret;
+
+out_unmap:
+ ib_dma_unmap_sg(rds_ibdev->dev, ibmr->sg, ibmr->sg_len,
+ DMA_BIDIRECTIONAL);
+ ibmr->sg_dma_len = 0;
+ return ret;
+}
+
+static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
+{
+ struct ib_send_wr *s_wr, *failed_wr;
+ struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+ struct rdma_cm_id *i_cm_id = ibmr->ic->i_cm_id;
+ int ret = -EINVAL;
+
+ if (!i_cm_id || !i_cm_id->qp || !frmr->mr)
+ goto out;
+
+ if (frmr->fr_state != FRMR_IS_INUSE)
+ goto out;
+
+ while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
+ atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ cpu_relax();
+ }
+
+ frmr->fr_inv = true;
+ s_wr = &frmr->fr_wr;
+
+ memset(s_wr, 0, sizeof(*s_wr));
+ s_wr->wr_id = (unsigned long)(void *)ibmr;
+ s_wr->opcode = IB_WR_LOCAL_INV;
+ s_wr->ex.invalidate_rkey = frmr->mr->rkey;
+ s_wr->send_flags = IB_SEND_SIGNALED;
+
+ failed_wr = s_wr;
+ ret = ib_post_send(i_cm_id->qp, s_wr, &failed_wr);
+ WARN_ON(failed_wr != s_wr);
+ if (unlikely(ret)) {
+ frmr->fr_state = FRMR_IS_STALE;
+ frmr->fr_inv = false;
+ atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
+ goto out;
+ }
+out:
+ return ret;
+}
+
+void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
+{
+ struct rds_ib_mr *ibmr = (void *)(unsigned long)wc->wr_id;
+ struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+
+ if (wc->status != IB_WC_SUCCESS) {
+ frmr->fr_state = FRMR_IS_STALE;
+ if (rds_conn_up(ic->conn))
+ rds_ib_conn_error(ic->conn,
+ "frmr completion <%pI4,%pI4> status %u(%s), vendor_err 0x%x, disconnecting and reconnecting\n",
+ &ic->conn->c_laddr,
+ &ic->conn->c_faddr,
+ wc->status,
+ ib_wc_status_msg(wc->status),
+ wc->vendor_err);
+ }
+
+ if (frmr->fr_inv) {
+ frmr->fr_state = FRMR_IS_FREE;
+ frmr->fr_inv = false;
+ }
+
+ atomic_inc(&ic->i_fastreg_wrs);
+}
+
+void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
+ unsigned long *unpinned, unsigned int goal)
+{
+ struct rds_ib_mr *ibmr, *next;
+ struct rds_ib_frmr *frmr;
+ int ret = 0;
+ unsigned int freed = *nfreed;
+
+ /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
+ list_for_each_entry(ibmr, list, unmap_list) {
+ if (ibmr->sg_dma_len)
+ ret |= rds_ib_post_inv(ibmr);
+ }
+ if (ret)
+ pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, ret);
+
+ /* Now we can destroy the DMA mapping and unpin any pages */
+ list_for_each_entry_safe(ibmr, next, list, unmap_list) {
+ *unpinned += ibmr->sg_len;
+ frmr = &ibmr->u.frmr;
+ __rds_ib_teardown_mr(ibmr);
+ if (freed < goal || frmr->fr_state == FRMR_IS_STALE) {
+ /* Don't de-allocate if the MR is not free yet */
+ if (frmr->fr_state == FRMR_IS_INUSE)
+ continue;
+
+ if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
+ list_del(&ibmr->unmap_list);
+ if (frmr->mr)
+ ib_dereg_mr(frmr->mr);
+ kfree(ibmr);
+ freed++;
+ }
+ }
+ *nfreed = freed;
+}
+
+struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
+ struct rds_ib_connection *ic,
+ struct scatterlist *sg,
+ unsigned long nents, u32 *key)
+{
+ struct rds_ib_mr *ibmr = NULL;
+ struct rds_ib_frmr *frmr;
+ int ret;
+
+ do {
+ if (ibmr)
+ rds_ib_free_frmr(ibmr, true);
+ ibmr = rds_ib_alloc_frmr(rds_ibdev, nents);
+ if (IS_ERR(ibmr))
+ return ibmr;
+ frmr = &ibmr->u.frmr;
+ } while (frmr->fr_state != FRMR_IS_FREE);
+
+ ibmr->ic = ic;
+ ibmr->device = rds_ibdev;
+ ret = rds_ib_map_frmr(rds_ibdev, ibmr->pool, ibmr, sg, nents);
+ if (ret == 0) {
+ *key = frmr->mr->rkey;
+ } else {
+ rds_ib_free_frmr(ibmr, false);
+ ibmr = ERR_PTR(ret);
+ }
+
+ return ibmr;
+}
+
+void rds_ib_free_frmr_list(struct rds_ib_mr *ibmr)
+{
+ struct rds_ib_mr_pool *pool = ibmr->pool;
+ struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+
+ if (frmr->fr_state == FRMR_IS_STALE)
+ llist_add(&ibmr->llnode, &pool->drop_list);
+ else
+ llist_add(&ibmr->llnode, &pool->free_list);
+}
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
new file mode 100644
index 000000000..1c754f4ac
--- /dev/null
+++ b/net/rds/ib_mr.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _RDS_IB_MR_H
+#define _RDS_IB_MR_H
+
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "ib.h"
+
+#define RDS_MR_1M_POOL_SIZE (8192 / 2)
+#define RDS_MR_1M_MSG_SIZE 256
+#define RDS_MR_8K_MSG_SIZE 2
+#define RDS_MR_8K_SCALE (256 / (RDS_MR_8K_MSG_SIZE + 1))
+#define RDS_MR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2))
+
+struct rds_ib_fmr {
+ struct ib_fmr *fmr;
+ u64 *dma;
+};
+
+enum rds_ib_fr_state {
+ FRMR_IS_FREE, /* mr invalidated & ready for use */
+ FRMR_IS_INUSE, /* mr is in use or used & can be invalidated */
+ FRMR_IS_STALE, /* Stale MR and needs to be dropped */
+};
+
+struct rds_ib_frmr {
+ struct ib_mr *mr;
+ enum rds_ib_fr_state fr_state;
+ bool fr_inv;
+ struct ib_send_wr fr_wr;
+ unsigned int dma_npages;
+ unsigned int sg_byte_len;
+};
+
+/* This is stored as mr->r_trans_private. */
+struct rds_ib_mr {
+ struct rds_ib_device *device;
+ struct rds_ib_mr_pool *pool;
+ struct rds_ib_connection *ic;
+
+ struct llist_node llnode;
+
+ /* unmap_list is for freeing */
+ struct list_head unmap_list;
+ unsigned int remap_count;
+
+ struct scatterlist *sg;
+ unsigned int sg_len;
+ int sg_dma_len;
+
+ union {
+ struct rds_ib_fmr fmr;
+ struct rds_ib_frmr frmr;
+ } u;
+};
+
+/* Our own little MR pool */
+struct rds_ib_mr_pool {
+ unsigned int pool_type;
+ struct mutex flush_lock; /* serialize fmr invalidate */
+ struct delayed_work flush_worker; /* flush worker */
+
+ atomic_t item_count; /* total # of MRs */
+ atomic_t dirty_count; /* # dirty of MRs */
+
+ struct llist_head drop_list; /* MRs not reached max_maps */
+ struct llist_head free_list; /* unused MRs */
+ struct llist_head clean_list; /* unused & unmapped MRs */
+ wait_queue_head_t flush_wait;
+
+ atomic_t free_pinned; /* memory pinned by free MRs */
+ unsigned long max_items;
+ unsigned long max_items_soft;
+ unsigned long max_free_pinned;
+ struct ib_fmr_attr fmr_attr;
+ bool use_fastreg;
+};
+
+extern struct workqueue_struct *rds_ib_mr_wq;
+extern unsigned int rds_ib_mr_1m_pool_size;
+extern unsigned int rds_ib_mr_8k_pool_size;
+extern bool prefer_frmr;
+
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
+ int npages);
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+ struct rds_info_rdma_connection *iinfo);
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+ struct rds_sock *rs, u32 *key_ret);
+void rds_ib_sync_mr(void *trans_private, int dir);
+void rds_ib_free_mr(void *trans_private, int invalidate);
+void rds_ib_flush_mrs(void);
+int rds_ib_mr_init(void);
+void rds_ib_mr_exit(void);
+
+void __rds_ib_teardown_mr(struct rds_ib_mr *);
+void rds_ib_teardown_mr(struct rds_ib_mr *);
+struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *, int);
+int rds_ib_map_fmr(struct rds_ib_device *, struct rds_ib_mr *,
+ struct scatterlist *, unsigned int);
+struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *);
+int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *, int, struct rds_ib_mr **);
+struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *, struct scatterlist *,
+ unsigned long, u32 *);
+struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *);
+void rds_ib_unreg_fmr(struct list_head *, unsigned int *,
+ unsigned long *, unsigned int);
+void rds_ib_free_fmr_list(struct rds_ib_mr *);
+struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
+ struct rds_ib_connection *ic,
+ struct scatterlist *sg,
+ unsigned long nents, u32 *key);
+void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
+ unsigned long *unpinned, unsigned int goal);
+void rds_ib_free_frmr_list(struct rds_ib_mr *);
+#endif
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index a2340748e..f7164ac1f 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -35,78 +35,13 @@
#include <linux/rculist.h>
#include <linux/llist.h>
-#include "rds.h"
-#include "ib.h"
+#include "ib_mr.h"
+
+struct workqueue_struct *rds_ib_mr_wq;
static DEFINE_PER_CPU(unsigned long, clean_list_grace);
#define CLEAN_LIST_BUSY_BIT 0
-/*
- * This is stored as mr->r_trans_private.
- */
-struct rds_ib_mr {
- struct rds_ib_device *device;
- struct rds_ib_mr_pool *pool;
- struct ib_fmr *fmr;
-
- struct llist_node llnode;
-
- /* unmap_list is for freeing */
- struct list_head unmap_list;
- unsigned int remap_count;
-
- struct scatterlist *sg;
- unsigned int sg_len;
- u64 *dma;
- int sg_dma_len;
-};
-
-/*
- * Our own little FMR pool
- */
-struct rds_ib_mr_pool {
- unsigned int pool_type;
- struct mutex flush_lock; /* serialize fmr invalidate */
- struct delayed_work flush_worker; /* flush worker */
-
- atomic_t item_count; /* total # of MRs */
- atomic_t dirty_count; /* # dirty of MRs */
-
- struct llist_head drop_list; /* MRs that have reached their max_maps limit */
- struct llist_head free_list; /* unused MRs */
- struct llist_head clean_list; /* global unused & unamapped MRs */
- wait_queue_head_t flush_wait;
-
- atomic_t free_pinned; /* memory pinned by free MRs */
- unsigned long max_items;
- unsigned long max_items_soft;
- unsigned long max_free_pinned;
- struct ib_fmr_attr fmr_attr;
-};
-
-static struct workqueue_struct *rds_ib_fmr_wq;
-
-int rds_ib_fmr_init(void)
-{
- rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd");
- if (!rds_ib_fmr_wq)
- return -ENOMEM;
- return 0;
-}
-
-/* By the time this is called all the IB devices should have been torn down and
- * had their pools freed. As each pool is freed its work struct is waited on,
- * so the pool flushing work queue should be idle by the time we get here.
- */
-void rds_ib_fmr_exit(void)
-{
- destroy_workqueue(rds_ib_fmr_wq);
-}
-
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
-static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
-static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
-
static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
{
struct rds_ib_device *rds_ibdev;
@@ -235,41 +170,6 @@ void rds_ib_destroy_nodev_conns(void)
rds_conn_destroy(ic->conn);
}
-struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
- int pool_type)
-{
- struct rds_ib_mr_pool *pool;
-
- pool = kzalloc(sizeof(*pool), GFP_KERNEL);
- if (!pool)
- return ERR_PTR(-ENOMEM);
-
- pool->pool_type = pool_type;
- init_llist_head(&pool->free_list);
- init_llist_head(&pool->drop_list);
- init_llist_head(&pool->clean_list);
- mutex_init(&pool->flush_lock);
- init_waitqueue_head(&pool->flush_wait);
- INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
-
- if (pool_type == RDS_IB_MR_1M_POOL) {
- /* +1 allows for unaligned MRs */
- pool->fmr_attr.max_pages = RDS_FMR_1M_MSG_SIZE + 1;
- pool->max_items = RDS_FMR_1M_POOL_SIZE;
- } else {
- /* pool_type == RDS_IB_MR_8K_POOL */
- pool->fmr_attr.max_pages = RDS_FMR_8K_MSG_SIZE + 1;
- pool->max_items = RDS_FMR_8K_POOL_SIZE;
- }
-
- pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4;
- pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
- pool->fmr_attr.page_shift = PAGE_SHIFT;
- pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4;
-
- return pool;
-}
-
void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
{
struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
@@ -278,16 +178,7 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
}
-void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
-{
- cancel_delayed_work_sync(&pool->flush_worker);
- rds_ib_flush_mr_pool(pool, 1, NULL);
- WARN_ON(atomic_read(&pool->item_count));
- WARN_ON(atomic_read(&pool->free_pinned));
- kfree(pool);
-}
-
-static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
+struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
{
struct rds_ib_mr *ibmr = NULL;
struct llist_node *ret;
@@ -297,8 +188,13 @@ static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
flag = this_cpu_ptr(&clean_list_grace);
set_bit(CLEAN_LIST_BUSY_BIT, flag);
ret = llist_del_first(&pool->clean_list);
- if (ret)
+ if (ret) {
ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_reused);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
+ }
clear_bit(CLEAN_LIST_BUSY_BIT, flag);
preempt_enable();
@@ -317,190 +213,6 @@ static inline void wait_clean_list_grace(void)
}
}
-static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev,
- int npages)
-{
- struct rds_ib_mr_pool *pool;
- struct rds_ib_mr *ibmr = NULL;
- int err = 0, iter = 0;
-
- if (npages <= RDS_FMR_8K_MSG_SIZE)
- pool = rds_ibdev->mr_8k_pool;
- else
- pool = rds_ibdev->mr_1m_pool;
-
- if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
- queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
-
- /* Switch pools if one of the pool is reaching upper limit */
- if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) {
- if (pool->pool_type == RDS_IB_MR_8K_POOL)
- pool = rds_ibdev->mr_1m_pool;
- else
- pool = rds_ibdev->mr_8k_pool;
- }
-
- while (1) {
- ibmr = rds_ib_reuse_fmr(pool);
- if (ibmr)
- return ibmr;
-
- /* No clean MRs - now we have the choice of either
- * allocating a fresh MR up to the limit imposed by the
- * driver, or flush any dirty unused MRs.
- * We try to avoid stalling in the send path if possible,
- * so we allocate as long as we're allowed to.
- *
- * We're fussy with enforcing the FMR limit, though. If the driver
- * tells us we can't use more than N fmrs, we shouldn't start
- * arguing with it */
- if (atomic_inc_return(&pool->item_count) <= pool->max_items)
- break;
-
- atomic_dec(&pool->item_count);
-
- if (++iter > 2) {
- if (pool->pool_type == RDS_IB_MR_8K_POOL)
- rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
- else
- rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
- return ERR_PTR(-EAGAIN);
- }
-
- /* We do have some empty MRs. Flush them out. */
- if (pool->pool_type == RDS_IB_MR_8K_POOL)
- rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait);
- else
- rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait);
- rds_ib_flush_mr_pool(pool, 0, &ibmr);
- if (ibmr)
- return ibmr;
- }
-
- ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
- if (!ibmr) {
- err = -ENOMEM;
- goto out_no_cigar;
- }
-
- ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
- (IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_READ |
- IB_ACCESS_REMOTE_WRITE|
- IB_ACCESS_REMOTE_ATOMIC),
- &pool->fmr_attr);
- if (IS_ERR(ibmr->fmr)) {
- err = PTR_ERR(ibmr->fmr);
- ibmr->fmr = NULL;
- printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err);
- goto out_no_cigar;
- }
-
- ibmr->pool = pool;
- if (pool->pool_type == RDS_IB_MR_8K_POOL)
- rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
- else
- rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
-
- return ibmr;
-
-out_no_cigar:
- if (ibmr) {
- if (ibmr->fmr)
- ib_dealloc_fmr(ibmr->fmr);
- kfree(ibmr);
- }
- atomic_dec(&pool->item_count);
- return ERR_PTR(err);
-}
-
-static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
- struct scatterlist *sg, unsigned int nents)
-{
- struct ib_device *dev = rds_ibdev->dev;
- struct scatterlist *scat = sg;
- u64 io_addr = 0;
- u64 *dma_pages;
- u32 len;
- int page_cnt, sg_dma_len;
- int i, j;
- int ret;
-
- sg_dma_len = ib_dma_map_sg(dev, sg, nents,
- DMA_BIDIRECTIONAL);
- if (unlikely(!sg_dma_len)) {
- printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n");
- return -EBUSY;
- }
-
- len = 0;
- page_cnt = 0;
-
- for (i = 0; i < sg_dma_len; ++i) {
- unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
- u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
-
- if (dma_addr & ~PAGE_MASK) {
- if (i > 0)
- return -EINVAL;
- else
- ++page_cnt;
- }
- if ((dma_addr + dma_len) & ~PAGE_MASK) {
- if (i < sg_dma_len - 1)
- return -EINVAL;
- else
- ++page_cnt;
- }
-
- len += dma_len;
- }
-
- page_cnt += len >> PAGE_SHIFT;
- if (page_cnt > ibmr->pool->fmr_attr.max_pages)
- return -EINVAL;
-
- dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
- rdsibdev_to_node(rds_ibdev));
- if (!dma_pages)
- return -ENOMEM;
-
- page_cnt = 0;
- for (i = 0; i < sg_dma_len; ++i) {
- unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
- u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
-
- for (j = 0; j < dma_len; j += PAGE_SIZE)
- dma_pages[page_cnt++] =
- (dma_addr & PAGE_MASK) + j;
- }
-
- ret = ib_map_phys_fmr(ibmr->fmr,
- dma_pages, page_cnt, io_addr);
- if (ret)
- goto out;
-
- /* Success - we successfully remapped the MR, so we can
- * safely tear down the old mapping. */
- rds_ib_teardown_mr(ibmr);
-
- ibmr->sg = scat;
- ibmr->sg_len = nents;
- ibmr->sg_dma_len = sg_dma_len;
- ibmr->remap_count++;
-
- if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
- rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
- else
- rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
- ret = 0;
-
-out:
- kfree(dma_pages);
-
- return ret;
-}
-
void rds_ib_sync_mr(void *trans_private, int direction)
{
struct rds_ib_mr *ibmr = trans_private;
@@ -518,7 +230,7 @@ void rds_ib_sync_mr(void *trans_private, int direction)
}
}
-static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
{
struct rds_ib_device *rds_ibdev = ibmr->device;
@@ -549,7 +261,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
}
}
-static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
{
unsigned int pinned = ibmr->sg_len;
@@ -623,17 +335,15 @@ static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
* If the number of MRs allocated exceeds the limit, we also try
* to free as many MRs as needed to get back to this limit.
*/
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
- int free_all, struct rds_ib_mr **ibmr_ret)
+int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
+ int free_all, struct rds_ib_mr **ibmr_ret)
{
- struct rds_ib_mr *ibmr, *next;
+ struct rds_ib_mr *ibmr;
struct llist_node *clean_nodes;
struct llist_node *clean_tail;
LIST_HEAD(unmap_list);
- LIST_HEAD(fmr_list);
unsigned long unpinned = 0;
unsigned int nfreed = 0, dirty_to_clean = 0, free_goal;
- int ret = 0;
if (pool->pool_type == RDS_IB_MR_8K_POOL)
rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush);
@@ -643,7 +353,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
if (ibmr_ret) {
DEFINE_WAIT(wait);
while (!mutex_trylock(&pool->flush_lock)) {
- ibmr = rds_ib_reuse_fmr(pool);
+ ibmr = rds_ib_reuse_mr(pool);
if (ibmr) {
*ibmr_ret = ibmr;
finish_wait(&pool->flush_wait, &wait);
@@ -655,7 +365,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
if (llist_empty(&pool->clean_list))
schedule();
- ibmr = rds_ib_reuse_fmr(pool);
+ ibmr = rds_ib_reuse_mr(pool);
if (ibmr) {
*ibmr_ret = ibmr;
finish_wait(&pool->flush_wait, &wait);
@@ -667,7 +377,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
mutex_lock(&pool->flush_lock);
if (ibmr_ret) {
- ibmr = rds_ib_reuse_fmr(pool);
+ ibmr = rds_ib_reuse_mr(pool);
if (ibmr) {
*ibmr_ret = ibmr;
goto out;
@@ -687,30 +397,10 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
if (list_empty(&unmap_list))
goto out;
- /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
- list_for_each_entry(ibmr, &unmap_list, unmap_list)
- list_add(&ibmr->fmr->list, &fmr_list);
-
- ret = ib_unmap_fmr(&fmr_list);
- if (ret)
- printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
-
- /* Now we can destroy the DMA mapping and unpin any pages */
- list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
- unpinned += ibmr->sg_len;
- __rds_ib_teardown_mr(ibmr);
- if (nfreed < free_goal ||
- ibmr->remap_count >= pool->fmr_attr.max_maps) {
- if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
- rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
- else
- rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
- list_del(&ibmr->unmap_list);
- ib_dealloc_fmr(ibmr->fmr);
- kfree(ibmr);
- nfreed++;
- }
- }
+ if (pool->use_fastreg)
+ rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal);
+ else
+ rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);
if (!list_empty(&unmap_list)) {
/* we have to make sure that none of the things we're about
@@ -743,7 +433,47 @@ out:
if (waitqueue_active(&pool->flush_wait))
wake_up(&pool->flush_wait);
out_nolock:
- return ret;
+ return 0;
+}
+
+struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
+{
+ struct rds_ib_mr *ibmr = NULL;
+ int iter = 0;
+
+ if (atomic_read(&pool->dirty_count) >= pool->max_items_soft / 10)
+ queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
+
+ while (1) {
+ ibmr = rds_ib_reuse_mr(pool);
+ if (ibmr)
+ return ibmr;
+
+ if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+ break;
+
+ atomic_dec(&pool->item_count);
+
+ if (++iter > 2) {
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ /* We do have some empty MRs. Flush them out. */
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait);
+
+ rds_ib_flush_mr_pool(pool, 0, &ibmr);
+ if (ibmr)
+ return ibmr;
+ }
+
+ return ibmr;
}
static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
@@ -762,10 +492,10 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
/* Return it to the pool's free list */
- if (ibmr->remap_count >= pool->fmr_attr.max_maps)
- llist_add(&ibmr->llnode, &pool->drop_list);
+ if (rds_ibdev->use_fastreg)
+ rds_ib_free_frmr_list(ibmr);
else
- llist_add(&ibmr->llnode, &pool->free_list);
+ rds_ib_free_fmr_list(ibmr);
atomic_add(ibmr->sg_len, &pool->free_pinned);
atomic_inc(&pool->dirty_count);
@@ -773,7 +503,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
/* If we've pinned too many pages, request a flush */
if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
atomic_read(&pool->dirty_count) >= pool->max_items / 5)
- queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
+ queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
if (invalidate) {
if (likely(!in_interrupt())) {
@@ -782,7 +512,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
/* We get here if the user created a MR marked
* as use_once and invalidate at the same time.
*/
- queue_delayed_work(rds_ib_fmr_wq,
+ queue_delayed_work(rds_ib_mr_wq,
&pool->flush_worker, 10);
}
}
@@ -810,6 +540,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
{
struct rds_ib_device *rds_ibdev;
struct rds_ib_mr *ibmr = NULL;
+ struct rds_ib_connection *ic = rs->rs_conn->c_transport_data;
int ret;
rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
@@ -823,29 +554,81 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
goto out;
}
- ibmr = rds_ib_alloc_fmr(rds_ibdev, nents);
- if (IS_ERR(ibmr)) {
- rds_ib_dev_put(rds_ibdev);
- return ibmr;
- }
-
- ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
- if (ret == 0)
- *key_ret = ibmr->fmr->rkey;
+ if (rds_ibdev->use_fastreg)
+ ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret);
else
- printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
-
- ibmr->device = rds_ibdev;
- rds_ibdev = NULL;
+ ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret);
+ if (ibmr)
+ rds_ibdev = NULL;
out:
- if (ret) {
- if (ibmr)
- rds_ib_free_mr(ibmr, 0);
- ibmr = ERR_PTR(ret);
- }
+ if (!ibmr)
+ pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret);
+
if (rds_ibdev)
rds_ib_dev_put(rds_ibdev);
+
return ibmr;
}
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
+{
+ cancel_delayed_work_sync(&pool->flush_worker);
+ rds_ib_flush_mr_pool(pool, 1, NULL);
+ WARN_ON(atomic_read(&pool->item_count));
+ WARN_ON(atomic_read(&pool->free_pinned));
+ kfree(pool);
+}
+
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
+ int pool_type)
+{
+ struct rds_ib_mr_pool *pool;
+
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool)
+ return ERR_PTR(-ENOMEM);
+
+ pool->pool_type = pool_type;
+ init_llist_head(&pool->free_list);
+ init_llist_head(&pool->drop_list);
+ init_llist_head(&pool->clean_list);
+ mutex_init(&pool->flush_lock);
+ init_waitqueue_head(&pool->flush_wait);
+ INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+
+ if (pool_type == RDS_IB_MR_1M_POOL) {
+ /* +1 allows for unaligned MRs */
+ pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1;
+ pool->max_items = RDS_MR_1M_POOL_SIZE;
+ } else {
+ /* pool_type == RDS_IB_MR_8K_POOL */
+ pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1;
+ pool->max_items = RDS_MR_8K_POOL_SIZE;
+ }
+
+ pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4;
+ pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
+ pool->fmr_attr.page_shift = PAGE_SHIFT;
+ pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4;
+ pool->use_fastreg = rds_ibdev->use_fastreg;
+
+ return pool;
+}
+
+int rds_ib_mr_init(void)
+{
+ rds_ib_mr_wq = create_workqueue("rds_mr_flushd");
+ if (!rds_ib_mr_wq)
+ return -ENOMEM;
+ return 0;
+}
+
+/* By the time this is called all the IB devices should have been torn down and
+ * had their pools freed. As each pool is freed its work struct is waited on,
+ * so the pool flushing work queue should be idle by the time we get here.
+ */
+void rds_ib_mr_exit(void)
+{
+ destroy_workqueue(rds_ib_mr_wq);
+}
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 977fb8606..abc8cc805 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -796,7 +796,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
addr = kmap_atomic(sg_page(&frag->f_sg));
- src = addr + frag_off;
+ src = addr + frag->f_sg.offset + frag_off;
dst = (void *)map->m_page_addrs[map_page] + map_off;
for (k = 0; k < to_copy; k += 8) {
/* Record ports that became uncongested, ie
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index eac30bf48..f27d2c82b 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -195,7 +195,7 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
send->s_op = NULL;
- send->s_wr.wr_id = i | RDS_IB_SEND_OP;
+ send->s_wr.wr_id = i;
send->s_wr.sg_list = send->s_sge;
send->s_wr.ex.imm_data = 0;
@@ -263,9 +263,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
oldest = rds_ib_ring_oldest(&ic->i_send_ring);
- completed = rds_ib_ring_completed(&ic->i_send_ring,
- (wc->wr_id & ~RDS_IB_SEND_OP),
- oldest);
+ completed = rds_ib_ring_completed(&ic->i_send_ring, wc->wr_id, oldest);
for (i = 0; i < completed; i++) {
send = &ic->i_sends[oldest];
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index d77e04473..7e78dca1f 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -73,6 +73,8 @@ static const char *const rds_ib_stat_names[] = {
"ib_rdma_mr_1m_pool_flush",
"ib_rdma_mr_1m_pool_wait",
"ib_rdma_mr_1m_pool_depleted",
+ "ib_rdma_mr_8k_reused",
+ "ib_rdma_mr_1m_reused",
"ib_atomic_cswp",
"ib_atomic_fadd",
};
diff --git a/net/rds/iw.c b/net/rds/iw.c
deleted file mode 100644
index f4a9fff82..000000000
--- a/net/rds/iw.c
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/in.h>
-#include <linux/if.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/if_arp.h>
-#include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-
-#include "rds.h"
-#include "iw.h"
-
-unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE;
-unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */
-
-module_param(fastreg_pool_size, int, 0444);
-MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device");
-module_param(fastreg_message_size, int, 0444);
-MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)");
-
-struct list_head rds_iw_devices;
-
-/* NOTE: if also grabbing iwdev lock, grab this first */
-DEFINE_SPINLOCK(iw_nodev_conns_lock);
-LIST_HEAD(iw_nodev_conns);
-
-static void rds_iw_add_one(struct ib_device *device)
-{
- struct rds_iw_device *rds_iwdev;
-
- /* Only handle iwarp devices */
- if (device->node_type != RDMA_NODE_RNIC)
- return;
-
- rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
- if (!rds_iwdev)
- return;
-
- spin_lock_init(&rds_iwdev->spinlock);
-
- rds_iwdev->dma_local_lkey = !!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
- rds_iwdev->max_wrs = device->attrs.max_qp_wr;
- rds_iwdev->max_sge = min(device->attrs.max_sge, RDS_IW_MAX_SGE);
-
- rds_iwdev->dev = device;
- rds_iwdev->pd = ib_alloc_pd(device);
- if (IS_ERR(rds_iwdev->pd))
- goto free_dev;
-
- if (!rds_iwdev->dma_local_lkey) {
- rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
- IB_ACCESS_REMOTE_READ |
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(rds_iwdev->mr))
- goto err_pd;
- } else
- rds_iwdev->mr = NULL;
-
- rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev);
- if (IS_ERR(rds_iwdev->mr_pool)) {
- rds_iwdev->mr_pool = NULL;
- goto err_mr;
- }
-
- INIT_LIST_HEAD(&rds_iwdev->cm_id_list);
- INIT_LIST_HEAD(&rds_iwdev->conn_list);
- list_add_tail(&rds_iwdev->list, &rds_iw_devices);
-
- ib_set_client_data(device, &rds_iw_client, rds_iwdev);
- return;
-
-err_mr:
- if (rds_iwdev->mr)
- ib_dereg_mr(rds_iwdev->mr);
-err_pd:
- ib_dealloc_pd(rds_iwdev->pd);
-free_dev:
- kfree(rds_iwdev);
-}
-
-static void rds_iw_remove_one(struct ib_device *device, void *client_data)
-{
- struct rds_iw_device *rds_iwdev = client_data;
- struct rds_iw_cm_id *i_cm_id, *next;
-
- if (!rds_iwdev)
- return;
-
- spin_lock_irq(&rds_iwdev->spinlock);
- list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) {
- list_del(&i_cm_id->list);
- kfree(i_cm_id);
- }
- spin_unlock_irq(&rds_iwdev->spinlock);
-
- rds_iw_destroy_conns(rds_iwdev);
-
- if (rds_iwdev->mr_pool)
- rds_iw_destroy_mr_pool(rds_iwdev->mr_pool);
-
- if (rds_iwdev->mr)
- ib_dereg_mr(rds_iwdev->mr);
-
- ib_dealloc_pd(rds_iwdev->pd);
-
- list_del(&rds_iwdev->list);
- kfree(rds_iwdev);
-}
-
-struct ib_client rds_iw_client = {
- .name = "rds_iw",
- .add = rds_iw_add_one,
- .remove = rds_iw_remove_one
-};
-
-static int rds_iw_conn_info_visitor(struct rds_connection *conn,
- void *buffer)
-{
- struct rds_info_rdma_connection *iinfo = buffer;
- struct rds_iw_connection *ic;
-
- /* We will only ever look at IB transports */
- if (conn->c_trans != &rds_iw_transport)
- return 0;
-
- iinfo->src_addr = conn->c_laddr;
- iinfo->dst_addr = conn->c_faddr;
-
- memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
- memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
- if (rds_conn_state(conn) == RDS_CONN_UP) {
- struct rds_iw_device *rds_iwdev;
- struct rdma_dev_addr *dev_addr;
-
- ic = conn->c_transport_data;
- dev_addr = &ic->i_cm_id->route.addr.dev_addr;
-
- rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
- rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
-
- rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
- iinfo->max_send_wr = ic->i_send_ring.w_nr;
- iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
- iinfo->max_send_sge = rds_iwdev->max_sge;
- rds_iw_get_mr_info(rds_iwdev, iinfo);
- }
- return 1;
-}
-
-static void rds_iw_ic_info(struct socket *sock, unsigned int len,
- struct rds_info_iterator *iter,
- struct rds_info_lengths *lens)
-{
- rds_for_each_conn_info(sock, len, iter, lens,
- rds_iw_conn_info_visitor,
- sizeof(struct rds_info_rdma_connection));
-}
-
-
-/*
- * Early RDS/IB was built to only bind to an address if there is an IPoIB
- * device with that address set.
- *
- * If it were me, I'd advocate for something more flexible. Sending and
- * receiving should be device-agnostic. Transports would try and maintain
- * connections between peers who have messages queued. Userspace would be
- * allowed to influence which paths have priority. We could call userspace
- * asserting this policy "routing".
- */
-static int rds_iw_laddr_check(struct net *net, __be32 addr)
-{
- int ret;
- struct rdma_cm_id *cm_id;
- struct sockaddr_in sin;
-
- /* Create a CMA ID and try to bind it. This catches both
- * IB and iWARP capable NICs.
- */
- cm_id = rdma_create_id(&init_net, NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
- if (IS_ERR(cm_id))
- return PTR_ERR(cm_id);
-
- memset(&sin, 0, sizeof(sin));
- sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = addr;
-
- /* rdma_bind_addr will only succeed for IB & iWARP devices */
- ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
- /* due to this, we will claim to support IB devices unless we
- check node_type. */
- if (ret || !cm_id->device ||
- cm_id->device->node_type != RDMA_NODE_RNIC)
- ret = -EADDRNOTAVAIL;
-
- rdsdebug("addr %pI4 ret %d node type %d\n",
- &addr, ret,
- cm_id->device ? cm_id->device->node_type : -1);
-
- rdma_destroy_id(cm_id);
-
- return ret;
-}
-
-void rds_iw_exit(void)
-{
- rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
- rds_iw_destroy_nodev_conns();
- ib_unregister_client(&rds_iw_client);
- rds_iw_sysctl_exit();
- rds_iw_recv_exit();
- rds_trans_unregister(&rds_iw_transport);
-}
-
-struct rds_transport rds_iw_transport = {
- .laddr_check = rds_iw_laddr_check,
- .xmit_complete = rds_iw_xmit_complete,
- .xmit = rds_iw_xmit,
- .xmit_rdma = rds_iw_xmit_rdma,
- .recv = rds_iw_recv,
- .conn_alloc = rds_iw_conn_alloc,
- .conn_free = rds_iw_conn_free,
- .conn_connect = rds_iw_conn_connect,
- .conn_shutdown = rds_iw_conn_shutdown,
- .inc_copy_to_user = rds_iw_inc_copy_to_user,
- .inc_free = rds_iw_inc_free,
- .cm_initiate_connect = rds_iw_cm_initiate_connect,
- .cm_handle_connect = rds_iw_cm_handle_connect,
- .cm_connect_complete = rds_iw_cm_connect_complete,
- .stats_info_copy = rds_iw_stats_info_copy,
- .exit = rds_iw_exit,
- .get_mr = rds_iw_get_mr,
- .sync_mr = rds_iw_sync_mr,
- .free_mr = rds_iw_free_mr,
- .flush_mrs = rds_iw_flush_mrs,
- .t_owner = THIS_MODULE,
- .t_name = "iwarp",
- .t_type = RDS_TRANS_IWARP,
- .t_prefer_loopback = 1,
-};
-
-int rds_iw_init(void)
-{
- int ret;
-
- INIT_LIST_HEAD(&rds_iw_devices);
-
- ret = ib_register_client(&rds_iw_client);
- if (ret)
- goto out;
-
- ret = rds_iw_sysctl_init();
- if (ret)
- goto out_ibreg;
-
- ret = rds_iw_recv_init();
- if (ret)
- goto out_sysctl;
-
- ret = rds_trans_register(&rds_iw_transport);
- if (ret)
- goto out_recv;
-
- rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
-
- goto out;
-
-out_recv:
- rds_iw_recv_exit();
-out_sysctl:
- rds_iw_sysctl_exit();
-out_ibreg:
- ib_unregister_client(&rds_iw_client);
-out:
- return ret;
-}
-
-MODULE_LICENSE("GPL");
-
diff --git a/net/rds/iw.h b/net/rds/iw.h
deleted file mode 100644
index 5af01d175..000000000
--- a/net/rds/iw.h
+++ /dev/null
@@ -1,398 +0,0 @@
-#ifndef _RDS_IW_H
-#define _RDS_IW_H
-
-#include <linux/interrupt.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/rdma_cm.h>
-#include "rds.h"
-#include "rdma_transport.h"
-
-#define RDS_FASTREG_SIZE 20
-#define RDS_FASTREG_POOL_SIZE 2048
-
-#define RDS_IW_MAX_SGE 8
-#define RDS_IW_RECV_SGE 2
-
-#define RDS_IW_DEFAULT_RECV_WR 1024
-#define RDS_IW_DEFAULT_SEND_WR 256
-
-#define RDS_IW_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
-
-extern struct list_head rds_iw_devices;
-
-/*
- * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
- * try and minimize the amount of memory tied up both the device and
- * socket receive queues.
- */
-/* page offset of the final full frag that fits in the page */
-#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
-struct rds_page_frag {
- struct list_head f_item;
- struct page *f_page;
- unsigned long f_offset;
- dma_addr_t f_mapped;
-};
-
-struct rds_iw_incoming {
- struct list_head ii_frags;
- struct rds_incoming ii_inc;
-};
-
-struct rds_iw_connect_private {
- /* Add new fields at the end, and don't permute existing fields. */
- __be32 dp_saddr;
- __be32 dp_daddr;
- u8 dp_protocol_major;
- u8 dp_protocol_minor;
- __be16 dp_protocol_minor_mask; /* bitmask */
- __be32 dp_reserved1;
- __be64 dp_ack_seq;
- __be32 dp_credit; /* non-zero enables flow ctl */
-};
-
-struct rds_iw_scatterlist {
- struct scatterlist *list;
- unsigned int len;
- int dma_len;
- unsigned int dma_npages;
- unsigned int bytes;
-};
-
-struct rds_iw_mapping {
- spinlock_t m_lock; /* protect the mapping struct */
- struct list_head m_list;
- struct rds_iw_mr *m_mr;
- uint32_t m_rkey;
- struct rds_iw_scatterlist m_sg;
-};
-
-struct rds_iw_send_work {
- struct rds_message *s_rm;
-
- /* We should really put these into a union: */
- struct rm_rdma_op *s_op;
- struct rds_iw_mapping *s_mapping;
- struct ib_mr *s_mr;
- unsigned char s_remap_count;
-
- union {
- struct ib_send_wr s_send_wr;
- struct ib_rdma_wr s_rdma_wr;
- struct ib_reg_wr s_reg_wr;
- };
- struct ib_sge s_sge[RDS_IW_MAX_SGE];
- unsigned long s_queued;
-};
-
-struct rds_iw_recv_work {
- struct rds_iw_incoming *r_iwinc;
- struct rds_page_frag *r_frag;
- struct ib_recv_wr r_wr;
- struct ib_sge r_sge[2];
-};
-
-struct rds_iw_work_ring {
- u32 w_nr;
- u32 w_alloc_ptr;
- u32 w_alloc_ctr;
- u32 w_free_ptr;
- atomic_t w_free_ctr;
-};
-
-struct rds_iw_device;
-
-struct rds_iw_connection {
-
- struct list_head iw_node;
- struct rds_iw_device *rds_iwdev;
- struct rds_connection *conn;
-
- /* alphabet soup, IBTA style */
- struct rdma_cm_id *i_cm_id;
- struct ib_pd *i_pd;
- struct ib_mr *i_mr;
- struct ib_cq *i_send_cq;
- struct ib_cq *i_recv_cq;
-
- /* tx */
- struct rds_iw_work_ring i_send_ring;
- struct rds_message *i_rm;
- struct rds_header *i_send_hdrs;
- u64 i_send_hdrs_dma;
- struct rds_iw_send_work *i_sends;
-
- /* rx */
- struct tasklet_struct i_recv_tasklet;
- struct mutex i_recv_mutex;
- struct rds_iw_work_ring i_recv_ring;
- struct rds_iw_incoming *i_iwinc;
- u32 i_recv_data_rem;
- struct rds_header *i_recv_hdrs;
- u64 i_recv_hdrs_dma;
- struct rds_iw_recv_work *i_recvs;
- struct rds_page_frag i_frag;
- u64 i_ack_recv; /* last ACK received */
-
- /* sending acks */
- unsigned long i_ack_flags;
-#ifdef KERNEL_HAS_ATOMIC64
- atomic64_t i_ack_next; /* next ACK to send */
-#else
- spinlock_t i_ack_lock; /* protect i_ack_next */
- u64 i_ack_next; /* next ACK to send */
-#endif
- struct rds_header *i_ack;
- struct ib_send_wr i_ack_wr;
- struct ib_sge i_ack_sge;
- u64 i_ack_dma;
- unsigned long i_ack_queued;
-
- /* Flow control related information
- *
- * Our algorithm uses a pair variables that we need to access
- * atomically - one for the send credits, and one posted
- * recv credits we need to transfer to remote.
- * Rather than protect them using a slow spinlock, we put both into
- * a single atomic_t and update it using cmpxchg
- */
- atomic_t i_credits;
-
- /* Protocol version specific information */
- unsigned int i_flowctl:1; /* enable/disable flow ctl */
- unsigned int i_dma_local_lkey:1;
- unsigned int i_fastreg_posted:1; /* fastreg posted on this connection */
- /* Batched completions */
- unsigned int i_unsignaled_wrs;
- long i_unsignaled_bytes;
-};
-
-/* This assumes that atomic_t is at least 32 bits */
-#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
-#define IB_GET_POST_CREDITS(v) ((v) >> 16)
-#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
-#define IB_SET_POST_CREDITS(v) ((v) << 16)
-
-struct rds_iw_cm_id {
- struct list_head list;
- struct rdma_cm_id *cm_id;
-};
-
-struct rds_iw_device {
- struct list_head list;
- struct list_head cm_id_list;
- struct list_head conn_list;
- struct ib_device *dev;
- struct ib_pd *pd;
- struct ib_mr *mr;
- struct rds_iw_mr_pool *mr_pool;
- int max_sge;
- unsigned int max_wrs;
- unsigned int dma_local_lkey:1;
- spinlock_t spinlock; /* protect the above */
-};
-
-/* bits for i_ack_flags */
-#define IB_ACK_IN_FLIGHT 0
-#define IB_ACK_REQUESTED 1
-
-/* Magic WR_ID for ACKs */
-#define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL)
-#define RDS_IW_REG_WR_ID ((u64)0xefefefefefefefefULL)
-#define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL)
-
-struct rds_iw_statistics {
- uint64_t s_iw_connect_raced;
- uint64_t s_iw_listen_closed_stale;
- uint64_t s_iw_tx_cq_call;
- uint64_t s_iw_tx_cq_event;
- uint64_t s_iw_tx_ring_full;
- uint64_t s_iw_tx_throttle;
- uint64_t s_iw_tx_sg_mapping_failure;
- uint64_t s_iw_tx_stalled;
- uint64_t s_iw_tx_credit_updates;
- uint64_t s_iw_rx_cq_call;
- uint64_t s_iw_rx_cq_event;
- uint64_t s_iw_rx_ring_empty;
- uint64_t s_iw_rx_refill_from_cq;
- uint64_t s_iw_rx_refill_from_thread;
- uint64_t s_iw_rx_alloc_limit;
- uint64_t s_iw_rx_credit_updates;
- uint64_t s_iw_ack_sent;
- uint64_t s_iw_ack_send_failure;
- uint64_t s_iw_ack_send_delayed;
- uint64_t s_iw_ack_send_piggybacked;
- uint64_t s_iw_ack_received;
- uint64_t s_iw_rdma_mr_alloc;
- uint64_t s_iw_rdma_mr_free;
- uint64_t s_iw_rdma_mr_used;
- uint64_t s_iw_rdma_mr_pool_flush;
- uint64_t s_iw_rdma_mr_pool_wait;
- uint64_t s_iw_rdma_mr_pool_depleted;
-};
-
-extern struct workqueue_struct *rds_iw_wq;
-
-/*
- * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
- * doesn't define it.
- */
-static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev,
- struct scatterlist *sg, unsigned int sg_dma_len, int direction)
-{
- unsigned int i;
-
- for (i = 0; i < sg_dma_len; ++i) {
- ib_dma_sync_single_for_cpu(dev,
- ib_sg_dma_address(dev, &sg[i]),
- ib_sg_dma_len(dev, &sg[i]),
- direction);
- }
-}
-#define ib_dma_sync_sg_for_cpu rds_iw_dma_sync_sg_for_cpu
-
-static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev,
- struct scatterlist *sg, unsigned int sg_dma_len, int direction)
-{
- unsigned int i;
-
- for (i = 0; i < sg_dma_len; ++i) {
- ib_dma_sync_single_for_device(dev,
- ib_sg_dma_address(dev, &sg[i]),
- ib_sg_dma_len(dev, &sg[i]),
- direction);
- }
-}
-#define ib_dma_sync_sg_for_device rds_iw_dma_sync_sg_for_device
-
-static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
-{
- return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey;
-}
-
-/* ib.c */
-extern struct rds_transport rds_iw_transport;
-extern struct ib_client rds_iw_client;
-
-extern unsigned int fastreg_pool_size;
-extern unsigned int fastreg_message_size;
-
-extern spinlock_t iw_nodev_conns_lock;
-extern struct list_head iw_nodev_conns;
-
-/* ib_cm.c */
-int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp);
-void rds_iw_conn_free(void *arg);
-int rds_iw_conn_connect(struct rds_connection *conn);
-void rds_iw_conn_shutdown(struct rds_connection *conn);
-void rds_iw_state_change(struct sock *sk);
-int rds_iw_listen_init(void);
-void rds_iw_listen_stop(void);
-void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
-int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
- struct rdma_cm_event *event);
-int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id);
-void rds_iw_cm_connect_complete(struct rds_connection *conn,
- struct rdma_cm_event *event);
-
-
-#define rds_iw_conn_error(conn, fmt...) \
- __rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt)
-
-/* ib_rdma.c */
-int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
-void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
-void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
-void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock);
-static inline void rds_iw_destroy_nodev_conns(void)
-{
- __rds_iw_destroy_conns(&iw_nodev_conns, &iw_nodev_conns_lock);
-}
-static inline void rds_iw_destroy_conns(struct rds_iw_device *rds_iwdev)
-{
- __rds_iw_destroy_conns(&rds_iwdev->conn_list, &rds_iwdev->spinlock);
-}
-struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *);
-void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo);
-void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *);
-void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
- struct rds_sock *rs, u32 *key_ret);
-void rds_iw_sync_mr(void *trans_private, int dir);
-void rds_iw_free_mr(void *trans_private, int invalidate);
-void rds_iw_flush_mrs(void);
-
-/* ib_recv.c */
-int rds_iw_recv_init(void);
-void rds_iw_recv_exit(void);
-int rds_iw_recv(struct rds_connection *conn);
-int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
- gfp_t page_gfp, int prefill);
-void rds_iw_inc_free(struct rds_incoming *inc);
-int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
-void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context);
-void rds_iw_recv_tasklet_fn(unsigned long data);
-void rds_iw_recv_init_ring(struct rds_iw_connection *ic);
-void rds_iw_recv_clear_ring(struct rds_iw_connection *ic);
-void rds_iw_recv_init_ack(struct rds_iw_connection *ic);
-void rds_iw_attempt_ack(struct rds_iw_connection *ic);
-void rds_iw_ack_send_complete(struct rds_iw_connection *ic);
-u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic);
-
-/* ib_ring.c */
-void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr);
-void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr);
-u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos);
-void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val);
-void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val);
-int rds_iw_ring_empty(struct rds_iw_work_ring *ring);
-int rds_iw_ring_low(struct rds_iw_work_ring *ring);
-u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring);
-u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest);
-extern wait_queue_head_t rds_iw_ring_empty_wait;
-
-/* ib_send.c */
-void rds_iw_xmit_complete(struct rds_connection *conn);
-int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
- unsigned int hdr_off, unsigned int sg, unsigned int off);
-void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
-void rds_iw_send_init_ring(struct rds_iw_connection *ic);
-void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
-void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
-void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
-int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
- u32 *adv_credits, int need_posted, int max_posted);
-
-/* ib_stats.c */
-DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
-#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member)
-unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
- unsigned int avail);
-
-/* ib_sysctl.c */
-int rds_iw_sysctl_init(void);
-void rds_iw_sysctl_exit(void);
-extern unsigned long rds_iw_sysctl_max_send_wr;
-extern unsigned long rds_iw_sysctl_max_recv_wr;
-extern unsigned long rds_iw_sysctl_max_unsig_wrs;
-extern unsigned long rds_iw_sysctl_max_unsig_bytes;
-extern unsigned long rds_iw_sysctl_max_recv_allocation;
-extern unsigned int rds_iw_sysctl_flow_control;
-
-/*
- * Helper functions for getting/setting the header and data SGEs in
- * RDS packets (not RDMA)
- */
-static inline struct ib_sge *
-rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
-{
- return &sge[0];
-}
-
-static inline struct ib_sge *
-rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
-{
- return &sge[1];
-}
-
-#endif
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
deleted file mode 100644
index aea4c911b..000000000
--- a/net/rds/iw_cm.c
+++ /dev/null
@@ -1,769 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/in.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/ratelimit.h>
-
-#include "rds.h"
-#include "iw.h"
-
-/*
- * Set the selected protocol version
- */
-static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version)
-{
- conn->c_version = version;
-}
-
-/*
- * Set up flow control
- */
-static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- if (rds_iw_sysctl_flow_control && credits != 0) {
- /* We're doing flow control */
- ic->i_flowctl = 1;
- rds_iw_send_add_credits(conn, credits);
- } else {
- ic->i_flowctl = 0;
- }
-}
-
-/*
- * Connection established.
- * We get here for both outgoing and incoming connection.
- */
-void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
-{
- const struct rds_iw_connect_private *dp = NULL;
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct rds_iw_device *rds_iwdev;
- int err;
-
- if (event->param.conn.private_data_len) {
- dp = event->param.conn.private_data;
-
- rds_iw_set_protocol(conn,
- RDS_PROTOCOL(dp->dp_protocol_major,
- dp->dp_protocol_minor));
- rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
- }
-
- /* update ib_device with this local ipaddr & conn */
- rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
- err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id);
- if (err)
- printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err);
- rds_iw_add_conn(rds_iwdev, conn);
-
- /* If the peer gave us the last packet it saw, process this as if
- * we had received a regular ACK. */
- if (dp && dp->dp_ack_seq)
- rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
-
- printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n",
- &conn->c_laddr, &conn->c_faddr,
- RDS_PROTOCOL_MAJOR(conn->c_version),
- RDS_PROTOCOL_MINOR(conn->c_version),
- ic->i_flowctl ? ", flow control" : "");
-
- rds_connect_complete(conn);
-}
-
-static void rds_iw_cm_fill_conn_param(struct rds_connection *conn,
- struct rdma_conn_param *conn_param,
- struct rds_iw_connect_private *dp,
- u32 protocol_version)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- memset(conn_param, 0, sizeof(struct rdma_conn_param));
- /* XXX tune these? */
- conn_param->responder_resources = 1;
- conn_param->initiator_depth = 1;
-
- if (dp) {
- memset(dp, 0, sizeof(*dp));
- dp->dp_saddr = conn->c_laddr;
- dp->dp_daddr = conn->c_faddr;
- dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
- dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
- dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS);
- dp->dp_ack_seq = rds_iw_piggyb_ack(ic);
-
- /* Advertise flow control */
- if (ic->i_flowctl) {
- unsigned int credits;
-
- credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
- dp->dp_credit = cpu_to_be32(credits);
- atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
- }
-
- conn_param->private_data = dp;
- conn_param->private_data_len = sizeof(*dp);
- }
-}
-
-static void rds_iw_cq_event_handler(struct ib_event *event, void *data)
-{
- rdsdebug("event %u data %p\n", event->event, data);
-}
-
-static void rds_iw_qp_event_handler(struct ib_event *event, void *data)
-{
- struct rds_connection *conn = data;
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
-
- switch (event->event) {
- case IB_EVENT_COMM_EST:
- rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
- break;
- case IB_EVENT_QP_REQ_ERR:
- case IB_EVENT_QP_FATAL:
- default:
- rdsdebug("Fatal QP Event %u "
- "- connection %pI4->%pI4, reconnecting\n",
- event->event, &conn->c_laddr,
- &conn->c_faddr);
- rds_conn_drop(conn);
- break;
- }
-}
-
-/*
- * Create a QP
- */
-static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
- struct rds_iw_device *rds_iwdev,
- struct rds_iw_work_ring *send_ring,
- void (*send_cq_handler)(struct ib_cq *, void *),
- struct rds_iw_work_ring *recv_ring,
- void (*recv_cq_handler)(struct ib_cq *, void *),
- void *context)
-{
- struct ib_device *dev = rds_iwdev->dev;
- struct ib_cq_init_attr cq_attr = {};
- unsigned int send_size, recv_size;
- int ret;
-
- /* The offset of 1 is to accommodate the additional ACK WR. */
- send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1);
- recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1);
- rds_iw_ring_resize(send_ring, send_size - 1);
- rds_iw_ring_resize(recv_ring, recv_size - 1);
-
- memset(attr, 0, sizeof(*attr));
- attr->event_handler = rds_iw_qp_event_handler;
- attr->qp_context = context;
- attr->cap.max_send_wr = send_size;
- attr->cap.max_recv_wr = recv_size;
- attr->cap.max_send_sge = rds_iwdev->max_sge;
- attr->cap.max_recv_sge = RDS_IW_RECV_SGE;
- attr->sq_sig_type = IB_SIGNAL_REQ_WR;
- attr->qp_type = IB_QPT_RC;
-
- cq_attr.cqe = send_size;
- attr->send_cq = ib_create_cq(dev, send_cq_handler,
- rds_iw_cq_event_handler,
- context, &cq_attr);
- if (IS_ERR(attr->send_cq)) {
- ret = PTR_ERR(attr->send_cq);
- attr->send_cq = NULL;
- rdsdebug("ib_create_cq send failed: %d\n", ret);
- goto out;
- }
-
- cq_attr.cqe = recv_size;
- attr->recv_cq = ib_create_cq(dev, recv_cq_handler,
- rds_iw_cq_event_handler,
- context, &cq_attr);
- if (IS_ERR(attr->recv_cq)) {
- ret = PTR_ERR(attr->recv_cq);
- attr->recv_cq = NULL;
- rdsdebug("ib_create_cq send failed: %d\n", ret);
- goto out;
- }
-
- ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP);
- if (ret) {
- rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
- goto out;
- }
-
- ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED);
- if (ret) {
- rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
- goto out;
- }
-
-out:
- if (ret) {
- if (attr->send_cq)
- ib_destroy_cq(attr->send_cq);
- if (attr->recv_cq)
- ib_destroy_cq(attr->recv_cq);
- }
- return ret;
-}
-
-/*
- * This needs to be very careful to not leave IS_ERR pointers around for
- * cleanup to trip over.
- */
-static int rds_iw_setup_qp(struct rds_connection *conn)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct ib_device *dev = ic->i_cm_id->device;
- struct ib_qp_init_attr attr;
- struct rds_iw_device *rds_iwdev;
- int ret;
-
- /* rds_iw_add_one creates a rds_iw_device object per IB device,
- * and allocates a protection domain, memory range and MR pool
- * for each. If that fails for any reason, it will not register
- * the rds_iwdev at all.
- */
- rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
- if (!rds_iwdev) {
- printk_ratelimited(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
- dev->name);
- return -EOPNOTSUPP;
- }
-
- /* Protection domain and memory range */
- ic->i_pd = rds_iwdev->pd;
- ic->i_mr = rds_iwdev->mr;
-
- ret = rds_iw_init_qp_attrs(&attr, rds_iwdev,
- &ic->i_send_ring, rds_iw_send_cq_comp_handler,
- &ic->i_recv_ring, rds_iw_recv_cq_comp_handler,
- conn);
- if (ret < 0)
- goto out;
-
- ic->i_send_cq = attr.send_cq;
- ic->i_recv_cq = attr.recv_cq;
-
- /*
- * XXX this can fail if max_*_wr is too large? Are we supposed
- * to back off until we get a value that the hardware can support?
- */
- ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
- if (ret) {
- rdsdebug("rdma_create_qp failed: %d\n", ret);
- goto out;
- }
-
- ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
- ic->i_send_ring.w_nr *
- sizeof(struct rds_header),
- &ic->i_send_hdrs_dma, GFP_KERNEL);
- if (!ic->i_send_hdrs) {
- ret = -ENOMEM;
- rdsdebug("ib_dma_alloc_coherent send failed\n");
- goto out;
- }
-
- ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
- ic->i_recv_ring.w_nr *
- sizeof(struct rds_header),
- &ic->i_recv_hdrs_dma, GFP_KERNEL);
- if (!ic->i_recv_hdrs) {
- ret = -ENOMEM;
- rdsdebug("ib_dma_alloc_coherent recv failed\n");
- goto out;
- }
-
- ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
- &ic->i_ack_dma, GFP_KERNEL);
- if (!ic->i_ack) {
- ret = -ENOMEM;
- rdsdebug("ib_dma_alloc_coherent ack failed\n");
- goto out;
- }
-
- ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
- if (!ic->i_sends) {
- ret = -ENOMEM;
- rdsdebug("send allocation failed\n");
- goto out;
- }
- rds_iw_send_init_ring(ic);
-
- ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
- if (!ic->i_recvs) {
- ret = -ENOMEM;
- rdsdebug("recv allocation failed\n");
- goto out;
- }
-
- rds_iw_recv_init_ring(ic);
- rds_iw_recv_init_ack(ic);
-
- /* Post receive buffers - as a side effect, this will update
- * the posted credit count. */
- rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
-
- rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
- ic->i_send_cq, ic->i_recv_cq);
-
-out:
- return ret;
-}
-
-static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp)
-{
- u16 common;
- u32 version = 0;
-
- /* rdma_cm private data is odd - when there is any private data in the
- * request, we will be given a pretty large buffer without telling us the
- * original size. The only way to tell the difference is by looking at
- * the contents, which are initialized to zero.
- * If the protocol version fields aren't set, this is a connection attempt
- * from an older version. This could could be 3.0 or 2.0 - we can't tell.
- * We really should have changed this for OFED 1.3 :-( */
- if (dp->dp_protocol_major == 0)
- return RDS_PROTOCOL_3_0;
-
- common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS;
- if (dp->dp_protocol_major == 3 && common) {
- version = RDS_PROTOCOL_3_0;
- while ((common >>= 1) != 0)
- version++;
- }
- printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using "
- "incompatible protocol version %u.%u\n",
- &dp->dp_saddr,
- dp->dp_protocol_major,
- dp->dp_protocol_minor);
- return version;
-}
-
-int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
- struct rdma_cm_event *event)
-{
- const struct rds_iw_connect_private *dp = event->param.conn.private_data;
- struct rds_iw_connect_private dp_rep;
- struct rds_connection *conn = NULL;
- struct rds_iw_connection *ic = NULL;
- struct rdma_conn_param conn_param;
- struct rds_iw_device *rds_iwdev;
- u32 version;
- int err, destroy = 1;
-
- /* Check whether the remote protocol version matches ours. */
- version = rds_iw_protocol_compatible(dp);
- if (!version)
- goto out;
-
- rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n",
- &dp->dp_saddr, &dp->dp_daddr,
- RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version));
-
- /* RDS/IW is not currently netns aware, thus init_net */
- conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr,
- &rds_iw_transport, GFP_KERNEL);
- if (IS_ERR(conn)) {
- rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
- conn = NULL;
- goto out;
- }
-
- /*
- * The connection request may occur while the
- * previous connection exist, e.g. in case of failover.
- * But as connections may be initiated simultaneously
- * by both hosts, we have a random backoff mechanism -
- * see the comment above rds_queue_reconnect()
- */
- mutex_lock(&conn->c_cm_lock);
- if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
- if (rds_conn_state(conn) == RDS_CONN_UP) {
- rdsdebug("incoming connect while connecting\n");
- rds_conn_drop(conn);
- rds_iw_stats_inc(s_iw_listen_closed_stale);
- } else
- if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
- /* Wait and see - our connect may still be succeeding */
- rds_iw_stats_inc(s_iw_connect_raced);
- }
- mutex_unlock(&conn->c_cm_lock);
- goto out;
- }
-
- ic = conn->c_transport_data;
-
- rds_iw_set_protocol(conn, version);
- rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
-
- /* If the peer gave us the last packet it saw, process this as if
- * we had received a regular ACK. */
- if (dp->dp_ack_seq)
- rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
-
- BUG_ON(cm_id->context);
- BUG_ON(ic->i_cm_id);
-
- ic->i_cm_id = cm_id;
- cm_id->context = conn;
-
- rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client);
- ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
-
- /* We got halfway through setting up the ib_connection, if we
- * fail now, we have to take the long route out of this mess. */
- destroy = 0;
-
- err = rds_iw_setup_qp(conn);
- if (err) {
- rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err);
- mutex_unlock(&conn->c_cm_lock);
- goto out;
- }
-
- rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
-
- /* rdma_accept() calls rdma_reject() internally if it fails */
- err = rdma_accept(cm_id, &conn_param);
- mutex_unlock(&conn->c_cm_lock);
- if (err) {
- rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err);
- goto out;
- }
-
- return 0;
-
-out:
- rdma_reject(cm_id, NULL, 0);
- return destroy;
-}
-
-
-int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id)
-{
- struct rds_connection *conn = cm_id->context;
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct rdma_conn_param conn_param;
- struct rds_iw_connect_private dp;
- int ret;
-
- /* If the peer doesn't do protocol negotiation, we must
- * default to RDSv3.0 */
- rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0);
- ic->i_flowctl = rds_iw_sysctl_flow_control; /* advertise flow control */
-
- ret = rds_iw_setup_qp(conn);
- if (ret) {
- rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret);
- goto out;
- }
-
- rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
-
- ret = rdma_connect(cm_id, &conn_param);
- if (ret)
- rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret);
-
-out:
- /* Beware - returning non-zero tells the rdma_cm to destroy
- * the cm_id. We should certainly not do it as long as we still
- * "own" the cm_id. */
- if (ret) {
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- if (ic->i_cm_id == cm_id)
- ret = 0;
- }
- return ret;
-}
-
-int rds_iw_conn_connect(struct rds_connection *conn)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct rds_iw_device *rds_iwdev;
- struct sockaddr_in src, dest;
- int ret;
-
- /* XXX I wonder what affect the port space has */
- /* delegate cm event handler to rdma_transport */
- ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn,
- RDMA_PS_TCP, IB_QPT_RC);
- if (IS_ERR(ic->i_cm_id)) {
- ret = PTR_ERR(ic->i_cm_id);
- ic->i_cm_id = NULL;
- rdsdebug("rdma_create_id() failed: %d\n", ret);
- goto out;
- }
-
- rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
-
- src.sin_family = AF_INET;
- src.sin_addr.s_addr = (__force u32)conn->c_laddr;
- src.sin_port = (__force u16)htons(0);
-
- /* First, bind to the local address and device. */
- ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src);
- if (ret) {
- rdsdebug("rdma_bind_addr(%pI4) failed: %d\n",
- &conn->c_laddr, ret);
- rdma_destroy_id(ic->i_cm_id);
- ic->i_cm_id = NULL;
- goto out;
- }
-
- rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
- ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
-
- dest.sin_family = AF_INET;
- dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
- dest.sin_port = (__force u16)htons(RDS_PORT);
-
- ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
- (struct sockaddr *)&dest,
- RDS_RDMA_RESOLVE_TIMEOUT_MS);
- if (ret) {
- rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
- ret);
- rdma_destroy_id(ic->i_cm_id);
- ic->i_cm_id = NULL;
- }
-
-out:
- return ret;
-}
-
-/*
- * This is so careful about only cleaning up resources that were built up
- * so that it can be called at any point during startup. In fact it
- * can be called multiple times for a given connection.
- */
-void rds_iw_conn_shutdown(struct rds_connection *conn)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- int err = 0;
- struct ib_qp_attr qp_attr;
-
- rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
- ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
- ic->i_cm_id ? ic->i_cm_id->qp : NULL);
-
- if (ic->i_cm_id) {
- struct ib_device *dev = ic->i_cm_id->device;
-
- rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
- err = rdma_disconnect(ic->i_cm_id);
- if (err) {
- /* Actually this may happen quite frequently, when
- * an outgoing connect raced with an incoming connect.
- */
- rdsdebug("failed to disconnect, cm: %p err %d\n",
- ic->i_cm_id, err);
- }
-
- if (ic->i_cm_id->qp) {
- qp_attr.qp_state = IB_QPS_ERR;
- ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
- }
-
- wait_event(rds_iw_ring_empty_wait,
- rds_iw_ring_empty(&ic->i_send_ring) &&
- rds_iw_ring_empty(&ic->i_recv_ring));
-
- if (ic->i_send_hdrs)
- ib_dma_free_coherent(dev,
- ic->i_send_ring.w_nr *
- sizeof(struct rds_header),
- ic->i_send_hdrs,
- ic->i_send_hdrs_dma);
-
- if (ic->i_recv_hdrs)
- ib_dma_free_coherent(dev,
- ic->i_recv_ring.w_nr *
- sizeof(struct rds_header),
- ic->i_recv_hdrs,
- ic->i_recv_hdrs_dma);
-
- if (ic->i_ack)
- ib_dma_free_coherent(dev, sizeof(struct rds_header),
- ic->i_ack, ic->i_ack_dma);
-
- if (ic->i_sends)
- rds_iw_send_clear_ring(ic);
- if (ic->i_recvs)
- rds_iw_recv_clear_ring(ic);
-
- if (ic->i_cm_id->qp)
- rdma_destroy_qp(ic->i_cm_id);
- if (ic->i_send_cq)
- ib_destroy_cq(ic->i_send_cq);
- if (ic->i_recv_cq)
- ib_destroy_cq(ic->i_recv_cq);
-
- /*
- * If associated with an rds_iw_device:
- * Move connection back to the nodev list.
- * Remove cm_id from the device cm_id list.
- */
- if (ic->rds_iwdev)
- rds_iw_remove_conn(ic->rds_iwdev, conn);
-
- rdma_destroy_id(ic->i_cm_id);
-
- ic->i_cm_id = NULL;
- ic->i_pd = NULL;
- ic->i_mr = NULL;
- ic->i_send_cq = NULL;
- ic->i_recv_cq = NULL;
- ic->i_send_hdrs = NULL;
- ic->i_recv_hdrs = NULL;
- ic->i_ack = NULL;
- }
- BUG_ON(ic->rds_iwdev);
-
- /* Clear pending transmit */
- if (ic->i_rm) {
- rds_message_put(ic->i_rm);
- ic->i_rm = NULL;
- }
-
- /* Clear the ACK state */
- clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
-#ifdef KERNEL_HAS_ATOMIC64
- atomic64_set(&ic->i_ack_next, 0);
-#else
- ic->i_ack_next = 0;
-#endif
- ic->i_ack_recv = 0;
-
- /* Clear flow control state */
- ic->i_flowctl = 0;
- atomic_set(&ic->i_credits, 0);
-
- rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
- rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
-
- if (ic->i_iwinc) {
- rds_inc_put(&ic->i_iwinc->ii_inc);
- ic->i_iwinc = NULL;
- }
-
- vfree(ic->i_sends);
- ic->i_sends = NULL;
- vfree(ic->i_recvs);
- ic->i_recvs = NULL;
- rdsdebug("shutdown complete\n");
-}
-
-int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
-{
- struct rds_iw_connection *ic;
- unsigned long flags;
-
- /* XXX too lazy? */
- ic = kzalloc(sizeof(struct rds_iw_connection), gfp);
- if (!ic)
- return -ENOMEM;
-
- INIT_LIST_HEAD(&ic->iw_node);
- tasklet_init(&ic->i_recv_tasklet, rds_iw_recv_tasklet_fn,
- (unsigned long) ic);
- mutex_init(&ic->i_recv_mutex);
-#ifndef KERNEL_HAS_ATOMIC64
- spin_lock_init(&ic->i_ack_lock);
-#endif
-
- /*
- * rds_iw_conn_shutdown() waits for these to be emptied so they
- * must be initialized before it can be called.
- */
- rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
- rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
-
- ic->conn = conn;
- conn->c_transport_data = ic;
-
- spin_lock_irqsave(&iw_nodev_conns_lock, flags);
- list_add_tail(&ic->iw_node, &iw_nodev_conns);
- spin_unlock_irqrestore(&iw_nodev_conns_lock, flags);
-
-
- rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
- return 0;
-}
-
-/*
- * Free a connection. Connection must be shut down and not set for reconnect.
- */
-void rds_iw_conn_free(void *arg)
-{
- struct rds_iw_connection *ic = arg;
- spinlock_t *lock_ptr;
-
- rdsdebug("ic %p\n", ic);
-
- /*
- * Conn is either on a dev's list or on the nodev list.
- * A race with shutdown() or connect() would cause problems
- * (since rds_iwdev would change) but that should never happen.
- */
- lock_ptr = ic->rds_iwdev ? &ic->rds_iwdev->spinlock : &iw_nodev_conns_lock;
-
- spin_lock_irq(lock_ptr);
- list_del(&ic->iw_node);
- spin_unlock_irq(lock_ptr);
-
- kfree(ic);
-}
-
-/*
- * An error occurred on the connection
- */
-void
-__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...)
-{
- va_list ap;
-
- rds_conn_drop(conn);
-
- va_start(ap, fmt);
- vprintk(fmt, ap);
- va_end(ap);
-}
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
deleted file mode 100644
index b09a40c1a..000000000
--- a/net/rds/iw_rdma.c
+++ /dev/null
@@ -1,837 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/ratelimit.h>
-
-#include "rds.h"
-#include "iw.h"
-
-
-/*
- * This is stored as mr->r_trans_private.
- */
-struct rds_iw_mr {
- struct rds_iw_device *device;
- struct rds_iw_mr_pool *pool;
- struct rdma_cm_id *cm_id;
-
- struct ib_mr *mr;
-
- struct rds_iw_mapping mapping;
- unsigned char remap_count;
-};
-
-/*
- * Our own little MR pool
- */
-struct rds_iw_mr_pool {
- struct rds_iw_device *device; /* back ptr to the device that owns us */
-
- struct mutex flush_lock; /* serialize fmr invalidate */
- struct work_struct flush_worker; /* flush worker */
-
- spinlock_t list_lock; /* protect variables below */
- atomic_t item_count; /* total # of MRs */
- atomic_t dirty_count; /* # dirty of MRs */
- struct list_head dirty_list; /* dirty mappings */
- struct list_head clean_list; /* unused & unamapped MRs */
- atomic_t free_pinned; /* memory pinned by free MRs */
- unsigned long max_message_size; /* in pages */
- unsigned long max_items;
- unsigned long max_items_soft;
- unsigned long max_free_pinned;
- int max_pages;
-};
-
-static void rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all);
-static void rds_iw_mr_pool_flush_worker(struct work_struct *work);
-static int rds_iw_init_reg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
-static int rds_iw_map_reg(struct rds_iw_mr_pool *pool,
- struct rds_iw_mr *ibmr,
- struct scatterlist *sg, unsigned int nents);
-static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
-static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
- struct list_head *unmap_list,
- struct list_head *kill_list,
- int *unpinned);
-static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
-
-static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst,
- struct rds_iw_device **rds_iwdev,
- struct rdma_cm_id **cm_id)
-{
- struct rds_iw_device *iwdev;
- struct rds_iw_cm_id *i_cm_id;
-
- *rds_iwdev = NULL;
- *cm_id = NULL;
-
- list_for_each_entry(iwdev, &rds_iw_devices, list) {
- spin_lock_irq(&iwdev->spinlock);
- list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) {
- struct sockaddr_in *src_addr, *dst_addr;
-
- src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr;
- dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr;
-
- rdsdebug("local ipaddr = %x port %d, "
- "remote ipaddr = %x port %d"
- "..looking for %x port %d, "
- "remote ipaddr = %x port %d\n",
- src_addr->sin_addr.s_addr,
- src_addr->sin_port,
- dst_addr->sin_addr.s_addr,
- dst_addr->sin_port,
- src->sin_addr.s_addr,
- src->sin_port,
- dst->sin_addr.s_addr,
- dst->sin_port);
-#ifdef WORKING_TUPLE_DETECTION
- if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr &&
- src_addr->sin_port == src->sin_port &&
- dst_addr->sin_addr.s_addr == dst->sin_addr.s_addr &&
- dst_addr->sin_port == dst->sin_port) {
-#else
- /* FIXME - needs to compare the local and remote
- * ipaddr/port tuple, but the ipaddr is the only
- * available information in the rds_sock (as the rest are
- * zero'ed. It doesn't appear to be properly populated
- * during connection setup...
- */
- if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr) {
-#endif
- spin_unlock_irq(&iwdev->spinlock);
- *rds_iwdev = iwdev;
- *cm_id = i_cm_id->cm_id;
- return 0;
- }
- }
- spin_unlock_irq(&iwdev->spinlock);
- }
-
- return 1;
-}
-
-static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
-{
- struct rds_iw_cm_id *i_cm_id;
-
- i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL);
- if (!i_cm_id)
- return -ENOMEM;
-
- i_cm_id->cm_id = cm_id;
-
- spin_lock_irq(&rds_iwdev->spinlock);
- list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list);
- spin_unlock_irq(&rds_iwdev->spinlock);
-
- return 0;
-}
-
-static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev,
- struct rdma_cm_id *cm_id)
-{
- struct rds_iw_cm_id *i_cm_id;
-
- spin_lock_irq(&rds_iwdev->spinlock);
- list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) {
- if (i_cm_id->cm_id == cm_id) {
- list_del(&i_cm_id->list);
- kfree(i_cm_id);
- break;
- }
- }
- spin_unlock_irq(&rds_iwdev->spinlock);
-}
-
-
-int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
-{
- struct sockaddr_in *src_addr, *dst_addr;
- struct rds_iw_device *rds_iwdev_old;
- struct rdma_cm_id *pcm_id;
- int rc;
-
- src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr;
- dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr;
-
- rc = rds_iw_get_device(src_addr, dst_addr, &rds_iwdev_old, &pcm_id);
- if (rc)
- rds_iw_remove_cm_id(rds_iwdev, cm_id);
-
- return rds_iw_add_cm_id(rds_iwdev, cm_id);
-}
-
-void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- /* conn was previously on the nodev_conns_list */
- spin_lock_irq(&iw_nodev_conns_lock);
- BUG_ON(list_empty(&iw_nodev_conns));
- BUG_ON(list_empty(&ic->iw_node));
- list_del(&ic->iw_node);
-
- spin_lock(&rds_iwdev->spinlock);
- list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
- spin_unlock(&rds_iwdev->spinlock);
- spin_unlock_irq(&iw_nodev_conns_lock);
-
- ic->rds_iwdev = rds_iwdev;
-}
-
-void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- /* place conn on nodev_conns_list */
- spin_lock(&iw_nodev_conns_lock);
-
- spin_lock_irq(&rds_iwdev->spinlock);
- BUG_ON(list_empty(&ic->iw_node));
- list_del(&ic->iw_node);
- spin_unlock_irq(&rds_iwdev->spinlock);
-
- list_add_tail(&ic->iw_node, &iw_nodev_conns);
-
- spin_unlock(&iw_nodev_conns_lock);
-
- rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id);
- ic->rds_iwdev = NULL;
-}
-
-void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock)
-{
- struct rds_iw_connection *ic, *_ic;
- LIST_HEAD(tmp_list);
-
- /* avoid calling conn_destroy with irqs off */
- spin_lock_irq(list_lock);
- list_splice(list, &tmp_list);
- INIT_LIST_HEAD(list);
- spin_unlock_irq(list_lock);
-
- list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node)
- rds_conn_destroy(ic->conn);
-}
-
-static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
- struct scatterlist *list, unsigned int sg_len)
-{
- sg->list = list;
- sg->len = sg_len;
- sg->dma_len = 0;
- sg->dma_npages = 0;
- sg->bytes = 0;
-}
-
-static int rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
- struct rds_iw_scatterlist *sg)
-{
- struct ib_device *dev = rds_iwdev->dev;
- int i, ret;
-
- WARN_ON(sg->dma_len);
-
- sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
- if (unlikely(!sg->dma_len)) {
- printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n");
- return -EBUSY;
- }
-
- sg->bytes = 0;
- sg->dma_npages = 0;
-
- ret = -EINVAL;
- for (i = 0; i < sg->dma_len; ++i) {
- unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
- u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
- u64 end_addr;
-
- sg->bytes += dma_len;
-
- end_addr = dma_addr + dma_len;
- if (dma_addr & PAGE_MASK) {
- if (i > 0)
- goto out_unmap;
- dma_addr &= ~PAGE_MASK;
- }
- if (end_addr & PAGE_MASK) {
- if (i < sg->dma_len - 1)
- goto out_unmap;
- end_addr = (end_addr + PAGE_MASK) & ~PAGE_MASK;
- }
-
- sg->dma_npages += (end_addr - dma_addr) >> PAGE_SHIFT;
- }
-
- /* Now gather the dma addrs into one list */
- if (sg->dma_npages > fastreg_message_size)
- goto out_unmap;
-
-
-
- return 0;
-
-out_unmap:
- ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
- sg->dma_len = 0;
- return ret;
-}
-
-
-struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev)
-{
- struct rds_iw_mr_pool *pool;
-
- pool = kzalloc(sizeof(*pool), GFP_KERNEL);
- if (!pool) {
- printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n");
- return ERR_PTR(-ENOMEM);
- }
-
- pool->device = rds_iwdev;
- INIT_LIST_HEAD(&pool->dirty_list);
- INIT_LIST_HEAD(&pool->clean_list);
- mutex_init(&pool->flush_lock);
- spin_lock_init(&pool->list_lock);
- INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker);
-
- pool->max_message_size = fastreg_message_size;
- pool->max_items = fastreg_pool_size;
- pool->max_free_pinned = pool->max_items * pool->max_message_size / 4;
- pool->max_pages = fastreg_message_size;
-
- /* We never allow more than max_items MRs to be allocated.
- * When we exceed more than max_items_soft, we start freeing
- * items more aggressively.
- * Make sure that max_items > max_items_soft > max_items / 2
- */
- pool->max_items_soft = pool->max_items * 3 / 4;
-
- return pool;
-}
-
-void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo)
-{
- struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
-
- iinfo->rdma_mr_max = pool->max_items;
- iinfo->rdma_mr_size = pool->max_pages;
-}
-
-void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool)
-{
- flush_workqueue(rds_wq);
- rds_iw_flush_mr_pool(pool, 1);
- BUG_ON(atomic_read(&pool->item_count));
- BUG_ON(atomic_read(&pool->free_pinned));
- kfree(pool);
-}
-
-static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool)
-{
- struct rds_iw_mr *ibmr = NULL;
- unsigned long flags;
-
- spin_lock_irqsave(&pool->list_lock, flags);
- if (!list_empty(&pool->clean_list)) {
- ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list);
- list_del_init(&ibmr->mapping.m_list);
- }
- spin_unlock_irqrestore(&pool->list_lock, flags);
-
- return ibmr;
-}
-
-static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev)
-{
- struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
- struct rds_iw_mr *ibmr = NULL;
- int err = 0, iter = 0;
-
- while (1) {
- ibmr = rds_iw_reuse_fmr(pool);
- if (ibmr)
- return ibmr;
-
- /* No clean MRs - now we have the choice of either
- * allocating a fresh MR up to the limit imposed by the
- * driver, or flush any dirty unused MRs.
- * We try to avoid stalling in the send path if possible,
- * so we allocate as long as we're allowed to.
- *
- * We're fussy with enforcing the FMR limit, though. If the driver
- * tells us we can't use more than N fmrs, we shouldn't start
- * arguing with it */
- if (atomic_inc_return(&pool->item_count) <= pool->max_items)
- break;
-
- atomic_dec(&pool->item_count);
-
- if (++iter > 2) {
- rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted);
- return ERR_PTR(-EAGAIN);
- }
-
- /* We do have some empty MRs. Flush them out. */
- rds_iw_stats_inc(s_iw_rdma_mr_pool_wait);
- rds_iw_flush_mr_pool(pool, 0);
- }
-
- ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
- if (!ibmr) {
- err = -ENOMEM;
- goto out_no_cigar;
- }
-
- spin_lock_init(&ibmr->mapping.m_lock);
- INIT_LIST_HEAD(&ibmr->mapping.m_list);
- ibmr->mapping.m_mr = ibmr;
-
- err = rds_iw_init_reg(pool, ibmr);
- if (err)
- goto out_no_cigar;
-
- rds_iw_stats_inc(s_iw_rdma_mr_alloc);
- return ibmr;
-
-out_no_cigar:
- if (ibmr) {
- rds_iw_destroy_fastreg(pool, ibmr);
- kfree(ibmr);
- }
- atomic_dec(&pool->item_count);
- return ERR_PTR(err);
-}
-
-void rds_iw_sync_mr(void *trans_private, int direction)
-{
- struct rds_iw_mr *ibmr = trans_private;
- struct rds_iw_device *rds_iwdev = ibmr->device;
-
- switch (direction) {
- case DMA_FROM_DEVICE:
- ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list,
- ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
- break;
- case DMA_TO_DEVICE:
- ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list,
- ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
- break;
- }
-}
-
-/*
- * Flush our pool of MRs.
- * At a minimum, all currently unused MRs are unmapped.
- * If the number of MRs allocated exceeds the limit, we also try
- * to free as many MRs as needed to get back to this limit.
- */
-static void rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
-{
- struct rds_iw_mr *ibmr, *next;
- LIST_HEAD(unmap_list);
- LIST_HEAD(kill_list);
- unsigned long flags;
- unsigned int nfreed = 0, ncleaned = 0, unpinned = 0;
-
- rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
-
- mutex_lock(&pool->flush_lock);
-
- spin_lock_irqsave(&pool->list_lock, flags);
- /* Get the list of all mappings to be destroyed */
- list_splice_init(&pool->dirty_list, &unmap_list);
- if (free_all)
- list_splice_init(&pool->clean_list, &kill_list);
- spin_unlock_irqrestore(&pool->list_lock, flags);
-
- /* Batched invalidate of dirty MRs.
- * For FMR based MRs, the mappings on the unmap list are
- * actually members of an ibmr (ibmr->mapping). They either
- * migrate to the kill_list, or have been cleaned and should be
- * moved to the clean_list.
- * For fastregs, they will be dynamically allocated, and
- * will be destroyed by the unmap function.
- */
- if (!list_empty(&unmap_list)) {
- ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list,
- &kill_list, &unpinned);
- /* If we've been asked to destroy all MRs, move those
- * that were simply cleaned to the kill list */
- if (free_all)
- list_splice_init(&unmap_list, &kill_list);
- }
-
- /* Destroy any MRs that are past their best before date */
- list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) {
- rds_iw_stats_inc(s_iw_rdma_mr_free);
- list_del(&ibmr->mapping.m_list);
- rds_iw_destroy_fastreg(pool, ibmr);
- kfree(ibmr);
- nfreed++;
- }
-
- /* Anything that remains are laundered ibmrs, which we can add
- * back to the clean list. */
- if (!list_empty(&unmap_list)) {
- spin_lock_irqsave(&pool->list_lock, flags);
- list_splice(&unmap_list, &pool->clean_list);
- spin_unlock_irqrestore(&pool->list_lock, flags);
- }
-
- atomic_sub(unpinned, &pool->free_pinned);
- atomic_sub(ncleaned, &pool->dirty_count);
- atomic_sub(nfreed, &pool->item_count);
-
- mutex_unlock(&pool->flush_lock);
-}
-
-static void rds_iw_mr_pool_flush_worker(struct work_struct *work)
-{
- struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker);
-
- rds_iw_flush_mr_pool(pool, 0);
-}
-
-void rds_iw_free_mr(void *trans_private, int invalidate)
-{
- struct rds_iw_mr *ibmr = trans_private;
- struct rds_iw_mr_pool *pool = ibmr->device->mr_pool;
-
- rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len);
- if (!pool)
- return;
-
- /* Return it to the pool's free list */
- rds_iw_free_fastreg(pool, ibmr);
-
- /* If we've pinned too many pages, request a flush */
- if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
- atomic_read(&pool->dirty_count) >= pool->max_items / 10)
- queue_work(rds_wq, &pool->flush_worker);
-
- if (invalidate) {
- if (likely(!in_interrupt())) {
- rds_iw_flush_mr_pool(pool, 0);
- } else {
- /* We get here if the user created a MR marked
- * as use_once and invalidate at the same time. */
- queue_work(rds_wq, &pool->flush_worker);
- }
- }
-}
-
-void rds_iw_flush_mrs(void)
-{
- struct rds_iw_device *rds_iwdev;
-
- list_for_each_entry(rds_iwdev, &rds_iw_devices, list) {
- struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
-
- if (pool)
- rds_iw_flush_mr_pool(pool, 0);
- }
-}
-
-void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
- struct rds_sock *rs, u32 *key_ret)
-{
- struct rds_iw_device *rds_iwdev;
- struct rds_iw_mr *ibmr = NULL;
- struct rdma_cm_id *cm_id;
- struct sockaddr_in src = {
- .sin_addr.s_addr = rs->rs_bound_addr,
- .sin_port = rs->rs_bound_port,
- };
- struct sockaddr_in dst = {
- .sin_addr.s_addr = rs->rs_conn_addr,
- .sin_port = rs->rs_conn_port,
- };
- int ret;
-
- ret = rds_iw_get_device(&src, &dst, &rds_iwdev, &cm_id);
- if (ret || !cm_id) {
- ret = -ENODEV;
- goto out;
- }
-
- if (!rds_iwdev->mr_pool) {
- ret = -ENODEV;
- goto out;
- }
-
- ibmr = rds_iw_alloc_mr(rds_iwdev);
- if (IS_ERR(ibmr))
- return ibmr;
-
- ibmr->cm_id = cm_id;
- ibmr->device = rds_iwdev;
-
- ret = rds_iw_map_reg(rds_iwdev->mr_pool, ibmr, sg, nents);
- if (ret == 0)
- *key_ret = ibmr->mr->rkey;
- else
- printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret);
-
-out:
- if (ret) {
- if (ibmr)
- rds_iw_free_mr(ibmr, 0);
- ibmr = ERR_PTR(ret);
- }
- return ibmr;
-}
-
-/*
- * iWARP reg handling
- *
- * The life cycle of a fastreg registration is a bit different from
- * FMRs.
- * The idea behind fastreg is to have one MR, to which we bind different
- * mappings over time. To avoid stalling on the expensive map and invalidate
- * operations, these operations are pipelined on the same send queue on
- * which we want to send the message containing the r_key.
- *
- * This creates a bit of a problem for us, as we do not have the destination
- * IP in GET_MR, so the connection must be setup prior to the GET_MR call for
- * RDMA to be correctly setup. If a fastreg request is present, rds_iw_xmit
- * will try to queue a LOCAL_INV (if needed) and a REG_MR work request
- * before queuing the SEND. When completions for these arrive, they are
- * dispatched to the MR has a bit set showing that RDMa can be performed.
- *
- * There is another interesting aspect that's related to invalidation.
- * The application can request that a mapping is invalidated in FREE_MR.
- * The expectation there is that this invalidation step includes ALL
- * PREVIOUSLY FREED MRs.
- */
-static int rds_iw_init_reg(struct rds_iw_mr_pool *pool,
- struct rds_iw_mr *ibmr)
-{
- struct rds_iw_device *rds_iwdev = pool->device;
- struct ib_mr *mr;
- int err;
-
- mr = ib_alloc_mr(rds_iwdev->pd, IB_MR_TYPE_MEM_REG,
- pool->max_message_size);
- if (IS_ERR(mr)) {
- err = PTR_ERR(mr);
-
- printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed (err=%d)\n", err);
- return err;
- }
-
- ibmr->mr = mr;
- return 0;
-}
-
-static int rds_iw_rdma_reg_mr(struct rds_iw_mapping *mapping)
-{
- struct rds_iw_mr *ibmr = mapping->m_mr;
- struct rds_iw_scatterlist *m_sg = &mapping->m_sg;
- struct ib_reg_wr reg_wr;
- struct ib_send_wr *failed_wr;
- int ret, n;
-
- n = ib_map_mr_sg_zbva(ibmr->mr, m_sg->list, m_sg->len, PAGE_SIZE);
- if (unlikely(n != m_sg->len))
- return n < 0 ? n : -EINVAL;
-
- reg_wr.wr.next = NULL;
- reg_wr.wr.opcode = IB_WR_REG_MR;
- reg_wr.wr.wr_id = RDS_IW_REG_WR_ID;
- reg_wr.wr.num_sge = 0;
- reg_wr.mr = ibmr->mr;
- reg_wr.key = mapping->m_rkey;
- reg_wr.access = IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_READ |
- IB_ACCESS_REMOTE_WRITE;
-
- /*
- * Perform a WR for the reg_mr. Each individual page
- * in the sg list is added to the fast reg page list and placed
- * inside the reg_mr WR. The key used is a rolling 8bit
- * counter, which should guarantee uniqueness.
- */
- ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++);
- mapping->m_rkey = ibmr->mr->rkey;
-
- failed_wr = &reg_wr.wr;
- ret = ib_post_send(ibmr->cm_id->qp, &reg_wr.wr, &failed_wr);
- BUG_ON(failed_wr != &reg_wr.wr);
- if (ret)
- printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
- __func__, __LINE__, ret);
- return ret;
-}
-
-static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr)
-{
- struct ib_send_wr s_wr, *failed_wr;
- int ret = 0;
-
- if (!ibmr->cm_id->qp || !ibmr->mr)
- goto out;
-
- memset(&s_wr, 0, sizeof(s_wr));
- s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID;
- s_wr.opcode = IB_WR_LOCAL_INV;
- s_wr.ex.invalidate_rkey = ibmr->mr->rkey;
- s_wr.send_flags = IB_SEND_SIGNALED;
-
- failed_wr = &s_wr;
- ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr);
- if (ret) {
- printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
- __func__, __LINE__, ret);
- goto out;
- }
-out:
- return ret;
-}
-
-static int rds_iw_map_reg(struct rds_iw_mr_pool *pool,
- struct rds_iw_mr *ibmr,
- struct scatterlist *sg,
- unsigned int sg_len)
-{
- struct rds_iw_device *rds_iwdev = pool->device;
- struct rds_iw_mapping *mapping = &ibmr->mapping;
- u64 *dma_pages;
- int ret = 0;
-
- rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len);
-
- ret = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg);
- if (ret) {
- dma_pages = NULL;
- goto out;
- }
-
- if (mapping->m_sg.dma_len > pool->max_message_size) {
- ret = -EMSGSIZE;
- goto out;
- }
-
- ret = rds_iw_rdma_reg_mr(mapping);
- if (ret)
- goto out;
-
- rds_iw_stats_inc(s_iw_rdma_mr_used);
-
-out:
- kfree(dma_pages);
-
- return ret;
-}
-
-/*
- * "Free" a fastreg MR.
- */
-static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool,
- struct rds_iw_mr *ibmr)
-{
- unsigned long flags;
- int ret;
-
- if (!ibmr->mapping.m_sg.dma_len)
- return;
-
- ret = rds_iw_rdma_fastreg_inv(ibmr);
- if (ret)
- return;
-
- /* Try to post the LOCAL_INV WR to the queue. */
- spin_lock_irqsave(&pool->list_lock, flags);
-
- list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list);
- atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned);
- atomic_inc(&pool->dirty_count);
-
- spin_unlock_irqrestore(&pool->list_lock, flags);
-}
-
-static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
- struct list_head *unmap_list,
- struct list_head *kill_list,
- int *unpinned)
-{
- struct rds_iw_mapping *mapping, *next;
- unsigned int ncleaned = 0;
- LIST_HEAD(laundered);
-
- /* Batched invalidation of fastreg MRs.
- * Why do we do it this way, even though we could pipeline unmap
- * and remap? The reason is the application semantics - when the
- * application requests an invalidation of MRs, it expects all
- * previously released R_Keys to become invalid.
- *
- * If we implement MR reuse naively, we risk memory corruption
- * (this has actually been observed). So the default behavior
- * requires that a MR goes through an explicit unmap operation before
- * we can reuse it again.
- *
- * We could probably improve on this a little, by allowing immediate
- * reuse of a MR on the same socket (eg you could add small
- * cache of unused MRs to strct rds_socket - GET_MR could grab one
- * of these without requiring an explicit invalidate).
- */
- while (!list_empty(unmap_list)) {
- unsigned long flags;
-
- spin_lock_irqsave(&pool->list_lock, flags);
- list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
- *unpinned += mapping->m_sg.len;
- list_move(&mapping->m_list, &laundered);
- ncleaned++;
- }
- spin_unlock_irqrestore(&pool->list_lock, flags);
- }
-
- /* Move all laundered mappings back to the unmap list.
- * We do not kill any WRs right now - it doesn't seem the
- * fastreg API has a max_remap limit. */
- list_splice_init(&laundered, unmap_list);
-
- return ncleaned;
-}
-
-static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool,
- struct rds_iw_mr *ibmr)
-{
- if (ibmr->mr)
- ib_dereg_mr(ibmr->mr);
-}
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
deleted file mode 100644
index a66d1794b..000000000
--- a/net/rds/iw_recv.c
+++ /dev/null
@@ -1,904 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <rdma/rdma_cm.h>
-
-#include "rds.h"
-#include "iw.h"
-
-static struct kmem_cache *rds_iw_incoming_slab;
-static struct kmem_cache *rds_iw_frag_slab;
-static atomic_t rds_iw_allocation = ATOMIC_INIT(0);
-
-static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
-{
- rdsdebug("frag %p page %p\n", frag, frag->f_page);
- __free_page(frag->f_page);
- frag->f_page = NULL;
-}
-
-static void rds_iw_frag_free(struct rds_page_frag *frag)
-{
- rdsdebug("frag %p page %p\n", frag, frag->f_page);
- BUG_ON(frag->f_page);
- kmem_cache_free(rds_iw_frag_slab, frag);
-}
-
-/*
- * We map a page at a time. Its fragments are posted in order. This
- * is called in fragment order as the fragments get send completion events.
- * Only the last frag in the page performs the unmapping.
- *
- * It's OK for ring cleanup to call this in whatever order it likes because
- * DMA is not in flight and so we can unmap while other ring entries still
- * hold page references in their frags.
- */
-static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic,
- struct rds_iw_recv_work *recv)
-{
- struct rds_page_frag *frag = recv->r_frag;
-
- rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
- if (frag->f_mapped)
- ib_dma_unmap_page(ic->i_cm_id->device,
- frag->f_mapped,
- RDS_FRAG_SIZE, DMA_FROM_DEVICE);
- frag->f_mapped = 0;
-}
-
-void rds_iw_recv_init_ring(struct rds_iw_connection *ic)
-{
- struct rds_iw_recv_work *recv;
- u32 i;
-
- for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
- struct ib_sge *sge;
-
- recv->r_iwinc = NULL;
- recv->r_frag = NULL;
-
- recv->r_wr.next = NULL;
- recv->r_wr.wr_id = i;
- recv->r_wr.sg_list = recv->r_sge;
- recv->r_wr.num_sge = RDS_IW_RECV_SGE;
-
- sge = rds_iw_data_sge(ic, recv->r_sge);
- sge->addr = 0;
- sge->length = RDS_FRAG_SIZE;
- sge->lkey = 0;
-
- sge = rds_iw_header_sge(ic, recv->r_sge);
- sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
- sge->length = sizeof(struct rds_header);
- sge->lkey = 0;
- }
-}
-
-static void rds_iw_recv_clear_one(struct rds_iw_connection *ic,
- struct rds_iw_recv_work *recv)
-{
- if (recv->r_iwinc) {
- rds_inc_put(&recv->r_iwinc->ii_inc);
- recv->r_iwinc = NULL;
- }
- if (recv->r_frag) {
- rds_iw_recv_unmap_page(ic, recv);
- if (recv->r_frag->f_page)
- rds_iw_frag_drop_page(recv->r_frag);
- rds_iw_frag_free(recv->r_frag);
- recv->r_frag = NULL;
- }
-}
-
-void rds_iw_recv_clear_ring(struct rds_iw_connection *ic)
-{
- u32 i;
-
- for (i = 0; i < ic->i_recv_ring.w_nr; i++)
- rds_iw_recv_clear_one(ic, &ic->i_recvs[i]);
-
- if (ic->i_frag.f_page)
- rds_iw_frag_drop_page(&ic->i_frag);
-}
-
-static int rds_iw_recv_refill_one(struct rds_connection *conn,
- struct rds_iw_recv_work *recv,
- gfp_t kptr_gfp, gfp_t page_gfp)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- dma_addr_t dma_addr;
- struct ib_sge *sge;
- int ret = -ENOMEM;
-
- if (!recv->r_iwinc) {
- if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
- rds_iw_stats_inc(s_iw_rx_alloc_limit);
- goto out;
- }
- recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
- kptr_gfp);
- if (!recv->r_iwinc) {
- atomic_dec(&rds_iw_allocation);
- goto out;
- }
- INIT_LIST_HEAD(&recv->r_iwinc->ii_frags);
- rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
- }
-
- if (!recv->r_frag) {
- recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
- if (!recv->r_frag)
- goto out;
- INIT_LIST_HEAD(&recv->r_frag->f_item);
- recv->r_frag->f_page = NULL;
- }
-
- if (!ic->i_frag.f_page) {
- ic->i_frag.f_page = alloc_page(page_gfp);
- if (!ic->i_frag.f_page)
- goto out;
- ic->i_frag.f_offset = 0;
- }
-
- dma_addr = ib_dma_map_page(ic->i_cm_id->device,
- ic->i_frag.f_page,
- ic->i_frag.f_offset,
- RDS_FRAG_SIZE,
- DMA_FROM_DEVICE);
- if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
- goto out;
-
- /*
- * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap()
- * must be called on this recv. This happens as completions hit
- * in order or on connection shutdown.
- */
- recv->r_frag->f_page = ic->i_frag.f_page;
- recv->r_frag->f_offset = ic->i_frag.f_offset;
- recv->r_frag->f_mapped = dma_addr;
-
- sge = rds_iw_data_sge(ic, recv->r_sge);
- sge->addr = dma_addr;
- sge->length = RDS_FRAG_SIZE;
-
- sge = rds_iw_header_sge(ic, recv->r_sge);
- sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
- sge->length = sizeof(struct rds_header);
-
- get_page(recv->r_frag->f_page);
-
- if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
- ic->i_frag.f_offset += RDS_FRAG_SIZE;
- } else {
- put_page(ic->i_frag.f_page);
- ic->i_frag.f_page = NULL;
- ic->i_frag.f_offset = 0;
- }
-
- ret = 0;
-out:
- return ret;
-}
-
-/*
- * This tries to allocate and post unused work requests after making sure that
- * they have all the allocations they need to queue received fragments into
- * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
- * pairs don't go unmatched.
- *
- * -1 is returned if posting fails due to temporary resource exhaustion.
- */
-int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
- gfp_t page_gfp, int prefill)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct rds_iw_recv_work *recv;
- struct ib_recv_wr *failed_wr;
- unsigned int posted = 0;
- int ret = 0;
- u32 pos;
-
- while ((prefill || rds_conn_up(conn)) &&
- rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
- if (pos >= ic->i_recv_ring.w_nr) {
- printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
- pos);
- ret = -EINVAL;
- break;
- }
-
- recv = &ic->i_recvs[pos];
- ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
- if (ret) {
- ret = -1;
- break;
- }
-
- /* XXX when can this fail? */
- ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
- rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv,
- recv->r_iwinc, recv->r_frag->f_page,
- (long) recv->r_frag->f_mapped, ret);
- if (ret) {
- rds_iw_conn_error(conn, "recv post on "
- "%pI4 returned %d, disconnecting and "
- "reconnecting\n", &conn->c_faddr,
- ret);
- ret = -1;
- break;
- }
-
- posted++;
- }
-
- /* We're doing flow control - update the window. */
- if (ic->i_flowctl && posted)
- rds_iw_advertise_credits(conn, posted);
-
- if (ret)
- rds_iw_ring_unalloc(&ic->i_recv_ring, 1);
- return ret;
-}
-
-static void rds_iw_inc_purge(struct rds_incoming *inc)
-{
- struct rds_iw_incoming *iwinc;
- struct rds_page_frag *frag;
- struct rds_page_frag *pos;
-
- iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
- rdsdebug("purging iwinc %p inc %p\n", iwinc, inc);
-
- list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) {
- list_del_init(&frag->f_item);
- rds_iw_frag_drop_page(frag);
- rds_iw_frag_free(frag);
- }
-}
-
-void rds_iw_inc_free(struct rds_incoming *inc)
-{
- struct rds_iw_incoming *iwinc;
-
- iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
-
- rds_iw_inc_purge(inc);
- rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc);
- BUG_ON(!list_empty(&iwinc->ii_frags));
- kmem_cache_free(rds_iw_incoming_slab, iwinc);
- atomic_dec(&rds_iw_allocation);
- BUG_ON(atomic_read(&rds_iw_allocation) < 0);
-}
-
-int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
-{
- struct rds_iw_incoming *iwinc;
- struct rds_page_frag *frag;
- unsigned long to_copy;
- unsigned long frag_off = 0;
- int copied = 0;
- int ret;
- u32 len;
-
- iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
- frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
- len = be32_to_cpu(inc->i_hdr.h_len);
-
- while (iov_iter_count(to) && copied < len) {
- if (frag_off == RDS_FRAG_SIZE) {
- frag = list_entry(frag->f_item.next,
- struct rds_page_frag, f_item);
- frag_off = 0;
- }
- to_copy = min_t(unsigned long, iov_iter_count(to),
- RDS_FRAG_SIZE - frag_off);
- to_copy = min_t(unsigned long, to_copy, len - copied);
-
- /* XXX needs + offset for multiple recvs per page */
- rds_stats_add(s_copy_to_user, to_copy);
- ret = copy_page_to_iter(frag->f_page,
- frag->f_offset + frag_off,
- to_copy,
- to);
- if (ret != to_copy)
- return -EFAULT;
-
- frag_off += to_copy;
- copied += to_copy;
- }
-
- return copied;
-}
-
-/* ic starts out kzalloc()ed */
-void rds_iw_recv_init_ack(struct rds_iw_connection *ic)
-{
- struct ib_send_wr *wr = &ic->i_ack_wr;
- struct ib_sge *sge = &ic->i_ack_sge;
-
- sge->addr = ic->i_ack_dma;
- sge->length = sizeof(struct rds_header);
- sge->lkey = rds_iw_local_dma_lkey(ic);
-
- wr->sg_list = sge;
- wr->num_sge = 1;
- wr->opcode = IB_WR_SEND;
- wr->wr_id = RDS_IW_ACK_WR_ID;
- wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
-}
-
-/*
- * You'd think that with reliable IB connections you wouldn't need to ack
- * messages that have been received. The problem is that IB hardware generates
- * an ack message before it has DMAed the message into memory. This creates a
- * potential message loss if the HCA is disabled for any reason between when it
- * sends the ack and before the message is DMAed and processed. This is only a
- * potential issue if another HCA is available for fail-over.
- *
- * When the remote host receives our ack they'll free the sent message from
- * their send queue. To decrease the latency of this we always send an ack
- * immediately after we've received messages.
- *
- * For simplicity, we only have one ack in flight at a time. This puts
- * pressure on senders to have deep enough send queues to absorb the latency of
- * a single ack frame being in flight. This might not be good enough.
- *
- * This is implemented by have a long-lived send_wr and sge which point to a
- * statically allocated ack frame. This ack wr does not fall under the ring
- * accounting that the tx and rx wrs do. The QP attribute specifically makes
- * room for it beyond the ring size. Send completion notices its special
- * wr_id and avoids working with the ring in that case.
- */
-#ifndef KERNEL_HAS_ATOMIC64
-static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
- int ack_required)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&ic->i_ack_lock, flags);
- ic->i_ack_next = seq;
- if (ack_required)
- set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
- spin_unlock_irqrestore(&ic->i_ack_lock, flags);
-}
-
-static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
-{
- unsigned long flags;
- u64 seq;
-
- clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
-
- spin_lock_irqsave(&ic->i_ack_lock, flags);
- seq = ic->i_ack_next;
- spin_unlock_irqrestore(&ic->i_ack_lock, flags);
-
- return seq;
-}
-#else
-static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
- int ack_required)
-{
- atomic64_set(&ic->i_ack_next, seq);
- if (ack_required) {
- smp_mb__before_atomic();
- set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
- }
-}
-
-static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
-{
- clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
- smp_mb__after_atomic();
-
- return atomic64_read(&ic->i_ack_next);
-}
-#endif
-
-
-static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits)
-{
- struct rds_header *hdr = ic->i_ack;
- struct ib_send_wr *failed_wr;
- u64 seq;
- int ret;
-
- seq = rds_iw_get_ack(ic);
-
- rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
- rds_message_populate_header(hdr, 0, 0, 0);
- hdr->h_ack = cpu_to_be64(seq);
- hdr->h_credit = adv_credits;
- rds_message_make_checksum(hdr);
- ic->i_ack_queued = jiffies;
-
- ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
- if (unlikely(ret)) {
- /* Failed to send. Release the WR, and
- * force another ACK.
- */
- clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
- set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
-
- rds_iw_stats_inc(s_iw_ack_send_failure);
-
- rds_iw_conn_error(ic->conn, "sending ack failed\n");
- } else
- rds_iw_stats_inc(s_iw_ack_sent);
-}
-
-/*
- * There are 3 ways of getting acknowledgements to the peer:
- * 1. We call rds_iw_attempt_ack from the recv completion handler
- * to send an ACK-only frame.
- * However, there can be only one such frame in the send queue
- * at any time, so we may have to postpone it.
- * 2. When another (data) packet is transmitted while there's
- * an ACK in the queue, we piggyback the ACK sequence number
- * on the data packet.
- * 3. If the ACK WR is done sending, we get called from the
- * send queue completion handler, and check whether there's
- * another ACK pending (postponed because the WR was on the
- * queue). If so, we transmit it.
- *
- * We maintain 2 variables:
- * - i_ack_flags, which keeps track of whether the ACK WR
- * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
- * - i_ack_next, which is the last sequence number we received
- *
- * Potentially, send queue and receive queue handlers can run concurrently.
- * It would be nice to not have to use a spinlock to synchronize things,
- * but the one problem that rules this out is that 64bit updates are
- * not atomic on all platforms. Things would be a lot simpler if
- * we had atomic64 or maybe cmpxchg64 everywhere.
- *
- * Reconnecting complicates this picture just slightly. When we
- * reconnect, we may be seeing duplicate packets. The peer
- * is retransmitting them, because it hasn't seen an ACK for
- * them. It is important that we ACK these.
- *
- * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
- * this flag set *MUST* be acknowledged immediately.
- */
-
-/*
- * When we get here, we're called from the recv queue handler.
- * Check whether we ought to transmit an ACK.
- */
-void rds_iw_attempt_ack(struct rds_iw_connection *ic)
-{
- unsigned int adv_credits;
-
- if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
- return;
-
- if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
- rds_iw_stats_inc(s_iw_ack_send_delayed);
- return;
- }
-
- /* Can we get a send credit? */
- if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
- rds_iw_stats_inc(s_iw_tx_throttle);
- clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
- return;
- }
-
- clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
- rds_iw_send_ack(ic, adv_credits);
-}
-
-/*
- * We get here from the send completion handler, when the
- * adapter tells us the ACK frame was sent.
- */
-void rds_iw_ack_send_complete(struct rds_iw_connection *ic)
-{
- clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
- rds_iw_attempt_ack(ic);
-}
-
-/*
- * This is called by the regular xmit code when it wants to piggyback
- * an ACK on an outgoing frame.
- */
-u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic)
-{
- if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
- rds_iw_stats_inc(s_iw_ack_send_piggybacked);
- return rds_iw_get_ack(ic);
-}
-
-/*
- * It's kind of lame that we're copying from the posted receive pages into
- * long-lived bitmaps. We could have posted the bitmaps and rdma written into
- * them. But receiving new congestion bitmaps should be a *rare* event, so
- * hopefully we won't need to invest that complexity in making it more
- * efficient. By copying we can share a simpler core with TCP which has to
- * copy.
- */
-static void rds_iw_cong_recv(struct rds_connection *conn,
- struct rds_iw_incoming *iwinc)
-{
- struct rds_cong_map *map;
- unsigned int map_off;
- unsigned int map_page;
- struct rds_page_frag *frag;
- unsigned long frag_off;
- unsigned long to_copy;
- unsigned long copied;
- uint64_t uncongested = 0;
- void *addr;
-
- /* catch completely corrupt packets */
- if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
- return;
-
- map = conn->c_fcong;
- map_page = 0;
- map_off = 0;
-
- frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
- frag_off = 0;
-
- copied = 0;
-
- while (copied < RDS_CONG_MAP_BYTES) {
- uint64_t *src, *dst;
- unsigned int k;
-
- to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
- BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
-
- addr = kmap_atomic(frag->f_page);
-
- src = addr + frag_off;
- dst = (void *)map->m_page_addrs[map_page] + map_off;
- for (k = 0; k < to_copy; k += 8) {
- /* Record ports that became uncongested, ie
- * bits that changed from 0 to 1. */
- uncongested |= ~(*src) & *dst;
- *dst++ = *src++;
- }
- kunmap_atomic(addr);
-
- copied += to_copy;
-
- map_off += to_copy;
- if (map_off == PAGE_SIZE) {
- map_off = 0;
- map_page++;
- }
-
- frag_off += to_copy;
- if (frag_off == RDS_FRAG_SIZE) {
- frag = list_entry(frag->f_item.next,
- struct rds_page_frag, f_item);
- frag_off = 0;
- }
- }
-
- /* the congestion map is in little endian order */
- uncongested = le64_to_cpu(uncongested);
-
- rds_cong_map_updated(map, uncongested);
-}
-
-/*
- * Rings are posted with all the allocations they'll need to queue the
- * incoming message to the receiving socket so this can't fail.
- * All fragments start with a header, so we can make sure we're not receiving
- * garbage, and we can tell a small 8 byte fragment from an ACK frame.
- */
-struct rds_iw_ack_state {
- u64 ack_next;
- u64 ack_recv;
- unsigned int ack_required:1;
- unsigned int ack_next_valid:1;
- unsigned int ack_recv_valid:1;
-};
-
-static void rds_iw_process_recv(struct rds_connection *conn,
- struct rds_iw_recv_work *recv, u32 byte_len,
- struct rds_iw_ack_state *state)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct rds_iw_incoming *iwinc = ic->i_iwinc;
- struct rds_header *ihdr, *hdr;
-
- /* XXX shut down the connection if port 0,0 are seen? */
-
- rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv,
- byte_len);
-
- if (byte_len < sizeof(struct rds_header)) {
- rds_iw_conn_error(conn, "incoming message "
- "from %pI4 didn't include a "
- "header, disconnecting and "
- "reconnecting\n",
- &conn->c_faddr);
- return;
- }
- byte_len -= sizeof(struct rds_header);
-
- ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
-
- /* Validate the checksum. */
- if (!rds_message_verify_checksum(ihdr)) {
- rds_iw_conn_error(conn, "incoming message "
- "from %pI4 has corrupted header - "
- "forcing a reconnect\n",
- &conn->c_faddr);
- rds_stats_inc(s_recv_drop_bad_checksum);
- return;
- }
-
- /* Process the ACK sequence which comes with every packet */
- state->ack_recv = be64_to_cpu(ihdr->h_ack);
- state->ack_recv_valid = 1;
-
- /* Process the credits update if there was one */
- if (ihdr->h_credit)
- rds_iw_send_add_credits(conn, ihdr->h_credit);
-
- if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
- /* This is an ACK-only packet. The fact that it gets
- * special treatment here is that historically, ACKs
- * were rather special beasts.
- */
- rds_iw_stats_inc(s_iw_ack_received);
-
- /*
- * Usually the frags make their way on to incs and are then freed as
- * the inc is freed. We don't go that route, so we have to drop the
- * page ref ourselves. We can't just leave the page on the recv
- * because that confuses the dma mapping of pages and each recv's use
- * of a partial page. We can leave the frag, though, it will be
- * reused.
- *
- * FIXME: Fold this into the code path below.
- */
- rds_iw_frag_drop_page(recv->r_frag);
- return;
- }
-
- /*
- * If we don't already have an inc on the connection then this
- * fragment has a header and starts a message.. copy its header
- * into the inc and save the inc so we can hang upcoming fragments
- * off its list.
- */
- if (!iwinc) {
- iwinc = recv->r_iwinc;
- recv->r_iwinc = NULL;
- ic->i_iwinc = iwinc;
-
- hdr = &iwinc->ii_inc.i_hdr;
- memcpy(hdr, ihdr, sizeof(*hdr));
- ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
-
- rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc,
- ic->i_recv_data_rem, hdr->h_flags);
- } else {
- hdr = &iwinc->ii_inc.i_hdr;
- /* We can't just use memcmp here; fragments of a
- * single message may carry different ACKs */
- if (hdr->h_sequence != ihdr->h_sequence ||
- hdr->h_len != ihdr->h_len ||
- hdr->h_sport != ihdr->h_sport ||
- hdr->h_dport != ihdr->h_dport) {
- rds_iw_conn_error(conn,
- "fragment header mismatch; forcing reconnect\n");
- return;
- }
- }
-
- list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags);
- recv->r_frag = NULL;
-
- if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
- ic->i_recv_data_rem -= RDS_FRAG_SIZE;
- else {
- ic->i_recv_data_rem = 0;
- ic->i_iwinc = NULL;
-
- if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
- rds_iw_cong_recv(conn, iwinc);
- else {
- rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
- &iwinc->ii_inc, GFP_ATOMIC);
- state->ack_next = be64_to_cpu(hdr->h_sequence);
- state->ack_next_valid = 1;
- }
-
- /* Evaluate the ACK_REQUIRED flag *after* we received
- * the complete frame, and after bumping the next_rx
- * sequence. */
- if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
- rds_stats_inc(s_recv_ack_required);
- state->ack_required = 1;
- }
-
- rds_inc_put(&iwinc->ii_inc);
- }
-}
-
-/*
- * Plucking the oldest entry from the ring can be done concurrently with
- * the thread refilling the ring. Each ring operation is protected by
- * spinlocks and the transient state of refilling doesn't change the
- * recording of which entry is oldest.
- *
- * This relies on IB only calling one cq comp_handler for each cq so that
- * there will only be one caller of rds_recv_incoming() per RDS connection.
- */
-void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context)
-{
- struct rds_connection *conn = context;
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- rdsdebug("conn %p cq %p\n", conn, cq);
-
- rds_iw_stats_inc(s_iw_rx_cq_call);
-
- tasklet_schedule(&ic->i_recv_tasklet);
-}
-
-static inline void rds_poll_cq(struct rds_iw_connection *ic,
- struct rds_iw_ack_state *state)
-{
- struct rds_connection *conn = ic->conn;
- struct ib_wc wc;
- struct rds_iw_recv_work *recv;
-
- while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
- rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
- (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
- be32_to_cpu(wc.ex.imm_data));
- rds_iw_stats_inc(s_iw_rx_cq_event);
-
- recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)];
-
- rds_iw_recv_unmap_page(ic, recv);
-
- /*
- * Also process recvs in connecting state because it is possible
- * to get a recv completion _before_ the rdmacm ESTABLISHED
- * event is processed.
- */
- if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
- /* We expect errors as the qp is drained during shutdown */
- if (wc.status == IB_WC_SUCCESS) {
- rds_iw_process_recv(conn, recv, wc.byte_len, state);
- } else {
- rds_iw_conn_error(conn, "recv completion on "
- "%pI4 had status %u, disconnecting and "
- "reconnecting\n", &conn->c_faddr,
- wc.status);
- }
- }
-
- rds_iw_ring_free(&ic->i_recv_ring, 1);
- }
-}
-
-void rds_iw_recv_tasklet_fn(unsigned long data)
-{
- struct rds_iw_connection *ic = (struct rds_iw_connection *) data;
- struct rds_connection *conn = ic->conn;
- struct rds_iw_ack_state state = { 0, };
-
- rds_poll_cq(ic, &state);
- ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
- rds_poll_cq(ic, &state);
-
- if (state.ack_next_valid)
- rds_iw_set_ack(ic, state.ack_next, state.ack_required);
- if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
- rds_send_drop_acked(conn, state.ack_recv, NULL);
- ic->i_ack_recv = state.ack_recv;
- }
- if (rds_conn_up(conn))
- rds_iw_attempt_ack(ic);
-
- /* If we ever end up with a really empty receive ring, we're
- * in deep trouble, as the sender will definitely see RNR
- * timeouts. */
- if (rds_iw_ring_empty(&ic->i_recv_ring))
- rds_iw_stats_inc(s_iw_rx_ring_empty);
-
- /*
- * If the ring is running low, then schedule the thread to refill.
- */
- if (rds_iw_ring_low(&ic->i_recv_ring))
- queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
-}
-
-int rds_iw_recv(struct rds_connection *conn)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- int ret = 0;
-
- rdsdebug("conn %p\n", conn);
-
- /*
- * If we get a temporary posting failure in this context then
- * we're really low and we want the caller to back off for a bit.
- */
- mutex_lock(&ic->i_recv_mutex);
- if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
- ret = -ENOMEM;
- else
- rds_iw_stats_inc(s_iw_rx_refill_from_thread);
- mutex_unlock(&ic->i_recv_mutex);
-
- if (rds_conn_up(conn))
- rds_iw_attempt_ack(ic);
-
- return ret;
-}
-
-int rds_iw_recv_init(void)
-{
- struct sysinfo si;
- int ret = -ENOMEM;
-
- /* Default to 30% of all available RAM for recv memory */
- si_meminfo(&si);
- rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
-
- rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
- sizeof(struct rds_iw_incoming),
- 0, 0, NULL);
- if (!rds_iw_incoming_slab)
- goto out;
-
- rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
- sizeof(struct rds_page_frag),
- 0, 0, NULL);
- if (!rds_iw_frag_slab)
- kmem_cache_destroy(rds_iw_incoming_slab);
- else
- ret = 0;
-out:
- return ret;
-}
-
-void rds_iw_recv_exit(void)
-{
- kmem_cache_destroy(rds_iw_incoming_slab);
- kmem_cache_destroy(rds_iw_frag_slab);
-}
diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c
deleted file mode 100644
index da8e3b63f..000000000
--- a/net/rds/iw_ring.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-
-#include "rds.h"
-#include "iw.h"
-
-/*
- * Locking for IB rings.
- * We assume that allocation is always protected by a mutex
- * in the caller (this is a valid assumption for the current
- * implementation).
- *
- * Freeing always happens in an interrupt, and hence only
- * races with allocations, but not with other free()s.
- *
- * The interaction between allocation and freeing is that
- * the alloc code has to determine the number of free entries.
- * To this end, we maintain two counters; an allocation counter
- * and a free counter. Both are allowed to run freely, and wrap
- * around.
- * The number of used entries is always (alloc_ctr - free_ctr) % NR.
- *
- * The current implementation makes free_ctr atomic. When the
- * caller finds an allocation fails, it should set an "alloc fail"
- * bit and retry the allocation. The "alloc fail" bit essentially tells
- * the CQ completion handlers to wake it up after freeing some
- * more entries.
- */
-
-/*
- * This only happens on shutdown.
- */
-DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait);
-
-void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr)
-{
- memset(ring, 0, sizeof(*ring));
- ring->w_nr = nr;
- rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
-}
-
-static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring)
-{
- u32 diff;
-
- /* This assumes that atomic_t has at least as many bits as u32 */
- diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
- BUG_ON(diff > ring->w_nr);
-
- return diff;
-}
-
-void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr)
-{
- /* We only ever get called from the connection setup code,
- * prior to creating the QP. */
- BUG_ON(__rds_iw_ring_used(ring));
- ring->w_nr = nr;
-}
-
-static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring)
-{
- return __rds_iw_ring_used(ring) == 0;
-}
-
-u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos)
-{
- u32 ret = 0, avail;
-
- avail = ring->w_nr - __rds_iw_ring_used(ring);
-
- rdsdebug("ring %p val %u next %u free %u\n", ring, val,
- ring->w_alloc_ptr, avail);
-
- if (val && avail) {
- ret = min(val, avail);
- *pos = ring->w_alloc_ptr;
-
- ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
- ring->w_alloc_ctr += ret;
- }
-
- return ret;
-}
-
-void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val)
-{
- ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
- atomic_add(val, &ring->w_free_ctr);
-
- if (__rds_iw_ring_empty(ring) &&
- waitqueue_active(&rds_iw_ring_empty_wait))
- wake_up(&rds_iw_ring_empty_wait);
-}
-
-void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val)
-{
- ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
- ring->w_alloc_ctr -= val;
-}
-
-int rds_iw_ring_empty(struct rds_iw_work_ring *ring)
-{
- return __rds_iw_ring_empty(ring);
-}
-
-int rds_iw_ring_low(struct rds_iw_work_ring *ring)
-{
- return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1);
-}
-
-
-/*
- * returns the oldest alloced ring entry. This will be the next one
- * freed. This can't be called if there are none allocated.
- */
-u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring)
-{
- return ring->w_free_ptr;
-}
-
-/*
- * returns the number of completed work requests.
- */
-
-u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest)
-{
- u32 ret;
-
- if (oldest <= (unsigned long long)wr_id)
- ret = (unsigned long long)wr_id - oldest + 1;
- else
- ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
-
- rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
- wr_id, oldest);
- return ret;
-}
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
deleted file mode 100644
index e20bd503f..000000000
--- a/net/rds/iw_send.c
+++ /dev/null
@@ -1,981 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/in.h>
-#include <linux/device.h>
-#include <linux/dmapool.h>
-#include <linux/ratelimit.h>
-
-#include "rds.h"
-#include "iw.h"
-
-static void rds_iw_send_rdma_complete(struct rds_message *rm,
- int wc_status)
-{
- int notify_status;
-
- switch (wc_status) {
- case IB_WC_WR_FLUSH_ERR:
- return;
-
- case IB_WC_SUCCESS:
- notify_status = RDS_RDMA_SUCCESS;
- break;
-
- case IB_WC_REM_ACCESS_ERR:
- notify_status = RDS_RDMA_REMOTE_ERROR;
- break;
-
- default:
- notify_status = RDS_RDMA_OTHER_ERROR;
- break;
- }
- rds_rdma_send_complete(rm, notify_status);
-}
-
-static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
- struct rm_rdma_op *op)
-{
- if (op->op_mapped) {
- ib_dma_unmap_sg(ic->i_cm_id->device,
- op->op_sg, op->op_nents,
- op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
- op->op_mapped = 0;
- }
-}
-
-static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
- struct rds_iw_send_work *send,
- int wc_status)
-{
- struct rds_message *rm = send->s_rm;
-
- rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
-
- ib_dma_unmap_sg(ic->i_cm_id->device,
- rm->data.op_sg, rm->data.op_nents,
- DMA_TO_DEVICE);
-
- if (rm->rdma.op_active) {
- rds_iw_send_unmap_rdma(ic, &rm->rdma);
-
- /* If the user asked for a completion notification on this
- * message, we can implement three different semantics:
- * 1. Notify when we received the ACK on the RDS message
- * that was queued with the RDMA. This provides reliable
- * notification of RDMA status at the expense of a one-way
- * packet delay.
- * 2. Notify when the IB stack gives us the completion event for
- * the RDMA operation.
- * 3. Notify when the IB stack gives us the completion event for
- * the accompanying RDS messages.
- * Here, we implement approach #3. To implement approach #2,
- * call rds_rdma_send_complete from the cq_handler. To implement #1,
- * don't call rds_rdma_send_complete at all, and fall back to the notify
- * handling in the ACK processing code.
- *
- * Note: There's no need to explicitly sync any RDMA buffers using
- * ib_dma_sync_sg_for_cpu - the completion for the RDMA
- * operation itself unmapped the RDMA buffers, which takes care
- * of synching.
- */
- rds_iw_send_rdma_complete(rm, wc_status);
-
- if (rm->rdma.op_write)
- rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
- else
- rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
- }
-
- /* If anyone waited for this message to get flushed out, wake
- * them up now */
- rds_message_unmapped(rm);
-
- rds_message_put(rm);
- send->s_rm = NULL;
-}
-
-void rds_iw_send_init_ring(struct rds_iw_connection *ic)
-{
- struct rds_iw_send_work *send;
- u32 i;
-
- for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
- struct ib_sge *sge;
-
- send->s_rm = NULL;
- send->s_op = NULL;
- send->s_mapping = NULL;
-
- send->s_send_wr.next = NULL;
- send->s_send_wr.wr_id = i;
- send->s_send_wr.sg_list = send->s_sge;
- send->s_send_wr.num_sge = 1;
- send->s_send_wr.opcode = IB_WR_SEND;
- send->s_send_wr.send_flags = 0;
- send->s_send_wr.ex.imm_data = 0;
-
- sge = rds_iw_data_sge(ic, send->s_sge);
- sge->lkey = 0;
-
- sge = rds_iw_header_sge(ic, send->s_sge);
- sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
- sge->length = sizeof(struct rds_header);
- sge->lkey = 0;
-
- send->s_mr = ib_alloc_mr(ic->i_pd, IB_MR_TYPE_MEM_REG,
- fastreg_message_size);
- if (IS_ERR(send->s_mr)) {
- printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed\n");
- break;
- }
- }
-}
-
-void rds_iw_send_clear_ring(struct rds_iw_connection *ic)
-{
- struct rds_iw_send_work *send;
- u32 i;
-
- for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
- BUG_ON(!send->s_mr);
- ib_dereg_mr(send->s_mr);
- if (send->s_send_wr.opcode == 0xdead)
- continue;
- if (send->s_rm)
- rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
- if (send->s_op)
- rds_iw_send_unmap_rdma(ic, send->s_op);
- }
-}
-
-/*
- * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
- * operations performed in the send path. As the sender allocs and potentially
- * unallocs the next free entry in the ring it doesn't alter which is
- * the next to be freed, which is what this is concerned with.
- */
-void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
-{
- struct rds_connection *conn = context;
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct ib_wc wc;
- struct rds_iw_send_work *send;
- u32 completed;
- u32 oldest;
- u32 i;
- int ret;
-
- rdsdebug("cq %p conn %p\n", cq, conn);
- rds_iw_stats_inc(s_iw_tx_cq_call);
- ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
- if (ret)
- rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
-
- while (ib_poll_cq(cq, 1, &wc) > 0) {
- rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
- (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
- be32_to_cpu(wc.ex.imm_data));
- rds_iw_stats_inc(s_iw_tx_cq_event);
-
- if (wc.status != IB_WC_SUCCESS) {
- printk(KERN_ERR "WC Error: status = %d opcode = %d\n", wc.status, wc.opcode);
- break;
- }
-
- if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) {
- ic->i_fastreg_posted = 0;
- continue;
- }
-
- if (wc.opcode == IB_WC_REG_MR && wc.wr_id == RDS_IW_REG_WR_ID) {
- ic->i_fastreg_posted = 1;
- continue;
- }
-
- if (wc.wr_id == RDS_IW_ACK_WR_ID) {
- if (time_after(jiffies, ic->i_ack_queued + HZ/2))
- rds_iw_stats_inc(s_iw_tx_stalled);
- rds_iw_ack_send_complete(ic);
- continue;
- }
-
- oldest = rds_iw_ring_oldest(&ic->i_send_ring);
-
- completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
-
- for (i = 0; i < completed; i++) {
- send = &ic->i_sends[oldest];
-
- /* In the error case, wc.opcode sometimes contains garbage */
- switch (send->s_send_wr.opcode) {
- case IB_WR_SEND:
- if (send->s_rm)
- rds_iw_send_unmap_rm(ic, send, wc.status);
- break;
- case IB_WR_REG_MR:
- case IB_WR_RDMA_WRITE:
- case IB_WR_RDMA_READ:
- case IB_WR_RDMA_READ_WITH_INV:
- /* Nothing to be done - the SG list will be unmapped
- * when the SEND completes. */
- break;
- default:
- printk_ratelimited(KERN_NOTICE
- "RDS/IW: %s: unexpected opcode 0x%x in WR!\n",
- __func__, send->s_send_wr.opcode);
- break;
- }
-
- send->s_send_wr.opcode = 0xdead;
- send->s_send_wr.num_sge = 1;
- if (time_after(jiffies, send->s_queued + HZ/2))
- rds_iw_stats_inc(s_iw_tx_stalled);
-
- /* If a RDMA operation produced an error, signal this right
- * away. If we don't, the subsequent SEND that goes with this
- * RDMA will be canceled with ERR_WFLUSH, and the application
- * never learn that the RDMA failed. */
- if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
- struct rds_message *rm;
-
- rm = rds_send_get_message(conn, send->s_op);
- if (rm)
- rds_iw_send_rdma_complete(rm, wc.status);
- }
-
- oldest = (oldest + 1) % ic->i_send_ring.w_nr;
- }
-
- rds_iw_ring_free(&ic->i_send_ring, completed);
-
- if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
- test_bit(0, &conn->c_map_queued))
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
-
- /* We expect errors as the qp is drained during shutdown */
- if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
- rds_iw_conn_error(conn,
- "send completion on %pI4 "
- "had status %u, disconnecting and reconnecting\n",
- &conn->c_faddr, wc.status);
- }
- }
-}
-
-/*
- * This is the main function for allocating credits when sending
- * messages.
- *
- * Conceptually, we have two counters:
- * - send credits: this tells us how many WRs we're allowed
- * to submit without overruning the receiver's queue. For
- * each SEND WR we post, we decrement this by one.
- *
- * - posted credits: this tells us how many WRs we recently
- * posted to the receive queue. This value is transferred
- * to the peer as a "credit update" in a RDS header field.
- * Every time we transmit credits to the peer, we subtract
- * the amount of transferred credits from this counter.
- *
- * It is essential that we avoid situations where both sides have
- * exhausted their send credits, and are unable to send new credits
- * to the peer. We achieve this by requiring that we send at least
- * one credit update to the peer before exhausting our credits.
- * When new credits arrive, we subtract one credit that is withheld
- * until we've posted new buffers and are ready to transmit these
- * credits (see rds_iw_send_add_credits below).
- *
- * The RDS send code is essentially single-threaded; rds_send_xmit
- * grabs c_send_lock to ensure exclusive access to the send ring.
- * However, the ACK sending code is independent and can race with
- * message SENDs.
- *
- * In the send path, we need to update the counters for send credits
- * and the counter of posted buffers atomically - when we use the
- * last available credit, we cannot allow another thread to race us
- * and grab the posted credits counter. Hence, we have to use a
- * spinlock to protect the credit counter, or use atomics.
- *
- * Spinlocks shared between the send and the receive path are bad,
- * because they create unnecessary delays. An early implementation
- * using a spinlock showed a 5% degradation in throughput at some
- * loads.
- *
- * This implementation avoids spinlocks completely, putting both
- * counters into a single atomic, and updating that atomic using
- * atomic_add (in the receive path, when receiving fresh credits),
- * and using atomic_cmpxchg when updating the two counters.
- */
-int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
- u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
-{
- unsigned int avail, posted, got = 0, advertise;
- long oldval, newval;
-
- *adv_credits = 0;
- if (!ic->i_flowctl)
- return wanted;
-
-try_again:
- advertise = 0;
- oldval = newval = atomic_read(&ic->i_credits);
- posted = IB_GET_POST_CREDITS(oldval);
- avail = IB_GET_SEND_CREDITS(oldval);
-
- rdsdebug("wanted=%u credits=%u posted=%u\n",
- wanted, avail, posted);
-
- /* The last credit must be used to send a credit update. */
- if (avail && !posted)
- avail--;
-
- if (avail < wanted) {
- struct rds_connection *conn = ic->i_cm_id->context;
-
- /* Oops, there aren't that many credits left! */
- set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
- got = avail;
- } else {
- /* Sometimes you get what you want, lalala. */
- got = wanted;
- }
- newval -= IB_SET_SEND_CREDITS(got);
-
- /*
- * If need_posted is non-zero, then the caller wants
- * the posted regardless of whether any send credits are
- * available.
- */
- if (posted && (got || need_posted)) {
- advertise = min_t(unsigned int, posted, max_posted);
- newval -= IB_SET_POST_CREDITS(advertise);
- }
-
- /* Finally bill everything */
- if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
- goto try_again;
-
- *adv_credits = advertise;
- return got;
-}
-
-void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- if (credits == 0)
- return;
-
- rdsdebug("credits=%u current=%u%s\n",
- credits,
- IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
- test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
-
- atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
- if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
-
- WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
-
- rds_iw_stats_inc(s_iw_rx_credit_updates);
-}
-
-void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- if (posted == 0)
- return;
-
- atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
-
- /* Decide whether to send an update to the peer now.
- * If we would send a credit update for every single buffer we
- * post, we would end up with an ACK storm (ACK arrives,
- * consumes buffer, we refill the ring, send ACK to remote
- * advertising the newly posted buffer... ad inf)
- *
- * Performance pretty much depends on how often we send
- * credit updates - too frequent updates mean lots of ACKs.
- * Too infrequent updates, and the peer will run out of
- * credits and has to throttle.
- * For the time being, 16 seems to be a good compromise.
- */
- if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
- set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
-}
-
-static inline void
-rds_iw_xmit_populate_wr(struct rds_iw_connection *ic,
- struct rds_iw_send_work *send, unsigned int pos,
- unsigned long buffer, unsigned int length,
- int send_flags)
-{
- struct ib_sge *sge;
-
- WARN_ON(pos != send - ic->i_sends);
-
- send->s_send_wr.send_flags = send_flags;
- send->s_send_wr.opcode = IB_WR_SEND;
- send->s_send_wr.num_sge = 2;
- send->s_send_wr.next = NULL;
- send->s_queued = jiffies;
- send->s_op = NULL;
-
- if (length != 0) {
- sge = rds_iw_data_sge(ic, send->s_sge);
- sge->addr = buffer;
- sge->length = length;
- sge->lkey = rds_iw_local_dma_lkey(ic);
-
- sge = rds_iw_header_sge(ic, send->s_sge);
- } else {
- /* We're sending a packet with no payload. There is only
- * one SGE */
- send->s_send_wr.num_sge = 1;
- sge = &send->s_sge[0];
- }
-
- sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
- sge->length = sizeof(struct rds_header);
- sge->lkey = rds_iw_local_dma_lkey(ic);
-}
-
-/*
- * This can be called multiple times for a given message. The first time
- * we see a message we map its scatterlist into the IB device so that
- * we can provide that mapped address to the IB scatter gather entries
- * in the IB work requests. We translate the scatterlist into a series
- * of work requests that fragment the message. These work requests complete
- * in order so we pass ownership of the message to the completion handler
- * once we send the final fragment.
- *
- * The RDS core uses the c_send_lock to only enter this function once
- * per connection. This makes sure that the tx ring alloc/unalloc pairs
- * don't get out of sync and confuse the ring.
- */
-int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
- unsigned int hdr_off, unsigned int sg, unsigned int off)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct ib_device *dev = ic->i_cm_id->device;
- struct rds_iw_send_work *send = NULL;
- struct rds_iw_send_work *first;
- struct rds_iw_send_work *prev;
- struct ib_send_wr *failed_wr;
- struct scatterlist *scat;
- u32 pos;
- u32 i;
- u32 work_alloc;
- u32 credit_alloc;
- u32 posted;
- u32 adv_credits = 0;
- int send_flags = 0;
- int sent;
- int ret;
- int flow_controlled = 0;
-
- BUG_ON(off % RDS_FRAG_SIZE);
- BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
-
- /* Fastreg support */
- if (rds_rdma_cookie_key(rm->m_rdma_cookie) && !ic->i_fastreg_posted) {
- ret = -EAGAIN;
- goto out;
- }
-
- /* FIXME we may overallocate here */
- if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
- i = 1;
- else
- i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
-
- work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
- if (work_alloc == 0) {
- set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
- rds_iw_stats_inc(s_iw_tx_ring_full);
- ret = -ENOMEM;
- goto out;
- }
-
- credit_alloc = work_alloc;
- if (ic->i_flowctl) {
- credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
- adv_credits += posted;
- if (credit_alloc < work_alloc) {
- rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
- work_alloc = credit_alloc;
- flow_controlled++;
- }
- if (work_alloc == 0) {
- set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
- rds_iw_stats_inc(s_iw_tx_throttle);
- ret = -ENOMEM;
- goto out;
- }
- }
-
- /* map the message the first time we see it */
- if (!ic->i_rm) {
- /*
- printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
- be16_to_cpu(rm->m_inc.i_hdr.h_dport),
- rm->m_inc.i_hdr.h_flags,
- be32_to_cpu(rm->m_inc.i_hdr.h_len));
- */
- if (rm->data.op_nents) {
- rm->data.op_count = ib_dma_map_sg(dev,
- rm->data.op_sg,
- rm->data.op_nents,
- DMA_TO_DEVICE);
- rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
- if (rm->data.op_count == 0) {
- rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
- rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
- ret = -ENOMEM; /* XXX ? */
- goto out;
- }
- } else {
- rm->data.op_count = 0;
- }
-
- ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
- ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
- rds_message_addref(rm);
- rm->data.op_dmasg = 0;
- rm->data.op_dmaoff = 0;
- ic->i_rm = rm;
-
- /* Finalize the header */
- if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
- rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
- if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
- rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
-
- /* If it has a RDMA op, tell the peer we did it. This is
- * used by the peer to release use-once RDMA MRs. */
- if (rm->rdma.op_active) {
- struct rds_ext_header_rdma ext_hdr;
-
- ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
- rds_message_add_extension(&rm->m_inc.i_hdr,
- RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
- }
- if (rm->m_rdma_cookie) {
- rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
- rds_rdma_cookie_key(rm->m_rdma_cookie),
- rds_rdma_cookie_offset(rm->m_rdma_cookie));
- }
-
- /* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so
- * we should not do this unless we have a chance of at least
- * sticking the header into the send ring. Which is why we
- * should call rds_iw_ring_alloc first. */
- rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic));
- rds_message_make_checksum(&rm->m_inc.i_hdr);
-
- /*
- * Update adv_credits since we reset the ACK_REQUIRED bit.
- */
- rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
- adv_credits += posted;
- BUG_ON(adv_credits > 255);
- }
-
- send = &ic->i_sends[pos];
- first = send;
- prev = NULL;
- scat = &rm->data.op_sg[rm->data.op_dmasg];
- sent = 0;
- i = 0;
-
- /* Sometimes you want to put a fence between an RDMA
- * READ and the following SEND.
- * We could either do this all the time
- * or when requested by the user. Right now, we let
- * the application choose.
- */
- if (rm->rdma.op_active && rm->rdma.op_fence)
- send_flags = IB_SEND_FENCE;
-
- /*
- * We could be copying the header into the unused tail of the page.
- * That would need to be changed in the future when those pages might
- * be mapped userspace pages or page cache pages. So instead we always
- * use a second sge and our long-lived ring of mapped headers. We send
- * the header after the data so that the data payload can be aligned on
- * the receiver.
- */
-
- /* handle a 0-len message */
- if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
- rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
- goto add_header;
- }
-
- /* if there's data reference it with a chain of work reqs */
- for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) {
- unsigned int len;
-
- send = &ic->i_sends[pos];
-
- len = min(RDS_FRAG_SIZE,
- ib_sg_dma_len(dev, scat) - rm->data.op_dmaoff);
- rds_iw_xmit_populate_wr(ic, send, pos,
- ib_sg_dma_address(dev, scat) + rm->data.op_dmaoff, len,
- send_flags);
-
- /*
- * We want to delay signaling completions just enough to get
- * the batching benefits but not so much that we create dead time
- * on the wire.
- */
- if (ic->i_unsignaled_wrs-- == 0) {
- ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
- send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
- }
-
- ic->i_unsignaled_bytes -= len;
- if (ic->i_unsignaled_bytes <= 0) {
- ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
- send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
- }
-
- /*
- * Always signal the last one if we're stopping due to flow control.
- */
- if (flow_controlled && i == (work_alloc-1))
- send->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
-
- rdsdebug("send %p wr %p num_sge %u next %p\n", send,
- &send->s_send_wr, send->s_send_wr.num_sge, send->s_send_wr.next);
-
- sent += len;
- rm->data.op_dmaoff += len;
- if (rm->data.op_dmaoff == ib_sg_dma_len(dev, scat)) {
- scat++;
- rm->data.op_dmaoff = 0;
- rm->data.op_dmasg++;
- }
-
-add_header:
- /* Tack on the header after the data. The header SGE should already
- * have been set up to point to the right header buffer. */
- memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
-
- if (0) {
- struct rds_header *hdr = &ic->i_send_hdrs[pos];
-
- printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
- be16_to_cpu(hdr->h_dport),
- hdr->h_flags,
- be32_to_cpu(hdr->h_len));
- }
- if (adv_credits) {
- struct rds_header *hdr = &ic->i_send_hdrs[pos];
-
- /* add credit and redo the header checksum */
- hdr->h_credit = adv_credits;
- rds_message_make_checksum(hdr);
- adv_credits = 0;
- rds_iw_stats_inc(s_iw_tx_credit_updates);
- }
-
- if (prev)
- prev->s_send_wr.next = &send->s_send_wr;
- prev = send;
-
- pos = (pos + 1) % ic->i_send_ring.w_nr;
- }
-
- /* Account the RDS header in the number of bytes we sent, but just once.
- * The caller has no concept of fragmentation. */
- if (hdr_off == 0)
- sent += sizeof(struct rds_header);
-
- /* if we finished the message then send completion owns it */
- if (scat == &rm->data.op_sg[rm->data.op_count]) {
- prev->s_rm = ic->i_rm;
- prev->s_send_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
- ic->i_rm = NULL;
- }
-
- if (i < work_alloc) {
- rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
- work_alloc = i;
- }
- if (ic->i_flowctl && i < credit_alloc)
- rds_iw_send_add_credits(conn, credit_alloc - i);
-
- /* XXX need to worry about failed_wr and partial sends. */
- failed_wr = &first->s_send_wr;
- ret = ib_post_send(ic->i_cm_id->qp, &first->s_send_wr, &failed_wr);
- rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
- first, &first->s_send_wr, ret, failed_wr);
- BUG_ON(failed_wr != &first->s_send_wr);
- if (ret) {
- printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 "
- "returned %d\n", &conn->c_faddr, ret);
- rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
- if (prev->s_rm) {
- ic->i_rm = prev->s_rm;
- prev->s_rm = NULL;
- }
- goto out;
- }
-
- ret = sent;
-out:
- BUG_ON(adv_credits);
- return ret;
-}
-
-static int rds_iw_build_send_reg(struct rds_iw_send_work *send,
- struct scatterlist *sg,
- int sg_nents)
-{
- int n;
-
- n = ib_map_mr_sg(send->s_mr, sg, sg_nents, PAGE_SIZE);
- if (unlikely(n != sg_nents))
- return n < 0 ? n : -EINVAL;
-
- send->s_reg_wr.wr.opcode = IB_WR_REG_MR;
- send->s_reg_wr.wr.wr_id = 0;
- send->s_reg_wr.wr.num_sge = 0;
- send->s_reg_wr.mr = send->s_mr;
- send->s_reg_wr.key = send->s_mr->rkey;
- send->s_reg_wr.access = IB_ACCESS_REMOTE_WRITE;
-
- ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
-
- return 0;
-}
-
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct rds_iw_send_work *send = NULL;
- struct rds_iw_send_work *first;
- struct rds_iw_send_work *prev;
- struct ib_send_wr *failed_wr;
- struct rds_iw_device *rds_iwdev;
- struct scatterlist *scat;
- unsigned long len;
- u64 remote_addr = op->op_remote_addr;
- u32 pos, fr_pos;
- u32 work_alloc;
- u32 i;
- u32 j;
- int sent;
- int ret;
- int num_sge;
- int sg_nents;
-
- rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
-
- /* map the message the first time we see it */
- if (!op->op_mapped) {
- op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
- op->op_sg, op->op_nents, (op->op_write) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE);
- rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
- if (op->op_count == 0) {
- rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
- ret = -ENOMEM; /* XXX ? */
- goto out;
- }
-
- op->op_mapped = 1;
- }
-
- if (!op->op_write) {
- /* Alloc space on the send queue for the fastreg */
- work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
- if (work_alloc != 1) {
- rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
- rds_iw_stats_inc(s_iw_tx_ring_full);
- ret = -ENOMEM;
- goto out;
- }
- }
-
- /*
- * Instead of knowing how to return a partial rdma read/write we insist that there
- * be enough work requests to send the entire message.
- */
- i = ceil(op->op_count, rds_iwdev->max_sge);
-
- work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
- if (work_alloc != i) {
- rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
- rds_iw_stats_inc(s_iw_tx_ring_full);
- ret = -ENOMEM;
- goto out;
- }
-
- send = &ic->i_sends[pos];
- if (!op->op_write) {
- first = prev = &ic->i_sends[fr_pos];
- } else {
- first = send;
- prev = NULL;
- }
- scat = &op->op_sg[0];
- sent = 0;
- num_sge = op->op_count;
- sg_nents = 0;
-
- for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
- send->s_rdma_wr.wr.send_flags = 0;
- send->s_queued = jiffies;
-
- /*
- * We want to delay signaling completions just enough to get
- * the batching benefits but not so much that we create dead time on the wire.
- */
- if (ic->i_unsignaled_wrs-- == 0) {
- ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
- send->s_rdma_wr.wr.send_flags = IB_SEND_SIGNALED;
- }
-
- /* To avoid the need to have the plumbing to invalidate the fastreg_mr used
- * for local access after RDS is finished with it, using
- * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
- */
- if (op->op_write)
- send->s_rdma_wr.wr.opcode = IB_WR_RDMA_WRITE;
- else
- send->s_rdma_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
-
- send->s_rdma_wr.remote_addr = remote_addr;
- send->s_rdma_wr.rkey = op->op_rkey;
- send->s_op = op;
-
- if (num_sge > rds_iwdev->max_sge) {
- send->s_rdma_wr.wr.num_sge = rds_iwdev->max_sge;
- num_sge -= rds_iwdev->max_sge;
- } else
- send->s_rdma_wr.wr.num_sge = num_sge;
-
- send->s_rdma_wr.wr.next = NULL;
-
- if (prev)
- prev->s_send_wr.next = &send->s_rdma_wr.wr;
-
- for (j = 0; j < send->s_rdma_wr.wr.num_sge &&
- scat != &op->op_sg[op->op_count]; j++) {
- len = ib_sg_dma_len(ic->i_cm_id->device, scat);
-
- if (send->s_rdma_wr.wr.opcode == IB_WR_RDMA_READ_WITH_INV)
- sg_nents++;
- else {
- send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat);
- send->s_sge[j].length = len;
- send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic);
- }
-
- sent += len;
- rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
- remote_addr += len;
-
- scat++;
- }
-
- if (send->s_rdma_wr.wr.opcode == IB_WR_RDMA_READ_WITH_INV) {
- send->s_rdma_wr.wr.num_sge = 1;
- send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr;
- send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes;
- send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey;
- }
-
- rdsdebug("send %p wr %p num_sge %u next %p\n", send,
- &send->s_rdma_wr,
- send->s_rdma_wr.wr.num_sge,
- send->s_rdma_wr.wr.next);
-
- prev = send;
- if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
- send = ic->i_sends;
- }
-
- /* if we finished the message then send completion owns it */
- if (scat == &op->op_sg[op->op_count])
- first->s_rdma_wr.wr.send_flags = IB_SEND_SIGNALED;
-
- if (i < work_alloc) {
- rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
- work_alloc = i;
- }
-
- /* On iWARP, local memory access by a remote system (ie, RDMA Read) is not
- * recommended. Putting the lkey on the wire is a security hole, as it can
- * allow for memory access to all of memory on the remote system. Some
- * adapters do not allow using the lkey for this at all. To bypass this use a
- * fastreg_mr (or possibly a dma_mr)
- */
- if (!op->op_write) {
- ret = rds_iw_build_send_reg(&ic->i_sends[fr_pos],
- &op->op_sg[0], sg_nents);
- if (ret) {
- printk(KERN_WARNING "RDS/IW: failed to reg send mem\n");
- goto out;
- }
- work_alloc++;
- }
-
- failed_wr = &first->s_rdma_wr.wr;
- ret = ib_post_send(ic->i_cm_id->qp, &first->s_rdma_wr.wr, &failed_wr);
- rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
- first, &first->s_rdma_wr, ret, failed_wr);
- BUG_ON(failed_wr != &first->s_rdma_wr.wr);
- if (ret) {
- printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 "
- "returned %d\n", &conn->c_faddr, ret);
- rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
- goto out;
- }
-
-out:
- return ret;
-}
-
-void rds_iw_xmit_complete(struct rds_connection *conn)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- /* We may have a pending ACK or window update we were unable
- * to send previously (due to flow control). Try again. */
- rds_iw_attempt_ack(ic);
-}
diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c
deleted file mode 100644
index 5fe67f6a1..000000000
--- a/net/rds/iw_stats.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/percpu.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-
-#include "rds.h"
-#include "iw.h"
-
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats);
-
-static const char *const rds_iw_stat_names[] = {
- "iw_connect_raced",
- "iw_listen_closed_stale",
- "iw_tx_cq_call",
- "iw_tx_cq_event",
- "iw_tx_ring_full",
- "iw_tx_throttle",
- "iw_tx_sg_mapping_failure",
- "iw_tx_stalled",
- "iw_tx_credit_updates",
- "iw_rx_cq_call",
- "iw_rx_cq_event",
- "iw_rx_ring_empty",
- "iw_rx_refill_from_cq",
- "iw_rx_refill_from_thread",
- "iw_rx_alloc_limit",
- "iw_rx_credit_updates",
- "iw_ack_sent",
- "iw_ack_send_failure",
- "iw_ack_send_delayed",
- "iw_ack_send_piggybacked",
- "iw_ack_received",
- "iw_rdma_mr_alloc",
- "iw_rdma_mr_free",
- "iw_rdma_mr_used",
- "iw_rdma_mr_pool_flush",
- "iw_rdma_mr_pool_wait",
- "iw_rdma_mr_pool_depleted",
-};
-
-unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
- unsigned int avail)
-{
- struct rds_iw_statistics stats = {0, };
- uint64_t *src;
- uint64_t *sum;
- size_t i;
- int cpu;
-
- if (avail < ARRAY_SIZE(rds_iw_stat_names))
- goto out;
-
- for_each_online_cpu(cpu) {
- src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu));
- sum = (uint64_t *)&stats;
- for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
- *(sum++) += *(src++);
- }
-
- rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names,
- ARRAY_SIZE(rds_iw_stat_names));
-out:
- return ARRAY_SIZE(rds_iw_stat_names);
-}
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
deleted file mode 100644
index 139239d2c..000000000
--- a/net/rds/iw_sysctl.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-
-#include "iw.h"
-
-static struct ctl_table_header *rds_iw_sysctl_hdr;
-
-unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR;
-unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR;
-unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
-static unsigned long rds_iw_sysctl_max_wr_min = 1;
-/* hardware will fail CQ creation long before this */
-static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0;
-
-unsigned long rds_iw_sysctl_max_unsig_wrs = 16;
-static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1;
-static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64;
-
-unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20);
-static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1;
-static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
-
-unsigned int rds_iw_sysctl_flow_control = 1;
-
-static struct ctl_table rds_iw_sysctl_table[] = {
- {
- .procname = "max_send_wr",
- .data = &rds_iw_sysctl_max_send_wr,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- .extra1 = &rds_iw_sysctl_max_wr_min,
- .extra2 = &rds_iw_sysctl_max_wr_max,
- },
- {
- .procname = "max_recv_wr",
- .data = &rds_iw_sysctl_max_recv_wr,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- .extra1 = &rds_iw_sysctl_max_wr_min,
- .extra2 = &rds_iw_sysctl_max_wr_max,
- },
- {
- .procname = "max_unsignaled_wr",
- .data = &rds_iw_sysctl_max_unsig_wrs,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- .extra1 = &rds_iw_sysctl_max_unsig_wr_min,
- .extra2 = &rds_iw_sysctl_max_unsig_wr_max,
- },
- {
- .procname = "max_unsignaled_bytes",
- .data = &rds_iw_sysctl_max_unsig_bytes,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- .extra1 = &rds_iw_sysctl_max_unsig_bytes_min,
- .extra2 = &rds_iw_sysctl_max_unsig_bytes_max,
- },
- {
- .procname = "max_recv_allocation",
- .data = &rds_iw_sysctl_max_recv_allocation,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- },
- {
- .procname = "flow_control",
- .data = &rds_iw_sysctl_flow_control,
- .maxlen = sizeof(rds_iw_sysctl_flow_control),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- { }
-};
-
-void rds_iw_sysctl_exit(void)
-{
- unregister_net_sysctl_table(rds_iw_sysctl_hdr);
-}
-
-int rds_iw_sysctl_init(void)
-{
- rds_iw_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/iw", rds_iw_sysctl_table);
- if (!rds_iw_sysctl_hdr)
- return -ENOMEM;
- return 0;
-}
diff --git a/net/rds/page.c b/net/rds/page.c
index 5a14e6d6a..e2b5a5832 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -42,8 +42,8 @@ struct rds_page_remainder {
unsigned long r_offset;
};
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder,
- rds_page_remainders);
+static
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
/*
* returns 0 on success or -errno on failure.
@@ -135,8 +135,8 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
if (rem->r_offset != 0)
rds_stats_inc(s_page_remainder_hit);
- rem->r_offset += bytes;
- if (rem->r_offset == PAGE_SIZE) {
+ rem->r_offset += ALIGN(bytes, 8);
+ if (rem->r_offset >= PAGE_SIZE) {
__free_page(rem->r_page);
rem->r_page = NULL;
}
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index 9c1fed81b..7220bebcf 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -49,9 +49,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
event->event, rdma_event_msg(event->event));
- if (cm_id->device->node_type == RDMA_NODE_RNIC)
- trans = &rds_iw_transport;
- else
+ if (cm_id->device->node_type == RDMA_NODE_IB_CA)
trans = &rds_ib_transport;
/* Prevent shutdown from tearing down the connection
@@ -119,6 +117,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
rds_conn_drop(conn);
break;
+ case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+ if (conn) {
+ pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI4->%pI4\n",
+ &conn->c_laddr, &conn->c_faddr);
+ rds_conn_drop(conn);
+ }
+ break;
+
default:
/* things like device disconnect? */
printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
@@ -200,10 +206,6 @@ static int rds_rdma_init(void)
if (ret)
goto out;
- ret = rds_iw_init();
- if (ret)
- goto err_iw_init;
-
ret = rds_ib_init();
if (ret)
goto err_ib_init;
@@ -211,8 +213,6 @@ static int rds_rdma_init(void)
goto out;
err_ib_init:
- rds_iw_exit();
-err_iw_init:
rds_rdma_listen_stop();
out:
return ret;
@@ -224,11 +224,10 @@ static void rds_rdma_exit(void)
/* stop listening first to ensure no new connections are attempted */
rds_rdma_listen_stop();
rds_ib_exit();
- rds_iw_exit();
}
module_exit(rds_rdma_exit);
MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
-MODULE_DESCRIPTION("RDS: IB/iWARP transport");
+MODULE_DESCRIPTION("RDS: IB transport");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
index faba4e382..ff2010e9d 100644
--- a/net/rds/rdma_transport.h
+++ b/net/rds/rdma_transport.h
@@ -16,9 +16,4 @@ extern struct rds_transport rds_ib_transport;
int rds_ib_init(void);
void rds_ib_exit(void);
-/* from iw.c */
-extern struct rds_transport rds_iw_transport;
-int rds_iw_init(void);
-void rds_iw_exit(void);
-
#endif
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 0e2797bdc..80256b08e 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -222,6 +222,7 @@ struct rds_incoming {
__be32 i_saddr;
rds_rdma_cookie_t i_rdma_cookie;
+ struct timeval i_rx_tstamp;
};
struct rds_mr {
diff --git a/net/rds/recv.c b/net/rds/recv.c
index a00462b0d..c0be1ecd1 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -35,6 +35,8 @@
#include <net/sock.h>
#include <linux/in.h>
#include <linux/export.h>
+#include <linux/time.h>
+#include <linux/rds.h>
#include "rds.h"
@@ -46,6 +48,8 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
inc->i_conn = conn;
inc->i_saddr = saddr;
inc->i_rdma_cookie = 0;
+ inc->i_rx_tstamp.tv_sec = 0;
+ inc->i_rx_tstamp.tv_usec = 0;
}
EXPORT_SYMBOL_GPL(rds_inc_init);
@@ -228,6 +232,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
be32_to_cpu(inc->i_hdr.h_len),
inc->i_hdr.h_dport);
+ if (sock_flag(sk, SOCK_RCVTSTAMP))
+ do_gettimeofday(&inc->i_rx_tstamp);
rds_inc_addref(inc);
list_add_tail(&inc->i_item, &rs->rs_recv_queue);
__rds_wake_sk_sleep(sk);
@@ -381,7 +387,8 @@ static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
/*
* Receive any control messages.
*/
-static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
+static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
+ struct rds_sock *rs)
{
int ret = 0;
@@ -392,6 +399,15 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
return ret;
}
+ if ((inc->i_rx_tstamp.tv_sec != 0) &&
+ sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) {
+ ret = put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
+ sizeof(struct timeval),
+ &inc->i_rx_tstamp);
+ if (ret)
+ return ret;
+ }
+
return 0;
}
@@ -474,7 +490,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
msg->msg_flags |= MSG_TRUNC;
}
- if (rds_cmsg_recv(inc, msg)) {
+ if (rds_cmsg_recv(inc, msg, rs)) {
ret = -EFAULT;
goto out;
}
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 9d6ddbacd..86187dad1 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -37,7 +37,6 @@
#include <net/tcp.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
-#include <net/tcp.h>
#include "rds.h"
#include "tcp.h"
@@ -53,7 +52,34 @@ static LIST_HEAD(rds_tcp_conn_list);
static struct kmem_cache *rds_tcp_conn_slab;
-#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
+static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *fpos);
+
+int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF;
+int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF;
+
+static struct ctl_table rds_tcp_sysctl_table[] = {
+#define RDS_TCP_SNDBUF 0
+ {
+ .procname = "rds_tcp_sndbuf",
+ /* data is per-net pointer */
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = rds_tcp_skbuf_handler,
+ .extra1 = &rds_tcp_min_sndbuf,
+ },
+#define RDS_TCP_RCVBUF 1
+ {
+ .procname = "rds_tcp_rcvbuf",
+ /* data is per-net pointer */
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = rds_tcp_skbuf_handler,
+ .extra1 = &rds_tcp_min_rcvbuf,
+ },
+ { }
+};
/* doing it this way avoids calling tcp_sk() */
void rds_tcp_nonagle(struct socket *sock)
@@ -67,15 +93,6 @@ void rds_tcp_nonagle(struct socket *sock)
set_fs(oldfs);
}
-/* All module specific customizations to the RDS-TCP socket should be done in
- * rds_tcp_tune() and applied after socket creation. In general these
- * customizations should be tunable via module_param()
- */
-void rds_tcp_tune(struct socket *sock)
-{
- rds_tcp_nonagle(sock);
-}
-
u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
{
return tcp_sk(tc->t_sock->sk)->snd_nxt;
@@ -110,7 +127,7 @@ void rds_tcp_restore_callbacks(struct socket *sock,
/*
* This is the only path that sets tc->t_sock. Send and receive trust that
- * it is set. The RDS_CONN_CONNECTED bit protects those paths from being
+ * it is set. The RDS_CONN_UP bit protects those paths from being
* called while it isn't set.
*/
void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn)
@@ -199,6 +216,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
if (!tc)
return -ENOMEM;
+ mutex_init(&tc->t_conn_lock);
tc->t_sock = NULL;
tc->t_tinc = NULL;
tc->t_tinc_hdr_rem = sizeof(struct rds_header);
@@ -273,8 +291,34 @@ static int rds_tcp_netid;
struct rds_tcp_net {
struct socket *rds_tcp_listen_sock;
struct work_struct rds_tcp_accept_w;
+ struct ctl_table_header *rds_tcp_sysctl;
+ struct ctl_table *ctl_table;
+ int sndbuf_size;
+ int rcvbuf_size;
};
+/* All module specific customizations to the RDS-TCP socket should be done in
+ * rds_tcp_tune() and applied after socket creation.
+ */
+void rds_tcp_tune(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+ rds_tcp_nonagle(sock);
+ lock_sock(sk);
+ if (rtn->sndbuf_size > 0) {
+ sk->sk_sndbuf = rtn->sndbuf_size;
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ }
+ if (rtn->rcvbuf_size > 0) {
+ sk->sk_sndbuf = rtn->rcvbuf_size;
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ }
+ release_sock(sk);
+}
+
static void rds_tcp_accept_worker(struct work_struct *work)
{
struct rds_tcp_net *rtn = container_of(work,
@@ -296,20 +340,60 @@ void rds_tcp_accept_work(struct sock *sk)
static __net_init int rds_tcp_init_net(struct net *net)
{
struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+ struct ctl_table *tbl;
+ int err = 0;
+ memset(rtn, 0, sizeof(*rtn));
+
+ /* {snd, rcv}buf_size default to 0, which implies we let the
+ * stack pick the value, and permit auto-tuning of buffer size.
+ */
+ if (net == &init_net) {
+ tbl = rds_tcp_sysctl_table;
+ } else {
+ tbl = kmemdup(rds_tcp_sysctl_table,
+ sizeof(rds_tcp_sysctl_table), GFP_KERNEL);
+ if (!tbl) {
+ pr_warn("could not set allocate syctl table\n");
+ return -ENOMEM;
+ }
+ rtn->ctl_table = tbl;
+ }
+ tbl[RDS_TCP_SNDBUF].data = &rtn->sndbuf_size;
+ tbl[RDS_TCP_RCVBUF].data = &rtn->rcvbuf_size;
+ rtn->rds_tcp_sysctl = register_net_sysctl(net, "net/rds/tcp", tbl);
+ if (!rtn->rds_tcp_sysctl) {
+ pr_warn("could not register sysctl\n");
+ err = -ENOMEM;
+ goto fail;
+ }
rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
if (!rtn->rds_tcp_listen_sock) {
pr_warn("could not set up listen sock\n");
- return -EAFNOSUPPORT;
+ unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
+ rtn->rds_tcp_sysctl = NULL;
+ err = -EAFNOSUPPORT;
+ goto fail;
}
INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
return 0;
+
+fail:
+ if (net != &init_net)
+ kfree(tbl);
+ return err;
}
static void __net_exit rds_tcp_exit_net(struct net *net)
{
struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+ if (rtn->rds_tcp_sysctl)
+ unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
+
+ if (net != &init_net && rtn->ctl_table)
+ kfree(rtn->ctl_table);
+
/* If rds_tcp_exit_net() is called as a result of netns deletion,
* the rds_tcp_kill_sock() device notifier would already have cleaned
* up the listen socket, thus there is no work to do in this function.
@@ -384,6 +468,45 @@ static struct notifier_block rds_tcp_dev_notifier = {
.priority = -10, /* must be called after other network notifiers */
};
+/* when sysctl is used to modify some kernel socket parameters,this
+ * function resets the RDS connections in that netns so that we can
+ * restart with new parameters. The assumption is that such reset
+ * events are few and far-between.
+ */
+static void rds_tcp_sysctl_reset(struct net *net)
+{
+ struct rds_tcp_connection *tc, *_tc;
+
+ spin_lock_irq(&rds_tcp_conn_lock);
+ list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
+ struct net *c_net = read_pnet(&tc->conn->c_net);
+
+ if (net != c_net || !tc->t_sock)
+ continue;
+
+ rds_conn_drop(tc->conn); /* reconnect with new parameters */
+ }
+ spin_unlock_irq(&rds_tcp_conn_lock);
+}
+
+static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *fpos)
+{
+ struct net *net = current->nsproxy->net_ns;
+ int err;
+
+ err = proc_dointvec_minmax(ctl, write, buffer, lenp, fpos);
+ if (err < 0) {
+ pr_warn("Invalid input. Must be >= %d\n",
+ *(int *)(ctl->extra1));
+ return err;
+ }
+ if (write)
+ rds_tcp_sysctl_reset(net);
+ return 0;
+}
+
static void rds_tcp_exit(void)
{
rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 64f873c0c..41c228300 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -12,6 +12,10 @@ struct rds_tcp_connection {
struct list_head t_tcp_node;
struct rds_connection *conn;
+ /* t_conn_lock synchronizes the connection establishment between
+ * rds_tcp_accept_one and rds_tcp_conn_connect
+ */
+ struct mutex t_conn_lock;
struct socket *t_sock;
void *t_orig_write_space;
void *t_orig_data_ready;
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 5cb16875c..49a3fcfed 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -78,7 +78,14 @@ int rds_tcp_conn_connect(struct rds_connection *conn)
struct socket *sock = NULL;
struct sockaddr_in src, dest;
int ret;
+ struct rds_tcp_connection *tc = conn->c_transport_data;
+
+ mutex_lock(&tc->t_conn_lock);
+ if (rds_conn_up(conn)) {
+ mutex_unlock(&tc->t_conn_lock);
+ return 0;
+ }
ret = sock_create_kern(rds_conn_net(conn), PF_INET,
SOCK_STREAM, IPPROTO_TCP, &sock);
if (ret < 0)
@@ -120,6 +127,7 @@ int rds_tcp_conn_connect(struct rds_connection *conn)
}
out:
+ mutex_unlock(&tc->t_conn_lock);
if (sock)
sock_release(sock);
return ret;
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 0936a4a32..be263cdf2 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -76,7 +76,9 @@ int rds_tcp_accept_one(struct socket *sock)
struct rds_connection *conn;
int ret;
struct inet_sock *inet;
- struct rds_tcp_connection *rs_tcp;
+ struct rds_tcp_connection *rs_tcp = NULL;
+ int conn_state;
+ struct sock *nsk;
ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family,
sock->sk->sk_type, sock->sk->sk_protocol,
@@ -115,28 +117,44 @@ int rds_tcp_accept_one(struct socket *sock)
* rds_tcp_state_change() will do that cleanup
*/
rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
- if (rs_tcp->t_sock &&
- ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) {
- struct sock *nsk = new_sock->sk;
-
- nsk->sk_user_data = NULL;
- nsk->sk_prot->disconnect(nsk, 0);
- tcp_done(nsk);
- new_sock = NULL;
- ret = 0;
- goto out;
- } else if (rs_tcp->t_sock) {
- rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp);
- conn->c_outgoing = 0;
- }
-
rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
+ mutex_lock(&rs_tcp->t_conn_lock);
+ conn_state = rds_conn_state(conn);
+ if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_UP)
+ goto rst_nsk;
+ if (rs_tcp->t_sock) {
+ /* Need to resolve a duelling SYN between peers.
+ * We have an outstanding SYN to this peer, which may
+ * potentially have transitioned to the RDS_CONN_UP state,
+ * so we must quiesce any send threads before resetting
+ * c_transport_data.
+ */
+ wait_event(conn->c_waitq,
+ !test_bit(RDS_IN_XMIT, &conn->c_flags));
+ if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) {
+ goto rst_nsk;
+ } else if (rs_tcp->t_sock) {
+ rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp);
+ conn->c_outgoing = 0;
+ }
+ }
rds_tcp_set_callbacks(new_sock, conn);
- rds_connect_complete(conn);
+ rds_connect_complete(conn); /* marks RDS_CONN_UP */
+ new_sock = NULL;
+ ret = 0;
+ goto out;
+rst_nsk:
+ /* reset the newly returned accept sock and bail */
+ nsk = new_sock->sk;
+ rds_tcp_stats_inc(s_tcp_listen_closed_stale);
+ nsk->sk_user_data = NULL;
+ nsk->sk_prot->disconnect(nsk, 0);
+ tcp_done(nsk);
new_sock = NULL;
ret = 0;
-
out:
+ if (rs_tcp)
+ mutex_unlock(&rs_tcp->t_conn_lock);
if (new_sock)
sock_release(new_sock);
return ret;