From 57f0f512b273f60d52568b8c6b77e17f5636edc0 Mon Sep 17 00:00:00 2001 From: André Fabian Silva Delgado Date: Wed, 5 Aug 2015 17:04:01 -0300 Subject: Initial import --- drivers/staging/lustre/lnet/klnds/socklnd/Makefile | 3 + .../staging/lustre/lnet/klnds/socklnd/socklnd.c | 2886 ++++++++++++++++++++ .../staging/lustre/lnet/klnds/socklnd/socklnd.h | 588 ++++ .../staging/lustre/lnet/klnds/socklnd/socklnd_cb.c | 2634 ++++++++++++++++++ .../lustre/lnet/klnds/socklnd/socklnd_lib-linux.c | 714 +++++ .../lustre/lnet/klnds/socklnd/socklnd_lib-linux.h | 86 + .../lustre/lnet/klnds/socklnd/socklnd_modparams.c | 188 ++ .../lustre/lnet/klnds/socklnd/socklnd_proto.c | 797 ++++++ 8 files changed, 7896 insertions(+) create mode 100644 drivers/staging/lustre/lnet/klnds/socklnd/Makefile create mode 100644 drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c create mode 100644 drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h create mode 100644 drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c create mode 100644 drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c create mode 100644 drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h create mode 100644 drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c create mode 100644 drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c (limited to 'drivers/staging/lustre/lnet/klnds/socklnd') diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/Makefile b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile new file mode 100644 index 000000000..f3fb8778c --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/socklnd/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_LNET) += ksocklnd.o + +ksocklnd-y := socklnd.o socklnd_cb.o socklnd_proto.o socklnd_modparams.o socklnd_lib-linux.o diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c new file mode 100644 index 000000000..7586b7e40 --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.c @@ -0,0 +1,2886 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/socklnd/socklnd.c + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + */ + +#include "socklnd.h" + +static lnd_t the_ksocklnd; +ksock_nal_data_t ksocknal_data; + +static ksock_interface_t * +ksocknal_ip2iface(lnet_ni_t *ni, __u32 ip) +{ + ksock_net_t *net = ni->ni_data; + int i; + ksock_interface_t *iface; + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + LASSERT(i < LNET_MAX_INTERFACES); + iface = &net->ksnn_interfaces[i]; + + if (iface->ksni_ipaddr == ip) + return iface; + } + + return NULL; +} + +static ksock_route_t * +ksocknal_create_route(__u32 ipaddr, int port) +{ + ksock_route_t *route; + + LIBCFS_ALLOC(route, sizeof(*route)); + if (route == NULL) + return NULL; + + atomic_set(&route->ksnr_refcount, 1); + route->ksnr_peer = NULL; + route->ksnr_retry_interval = 0; /* OK to connect at any time */ + route->ksnr_ipaddr = ipaddr; + route->ksnr_port = port; + route->ksnr_scheduled = 0; + route->ksnr_connecting = 0; + route->ksnr_connected = 0; + route->ksnr_deleted = 0; + route->ksnr_conn_count = 0; + route->ksnr_share_count = 0; + + return route; +} + +void +ksocknal_destroy_route(ksock_route_t *route) +{ + LASSERT(atomic_read(&route->ksnr_refcount) == 0); + + if (route->ksnr_peer != NULL) + ksocknal_peer_decref(route->ksnr_peer); + + LIBCFS_FREE(route, sizeof(*route)); +} + +static int +ksocknal_create_peer(ksock_peer_t **peerp, lnet_ni_t *ni, lnet_process_id_t id) +{ + ksock_net_t *net = ni->ni_data; + ksock_peer_t *peer; + + LASSERT(id.nid != LNET_NID_ANY); + LASSERT(id.pid != LNET_PID_ANY); + LASSERT(!in_interrupt()); + + LIBCFS_ALLOC(peer, sizeof(*peer)); + if (peer == NULL) + return -ENOMEM; + + peer->ksnp_ni = ni; + peer->ksnp_id = id; + atomic_set(&peer->ksnp_refcount, 1); /* 1 ref for caller */ + peer->ksnp_closing = 0; + peer->ksnp_accepting = 0; + peer->ksnp_proto = NULL; + peer->ksnp_last_alive = 0; + peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1; + + INIT_LIST_HEAD(&peer->ksnp_conns); + INIT_LIST_HEAD(&peer->ksnp_routes); + INIT_LIST_HEAD(&peer->ksnp_tx_queue); + INIT_LIST_HEAD(&peer->ksnp_zc_req_list); + spin_lock_init(&peer->ksnp_lock); + + spin_lock_bh(&net->ksnn_lock); + + if (net->ksnn_shutdown) { + spin_unlock_bh(&net->ksnn_lock); + + LIBCFS_FREE(peer, sizeof(*peer)); + CERROR("Can't create peer: network shutdown\n"); + return -ESHUTDOWN; + } + + net->ksnn_npeers++; + + spin_unlock_bh(&net->ksnn_lock); + + *peerp = peer; + return 0; +} + +void +ksocknal_destroy_peer(ksock_peer_t *peer) +{ + ksock_net_t *net = peer->ksnp_ni->ni_data; + + CDEBUG(D_NET, "peer %s %p deleted\n", + libcfs_id2str(peer->ksnp_id), peer); + + LASSERT(atomic_read(&peer->ksnp_refcount) == 0); + LASSERT(peer->ksnp_accepting == 0); + LASSERT(list_empty(&peer->ksnp_conns)); + LASSERT(list_empty(&peer->ksnp_routes)); + LASSERT(list_empty(&peer->ksnp_tx_queue)); + LASSERT(list_empty(&peer->ksnp_zc_req_list)); + + LIBCFS_FREE(peer, sizeof(*peer)); + + /* NB a peer's connections and routes keep a reference on their peer + * until they are destroyed, so we can be assured that _all_ state to + * do with this peer has been cleaned up when its refcount drops to + * zero. */ + spin_lock_bh(&net->ksnn_lock); + net->ksnn_npeers--; + spin_unlock_bh(&net->ksnn_lock); +} + +ksock_peer_t * +ksocknal_find_peer_locked(lnet_ni_t *ni, lnet_process_id_t id) +{ + struct list_head *peer_list = ksocknal_nid2peerlist(id.nid); + struct list_head *tmp; + ksock_peer_t *peer; + + list_for_each(tmp, peer_list) { + + peer = list_entry(tmp, ksock_peer_t, ksnp_list); + + LASSERT(!peer->ksnp_closing); + + if (peer->ksnp_ni != ni) + continue; + + if (peer->ksnp_id.nid != id.nid || + peer->ksnp_id.pid != id.pid) + continue; + + CDEBUG(D_NET, "got peer [%p] -> %s (%d)\n", + peer, libcfs_id2str(id), + atomic_read(&peer->ksnp_refcount)); + return peer; + } + return NULL; +} + +ksock_peer_t * +ksocknal_find_peer(lnet_ni_t *ni, lnet_process_id_t id) +{ + ksock_peer_t *peer; + + read_lock(&ksocknal_data.ksnd_global_lock); + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) /* +1 ref for caller? */ + ksocknal_peer_addref(peer); + read_unlock(&ksocknal_data.ksnd_global_lock); + + return peer; +} + +static void +ksocknal_unlink_peer_locked(ksock_peer_t *peer) +{ + int i; + __u32 ip; + ksock_interface_t *iface; + + for (i = 0; i < peer->ksnp_n_passive_ips; i++) { + LASSERT(i < LNET_MAX_INTERFACES); + ip = peer->ksnp_passive_ips[i]; + + iface = ksocknal_ip2iface(peer->ksnp_ni, ip); + /* All IPs in peer->ksnp_passive_ips[] come from the + * interface list, therefore the call must succeed. */ + LASSERT(iface != NULL); + + CDEBUG(D_NET, "peer=%p iface=%p ksni_nroutes=%d\n", + peer, iface, iface->ksni_nroutes); + iface->ksni_npeers--; + } + + LASSERT(list_empty(&peer->ksnp_conns)); + LASSERT(list_empty(&peer->ksnp_routes)); + LASSERT(!peer->ksnp_closing); + peer->ksnp_closing = 1; + list_del(&peer->ksnp_list); + /* lose peerlist's ref */ + ksocknal_peer_decref(peer); +} + +static int +ksocknal_get_peer_info(lnet_ni_t *ni, int index, + lnet_process_id_t *id, __u32 *myip, __u32 *peer_ip, + int *port, int *conn_count, int *share_count) +{ + ksock_peer_t *peer; + struct list_head *ptmp; + ksock_route_t *route; + struct list_head *rtmp; + int i; + int j; + int rc = -ENOENT; + + read_lock(&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + + list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry(ptmp, ksock_peer_t, ksnp_list); + + if (peer->ksnp_ni != ni) + continue; + + if (peer->ksnp_n_passive_ips == 0 && + list_empty(&peer->ksnp_routes)) { + if (index-- > 0) + continue; + + *id = peer->ksnp_id; + *myip = 0; + *peer_ip = 0; + *port = 0; + *conn_count = 0; + *share_count = 0; + rc = 0; + goto out; + } + + for (j = 0; j < peer->ksnp_n_passive_ips; j++) { + if (index-- > 0) + continue; + + *id = peer->ksnp_id; + *myip = peer->ksnp_passive_ips[j]; + *peer_ip = 0; + *port = 0; + *conn_count = 0; + *share_count = 0; + rc = 0; + goto out; + } + + list_for_each(rtmp, &peer->ksnp_routes) { + if (index-- > 0) + continue; + + route = list_entry(rtmp, ksock_route_t, + ksnr_list); + + *id = peer->ksnp_id; + *myip = route->ksnr_myipaddr; + *peer_ip = route->ksnr_ipaddr; + *port = route->ksnr_port; + *conn_count = route->ksnr_conn_count; + *share_count = route->ksnr_share_count; + rc = 0; + goto out; + } + } + } + out: + read_unlock(&ksocknal_data.ksnd_global_lock); + return rc; +} + +static void +ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn) +{ + ksock_peer_t *peer = route->ksnr_peer; + int type = conn->ksnc_type; + ksock_interface_t *iface; + + conn->ksnc_route = route; + ksocknal_route_addref(route); + + if (route->ksnr_myipaddr != conn->ksnc_myipaddr) { + if (route->ksnr_myipaddr == 0) { + /* route wasn't bound locally yet (the initial route) */ + CDEBUG(D_NET, "Binding %s %pI4h to %pI4h\n", + libcfs_id2str(peer->ksnp_id), + &route->ksnr_ipaddr, + &conn->ksnc_myipaddr); + } else { + CDEBUG(D_NET, "Rebinding %s %pI4h from %pI4h to %pI4h\n", + libcfs_id2str(peer->ksnp_id), + &route->ksnr_ipaddr, + &route->ksnr_myipaddr, + &conn->ksnc_myipaddr); + + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); + if (iface != NULL) + iface->ksni_nroutes--; + } + route->ksnr_myipaddr = conn->ksnc_myipaddr; + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); + if (iface != NULL) + iface->ksni_nroutes++; + } + + route->ksnr_connected |= (1<ksnr_conn_count++; + + /* Successful connection => further attempts can + * proceed immediately */ + route->ksnr_retry_interval = 0; +} + +static void +ksocknal_add_route_locked(ksock_peer_t *peer, ksock_route_t *route) +{ + struct list_head *tmp; + ksock_conn_t *conn; + ksock_route_t *route2; + + LASSERT(!peer->ksnp_closing); + LASSERT(route->ksnr_peer == NULL); + LASSERT(!route->ksnr_scheduled); + LASSERT(!route->ksnr_connecting); + LASSERT(route->ksnr_connected == 0); + + /* LASSERT(unique) */ + list_for_each(tmp, &peer->ksnp_routes) { + route2 = list_entry(tmp, ksock_route_t, ksnr_list); + + if (route2->ksnr_ipaddr == route->ksnr_ipaddr) { + CERROR("Duplicate route %s %pI4h\n", + libcfs_id2str(peer->ksnp_id), + &route->ksnr_ipaddr); + LBUG(); + } + } + + route->ksnr_peer = peer; + ksocknal_peer_addref(peer); + /* peer's routelist takes over my ref on 'route' */ + list_add_tail(&route->ksnr_list, &peer->ksnp_routes); + + list_for_each(tmp, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_ipaddr != route->ksnr_ipaddr) + continue; + + ksocknal_associate_route_conn_locked(route, conn); + /* keep going (typed routes) */ + } +} + +static void +ksocknal_del_route_locked(ksock_route_t *route) +{ + ksock_peer_t *peer = route->ksnr_peer; + ksock_interface_t *iface; + ksock_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + + LASSERT(!route->ksnr_deleted); + + /* Close associated conns */ + list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) { + conn = list_entry(ctmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_route != route) + continue; + + ksocknal_close_conn_locked(conn, 0); + } + + if (route->ksnr_myipaddr != 0) { + iface = ksocknal_ip2iface(route->ksnr_peer->ksnp_ni, + route->ksnr_myipaddr); + if (iface != NULL) + iface->ksni_nroutes--; + } + + route->ksnr_deleted = 1; + list_del(&route->ksnr_list); + ksocknal_route_decref(route); /* drop peer's ref */ + + if (list_empty(&peer->ksnp_routes) && + list_empty(&peer->ksnp_conns)) { + /* I've just removed the last route to a peer with no active + * connections */ + ksocknal_unlink_peer_locked(peer); + } +} + +int +ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ipaddr, int port) +{ + struct list_head *tmp; + ksock_peer_t *peer; + ksock_peer_t *peer2; + ksock_route_t *route; + ksock_route_t *route2; + int rc; + + if (id.nid == LNET_NID_ANY || + id.pid == LNET_PID_ANY) + return -EINVAL; + + /* Have a brand new peer ready... */ + rc = ksocknal_create_peer(&peer, ni, id); + if (rc != 0) + return rc; + + route = ksocknal_create_route(ipaddr, port); + if (route == NULL) { + ksocknal_peer_decref(peer); + return -ENOMEM; + } + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + /* always called with a ref on ni, so shutdown can't have started */ + LASSERT(((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0); + + peer2 = ksocknal_find_peer_locked(ni, id); + if (peer2 != NULL) { + ksocknal_peer_decref(peer); + peer = peer2; + } else { + /* peer table takes my ref on peer */ + list_add_tail(&peer->ksnp_list, + ksocknal_nid2peerlist(id.nid)); + } + + route2 = NULL; + list_for_each(tmp, &peer->ksnp_routes) { + route2 = list_entry(tmp, ksock_route_t, ksnr_list); + + if (route2->ksnr_ipaddr == ipaddr) + break; + + route2 = NULL; + } + if (route2 == NULL) { + ksocknal_add_route_locked(peer, route); + route->ksnr_share_count++; + } else { + ksocknal_route_decref(route); + route2->ksnr_share_count++; + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return 0; +} + +static void +ksocknal_del_peer_locked(ksock_peer_t *peer, __u32 ip) +{ + ksock_conn_t *conn; + ksock_route_t *route; + struct list_head *tmp; + struct list_head *nxt; + int nshared; + + LASSERT(!peer->ksnp_closing); + + /* Extra ref prevents peer disappearing until I'm done with it */ + ksocknal_peer_addref(peer); + + list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + + /* no match */ + if (!(ip == 0 || route->ksnr_ipaddr == ip)) + continue; + + route->ksnr_share_count = 0; + /* This deletes associated conns too */ + ksocknal_del_route_locked(route); + } + + nshared = 0; + list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + nshared += route->ksnr_share_count; + } + + if (nshared == 0) { + /* remove everything else if there are no explicit entries + * left */ + + list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + + /* we should only be removing auto-entries */ + LASSERT(route->ksnr_share_count == 0); + ksocknal_del_route_locked(route); + } + + list_for_each_safe(tmp, nxt, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + ksocknal_close_conn_locked(conn, 0); + } + } + + ksocknal_peer_decref(peer); + /* NB peer unlinks itself when last conn/route is removed */ +} + +static int +ksocknal_del_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip) +{ + LIST_HEAD(zombies); + struct list_head *ptmp; + struct list_head *pnxt; + ksock_peer_t *peer; + int lo; + int hi; + int i; + int rc = -ENOENT; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + if (id.nid != LNET_NID_ANY) + lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers); + else { + lo = 0; + hi = ksocknal_data.ksnd_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe(ptmp, pnxt, + &ksocknal_data.ksnd_peers[i]) { + peer = list_entry(ptmp, ksock_peer_t, ksnp_list); + + if (peer->ksnp_ni != ni) + continue; + + if (!((id.nid == LNET_NID_ANY || peer->ksnp_id.nid == id.nid) && + (id.pid == LNET_PID_ANY || peer->ksnp_id.pid == id.pid))) + continue; + + ksocknal_peer_addref(peer); /* a ref for me... */ + + ksocknal_del_peer_locked(peer, ip); + + if (peer->ksnp_closing && + !list_empty(&peer->ksnp_tx_queue)) { + LASSERT(list_empty(&peer->ksnp_conns)); + LASSERT(list_empty(&peer->ksnp_routes)); + + list_splice_init(&peer->ksnp_tx_queue, + &zombies); + } + + ksocknal_peer_decref(peer); /* ...till here */ + + rc = 0; /* matched! */ + } + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_txlist_done(ni, &zombies, 1); + + return rc; +} + +static ksock_conn_t * +ksocknal_get_conn_by_idx(lnet_ni_t *ni, int index) +{ + ksock_peer_t *peer; + struct list_head *ptmp; + ksock_conn_t *conn; + struct list_head *ctmp; + int i; + + read_lock(&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry(ptmp, ksock_peer_t, ksnp_list); + + LASSERT(!peer->ksnp_closing); + + if (peer->ksnp_ni != ni) + continue; + + list_for_each(ctmp, &peer->ksnp_conns) { + if (index-- > 0) + continue; + + conn = list_entry(ctmp, ksock_conn_t, + ksnc_list); + ksocknal_conn_addref(conn); + read_unlock(&ksocknal_data.ksnd_global_lock); + return conn; + } + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return NULL; +} + +static ksock_sched_t * +ksocknal_choose_scheduler_locked(unsigned int cpt) +{ + struct ksock_sched_info *info = ksocknal_data.ksnd_sched_info[cpt]; + ksock_sched_t *sched; + int i; + + LASSERT(info->ksi_nthreads > 0); + + sched = &info->ksi_scheds[0]; + /* + * NB: it's safe so far, but info->ksi_nthreads could be changed + * at runtime when we have dynamic LNet configuration, then we + * need to take care of this. + */ + for (i = 1; i < info->ksi_nthreads; i++) { + if (sched->kss_nconns > info->ksi_scheds[i].kss_nconns) + sched = &info->ksi_scheds[i]; + } + + return sched; +} + +static int +ksocknal_local_ipvec(lnet_ni_t *ni, __u32 *ipaddrs) +{ + ksock_net_t *net = ni->ni_data; + int i; + int nip; + + read_lock(&ksocknal_data.ksnd_global_lock); + + nip = net->ksnn_ninterfaces; + LASSERT(nip <= LNET_MAX_INTERFACES); + + /* Only offer interfaces for additional connections if I have + * more than one. */ + if (nip < 2) { + read_unlock(&ksocknal_data.ksnd_global_lock); + return 0; + } + + for (i = 0; i < nip; i++) { + ipaddrs[i] = net->ksnn_interfaces[i].ksni_ipaddr; + LASSERT(ipaddrs[i] != 0); + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return nip; +} + +static int +ksocknal_match_peerip(ksock_interface_t *iface, __u32 *ips, int nips) +{ + int best_netmatch = 0; + int best_xor = 0; + int best = -1; + int this_xor; + int this_netmatch; + int i; + + for (i = 0; i < nips; i++) { + if (ips[i] == 0) + continue; + + this_xor = ips[i] ^ iface->ksni_ipaddr; + this_netmatch = ((this_xor & iface->ksni_netmask) == 0) ? 1 : 0; + + if (!(best < 0 || + best_netmatch < this_netmatch || + (best_netmatch == this_netmatch && + best_xor > this_xor))) + continue; + + best = i; + best_netmatch = this_netmatch; + best_xor = this_xor; + } + + LASSERT(best >= 0); + return best; +} + +static int +ksocknal_select_ips(ksock_peer_t *peer, __u32 *peerips, int n_peerips) +{ + rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + ksock_net_t *net = peer->ksnp_ni->ni_data; + ksock_interface_t *iface; + ksock_interface_t *best_iface; + int n_ips; + int i; + int j; + int k; + __u32 ip; + __u32 xor; + int this_netmatch; + int best_netmatch; + int best_npeers; + + /* CAVEAT EMPTOR: We do all our interface matching with an + * exclusive hold of global lock at IRQ priority. We're only + * expecting to be dealing with small numbers of interfaces, so the + * O(n**3)-ness shouldn't matter */ + + /* Also note that I'm not going to return more than n_peerips + * interfaces, even if I have more myself */ + + write_lock_bh(global_lock); + + LASSERT(n_peerips <= LNET_MAX_INTERFACES); + LASSERT(net->ksnn_ninterfaces <= LNET_MAX_INTERFACES); + + /* Only match interfaces for additional connections + * if I have > 1 interface */ + n_ips = (net->ksnn_ninterfaces < 2) ? 0 : + min(n_peerips, net->ksnn_ninterfaces); + + for (i = 0; peer->ksnp_n_passive_ips < n_ips; i++) { + /* ^ yes really... */ + + /* If we have any new interfaces, first tick off all the + * peer IPs that match old interfaces, then choose new + * interfaces to match the remaining peer IPS. + * We don't forget interfaces we've stopped using; we might + * start using them again... */ + + if (i < peer->ksnp_n_passive_ips) { + /* Old interface. */ + ip = peer->ksnp_passive_ips[i]; + best_iface = ksocknal_ip2iface(peer->ksnp_ni, ip); + + } else { + /* choose a new interface */ + LASSERT(i == peer->ksnp_n_passive_ips); + + best_iface = NULL; + best_netmatch = 0; + best_npeers = 0; + + for (j = 0; j < net->ksnn_ninterfaces; j++) { + iface = &net->ksnn_interfaces[j]; + ip = iface->ksni_ipaddr; + + for (k = 0; k < peer->ksnp_n_passive_ips; k++) + if (peer->ksnp_passive_ips[k] == ip) + break; + + if (k < peer->ksnp_n_passive_ips) /* using it already */ + continue; + + k = ksocknal_match_peerip(iface, peerips, n_peerips); + xor = ip ^ peerips[k]; + this_netmatch = ((xor & iface->ksni_netmask) == 0) ? 1 : 0; + + if (!(best_iface == NULL || + best_netmatch < this_netmatch || + (best_netmatch == this_netmatch && + best_npeers > iface->ksni_npeers))) + continue; + + best_iface = iface; + best_netmatch = this_netmatch; + best_npeers = iface->ksni_npeers; + } + + best_iface->ksni_npeers++; + ip = best_iface->ksni_ipaddr; + peer->ksnp_passive_ips[i] = ip; + peer->ksnp_n_passive_ips = i+1; + } + + /* mark the best matching peer IP used */ + j = ksocknal_match_peerip(best_iface, peerips, n_peerips); + peerips[j] = 0; + } + + /* Overwrite input peer IP addresses */ + memcpy(peerips, peer->ksnp_passive_ips, n_ips * sizeof(*peerips)); + + write_unlock_bh(global_lock); + + return n_ips; +} + +static void +ksocknal_create_routes(ksock_peer_t *peer, int port, + __u32 *peer_ipaddrs, int npeer_ipaddrs) +{ + ksock_route_t *newroute = NULL; + rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + lnet_ni_t *ni = peer->ksnp_ni; + ksock_net_t *net = ni->ni_data; + struct list_head *rtmp; + ksock_route_t *route; + ksock_interface_t *iface; + ksock_interface_t *best_iface; + int best_netmatch; + int this_netmatch; + int best_nroutes; + int i; + int j; + + /* CAVEAT EMPTOR: We do all our interface matching with an + * exclusive hold of global lock at IRQ priority. We're only + * expecting to be dealing with small numbers of interfaces, so the + * O(n**3)-ness here shouldn't matter */ + + write_lock_bh(global_lock); + + if (net->ksnn_ninterfaces < 2) { + /* Only create additional connections + * if I have > 1 interface */ + write_unlock_bh(global_lock); + return; + } + + LASSERT(npeer_ipaddrs <= LNET_MAX_INTERFACES); + + for (i = 0; i < npeer_ipaddrs; i++) { + if (newroute != NULL) { + newroute->ksnr_ipaddr = peer_ipaddrs[i]; + } else { + write_unlock_bh(global_lock); + + newroute = ksocknal_create_route(peer_ipaddrs[i], port); + if (newroute == NULL) + return; + + write_lock_bh(global_lock); + } + + if (peer->ksnp_closing) { + /* peer got closed under me */ + break; + } + + /* Already got a route? */ + route = NULL; + list_for_each(rtmp, &peer->ksnp_routes) { + route = list_entry(rtmp, ksock_route_t, ksnr_list); + + if (route->ksnr_ipaddr == newroute->ksnr_ipaddr) + break; + + route = NULL; + } + if (route != NULL) + continue; + + best_iface = NULL; + best_nroutes = 0; + best_netmatch = 0; + + LASSERT(net->ksnn_ninterfaces <= LNET_MAX_INTERFACES); + + /* Select interface to connect from */ + for (j = 0; j < net->ksnn_ninterfaces; j++) { + iface = &net->ksnn_interfaces[j]; + + /* Using this interface already? */ + list_for_each(rtmp, &peer->ksnp_routes) { + route = list_entry(rtmp, ksock_route_t, + ksnr_list); + + if (route->ksnr_myipaddr == iface->ksni_ipaddr) + break; + + route = NULL; + } + if (route != NULL) + continue; + + this_netmatch = (((iface->ksni_ipaddr ^ + newroute->ksnr_ipaddr) & + iface->ksni_netmask) == 0) ? 1 : 0; + + if (!(best_iface == NULL || + best_netmatch < this_netmatch || + (best_netmatch == this_netmatch && + best_nroutes > iface->ksni_nroutes))) + continue; + + best_iface = iface; + best_netmatch = this_netmatch; + best_nroutes = iface->ksni_nroutes; + } + + if (best_iface == NULL) + continue; + + newroute->ksnr_myipaddr = best_iface->ksni_ipaddr; + best_iface->ksni_nroutes++; + + ksocknal_add_route_locked(peer, newroute); + newroute = NULL; + } + + write_unlock_bh(global_lock); + if (newroute != NULL) + ksocknal_route_decref(newroute); +} + +int +ksocknal_accept(lnet_ni_t *ni, struct socket *sock) +{ + ksock_connreq_t *cr; + int rc; + __u32 peer_ip; + int peer_port; + + rc = libcfs_sock_getaddr(sock, 1, &peer_ip, &peer_port); + LASSERT(rc == 0); /* we succeeded before */ + + LIBCFS_ALLOC(cr, sizeof(*cr)); + if (cr == NULL) { + LCONSOLE_ERROR_MSG(0x12f, "Dropping connection request from %pI4h: memory exhausted\n", + &peer_ip); + return -ENOMEM; + } + + lnet_ni_addref(ni); + cr->ksncr_ni = ni; + cr->ksncr_sock = sock; + + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + + list_add_tail(&cr->ksncr_list, &ksocknal_data.ksnd_connd_connreqs); + wake_up(&ksocknal_data.ksnd_connd_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + return 0; +} + +static int +ksocknal_connecting(ksock_peer_t *peer, __u32 ipaddr) +{ + ksock_route_t *route; + + list_for_each_entry(route, &peer->ksnp_routes, ksnr_list) { + + if (route->ksnr_ipaddr == ipaddr) + return route->ksnr_connecting; + } + return 0; +} + +int +ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route, + struct socket *sock, int type) +{ + rwlock_t *global_lock = &ksocknal_data.ksnd_global_lock; + LIST_HEAD(zombies); + lnet_process_id_t peerid; + struct list_head *tmp; + __u64 incarnation; + ksock_conn_t *conn; + ksock_conn_t *conn2; + ksock_peer_t *peer = NULL; + ksock_peer_t *peer2; + ksock_sched_t *sched; + ksock_hello_msg_t *hello; + int cpt; + ksock_tx_t *tx; + ksock_tx_t *txtmp; + int rc; + int active; + char *warn = NULL; + + active = (route != NULL); + + LASSERT(active == (type != SOCKLND_CONN_NONE)); + + LIBCFS_ALLOC(conn, sizeof(*conn)); + if (conn == NULL) { + rc = -ENOMEM; + goto failed_0; + } + + conn->ksnc_peer = NULL; + conn->ksnc_route = NULL; + conn->ksnc_sock = sock; + /* 2 ref, 1 for conn, another extra ref prevents socket + * being closed before establishment of connection */ + atomic_set(&conn->ksnc_sock_refcount, 2); + conn->ksnc_type = type; + ksocknal_lib_save_callback(sock, conn); + atomic_set(&conn->ksnc_conn_refcount, 1); /* 1 ref for me */ + + conn->ksnc_rx_ready = 0; + conn->ksnc_rx_scheduled = 0; + + INIT_LIST_HEAD(&conn->ksnc_tx_queue); + conn->ksnc_tx_ready = 0; + conn->ksnc_tx_scheduled = 0; + conn->ksnc_tx_carrier = NULL; + atomic_set(&conn->ksnc_tx_nob, 0); + + LIBCFS_ALLOC(hello, offsetof(ksock_hello_msg_t, + kshm_ips[LNET_MAX_INTERFACES])); + if (hello == NULL) { + rc = -ENOMEM; + goto failed_1; + } + + /* stash conn's local and remote addrs */ + rc = ksocknal_lib_get_conn_addrs(conn); + if (rc != 0) + goto failed_1; + + /* Find out/confirm peer's NID and connection type and get the + * vector of interfaces she's willing to let me connect to. + * Passive connections use the listener timeout since the peer sends + * eagerly */ + + if (active) { + peer = route->ksnr_peer; + LASSERT(ni == peer->ksnp_ni); + + /* Active connection sends HELLO eagerly */ + hello->kshm_nips = ksocknal_local_ipvec(ni, hello->kshm_ips); + peerid = peer->ksnp_id; + + write_lock_bh(global_lock); + conn->ksnc_proto = peer->ksnp_proto; + write_unlock_bh(global_lock); + + if (conn->ksnc_proto == NULL) { + conn->ksnc_proto = &ksocknal_protocol_v3x; +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol == 2) + conn->ksnc_proto = &ksocknal_protocol_v2x; + else if (*ksocknal_tunables.ksnd_protocol == 1) + conn->ksnc_proto = &ksocknal_protocol_v1x; +#endif + } + + rc = ksocknal_send_hello(ni, conn, peerid.nid, hello); + if (rc != 0) + goto failed_1; + } else { + peerid.nid = LNET_NID_ANY; + peerid.pid = LNET_PID_ANY; + + /* Passive, get protocol from peer */ + conn->ksnc_proto = NULL; + } + + rc = ksocknal_recv_hello(ni, conn, hello, &peerid, &incarnation); + if (rc < 0) + goto failed_1; + + LASSERT(rc == 0 || active); + LASSERT(conn->ksnc_proto != NULL); + LASSERT(peerid.nid != LNET_NID_ANY); + + cpt = lnet_cpt_of_nid(peerid.nid); + + if (active) { + ksocknal_peer_addref(peer); + write_lock_bh(global_lock); + } else { + rc = ksocknal_create_peer(&peer, ni, peerid); + if (rc != 0) + goto failed_1; + + write_lock_bh(global_lock); + + /* called with a ref on ni, so shutdown can't have started */ + LASSERT(((ksock_net_t *) ni->ni_data)->ksnn_shutdown == 0); + + peer2 = ksocknal_find_peer_locked(ni, peerid); + if (peer2 == NULL) { + /* NB this puts an "empty" peer in the peer + * table (which takes my ref) */ + list_add_tail(&peer->ksnp_list, + ksocknal_nid2peerlist(peerid.nid)); + } else { + ksocknal_peer_decref(peer); + peer = peer2; + } + + /* +1 ref for me */ + ksocknal_peer_addref(peer); + peer->ksnp_accepting++; + + /* Am I already connecting to this guy? Resolve in + * favour of higher NID... */ + if (peerid.nid < ni->ni_nid && + ksocknal_connecting(peer, conn->ksnc_ipaddr)) { + rc = EALREADY; + warn = "connection race resolution"; + goto failed_2; + } + } + + if (peer->ksnp_closing || + (active && route->ksnr_deleted)) { + /* peer/route got closed under me */ + rc = -ESTALE; + warn = "peer/route removed"; + goto failed_2; + } + + if (peer->ksnp_proto == NULL) { + /* Never connected before. + * NB recv_hello may have returned EPROTO to signal my peer + * wants a different protocol than the one I asked for. + */ + LASSERT(list_empty(&peer->ksnp_conns)); + + peer->ksnp_proto = conn->ksnc_proto; + peer->ksnp_incarnation = incarnation; + } + + if (peer->ksnp_proto != conn->ksnc_proto || + peer->ksnp_incarnation != incarnation) { + /* Peer rebooted or I've got the wrong protocol version */ + ksocknal_close_peer_conns_locked(peer, 0, 0); + + peer->ksnp_proto = NULL; + rc = ESTALE; + warn = peer->ksnp_incarnation != incarnation ? + "peer rebooted" : + "wrong proto version"; + goto failed_2; + } + + switch (rc) { + default: + LBUG(); + case 0: + break; + case EALREADY: + warn = "lost conn race"; + goto failed_2; + case EPROTO: + warn = "retry with different protocol version"; + goto failed_2; + } + + /* Refuse to duplicate an existing connection, unless this is a + * loopback connection */ + if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) { + list_for_each(tmp, &peer->ksnp_conns) { + conn2 = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn2->ksnc_ipaddr != conn->ksnc_ipaddr || + conn2->ksnc_myipaddr != conn->ksnc_myipaddr || + conn2->ksnc_type != conn->ksnc_type) + continue; + + /* Reply on a passive connection attempt so the peer + * realises we're connected. */ + LASSERT(rc == 0); + if (!active) + rc = EALREADY; + + warn = "duplicate"; + goto failed_2; + } + } + + /* If the connection created by this route didn't bind to the IP + * address the route connected to, the connection/route matching + * code below probably isn't going to work. */ + if (active && + route->ksnr_ipaddr != conn->ksnc_ipaddr) { + CERROR("Route %s %pI4h connected to %pI4h\n", + libcfs_id2str(peer->ksnp_id), + &route->ksnr_ipaddr, + &conn->ksnc_ipaddr); + } + + /* Search for a route corresponding to the new connection and + * create an association. This allows incoming connections created + * by routes in my peer to match my own route entries so I don't + * continually create duplicate routes. */ + list_for_each(tmp, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + + if (route->ksnr_ipaddr != conn->ksnc_ipaddr) + continue; + + ksocknal_associate_route_conn_locked(route, conn); + break; + } + + conn->ksnc_peer = peer; /* conn takes my ref on peer */ + peer->ksnp_last_alive = cfs_time_current(); + peer->ksnp_send_keepalive = 0; + peer->ksnp_error = 0; + + sched = ksocknal_choose_scheduler_locked(cpt); + sched->kss_nconns++; + conn->ksnc_scheduler = sched; + + conn->ksnc_tx_last_post = cfs_time_current(); + /* Set the deadline for the outgoing HELLO to drain */ + conn->ksnc_tx_bufnob = sock->sk->sk_wmem_queued; + conn->ksnc_tx_deadline = cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + mb(); /* order with adding to peer's conn list */ + + list_add(&conn->ksnc_list, &peer->ksnp_conns); + ksocknal_conn_addref(conn); + + ksocknal_new_packet(conn, 0); + + conn->ksnc_zc_capable = ksocknal_lib_zc_capable(conn); + + /* Take packets blocking for this connection. */ + list_for_each_entry_safe(tx, txtmp, &peer->ksnp_tx_queue, tx_list) { + if (conn->ksnc_proto->pro_match_tx(conn, tx, tx->tx_nonblk) == SOCKNAL_MATCH_NO) + continue; + + list_del(&tx->tx_list); + ksocknal_queue_tx_locked(tx, conn); + } + + write_unlock_bh(global_lock); + + /* We've now got a new connection. Any errors from here on are just + * like "normal" comms errors and we close the connection normally. + * NB (a) we still have to send the reply HELLO for passive + * connections, + * (b) normal I/O on the conn is blocked until I setup and call the + * socket callbacks. + */ + + CDEBUG(D_NET, "New conn %s p %d.x %pI4h -> %pI4h/%d incarnation:%lld sched[%d:%d]\n", + libcfs_id2str(peerid), conn->ksnc_proto->pro_version, + &conn->ksnc_myipaddr, &conn->ksnc_ipaddr, + conn->ksnc_port, incarnation, cpt, + (int)(sched - &sched->kss_info->ksi_scheds[0])); + + if (active) { + /* additional routes after interface exchange? */ + ksocknal_create_routes(peer, conn->ksnc_port, + hello->kshm_ips, hello->kshm_nips); + } else { + hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips, + hello->kshm_nips); + rc = ksocknal_send_hello(ni, conn, peerid.nid, hello); + } + + LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t, + kshm_ips[LNET_MAX_INTERFACES])); + + /* setup the socket AFTER I've received hello (it disables + * SO_LINGER). I might call back to the acceptor who may want + * to send a protocol version response and then close the + * socket; this ensures the socket only tears down after the + * response has been sent. */ + if (rc == 0) + rc = ksocknal_lib_setup_sock(sock); + + write_lock_bh(global_lock); + + /* NB my callbacks block while I hold ksnd_global_lock */ + ksocknal_lib_set_callback(sock, conn); + + if (!active) + peer->ksnp_accepting--; + + write_unlock_bh(global_lock); + + if (rc != 0) { + write_lock_bh(global_lock); + if (!conn->ksnc_closing) { + /* could be closed by another thread */ + ksocknal_close_conn_locked(conn, rc); + } + write_unlock_bh(global_lock); + } else if (ksocknal_connsock_addref(conn) == 0) { + /* Allow I/O to proceed. */ + ksocknal_read_callback(conn); + ksocknal_write_callback(conn); + ksocknal_connsock_decref(conn); + } + + ksocknal_connsock_decref(conn); + ksocknal_conn_decref(conn); + return rc; + + failed_2: + if (!peer->ksnp_closing && + list_empty(&peer->ksnp_conns) && + list_empty(&peer->ksnp_routes)) { + list_add(&zombies, &peer->ksnp_tx_queue); + list_del_init(&peer->ksnp_tx_queue); + ksocknal_unlink_peer_locked(peer); + } + + write_unlock_bh(global_lock); + + if (warn != NULL) { + if (rc < 0) + CERROR("Not creating conn %s type %d: %s\n", + libcfs_id2str(peerid), conn->ksnc_type, warn); + else + CDEBUG(D_NET, "Not creating conn %s type %d: %s\n", + libcfs_id2str(peerid), conn->ksnc_type, warn); + } + + if (!active) { + if (rc > 0) { + /* Request retry by replying with CONN_NONE + * ksnc_proto has been set already */ + conn->ksnc_type = SOCKLND_CONN_NONE; + hello->kshm_nips = 0; + ksocknal_send_hello(ni, conn, peerid.nid, hello); + } + + write_lock_bh(global_lock); + peer->ksnp_accepting--; + write_unlock_bh(global_lock); + } + + ksocknal_txlist_done(ni, &zombies, 1); + ksocknal_peer_decref(peer); + + failed_1: + if (hello != NULL) + LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t, + kshm_ips[LNET_MAX_INTERFACES])); + + LIBCFS_FREE(conn, sizeof(*conn)); + + failed_0: + libcfs_sock_release(sock); + return rc; +} + +void +ksocknal_close_conn_locked(ksock_conn_t *conn, int error) +{ + /* This just does the immmediate housekeeping, and queues the + * connection for the reaper to terminate. + * Caller holds ksnd_global_lock exclusively in irq context */ + ksock_peer_t *peer = conn->ksnc_peer; + ksock_route_t *route; + ksock_conn_t *conn2; + struct list_head *tmp; + + LASSERT(peer->ksnp_error == 0); + LASSERT(!conn->ksnc_closing); + conn->ksnc_closing = 1; + + /* ksnd_deathrow_conns takes over peer's ref */ + list_del(&conn->ksnc_list); + + route = conn->ksnc_route; + if (route != NULL) { + /* dissociate conn from route... */ + LASSERT(!route->ksnr_deleted); + LASSERT((route->ksnr_connected & (1 << conn->ksnc_type)) != 0); + + conn2 = NULL; + list_for_each(tmp, &peer->ksnp_conns) { + conn2 = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn2->ksnc_route == route && + conn2->ksnc_type == conn->ksnc_type) + break; + + conn2 = NULL; + } + if (conn2 == NULL) + route->ksnr_connected &= ~(1 << conn->ksnc_type); + + conn->ksnc_route = NULL; + +#if 0 /* irrelevant with only eager routes */ + /* make route least favourite */ + list_del(&route->ksnr_list); + list_add_tail(&route->ksnr_list, &peer->ksnp_routes); +#endif + ksocknal_route_decref(route); /* drop conn's ref on route */ + } + + if (list_empty(&peer->ksnp_conns)) { + /* No more connections to this peer */ + + if (!list_empty(&peer->ksnp_tx_queue)) { + ksock_tx_t *tx; + + LASSERT(conn->ksnc_proto == &ksocknal_protocol_v3x); + + /* throw them to the last connection..., + * these TXs will be send to /dev/null by scheduler */ + list_for_each_entry(tx, &peer->ksnp_tx_queue, + tx_list) + ksocknal_tx_prep(conn, tx); + + spin_lock_bh(&conn->ksnc_scheduler->kss_lock); + list_splice_init(&peer->ksnp_tx_queue, + &conn->ksnc_tx_queue); + spin_unlock_bh(&conn->ksnc_scheduler->kss_lock); + } + + peer->ksnp_proto = NULL; /* renegotiate protocol version */ + peer->ksnp_error = error; /* stash last conn close reason */ + + if (list_empty(&peer->ksnp_routes)) { + /* I've just closed last conn belonging to a + * peer with no routes to it */ + ksocknal_unlink_peer_locked(peer); + } + } + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + list_add_tail(&conn->ksnc_list, + &ksocknal_data.ksnd_deathrow_conns); + wake_up(&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); +} + +void +ksocknal_peer_failed(ksock_peer_t *peer) +{ + int notify = 0; + unsigned long last_alive = 0; + + /* There has been a connection failure or comms error; but I'll only + * tell LNET I think the peer is dead if it's to another kernel and + * there are no connections or connection attempts in existence. */ + + read_lock(&ksocknal_data.ksnd_global_lock); + + if ((peer->ksnp_id.pid & LNET_PID_USERFLAG) == 0 && + list_empty(&peer->ksnp_conns) && + peer->ksnp_accepting == 0 && + ksocknal_find_connecting_route_locked(peer) == NULL) { + notify = 1; + last_alive = peer->ksnp_last_alive; + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + if (notify) + lnet_notify(peer->ksnp_ni, peer->ksnp_id.nid, 0, + last_alive); +} + +void +ksocknal_finalize_zcreq(ksock_conn_t *conn) +{ + ksock_peer_t *peer = conn->ksnc_peer; + ksock_tx_t *tx; + ksock_tx_t *tmp; + LIST_HEAD(zlist); + + /* NB safe to finalize TXs because closing of socket will + * abort all buffered data */ + LASSERT(conn->ksnc_sock == NULL); + + spin_lock(&peer->ksnp_lock); + + list_for_each_entry_safe(tx, tmp, &peer->ksnp_zc_req_list, tx_zc_list) { + if (tx->tx_conn != conn) + continue; + + LASSERT(tx->tx_msg.ksm_zc_cookies[0] != 0); + + tx->tx_msg.ksm_zc_cookies[0] = 0; + tx->tx_zc_aborted = 1; /* mark it as not-acked */ + list_del(&tx->tx_zc_list); + list_add(&tx->tx_zc_list, &zlist); + } + + spin_unlock(&peer->ksnp_lock); + + while (!list_empty(&zlist)) { + tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list); + + list_del(&tx->tx_zc_list); + ksocknal_tx_decref(tx); + } +} + +void +ksocknal_terminate_conn(ksock_conn_t *conn) +{ + /* This gets called by the reaper (guaranteed thread context) to + * disengage the socket from its callbacks and close it. + * ksnc_refcount will eventually hit zero, and then the reaper will + * destroy it. */ + ksock_peer_t *peer = conn->ksnc_peer; + ksock_sched_t *sched = conn->ksnc_scheduler; + int failed = 0; + + LASSERT(conn->ksnc_closing); + + /* wake up the scheduler to "send" all remaining packets to /dev/null */ + spin_lock_bh(&sched->kss_lock); + + /* a closing conn is always ready to tx */ + conn->ksnc_tx_ready = 1; + + if (!conn->ksnc_tx_scheduled && + !list_empty(&conn->ksnc_tx_queue)) { + list_add_tail(&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + ksocknal_conn_addref(conn); + + wake_up(&sched->kss_waitq); + } + + spin_unlock_bh(&sched->kss_lock); + + /* serialise with callbacks */ + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_lib_reset_callback(conn->ksnc_sock, conn); + + /* OK, so this conn may not be completely disengaged from its + * scheduler yet, but it _has_ committed to terminate... */ + conn->ksnc_scheduler->kss_nconns--; + + if (peer->ksnp_error != 0) { + /* peer's last conn closed in error */ + LASSERT(list_empty(&peer->ksnp_conns)); + failed = 1; + peer->ksnp_error = 0; /* avoid multiple notifications */ + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + if (failed) + ksocknal_peer_failed(peer); + + /* The socket is closed on the final put; either here, or in + * ksocknal_{send,recv}msg(). Since we set up the linger2 option + * when the connection was established, this will close the socket + * immediately, aborting anything buffered in it. Any hung + * zero-copy transmits will therefore complete in finite time. */ + ksocknal_connsock_decref(conn); +} + +void +ksocknal_queue_zombie_conn(ksock_conn_t *conn) +{ + /* Queue the conn for the reaper to destroy */ + + LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0); + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + list_add_tail(&conn->ksnc_list, &ksocknal_data.ksnd_zombie_conns); + wake_up(&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); +} + +void +ksocknal_destroy_conn(ksock_conn_t *conn) +{ + unsigned long last_rcv; + + /* Final coup-de-grace of the reaper */ + CDEBUG(D_NET, "connection %p\n", conn); + + LASSERT(atomic_read(&conn->ksnc_conn_refcount) == 0); + LASSERT(atomic_read(&conn->ksnc_sock_refcount) == 0); + LASSERT(conn->ksnc_sock == NULL); + LASSERT(conn->ksnc_route == NULL); + LASSERT(!conn->ksnc_tx_scheduled); + LASSERT(!conn->ksnc_rx_scheduled); + LASSERT(list_empty(&conn->ksnc_tx_queue)); + + /* complete current receive if any */ + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_LNET_PAYLOAD: + last_rcv = conn->ksnc_rx_deadline - + cfs_time_seconds(*ksocknal_tunables.ksnd_timeout); + CERROR("Completing partial receive from %s[%d], ip %pI4h:%d, with error, wanted: %d, left: %d, last alive is %ld secs ago\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), conn->ksnc_type, + &conn->ksnc_ipaddr, conn->ksnc_port, + conn->ksnc_rx_nob_wanted, conn->ksnc_rx_nob_left, + cfs_duration_sec(cfs_time_sub(cfs_time_current(), + last_rcv))); + lnet_finalize(conn->ksnc_peer->ksnp_ni, + conn->ksnc_cookie, -EIO); + break; + case SOCKNAL_RX_LNET_HEADER: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of lnet header from %s, ip %pI4h:%d, with error, protocol: %d.x.\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, conn->ksnc_port, + conn->ksnc_proto->pro_version); + break; + case SOCKNAL_RX_KSM_HEADER: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of ksock message from %s, ip %pI4h:%d, with error, protocol: %d.x.\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, conn->ksnc_port, + conn->ksnc_proto->pro_version); + break; + case SOCKNAL_RX_SLOP: + if (conn->ksnc_rx_started) + CERROR("Incomplete receive of slops from %s, ip %pI4h:%d, with error\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, conn->ksnc_port); + break; + default: + LBUG(); + break; + } + + ksocknal_peer_decref(conn->ksnc_peer); + + LIBCFS_FREE(conn, sizeof(*conn)); +} + +int +ksocknal_close_peer_conns_locked(ksock_peer_t *peer, __u32 ipaddr, int why) +{ + ksock_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe(ctmp, cnxt, &peer->ksnp_conns) { + conn = list_entry(ctmp, ksock_conn_t, ksnc_list); + + if (ipaddr == 0 || + conn->ksnc_ipaddr == ipaddr) { + count++; + ksocknal_close_conn_locked(conn, why); + } + } + + return count; +} + +int +ksocknal_close_conn_and_siblings(ksock_conn_t *conn, int why) +{ + ksock_peer_t *peer = conn->ksnc_peer; + __u32 ipaddr = conn->ksnc_ipaddr; + int count; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + count = ksocknal_close_peer_conns_locked(peer, ipaddr, why); + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return count; +} + +int +ksocknal_close_matching_conns(lnet_process_id_t id, __u32 ipaddr) +{ + ksock_peer_t *peer; + struct list_head *ptmp; + struct list_head *pnxt; + int lo; + int hi; + int i; + int count = 0; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + if (id.nid != LNET_NID_ANY) + lo = hi = (int)(ksocknal_nid2peerlist(id.nid) - ksocknal_data.ksnd_peers); + else { + lo = 0; + hi = ksocknal_data.ksnd_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe(ptmp, pnxt, + &ksocknal_data.ksnd_peers[i]) { + + peer = list_entry(ptmp, ksock_peer_t, ksnp_list); + + if (!((id.nid == LNET_NID_ANY || id.nid == peer->ksnp_id.nid) && + (id.pid == LNET_PID_ANY || id.pid == peer->ksnp_id.pid))) + continue; + + count += ksocknal_close_peer_conns_locked(peer, ipaddr, 0); + } + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + /* wildcards always succeed */ + if (id.nid == LNET_NID_ANY || id.pid == LNET_PID_ANY || ipaddr == 0) + return 0; + + if (count == 0) + return -ENOENT; + else + return 0; +} + +void +ksocknal_notify(lnet_ni_t *ni, lnet_nid_t gw_nid, int alive) +{ + /* The router is telling me she's been notified of a change in + * gateway state.... */ + lnet_process_id_t id = {0}; + + id.nid = gw_nid; + id.pid = LNET_PID_ANY; + + CDEBUG(D_NET, "gw %s %s\n", libcfs_nid2str(gw_nid), + alive ? "up" : "down"); + + if (!alive) { + /* If the gateway crashed, close all open connections... */ + ksocknal_close_matching_conns(id, 0); + return; + } + + /* ...otherwise do nothing. We can only establish new connections + * if we have autroutes, and these connect on demand. */ +} + +void +ksocknal_query(lnet_ni_t *ni, lnet_nid_t nid, unsigned long *when) +{ + int connect = 1; + unsigned long last_alive = 0; + unsigned long now = cfs_time_current(); + ksock_peer_t *peer = NULL; + rwlock_t *glock = &ksocknal_data.ksnd_global_lock; + lnet_process_id_t id = {.nid = nid, .pid = LUSTRE_SRV_LNET_PID}; + + read_lock(glock); + + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) { + struct list_head *tmp; + ksock_conn_t *conn; + int bufnob; + + list_for_each(tmp, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + bufnob = conn->ksnc_sock->sk->sk_wmem_queued; + + if (bufnob < conn->ksnc_tx_bufnob) { + /* something got ACKed */ + conn->ksnc_tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + peer->ksnp_last_alive = now; + conn->ksnc_tx_bufnob = bufnob; + } + } + + last_alive = peer->ksnp_last_alive; + if (ksocknal_find_connectable_route_locked(peer) == NULL) + connect = 0; + } + + read_unlock(glock); + + if (last_alive != 0) + *when = last_alive; + + CDEBUG(D_NET, "Peer %s %p, alive %ld secs ago, connect %d\n", + libcfs_nid2str(nid), peer, + last_alive ? cfs_duration_sec(now - last_alive) : -1, + connect); + + if (!connect) + return; + + ksocknal_add_peer(ni, id, LNET_NIDADDR(nid), lnet_acceptor_port()); + + write_lock_bh(glock); + + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) + ksocknal_launch_all_connections_locked(peer); + + write_unlock_bh(glock); + return; +} + +static void +ksocknal_push_peer(ksock_peer_t *peer) +{ + int index; + int i; + struct list_head *tmp; + ksock_conn_t *conn; + + for (index = 0; ; index++) { + read_lock(&ksocknal_data.ksnd_global_lock); + + i = 0; + conn = NULL; + + list_for_each(tmp, &peer->ksnp_conns) { + if (i++ == index) { + conn = list_entry(tmp, ksock_conn_t, + ksnc_list); + ksocknal_conn_addref(conn); + break; + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + if (conn == NULL) + break; + + ksocknal_lib_push_conn(conn); + ksocknal_conn_decref(conn); + } +} + +static int +ksocknal_push(lnet_ni_t *ni, lnet_process_id_t id) +{ + ksock_peer_t *peer; + struct list_head *tmp; + int index; + int i; + int j; + int rc = -ENOENT; + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + for (j = 0; ; j++) { + read_lock(&ksocknal_data.ksnd_global_lock); + + index = 0; + peer = NULL; + + list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry(tmp, ksock_peer_t, + ksnp_list); + + if (!((id.nid == LNET_NID_ANY || + id.nid == peer->ksnp_id.nid) && + (id.pid == LNET_PID_ANY || + id.pid == peer->ksnp_id.pid))) { + peer = NULL; + continue; + } + + if (index++ == j) { + ksocknal_peer_addref(peer); + break; + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + if (peer != NULL) { + rc = 0; + ksocknal_push_peer(peer); + ksocknal_peer_decref(peer); + } + } + + } + + return rc; +} + +static int +ksocknal_add_interface(lnet_ni_t *ni, __u32 ipaddress, __u32 netmask) +{ + ksock_net_t *net = ni->ni_data; + ksock_interface_t *iface; + int rc; + int i; + int j; + struct list_head *ptmp; + ksock_peer_t *peer; + struct list_head *rtmp; + ksock_route_t *route; + + if (ipaddress == 0 || + netmask == 0) + return -EINVAL; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + iface = ksocknal_ip2iface(ni, ipaddress); + if (iface != NULL) { + /* silently ignore dups */ + rc = 0; + } else if (net->ksnn_ninterfaces == LNET_MAX_INTERFACES) { + rc = -ENOSPC; + } else { + iface = &net->ksnn_interfaces[net->ksnn_ninterfaces++]; + + iface->ksni_ipaddr = ipaddress; + iface->ksni_netmask = netmask; + iface->ksni_nroutes = 0; + iface->ksni_npeers = 0; + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each(ptmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry(ptmp, ksock_peer_t, + ksnp_list); + + for (j = 0; j < peer->ksnp_n_passive_ips; j++) + if (peer->ksnp_passive_ips[j] == ipaddress) + iface->ksni_npeers++; + + list_for_each(rtmp, &peer->ksnp_routes) { + route = list_entry(rtmp, + ksock_route_t, + ksnr_list); + + if (route->ksnr_myipaddr == ipaddress) + iface->ksni_nroutes++; + } + } + } + + rc = 0; + /* NB only new connections will pay attention to the new interface! */ + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return rc; +} + +static void +ksocknal_peer_del_interface_locked(ksock_peer_t *peer, __u32 ipaddr) +{ + struct list_head *tmp; + struct list_head *nxt; + ksock_route_t *route; + ksock_conn_t *conn; + int i; + int j; + + for (i = 0; i < peer->ksnp_n_passive_ips; i++) + if (peer->ksnp_passive_ips[i] == ipaddr) { + for (j = i+1; j < peer->ksnp_n_passive_ips; j++) + peer->ksnp_passive_ips[j-1] = + peer->ksnp_passive_ips[j]; + peer->ksnp_n_passive_ips--; + break; + } + + list_for_each_safe(tmp, nxt, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + + if (route->ksnr_myipaddr != ipaddr) + continue; + + if (route->ksnr_share_count != 0) { + /* Manually created; keep, but unbind */ + route->ksnr_myipaddr = 0; + } else { + ksocknal_del_route_locked(route); + } + } + + list_for_each_safe(tmp, nxt, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + + if (conn->ksnc_myipaddr == ipaddr) + ksocknal_close_conn_locked(conn, 0); + } +} + +static int +ksocknal_del_interface(lnet_ni_t *ni, __u32 ipaddress) +{ + ksock_net_t *net = ni->ni_data; + int rc = -ENOENT; + struct list_head *tmp; + struct list_head *nxt; + ksock_peer_t *peer; + __u32 this_ip; + int i; + int j; + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + this_ip = net->ksnn_interfaces[i].ksni_ipaddr; + + if (!(ipaddress == 0 || + ipaddress == this_ip)) + continue; + + rc = 0; + + for (j = i+1; j < net->ksnn_ninterfaces; j++) + net->ksnn_interfaces[j-1] = + net->ksnn_interfaces[j]; + + net->ksnn_ninterfaces--; + + for (j = 0; j < ksocknal_data.ksnd_peer_hash_size; j++) { + list_for_each_safe(tmp, nxt, + &ksocknal_data.ksnd_peers[j]) { + peer = list_entry(tmp, ksock_peer_t, + ksnp_list); + + if (peer->ksnp_ni != ni) + continue; + + ksocknal_peer_del_interface_locked(peer, this_ip); + } + } + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + return rc; +} + +int +ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg) +{ + lnet_process_id_t id = {0}; + struct libcfs_ioctl_data *data = arg; + int rc; + + switch (cmd) { + case IOC_LIBCFS_GET_INTERFACE: { + ksock_net_t *net = ni->ni_data; + ksock_interface_t *iface; + + read_lock(&ksocknal_data.ksnd_global_lock); + + if (data->ioc_count >= (__u32)net->ksnn_ninterfaces) { + rc = -ENOENT; + } else { + rc = 0; + iface = &net->ksnn_interfaces[data->ioc_count]; + + data->ioc_u32[0] = iface->ksni_ipaddr; + data->ioc_u32[1] = iface->ksni_netmask; + data->ioc_u32[2] = iface->ksni_npeers; + data->ioc_u32[3] = iface->ksni_nroutes; + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return rc; + } + + case IOC_LIBCFS_ADD_INTERFACE: + return ksocknal_add_interface(ni, + data->ioc_u32[0], /* IP address */ + data->ioc_u32[1]); /* net mask */ + + case IOC_LIBCFS_DEL_INTERFACE: + return ksocknal_del_interface(ni, + data->ioc_u32[0]); /* IP address */ + + case IOC_LIBCFS_GET_PEER: { + __u32 myip = 0; + __u32 ip = 0; + int port = 0; + int conn_count = 0; + int share_count = 0; + + rc = ksocknal_get_peer_info(ni, data->ioc_count, + &id, &myip, &ip, &port, + &conn_count, &share_count); + if (rc != 0) + return rc; + + data->ioc_nid = id.nid; + data->ioc_count = share_count; + data->ioc_u32[0] = ip; + data->ioc_u32[1] = port; + data->ioc_u32[2] = myip; + data->ioc_u32[3] = conn_count; + data->ioc_u32[4] = id.pid; + return 0; + } + + case IOC_LIBCFS_ADD_PEER: + id.nid = data->ioc_nid; + id.pid = LUSTRE_SRV_LNET_PID; + return ksocknal_add_peer(ni, id, + data->ioc_u32[0], /* IP */ + data->ioc_u32[1]); /* port */ + + case IOC_LIBCFS_DEL_PEER: + id.nid = data->ioc_nid; + id.pid = LNET_PID_ANY; + return ksocknal_del_peer(ni, id, + data->ioc_u32[0]); /* IP */ + + case IOC_LIBCFS_GET_CONN: { + int txmem; + int rxmem; + int nagle; + ksock_conn_t *conn = ksocknal_get_conn_by_idx(ni, data->ioc_count); + + if (conn == NULL) + return -ENOENT; + + ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle); + + data->ioc_count = txmem; + data->ioc_nid = conn->ksnc_peer->ksnp_id.nid; + data->ioc_flags = nagle; + data->ioc_u32[0] = conn->ksnc_ipaddr; + data->ioc_u32[1] = conn->ksnc_port; + data->ioc_u32[2] = conn->ksnc_myipaddr; + data->ioc_u32[3] = conn->ksnc_type; + data->ioc_u32[4] = conn->ksnc_scheduler->kss_info->ksi_cpt; + data->ioc_u32[5] = rxmem; + data->ioc_u32[6] = conn->ksnc_peer->ksnp_id.pid; + ksocknal_conn_decref(conn); + return 0; + } + + case IOC_LIBCFS_CLOSE_CONNECTION: + id.nid = data->ioc_nid; + id.pid = LNET_PID_ANY; + return ksocknal_close_matching_conns(id, + data->ioc_u32[0]); + + case IOC_LIBCFS_REGISTER_MYNID: + /* Ignore if this is a noop */ + if (data->ioc_nid == ni->ni_nid) + return 0; + + CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n", + libcfs_nid2str(data->ioc_nid), + libcfs_nid2str(ni->ni_nid)); + return -EINVAL; + + case IOC_LIBCFS_PUSH_CONNECTION: + id.nid = data->ioc_nid; + id.pid = LNET_PID_ANY; + return ksocknal_push(ni, id); + + default: + return -EINVAL; + } + /* not reached */ +} + +static void +ksocknal_free_buffers(void) +{ + LASSERT(atomic_read(&ksocknal_data.ksnd_nactive_txs) == 0); + + if (ksocknal_data.ksnd_sched_info != NULL) { + struct ksock_sched_info *info; + int i; + + cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) { + if (info->ksi_scheds != NULL) { + LIBCFS_FREE(info->ksi_scheds, + info->ksi_nthreads_max * + sizeof(info->ksi_scheds[0])); + } + } + cfs_percpt_free(ksocknal_data.ksnd_sched_info); + } + + LIBCFS_FREE(ksocknal_data.ksnd_peers, + sizeof(struct list_head) * + ksocknal_data.ksnd_peer_hash_size); + + spin_lock(&ksocknal_data.ksnd_tx_lock); + + if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { + struct list_head zlist; + ksock_tx_t *tx; + + list_add(&zlist, &ksocknal_data.ksnd_idle_noop_txs); + list_del_init(&ksocknal_data.ksnd_idle_noop_txs); + spin_unlock(&ksocknal_data.ksnd_tx_lock); + + while (!list_empty(&zlist)) { + tx = list_entry(zlist.next, ksock_tx_t, tx_list); + list_del(&tx->tx_list); + LIBCFS_FREE(tx, tx->tx_desc_size); + } + } else { + spin_unlock(&ksocknal_data.ksnd_tx_lock); + } +} + +static void +ksocknal_base_shutdown(void) +{ + struct ksock_sched_info *info; + ksock_sched_t *sched; + int i; + int j; + + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + LASSERT(ksocknal_data.ksnd_nnets == 0); + + switch (ksocknal_data.ksnd_init) { + default: + LASSERT(0); + + case SOCKNAL_INIT_ALL: + case SOCKNAL_INIT_DATA: + LASSERT(ksocknal_data.ksnd_peers != NULL); + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + LASSERT(list_empty(&ksocknal_data.ksnd_peers[i])); + } + + LASSERT(list_empty(&ksocknal_data.ksnd_nets)); + LASSERT(list_empty(&ksocknal_data.ksnd_enomem_conns)); + LASSERT(list_empty(&ksocknal_data.ksnd_zombie_conns)); + LASSERT(list_empty(&ksocknal_data.ksnd_connd_connreqs)); + LASSERT(list_empty(&ksocknal_data.ksnd_connd_routes)); + + if (ksocknal_data.ksnd_sched_info != NULL) { + cfs_percpt_for_each(info, i, + ksocknal_data.ksnd_sched_info) { + if (info->ksi_scheds == NULL) + continue; + + for (j = 0; j < info->ksi_nthreads_max; j++) { + + sched = &info->ksi_scheds[j]; + LASSERT(list_empty( + &sched->kss_tx_conns)); + LASSERT(list_empty( + &sched->kss_rx_conns)); + LASSERT(list_empty( + &sched->kss_zombie_noop_txs)); + LASSERT(sched->kss_nconns == 0); + } + } + } + + /* flag threads to terminate; wake and wait for them to die */ + ksocknal_data.ksnd_shuttingdown = 1; + wake_up_all(&ksocknal_data.ksnd_connd_waitq); + wake_up_all(&ksocknal_data.ksnd_reaper_waitq); + + if (ksocknal_data.ksnd_sched_info != NULL) { + cfs_percpt_for_each(info, i, + ksocknal_data.ksnd_sched_info) { + if (info->ksi_scheds == NULL) + continue; + + for (j = 0; j < info->ksi_nthreads_max; j++) { + sched = &info->ksi_scheds[j]; + wake_up_all(&sched->kss_waitq); + } + } + } + + i = 4; + read_lock(&ksocknal_data.ksnd_global_lock); + while (ksocknal_data.ksnd_nthreads != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "waiting for %d threads to terminate\n", + ksocknal_data.ksnd_nthreads); + read_unlock(&ksocknal_data.ksnd_global_lock); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + read_lock(&ksocknal_data.ksnd_global_lock); + } + read_unlock(&ksocknal_data.ksnd_global_lock); + + ksocknal_free_buffers(); + + ksocknal_data.ksnd_init = SOCKNAL_INIT_NOTHING; + break; + } + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read(&libcfs_kmemory)); + + module_put(THIS_MODULE); +} + +static __u64 +ksocknal_new_incarnation(void) +{ + + /* The incarnation number is the time this module loaded and it + * identifies this particular instance of the socknal. + */ + return ktime_get_ns(); +} + +static int +ksocknal_base_startup(void) +{ + struct ksock_sched_info *info; + int rc; + int i; + + LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); + LASSERT(ksocknal_data.ksnd_nnets == 0); + + memset(&ksocknal_data, 0, sizeof(ksocknal_data)); /* zero pointers */ + + ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE; + LIBCFS_ALLOC(ksocknal_data.ksnd_peers, + sizeof(struct list_head) * + ksocknal_data.ksnd_peer_hash_size); + if (ksocknal_data.ksnd_peers == NULL) + return -ENOMEM; + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) + INIT_LIST_HEAD(&ksocknal_data.ksnd_peers[i]); + + rwlock_init(&ksocknal_data.ksnd_global_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_nets); + + spin_lock_init(&ksocknal_data.ksnd_reaper_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_enomem_conns); + INIT_LIST_HEAD(&ksocknal_data.ksnd_zombie_conns); + INIT_LIST_HEAD(&ksocknal_data.ksnd_deathrow_conns); + init_waitqueue_head(&ksocknal_data.ksnd_reaper_waitq); + + spin_lock_init(&ksocknal_data.ksnd_connd_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_connreqs); + INIT_LIST_HEAD(&ksocknal_data.ksnd_connd_routes); + init_waitqueue_head(&ksocknal_data.ksnd_connd_waitq); + + spin_lock_init(&ksocknal_data.ksnd_tx_lock); + INIT_LIST_HEAD(&ksocknal_data.ksnd_idle_noop_txs); + + /* NB memset above zeros whole of ksocknal_data */ + + /* flag lists/ptrs/locks initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_DATA; + try_module_get(THIS_MODULE); + + ksocknal_data.ksnd_sched_info = cfs_percpt_alloc(lnet_cpt_table(), + sizeof(*info)); + if (ksocknal_data.ksnd_sched_info == NULL) + goto failed; + + cfs_percpt_for_each(info, i, ksocknal_data.ksnd_sched_info) { + ksock_sched_t *sched; + int nthrs; + + nthrs = cfs_cpt_weight(lnet_cpt_table(), i); + if (*ksocknal_tunables.ksnd_nscheds > 0) { + nthrs = min(nthrs, *ksocknal_tunables.ksnd_nscheds); + } else { + /* max to half of CPUs, assume another half should be + * reserved for upper layer modules */ + nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs); + } + + info->ksi_nthreads_max = nthrs; + info->ksi_cpt = i; + + LIBCFS_CPT_ALLOC(info->ksi_scheds, lnet_cpt_table(), i, + info->ksi_nthreads_max * sizeof(*sched)); + if (info->ksi_scheds == NULL) + goto failed; + + for (; nthrs > 0; nthrs--) { + sched = &info->ksi_scheds[nthrs - 1]; + + sched->kss_info = info; + spin_lock_init(&sched->kss_lock); + INIT_LIST_HEAD(&sched->kss_rx_conns); + INIT_LIST_HEAD(&sched->kss_tx_conns); + INIT_LIST_HEAD(&sched->kss_zombie_noop_txs); + init_waitqueue_head(&sched->kss_waitq); + } + } + + ksocknal_data.ksnd_connd_starting = 0; + ksocknal_data.ksnd_connd_failed_stamp = 0; + ksocknal_data.ksnd_connd_starting_stamp = get_seconds(); + /* must have at least 2 connds to remain responsive to accepts while + * connecting */ + if (*ksocknal_tunables.ksnd_nconnds < SOCKNAL_CONND_RESV + 1) + *ksocknal_tunables.ksnd_nconnds = SOCKNAL_CONND_RESV + 1; + + if (*ksocknal_tunables.ksnd_nconnds_max < + *ksocknal_tunables.ksnd_nconnds) { + ksocknal_tunables.ksnd_nconnds_max = + ksocknal_tunables.ksnd_nconnds; + } + + for (i = 0; i < *ksocknal_tunables.ksnd_nconnds; i++) { + char name[16]; + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + ksocknal_data.ksnd_connd_starting++; + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + + + snprintf(name, sizeof(name), "socknal_cd%02d", i); + rc = ksocknal_thread_start(ksocknal_connd, + (void *)((ulong_ptr_t)i), name); + if (rc != 0) { + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + ksocknal_data.ksnd_connd_starting--; + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + CERROR("Can't spawn socknal connd: %d\n", rc); + goto failed; + } + } + + rc = ksocknal_thread_start(ksocknal_reaper, NULL, "socknal_reaper"); + if (rc != 0) { + CERROR("Can't spawn socknal reaper: %d\n", rc); + goto failed; + } + + /* flag everything initialised */ + ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; + + return 0; + + failed: + ksocknal_base_shutdown(); + return -ENETDOWN; +} + +static void +ksocknal_debug_peerhash(lnet_ni_t *ni) +{ + ksock_peer_t *peer = NULL; + struct list_head *tmp; + int i; + + read_lock(&ksocknal_data.ksnd_global_lock); + + for (i = 0; i < ksocknal_data.ksnd_peer_hash_size; i++) { + list_for_each(tmp, &ksocknal_data.ksnd_peers[i]) { + peer = list_entry(tmp, ksock_peer_t, ksnp_list); + + if (peer->ksnp_ni == ni) + break; + + peer = NULL; + } + } + + if (peer != NULL) { + ksock_route_t *route; + ksock_conn_t *conn; + + CWARN("Active peer on shutdown: %s, ref %d, scnt %d, closing %d, accepting %d, err %d, zcookie %llu, txq %d, zc_req %d\n", + libcfs_id2str(peer->ksnp_id), + atomic_read(&peer->ksnp_refcount), + peer->ksnp_sharecount, peer->ksnp_closing, + peer->ksnp_accepting, peer->ksnp_error, + peer->ksnp_zc_next_cookie, + !list_empty(&peer->ksnp_tx_queue), + !list_empty(&peer->ksnp_zc_req_list)); + + list_for_each(tmp, &peer->ksnp_routes) { + route = list_entry(tmp, ksock_route_t, ksnr_list); + CWARN("Route: ref %d, schd %d, conn %d, cnted %d, del %d\n", + atomic_read(&route->ksnr_refcount), + route->ksnr_scheduled, route->ksnr_connecting, + route->ksnr_connected, route->ksnr_deleted); + } + + list_for_each(tmp, &peer->ksnp_conns) { + conn = list_entry(tmp, ksock_conn_t, ksnc_list); + CWARN("Conn: ref %d, sref %d, t %d, c %d\n", + atomic_read(&conn->ksnc_conn_refcount), + atomic_read(&conn->ksnc_sock_refcount), + conn->ksnc_type, conn->ksnc_closing); + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + return; +} + +void +ksocknal_shutdown(lnet_ni_t *ni) +{ + ksock_net_t *net = ni->ni_data; + int i; + lnet_process_id_t anyid = {0}; + + anyid.nid = LNET_NID_ANY; + anyid.pid = LNET_PID_ANY; + + LASSERT(ksocknal_data.ksnd_init == SOCKNAL_INIT_ALL); + LASSERT(ksocknal_data.ksnd_nnets > 0); + + spin_lock_bh(&net->ksnn_lock); + net->ksnn_shutdown = 1; /* prevent new peers */ + spin_unlock_bh(&net->ksnn_lock); + + /* Delete all peers */ + ksocknal_del_peer(ni, anyid, 0); + + /* Wait for all peer state to clean up */ + i = 2; + spin_lock_bh(&net->ksnn_lock); + while (net->ksnn_npeers != 0) { + spin_unlock_bh(&net->ksnn_lock); + + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "waiting for %d peers to disconnect\n", + net->ksnn_npeers); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(1)); + + ksocknal_debug_peerhash(ni); + + spin_lock_bh(&net->ksnn_lock); + } + spin_unlock_bh(&net->ksnn_lock); + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + LASSERT(net->ksnn_interfaces[i].ksni_npeers == 0); + LASSERT(net->ksnn_interfaces[i].ksni_nroutes == 0); + } + + list_del(&net->ksnn_list); + LIBCFS_FREE(net, sizeof(*net)); + + ksocknal_data.ksnd_nnets--; + if (ksocknal_data.ksnd_nnets == 0) + ksocknal_base_shutdown(); +} + +static int +ksocknal_enumerate_interfaces(ksock_net_t *net) +{ + char **names; + int i; + int j; + int rc; + int n; + + n = libcfs_ipif_enumerate(&names); + if (n <= 0) { + CERROR("Can't enumerate interfaces: %d\n", n); + return n; + } + + for (i = j = 0; i < n; i++) { + int up; + __u32 ip; + __u32 mask; + + if (!strcmp(names[i], "lo")) /* skip the loopback IF */ + continue; + + rc = libcfs_ipif_query(names[i], &up, &ip, &mask); + if (rc != 0) { + CWARN("Can't get interface %s info: %d\n", + names[i], rc); + continue; + } + + if (!up) { + CWARN("Ignoring interface %s (down)\n", + names[i]); + continue; + } + + if (j == LNET_MAX_INTERFACES) { + CWARN("Ignoring interface %s (too many interfaces)\n", + names[i]); + continue; + } + + net->ksnn_interfaces[j].ksni_ipaddr = ip; + net->ksnn_interfaces[j].ksni_netmask = mask; + strncpy(&net->ksnn_interfaces[j].ksni_name[0], + names[i], IFNAMSIZ); + j++; + } + + libcfs_ipif_free_enumeration(names, n); + + if (j == 0) + CERROR("Can't find any usable interfaces\n"); + + return j; +} + +static int +ksocknal_search_new_ipif(ksock_net_t *net) +{ + int new_ipif = 0; + int i; + + for (i = 0; i < net->ksnn_ninterfaces; i++) { + char *ifnam = &net->ksnn_interfaces[i].ksni_name[0]; + char *colon = strchr(ifnam, ':'); + int found = 0; + ksock_net_t *tmp; + int j; + + if (colon != NULL) /* ignore alias device */ + *colon = 0; + + list_for_each_entry(tmp, &ksocknal_data.ksnd_nets, + ksnn_list) { + for (j = 0; !found && j < tmp->ksnn_ninterfaces; j++) { + char *ifnam2 = + &tmp->ksnn_interfaces[j].ksni_name[0]; + char *colon2 = strchr(ifnam2, ':'); + + if (colon2 != NULL) + *colon2 = 0; + + found = strcmp(ifnam, ifnam2) == 0; + if (colon2 != NULL) + *colon2 = ':'; + } + if (found) + break; + } + + new_ipif += !found; + if (colon != NULL) + *colon = ':'; + } + + return new_ipif; +} + +static int +ksocknal_start_schedulers(struct ksock_sched_info *info) +{ + int nthrs; + int rc = 0; + int i; + + if (info->ksi_nthreads == 0) { + if (*ksocknal_tunables.ksnd_nscheds > 0) { + nthrs = info->ksi_nthreads_max; + } else { + nthrs = cfs_cpt_weight(lnet_cpt_table(), + info->ksi_cpt); + nthrs = min(max(SOCKNAL_NSCHEDS, nthrs >> 1), nthrs); + nthrs = min(SOCKNAL_NSCHEDS_HIGH, nthrs); + } + nthrs = min(nthrs, info->ksi_nthreads_max); + } else { + LASSERT(info->ksi_nthreads <= info->ksi_nthreads_max); + /* increase two threads if there is new interface */ + nthrs = min(2, info->ksi_nthreads_max - info->ksi_nthreads); + } + + for (i = 0; i < nthrs; i++) { + long id; + char name[20]; + ksock_sched_t *sched; + id = KSOCK_THREAD_ID(info->ksi_cpt, info->ksi_nthreads + i); + sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)]; + snprintf(name, sizeof(name), "socknal_sd%02d_%02d", + info->ksi_cpt, (int)(sched - &info->ksi_scheds[0])); + + rc = ksocknal_thread_start(ksocknal_scheduler, + (void *)id, name); + if (rc == 0) + continue; + + CERROR("Can't spawn thread %d for scheduler[%d]: %d\n", + info->ksi_cpt, info->ksi_nthreads + i, rc); + break; + } + + info->ksi_nthreads += i; + return rc; +} + +static int +ksocknal_net_start_threads(ksock_net_t *net, __u32 *cpts, int ncpts) +{ + int newif = ksocknal_search_new_ipif(net); + int rc; + int i; + + LASSERT(ncpts > 0 && ncpts <= cfs_cpt_number(lnet_cpt_table())); + + for (i = 0; i < ncpts; i++) { + struct ksock_sched_info *info; + int cpt = (cpts == NULL) ? i : cpts[i]; + + LASSERT(cpt < cfs_cpt_number(lnet_cpt_table())); + info = ksocknal_data.ksnd_sched_info[cpt]; + + if (!newif && info->ksi_nthreads > 0) + continue; + + rc = ksocknal_start_schedulers(info); + if (rc != 0) + return rc; + } + return 0; +} + +int +ksocknal_startup(lnet_ni_t *ni) +{ + ksock_net_t *net; + int rc; + int i; + + LASSERT(ni->ni_lnd == &the_ksocklnd); + + if (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING) { + rc = ksocknal_base_startup(); + if (rc != 0) + return rc; + } + + LIBCFS_ALLOC(net, sizeof(*net)); + if (net == NULL) + goto fail_0; + + spin_lock_init(&net->ksnn_lock); + net->ksnn_incarnation = ksocknal_new_incarnation(); + ni->ni_data = net; + ni->ni_peertimeout = *ksocknal_tunables.ksnd_peertimeout; + ni->ni_maxtxcredits = *ksocknal_tunables.ksnd_credits; + ni->ni_peertxcredits = *ksocknal_tunables.ksnd_peertxcredits; + ni->ni_peerrtrcredits = *ksocknal_tunables.ksnd_peerrtrcredits; + + if (ni->ni_interfaces[0] == NULL) { + rc = ksocknal_enumerate_interfaces(net); + if (rc <= 0) + goto fail_1; + + net->ksnn_ninterfaces = 1; + } else { + for (i = 0; i < LNET_MAX_INTERFACES; i++) { + int up; + + if (ni->ni_interfaces[i] == NULL) + break; + + rc = libcfs_ipif_query( + ni->ni_interfaces[i], &up, + &net->ksnn_interfaces[i].ksni_ipaddr, + &net->ksnn_interfaces[i].ksni_netmask); + + if (rc != 0) { + CERROR("Can't get interface %s info: %d\n", + ni->ni_interfaces[i], rc); + goto fail_1; + } + + if (!up) { + CERROR("Interface %s is down\n", + ni->ni_interfaces[i]); + goto fail_1; + } + + strncpy(&net->ksnn_interfaces[i].ksni_name[0], + ni->ni_interfaces[i], IFNAMSIZ); + } + net->ksnn_ninterfaces = i; + } + + /* call it before add it to ksocknal_data.ksnd_nets */ + rc = ksocknal_net_start_threads(net, ni->ni_cpts, ni->ni_ncpts); + if (rc != 0) + goto fail_1; + + ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), + net->ksnn_interfaces[0].ksni_ipaddr); + list_add(&net->ksnn_list, &ksocknal_data.ksnd_nets); + + ksocknal_data.ksnd_nnets++; + + return 0; + + fail_1: + LIBCFS_FREE(net, sizeof(*net)); + fail_0: + if (ksocknal_data.ksnd_nnets == 0) + ksocknal_base_shutdown(); + + return -ENETDOWN; +} + + +static void __exit +ksocknal_module_fini(void) +{ + lnet_unregister_lnd(&the_ksocklnd); +} + +static int __init +ksocknal_module_init(void) +{ + int rc; + + /* check ksnr_connected/connecting field large enough */ + CLASSERT(SOCKLND_CONN_NTYPES <= 4); + CLASSERT(SOCKLND_CONN_ACK == SOCKLND_CONN_BULK_IN); + + /* initialize the_ksocklnd */ + the_ksocklnd.lnd_type = SOCKLND; + the_ksocklnd.lnd_startup = ksocknal_startup; + the_ksocklnd.lnd_shutdown = ksocknal_shutdown; + the_ksocklnd.lnd_ctl = ksocknal_ctl; + the_ksocklnd.lnd_send = ksocknal_send; + the_ksocklnd.lnd_recv = ksocknal_recv; + the_ksocklnd.lnd_notify = ksocknal_notify; + the_ksocklnd.lnd_query = ksocknal_query; + the_ksocklnd.lnd_accept = ksocknal_accept; + + rc = ksocknal_tunables_init(); + if (rc != 0) + return rc; + + lnet_register_lnd(&the_ksocklnd); + + return 0; +} + +MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_DESCRIPTION("Kernel TCP Socket LND v3.0.0"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("3.0.0"); + +module_init(ksocknal_module_init); +module_exit(ksocknal_module_fini); diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h new file mode 100644 index 000000000..c54c99551 --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd.h @@ -0,0 +1,588 @@ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Lustre, http://www.lustre.org + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#define DEBUG_PORTAL_ALLOC +#define DEBUG_SUBSYSTEM S_LND + +#include "socklnd_lib-linux.h" + +#include "../../../include/linux/libcfs/libcfs.h" +#include "../../../include/linux/lnet/lnet.h" +#include "../../../include/linux/lnet/lib-lnet.h" +#include "../../../include/linux/lnet/socklnd.h" +#include "../../../include/linux/lnet/lnet-sysctl.h" + +#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */ +#define SOCKNAL_RESCHED 100 /* # scheduler loops before reschedule */ +#define SOCKNAL_INSANITY_RECONN 5000 /* connd is trying on reconn infinitely */ +#define SOCKNAL_ENOMEM_RETRY CFS_TICK /* jiffies between retries */ + +#define SOCKNAL_SINGLE_FRAG_TX 0 /* disable multi-fragment sends */ +#define SOCKNAL_SINGLE_FRAG_RX 0 /* disable multi-fragment receives */ + +#define SOCKNAL_VERSION_DEBUG 0 /* enable protocol version debugging */ + +/* risk kmap deadlock on multi-frag I/O (backs off to single-frag if disabled). + * no risk if we're not running on a CONFIG_HIGHMEM platform. */ +#ifdef CONFIG_HIGHMEM +# define SOCKNAL_RISK_KMAP_DEADLOCK 0 +#else +# define SOCKNAL_RISK_KMAP_DEADLOCK 1 +#endif + +struct ksock_sched_info; + +typedef struct /* per scheduler state */ +{ + spinlock_t kss_lock; /* serialise */ + struct list_head kss_rx_conns; /* conn waiting to be read */ + /* conn waiting to be written */ + struct list_head kss_tx_conns; + /* zombie noop tx list */ + struct list_head kss_zombie_noop_txs; + wait_queue_head_t kss_waitq; /* where scheduler sleeps */ + /* # connections assigned to this scheduler */ + int kss_nconns; + struct ksock_sched_info *kss_info; /* owner of it */ + struct page *kss_rx_scratch_pgs[LNET_MAX_IOV]; + struct kvec kss_scratch_iov[LNET_MAX_IOV]; +} ksock_sched_t; + +struct ksock_sched_info { + int ksi_nthreads_max; /* max allowed threads */ + int ksi_nthreads; /* number of threads */ + int ksi_cpt; /* CPT id */ + ksock_sched_t *ksi_scheds; /* array of schedulers */ +}; + +#define KSOCK_CPT_SHIFT 16 +#define KSOCK_THREAD_ID(cpt, sid) (((cpt) << KSOCK_CPT_SHIFT) | (sid)) +#define KSOCK_THREAD_CPT(id) ((id) >> KSOCK_CPT_SHIFT) +#define KSOCK_THREAD_SID(id) ((id) & ((1UL << KSOCK_CPT_SHIFT) - 1)) + +typedef struct /* in-use interface */ +{ + __u32 ksni_ipaddr; /* interface's IP address */ + __u32 ksni_netmask; /* interface's network mask */ + int ksni_nroutes; /* # routes using (active) */ + int ksni_npeers; /* # peers using (passive) */ + char ksni_name[IFNAMSIZ]; /* interface name */ +} ksock_interface_t; + +typedef struct { + /* "stuck" socket timeout (seconds) */ + int *ksnd_timeout; + /* # scheduler threads in each pool while starting */ + int *ksnd_nscheds; + int *ksnd_nconnds; /* # connection daemons */ + int *ksnd_nconnds_max; /* max # connection daemons */ + int *ksnd_min_reconnectms; /* first connection retry after (ms)... */ + int *ksnd_max_reconnectms; /* ...exponentially increasing to this */ + int *ksnd_eager_ack; /* make TCP ack eagerly? */ + int *ksnd_typed_conns; /* drive sockets by type? */ + int *ksnd_min_bulk; /* smallest "large" message */ + int *ksnd_tx_buffer_size; /* socket tx buffer size */ + int *ksnd_rx_buffer_size; /* socket rx buffer size */ + int *ksnd_nagle; /* enable NAGLE? */ + int *ksnd_round_robin; /* round robin for multiple interfaces */ + int *ksnd_keepalive; /* # secs for sending keepalive NOOP */ + int *ksnd_keepalive_idle; /* # idle secs before 1st probe */ + int *ksnd_keepalive_count; /* # probes */ + int *ksnd_keepalive_intvl; /* time between probes */ + int *ksnd_credits; /* # concurrent sends */ + int *ksnd_peertxcredits; /* # concurrent sends to 1 peer */ + int *ksnd_peerrtrcredits; /* # per-peer router buffer credits */ + int *ksnd_peertimeout; /* seconds to consider peer dead */ + int *ksnd_enable_csum; /* enable check sum */ + int *ksnd_inject_csum_error; /* set non-zero to inject checksum error */ + int *ksnd_nonblk_zcack; /* always send zc-ack on non-blocking connection */ + unsigned int *ksnd_zc_min_payload; /* minimum zero copy payload size */ + int *ksnd_zc_recv; /* enable ZC receive (for Chelsio TOE) */ + int *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */ +} ksock_tunables_t; + +typedef struct { + __u64 ksnn_incarnation; /* my epoch */ + spinlock_t ksnn_lock; /* serialise */ + struct list_head ksnn_list; /* chain on global list */ + int ksnn_npeers; /* # peers */ + int ksnn_shutdown; /* shutting down? */ + int ksnn_ninterfaces; /* IP interfaces */ + ksock_interface_t ksnn_interfaces[LNET_MAX_INTERFACES]; +} ksock_net_t; + +/** connd timeout */ +#define SOCKNAL_CONND_TIMEOUT 120 +/** reserved thread for accepting & creating new connd */ +#define SOCKNAL_CONND_RESV 1 + +typedef struct { + int ksnd_init; /* initialisation state */ + int ksnd_nnets; /* # networks set up */ + struct list_head ksnd_nets; /* list of nets */ + /* stabilize peer/conn ops */ + rwlock_t ksnd_global_lock; + /* hash table of all my known peers */ + struct list_head *ksnd_peers; + int ksnd_peer_hash_size; /* size of ksnd_peers */ + + int ksnd_nthreads; /* # live threads */ + int ksnd_shuttingdown; /* tell threads to exit */ + /* schedulers information */ + struct ksock_sched_info **ksnd_sched_info; + + atomic_t ksnd_nactive_txs; /* #active txs */ + + struct list_head ksnd_deathrow_conns; /* conns to close: reaper_lock*/ + struct list_head ksnd_zombie_conns; /* conns to free: reaper_lock */ + struct list_head ksnd_enomem_conns; /* conns to retry: reaper_lock*/ + wait_queue_head_t ksnd_reaper_waitq; /* reaper sleeps here */ + unsigned long ksnd_reaper_waketime;/* when reaper will wake */ + spinlock_t ksnd_reaper_lock; /* serialise */ + + int ksnd_enomem_tx; /* test ENOMEM sender */ + int ksnd_stall_tx; /* test sluggish sender */ + int ksnd_stall_rx; /* test sluggish receiver */ + + struct list_head ksnd_connd_connreqs; /* incoming connection requests */ + struct list_head ksnd_connd_routes; /* routes waiting to be connected */ + wait_queue_head_t ksnd_connd_waitq; /* connds sleep here */ + int ksnd_connd_connecting;/* # connds connecting */ + /** time stamp of the last failed connecting attempt */ + long ksnd_connd_failed_stamp; + /** # starting connd */ + unsigned ksnd_connd_starting; + /** time stamp of the last starting connd */ + long ksnd_connd_starting_stamp; + /** # running connd */ + unsigned ksnd_connd_running; + spinlock_t ksnd_connd_lock; /* serialise */ + + struct list_head ksnd_idle_noop_txs; /* list head for freed noop tx */ + spinlock_t ksnd_tx_lock; /* serialise, g_lock unsafe */ + +} ksock_nal_data_t; + +#define SOCKNAL_INIT_NOTHING 0 +#define SOCKNAL_INIT_DATA 1 +#define SOCKNAL_INIT_ALL 2 + +/* A packet just assembled for transmission is represented by 1 or more + * struct iovec fragments (the first frag contains the portals header), + * followed by 0 or more lnet_kiov_t fragments. + * + * On the receive side, initially 1 struct iovec fragment is posted for + * receive (the header). Once the header has been received, the payload is + * received into either struct iovec or lnet_kiov_t fragments, depending on + * what the header matched or whether the message needs forwarding. */ + +struct ksock_conn; /* forward ref */ +struct ksock_peer; /* forward ref */ +struct ksock_route; /* forward ref */ +struct ksock_proto; /* forward ref */ + +typedef struct /* transmit packet */ +{ + struct list_head tx_list; /* queue on conn for transmission etc */ + struct list_head tx_zc_list; /* queue on peer for ZC request */ + atomic_t tx_refcount; /* tx reference count */ + int tx_nob; /* # packet bytes */ + int tx_resid; /* residual bytes */ + int tx_niov; /* # packet iovec frags */ + struct kvec *tx_iov; /* packet iovec frags */ + int tx_nkiov; /* # packet page frags */ + unsigned short tx_zc_aborted; /* aborted ZC request */ + unsigned short tx_zc_capable:1; /* payload is large enough for ZC */ + unsigned short tx_zc_checked:1; /* Have I checked if I should ZC? */ + unsigned short tx_nonblk:1; /* it's a non-blocking ACK */ + lnet_kiov_t *tx_kiov; /* packet page frags */ + struct ksock_conn *tx_conn; /* owning conn */ + lnet_msg_t *tx_lnetmsg; /* lnet message for lnet_finalize() */ + unsigned long tx_deadline; /* when (in jiffies) tx times out */ + ksock_msg_t tx_msg; /* socklnd message buffer */ + int tx_desc_size; /* size of this descriptor */ + union { + struct { + struct kvec iov; /* virt hdr */ + lnet_kiov_t kiov[0]; /* paged payload */ + } paged; + struct { + struct kvec iov[1]; /* virt hdr + payload */ + } virt; + } tx_frags; +} ksock_tx_t; + +#define KSOCK_NOOP_TX_SIZE ((int)offsetof(ksock_tx_t, tx_frags.paged.kiov[0])) + +/* network zero copy callback descriptor embedded in ksock_tx_t */ + +/* space for the rx frag descriptors; we either read a single contiguous + * header, or up to LNET_MAX_IOV frags of payload of either type. */ +typedef union { + struct kvec iov[LNET_MAX_IOV]; + lnet_kiov_t kiov[LNET_MAX_IOV]; +} ksock_rxiovspace_t; + +#define SOCKNAL_RX_KSM_HEADER 1 /* reading ksock message header */ +#define SOCKNAL_RX_LNET_HEADER 2 /* reading lnet message header */ +#define SOCKNAL_RX_PARSE 3 /* Calling lnet_parse() */ +#define SOCKNAL_RX_PARSE_WAIT 4 /* waiting to be told to read the body */ +#define SOCKNAL_RX_LNET_PAYLOAD 5 /* reading lnet payload (to deliver here) */ +#define SOCKNAL_RX_SLOP 6 /* skipping body */ + +typedef struct ksock_conn { + struct ksock_peer *ksnc_peer; /* owning peer */ + struct ksock_route *ksnc_route; /* owning route */ + struct list_head ksnc_list; /* stash on peer's conn list */ + struct socket *ksnc_sock; /* actual socket */ + void *ksnc_saved_data_ready; /* socket's original data_ready() callback */ + void *ksnc_saved_write_space; /* socket's original write_space() callback */ + atomic_t ksnc_conn_refcount; /* conn refcount */ + atomic_t ksnc_sock_refcount; /* sock refcount */ + ksock_sched_t *ksnc_scheduler; /* who schedules this connection */ + __u32 ksnc_myipaddr; /* my IP */ + __u32 ksnc_ipaddr; /* peer's IP */ + int ksnc_port; /* peer's port */ + signed int ksnc_type:3; /* type of connection, + * should be signed value */ + unsigned int ksnc_closing:1; /* being shut down */ + unsigned int ksnc_flip:1; /* flip or not, only for V2.x */ + unsigned int ksnc_zc_capable:1; /* enable to ZC */ + struct ksock_proto *ksnc_proto; /* protocol for the connection */ + + /* reader */ + struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ + unsigned long ksnc_rx_deadline; /* when (in jiffies) receive times out */ + __u8 ksnc_rx_started; /* started receiving a message */ + __u8 ksnc_rx_ready; /* data ready to read */ + __u8 ksnc_rx_scheduled;/* being progressed */ + __u8 ksnc_rx_state; /* what is being read */ + int ksnc_rx_nob_left; /* # bytes to next hdr/body */ + int ksnc_rx_nob_wanted; /* bytes actually wanted */ + int ksnc_rx_niov; /* # iovec frags */ + struct kvec *ksnc_rx_iov; /* the iovec frags */ + int ksnc_rx_nkiov; /* # page frags */ + lnet_kiov_t *ksnc_rx_kiov; /* the page frags */ + ksock_rxiovspace_t ksnc_rx_iov_space;/* space for frag descriptors */ + __u32 ksnc_rx_csum; /* partial checksum for incoming data */ + void *ksnc_cookie; /* rx lnet_finalize passthru arg */ + ksock_msg_t ksnc_msg; /* incoming message buffer: + * V2.x message takes the + * whole struct + * V1.x message is a bare + * lnet_hdr_t, it's stored in + * ksnc_msg.ksm_u.lnetmsg */ + + /* WRITER */ + struct list_head ksnc_tx_list; /* where I enq waiting for output space */ + struct list_head ksnc_tx_queue; /* packets waiting to be sent */ + ksock_tx_t *ksnc_tx_carrier; /* next TX that can carry a LNet message or ZC-ACK */ + unsigned long ksnc_tx_deadline; /* when (in jiffies) tx times out */ + int ksnc_tx_bufnob; /* send buffer marker */ + atomic_t ksnc_tx_nob; /* # bytes queued */ + int ksnc_tx_ready; /* write space */ + int ksnc_tx_scheduled; /* being progressed */ + unsigned long ksnc_tx_last_post; /* time stamp of the last posted TX */ +} ksock_conn_t; + +typedef struct ksock_route { + struct list_head ksnr_list; /* chain on peer route list */ + struct list_head ksnr_connd_list; /* chain on ksnr_connd_routes */ + struct ksock_peer *ksnr_peer; /* owning peer */ + atomic_t ksnr_refcount; /* # users */ + unsigned long ksnr_timeout; /* when (in jiffies) reconnection can happen next */ + long ksnr_retry_interval; /* how long between retries */ + __u32 ksnr_myipaddr; /* my IP */ + __u32 ksnr_ipaddr; /* IP address to connect to */ + int ksnr_port; /* port to connect to */ + unsigned int ksnr_scheduled:1; /* scheduled for attention */ + unsigned int ksnr_connecting:1;/* connection establishment in progress */ + unsigned int ksnr_connected:4; /* connections established by type */ + unsigned int ksnr_deleted:1; /* been removed from peer? */ + unsigned int ksnr_share_count; /* created explicitly? */ + int ksnr_conn_count; /* # conns established by this route */ +} ksock_route_t; + +#define SOCKNAL_KEEPALIVE_PING 1 /* cookie for keepalive ping */ + +typedef struct ksock_peer { + struct list_head ksnp_list; /* stash on global peer list */ + unsigned long ksnp_last_alive; /* when (in jiffies) I was last alive */ + lnet_process_id_t ksnp_id; /* who's on the other end(s) */ + atomic_t ksnp_refcount; /* # users */ + int ksnp_sharecount; /* lconf usage counter */ + int ksnp_closing; /* being closed */ + int ksnp_accepting;/* # passive connections pending */ + int ksnp_error; /* errno on closing last conn */ + __u64 ksnp_zc_next_cookie;/* ZC completion cookie */ + __u64 ksnp_incarnation; /* latest known peer incarnation */ + struct ksock_proto *ksnp_proto; /* latest known peer protocol */ + struct list_head ksnp_conns; /* all active connections */ + struct list_head ksnp_routes; /* routes */ + struct list_head ksnp_tx_queue; /* waiting packets */ + spinlock_t ksnp_lock; /* serialize, g_lock unsafe */ + struct list_head ksnp_zc_req_list; /* zero copy requests wait for ACK */ + unsigned long ksnp_send_keepalive; /* time to send keepalive */ + lnet_ni_t *ksnp_ni; /* which network */ + int ksnp_n_passive_ips; /* # of... */ + __u32 ksnp_passive_ips[LNET_MAX_INTERFACES]; /* preferred local interfaces */ +} ksock_peer_t; + +typedef struct ksock_connreq { + struct list_head ksncr_list; /* stash on ksnd_connd_connreqs */ + lnet_ni_t *ksncr_ni; /* chosen NI */ + struct socket *ksncr_sock; /* accepted socket */ +} ksock_connreq_t; + +extern ksock_nal_data_t ksocknal_data; +extern ksock_tunables_t ksocknal_tunables; + +#define SOCKNAL_MATCH_NO 0 /* TX can't match type of connection */ +#define SOCKNAL_MATCH_YES 1 /* TX matches type of connection */ +#define SOCKNAL_MATCH_MAY 2 /* TX can be sent on the connection, but not preferred */ + +typedef struct ksock_proto { + int pro_version; /* version number of protocol */ + int (*pro_send_hello)(ksock_conn_t *, ksock_hello_msg_t *); /* handshake function */ + int (*pro_recv_hello)(ksock_conn_t *, ksock_hello_msg_t *, int);/* handshake function */ + void (*pro_pack)(ksock_tx_t *); /* message pack */ + void (*pro_unpack)(ksock_msg_t *); /* message unpack */ + ksock_tx_t *(*pro_queue_tx_msg)(ksock_conn_t *, ksock_tx_t *); /* queue tx on the connection */ + int (*pro_queue_tx_zcack)(ksock_conn_t *, ksock_tx_t *, __u64); /* queue ZC ack on the connection */ + int (*pro_handle_zcreq)(ksock_conn_t *, __u64, int); /* handle ZC request */ + int (*pro_handle_zcack)(ksock_conn_t *, __u64, __u64); /* handle ZC ACK */ + int (*pro_match_tx)(ksock_conn_t *, ksock_tx_t *, int); /* msg type matches the connection type: + * return value: + * return MATCH_NO : no + * return MATCH_YES : matching type + * return MATCH_MAY : can be backup */ +} ksock_proto_t; + +extern ksock_proto_t ksocknal_protocol_v1x; +extern ksock_proto_t ksocknal_protocol_v2x; +extern ksock_proto_t ksocknal_protocol_v3x; + +#define KSOCK_PROTO_V1_MAJOR LNET_PROTO_TCP_VERSION_MAJOR +#define KSOCK_PROTO_V1_MINOR LNET_PROTO_TCP_VERSION_MINOR +#define KSOCK_PROTO_V1 KSOCK_PROTO_V1_MAJOR + +#ifndef CPU_MASK_NONE +#define CPU_MASK_NONE 0UL +#endif + +static inline int +ksocknal_route_mask(void) +{ + if (!*ksocknal_tunables.ksnd_typed_conns) + return (1 << SOCKLND_CONN_ANY); + + return ((1 << SOCKLND_CONN_CONTROL) | + (1 << SOCKLND_CONN_BULK_IN) | + (1 << SOCKLND_CONN_BULK_OUT)); +} + +static inline struct list_head * +ksocknal_nid2peerlist(lnet_nid_t nid) +{ + unsigned int hash = ((unsigned int)nid) % ksocknal_data.ksnd_peer_hash_size; + + return &ksocknal_data.ksnd_peers[hash]; +} + +static inline void +ksocknal_conn_addref(ksock_conn_t *conn) +{ + LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0); + atomic_inc(&conn->ksnc_conn_refcount); +} + +extern void ksocknal_queue_zombie_conn(ksock_conn_t *conn); +extern void ksocknal_finalize_zcreq(ksock_conn_t *conn); + +static inline void +ksocknal_conn_decref(ksock_conn_t *conn) +{ + LASSERT(atomic_read(&conn->ksnc_conn_refcount) > 0); + if (atomic_dec_and_test(&conn->ksnc_conn_refcount)) + ksocknal_queue_zombie_conn(conn); +} + +static inline int +ksocknal_connsock_addref(ksock_conn_t *conn) +{ + int rc = -ESHUTDOWN; + + read_lock(&ksocknal_data.ksnd_global_lock); + if (!conn->ksnc_closing) { + LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0); + atomic_inc(&conn->ksnc_sock_refcount); + rc = 0; + } + read_unlock(&ksocknal_data.ksnd_global_lock); + + return rc; +} + +static inline void +ksocknal_connsock_decref(ksock_conn_t *conn) +{ + LASSERT(atomic_read(&conn->ksnc_sock_refcount) > 0); + if (atomic_dec_and_test(&conn->ksnc_sock_refcount)) { + LASSERT(conn->ksnc_closing); + libcfs_sock_release(conn->ksnc_sock); + conn->ksnc_sock = NULL; + ksocknal_finalize_zcreq(conn); + } +} + +static inline void +ksocknal_tx_addref(ksock_tx_t *tx) +{ + LASSERT(atomic_read(&tx->tx_refcount) > 0); + atomic_inc(&tx->tx_refcount); +} + +extern void ksocknal_tx_prep(ksock_conn_t *, ksock_tx_t *tx); +extern void ksocknal_tx_done(lnet_ni_t *ni, ksock_tx_t *tx); + +static inline void +ksocknal_tx_decref(ksock_tx_t *tx) +{ + LASSERT(atomic_read(&tx->tx_refcount) > 0); + if (atomic_dec_and_test(&tx->tx_refcount)) + ksocknal_tx_done(NULL, tx); +} + +static inline void +ksocknal_route_addref(ksock_route_t *route) +{ + LASSERT(atomic_read(&route->ksnr_refcount) > 0); + atomic_inc(&route->ksnr_refcount); +} + +extern void ksocknal_destroy_route(ksock_route_t *route); + +static inline void +ksocknal_route_decref(ksock_route_t *route) +{ + LASSERT(atomic_read(&route->ksnr_refcount) > 0); + if (atomic_dec_and_test(&route->ksnr_refcount)) + ksocknal_destroy_route(route); +} + +static inline void +ksocknal_peer_addref(ksock_peer_t *peer) +{ + LASSERT(atomic_read(&peer->ksnp_refcount) > 0); + atomic_inc(&peer->ksnp_refcount); +} + +extern void ksocknal_destroy_peer(ksock_peer_t *peer); + +static inline void +ksocknal_peer_decref(ksock_peer_t *peer) +{ + LASSERT(atomic_read(&peer->ksnp_refcount) > 0); + if (atomic_dec_and_test(&peer->ksnp_refcount)) + ksocknal_destroy_peer(peer); +} + +int ksocknal_startup(lnet_ni_t *ni); +void ksocknal_shutdown(lnet_ni_t *ni); +int ksocknal_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg); +int ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg); +int ksocknal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, + int delayed, unsigned int niov, + struct kvec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen); +int ksocknal_accept(lnet_ni_t *ni, struct socket *sock); + +extern int ksocknal_add_peer(lnet_ni_t *ni, lnet_process_id_t id, __u32 ip, int port); +extern ksock_peer_t *ksocknal_find_peer_locked(lnet_ni_t *ni, lnet_process_id_t id); +extern ksock_peer_t *ksocknal_find_peer(lnet_ni_t *ni, lnet_process_id_t id); +extern void ksocknal_peer_failed(ksock_peer_t *peer); +extern int ksocknal_create_conn(lnet_ni_t *ni, ksock_route_t *route, + struct socket *sock, int type); +extern void ksocknal_close_conn_locked(ksock_conn_t *conn, int why); +extern void ksocknal_terminate_conn(ksock_conn_t *conn); +extern void ksocknal_destroy_conn(ksock_conn_t *conn); +extern int ksocknal_close_peer_conns_locked(ksock_peer_t *peer, + __u32 ipaddr, int why); +extern int ksocknal_close_conn_and_siblings(ksock_conn_t *conn, int why); +extern int ksocknal_close_matching_conns(lnet_process_id_t id, __u32 ipaddr); +extern ksock_conn_t *ksocknal_find_conn_locked(ksock_peer_t *peer, + ksock_tx_t *tx, int nonblk); + +extern int ksocknal_launch_packet(lnet_ni_t *ni, ksock_tx_t *tx, + lnet_process_id_t id); +extern ksock_tx_t *ksocknal_alloc_tx(int type, int size); +extern void ksocknal_free_tx(ksock_tx_t *tx); +extern ksock_tx_t *ksocknal_alloc_tx_noop(__u64 cookie, int nonblk); +extern void ksocknal_next_tx_carrier(ksock_conn_t *conn); +extern void ksocknal_queue_tx_locked(ksock_tx_t *tx, ksock_conn_t *conn); +extern void ksocknal_txlist_done(lnet_ni_t *ni, struct list_head *txlist, + int error); +extern void ksocknal_notify(lnet_ni_t *ni, lnet_nid_t gw_nid, int alive); +extern void ksocknal_query(struct lnet_ni *ni, lnet_nid_t nid, unsigned long *when); +extern int ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name); +extern void ksocknal_thread_fini(void); +extern void ksocknal_launch_all_connections_locked(ksock_peer_t *peer); +extern ksock_route_t *ksocknal_find_connectable_route_locked(ksock_peer_t *peer); +extern ksock_route_t *ksocknal_find_connecting_route_locked(ksock_peer_t *peer); +extern int ksocknal_new_packet(ksock_conn_t *conn, int skip); +extern int ksocknal_scheduler(void *arg); +extern int ksocknal_connd(void *arg); +extern int ksocknal_reaper(void *arg); +extern int ksocknal_send_hello(lnet_ni_t *ni, ksock_conn_t *conn, + lnet_nid_t peer_nid, ksock_hello_msg_t *hello); +extern int ksocknal_recv_hello(lnet_ni_t *ni, ksock_conn_t *conn, + ksock_hello_msg_t *hello, lnet_process_id_t *id, + __u64 *incarnation); +extern void ksocknal_read_callback(ksock_conn_t *conn); +extern void ksocknal_write_callback(ksock_conn_t *conn); + +extern int ksocknal_lib_zc_capable(ksock_conn_t *conn); +extern void ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn); +extern void ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn); +extern void ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn); +extern void ksocknal_lib_push_conn(ksock_conn_t *conn); +extern int ksocknal_lib_get_conn_addrs(ksock_conn_t *conn); +extern int ksocknal_lib_setup_sock(struct socket *so); +extern int ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx); +extern int ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx); +extern void ksocknal_lib_eager_ack(ksock_conn_t *conn); +extern int ksocknal_lib_recv_iov(ksock_conn_t *conn); +extern int ksocknal_lib_recv_kiov(ksock_conn_t *conn); +extern int ksocknal_lib_get_conn_tunables(ksock_conn_t *conn, int *txmem, + int *rxmem, int *nagle); + +extern int ksocknal_tunables_init(void); + +extern void ksocknal_lib_csum_tx(ksock_tx_t *tx); + +extern int ksocknal_lib_memory_pressure(ksock_conn_t *conn); +extern int ksocknal_lib_bind_thread_to_cpu(int id); diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c new file mode 100644 index 000000000..fa7ad883b --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_cb.c @@ -0,0 +1,2634 @@ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socklnd.h" + +ksock_tx_t * +ksocknal_alloc_tx(int type, int size) +{ + ksock_tx_t *tx = NULL; + + if (type == KSOCK_MSG_NOOP) { + LASSERT(size == KSOCK_NOOP_TX_SIZE); + + /* searching for a noop tx in free list */ + spin_lock(&ksocknal_data.ksnd_tx_lock); + + if (!list_empty(&ksocknal_data.ksnd_idle_noop_txs)) { + tx = list_entry(ksocknal_data.ksnd_idle_noop_txs. \ + next, ksock_tx_t, tx_list); + LASSERT(tx->tx_desc_size == size); + list_del(&tx->tx_list); + } + + spin_unlock(&ksocknal_data.ksnd_tx_lock); + } + + if (tx == NULL) + LIBCFS_ALLOC(tx, size); + + if (tx == NULL) + return NULL; + + atomic_set(&tx->tx_refcount, 1); + tx->tx_zc_aborted = 0; + tx->tx_zc_capable = 0; + tx->tx_zc_checked = 0; + tx->tx_desc_size = size; + + atomic_inc(&ksocknal_data.ksnd_nactive_txs); + + return tx; +} + +ksock_tx_t * +ksocknal_alloc_tx_noop(__u64 cookie, int nonblk) +{ + ksock_tx_t *tx; + + tx = ksocknal_alloc_tx(KSOCK_MSG_NOOP, KSOCK_NOOP_TX_SIZE); + if (tx == NULL) { + CERROR("Can't allocate noop tx desc\n"); + return NULL; + } + + tx->tx_conn = NULL; + tx->tx_lnetmsg = NULL; + tx->tx_kiov = NULL; + tx->tx_nkiov = 0; + tx->tx_iov = tx->tx_frags.virt.iov; + tx->tx_niov = 1; + tx->tx_nonblk = nonblk; + + socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_NOOP); + tx->tx_msg.ksm_zc_cookies[1] = cookie; + + return tx; +} + + +void +ksocknal_free_tx (ksock_tx_t *tx) +{ + atomic_dec(&ksocknal_data.ksnd_nactive_txs); + + if (tx->tx_lnetmsg == NULL && tx->tx_desc_size == KSOCK_NOOP_TX_SIZE) { + /* it's a noop tx */ + spin_lock(&ksocknal_data.ksnd_tx_lock); + + list_add(&tx->tx_list, &ksocknal_data.ksnd_idle_noop_txs); + + spin_unlock(&ksocknal_data.ksnd_tx_lock); + } else { + LIBCFS_FREE(tx, tx->tx_desc_size); + } +} + +static int +ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) +{ + struct kvec *iov = tx->tx_iov; + int nob; + int rc; + + LASSERT (tx->tx_niov > 0); + + /* Never touch tx->tx_iov inside ksocknal_lib_send_iov() */ + rc = ksocknal_lib_send_iov(conn, tx); + + if (rc <= 0) /* sent nothing? */ + return rc; + + nob = rc; + LASSERT (nob <= tx->tx_resid); + tx->tx_resid -= nob; + + /* "consume" iov */ + do { + LASSERT (tx->tx_niov > 0); + + if (nob < (int) iov->iov_len) { + iov->iov_base = (void *)((char *)iov->iov_base + nob); + iov->iov_len -= nob; + return rc; + } + + nob -= iov->iov_len; + tx->tx_iov = ++iov; + tx->tx_niov--; + } while (nob != 0); + + return rc; +} + +static int +ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) +{ + lnet_kiov_t *kiov = tx->tx_kiov; + int nob; + int rc; + + LASSERT (tx->tx_niov == 0); + LASSERT (tx->tx_nkiov > 0); + + /* Never touch tx->tx_kiov inside ksocknal_lib_send_kiov() */ + rc = ksocknal_lib_send_kiov(conn, tx); + + if (rc <= 0) /* sent nothing? */ + return rc; + + nob = rc; + LASSERT (nob <= tx->tx_resid); + tx->tx_resid -= nob; + + /* "consume" kiov */ + do { + LASSERT(tx->tx_nkiov > 0); + + if (nob < (int)kiov->kiov_len) { + kiov->kiov_offset += nob; + kiov->kiov_len -= nob; + return rc; + } + + nob -= (int)kiov->kiov_len; + tx->tx_kiov = ++kiov; + tx->tx_nkiov--; + } while (nob != 0); + + return rc; +} + +static int +ksocknal_transmit (ksock_conn_t *conn, ksock_tx_t *tx) +{ + int rc; + int bufnob; + + if (ksocknal_data.ksnd_stall_tx != 0) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(ksocknal_data.ksnd_stall_tx)); + } + + LASSERT (tx->tx_resid != 0); + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) { + LASSERT (conn->ksnc_closing); + return -ESHUTDOWN; + } + + do { + if (ksocknal_data.ksnd_enomem_tx > 0) { + /* testing... */ + ksocknal_data.ksnd_enomem_tx--; + rc = -EAGAIN; + } else if (tx->tx_niov != 0) { + rc = ksocknal_send_iov (conn, tx); + } else { + rc = ksocknal_send_kiov (conn, tx); + } + + bufnob = conn->ksnc_sock->sk->sk_wmem_queued; + if (rc > 0) /* sent something? */ + conn->ksnc_tx_bufnob += rc; /* account it */ + + if (bufnob < conn->ksnc_tx_bufnob) { + /* allocated send buffer bytes < computed; infer + * something got ACKed */ + conn->ksnc_tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); + conn->ksnc_tx_bufnob = bufnob; + mb(); + } + + if (rc <= 0) { /* Didn't write anything? */ + + if (rc == 0) /* some stacks return 0 instead of -EAGAIN */ + rc = -EAGAIN; + + /* Check if EAGAIN is due to memory pressure */ + if (rc == -EAGAIN && ksocknal_lib_memory_pressure(conn)) + rc = -ENOMEM; + + break; + } + + /* socket's wmem_queued now includes 'rc' bytes */ + atomic_sub (rc, &conn->ksnc_tx_nob); + rc = 0; + + } while (tx->tx_resid != 0); + + ksocknal_connsock_decref(conn); + return rc; +} + +static int +ksocknal_recv_iov (ksock_conn_t *conn) +{ + struct kvec *iov = conn->ksnc_rx_iov; + int nob; + int rc; + + LASSERT (conn->ksnc_rx_niov > 0); + + /* Never touch conn->ksnc_rx_iov or change connection + * status inside ksocknal_lib_recv_iov */ + rc = ksocknal_lib_recv_iov(conn); + + if (rc <= 0) + return rc; + + /* received something... */ + nob = rc; + + conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); + conn->ksnc_rx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + mb(); /* order with setting rx_started */ + conn->ksnc_rx_started = 1; + + conn->ksnc_rx_nob_wanted -= nob; + conn->ksnc_rx_nob_left -= nob; + + do { + LASSERT (conn->ksnc_rx_niov > 0); + + if (nob < (int)iov->iov_len) { + iov->iov_len -= nob; + iov->iov_base += nob; + return -EAGAIN; + } + + nob -= iov->iov_len; + conn->ksnc_rx_iov = ++iov; + conn->ksnc_rx_niov--; + } while (nob != 0); + + return rc; +} + +static int +ksocknal_recv_kiov (ksock_conn_t *conn) +{ + lnet_kiov_t *kiov = conn->ksnc_rx_kiov; + int nob; + int rc; + LASSERT (conn->ksnc_rx_nkiov > 0); + + /* Never touch conn->ksnc_rx_kiov or change connection + * status inside ksocknal_lib_recv_iov */ + rc = ksocknal_lib_recv_kiov(conn); + + if (rc <= 0) + return rc; + + /* received something... */ + nob = rc; + + conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); + conn->ksnc_rx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + mb(); /* order with setting rx_started */ + conn->ksnc_rx_started = 1; + + conn->ksnc_rx_nob_wanted -= nob; + conn->ksnc_rx_nob_left -= nob; + + do { + LASSERT (conn->ksnc_rx_nkiov > 0); + + if (nob < (int) kiov->kiov_len) { + kiov->kiov_offset += nob; + kiov->kiov_len -= nob; + return -EAGAIN; + } + + nob -= kiov->kiov_len; + conn->ksnc_rx_kiov = ++kiov; + conn->ksnc_rx_nkiov--; + } while (nob != 0); + + return 1; +} + +static int +ksocknal_receive (ksock_conn_t *conn) +{ + /* Return 1 on success, 0 on EOF, < 0 on error. + * Caller checks ksnc_rx_nob_wanted to determine + * progress/completion. */ + int rc; + + if (ksocknal_data.ksnd_stall_rx != 0) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(cfs_time_seconds(ksocknal_data.ksnd_stall_rx)); + } + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) { + LASSERT (conn->ksnc_closing); + return -ESHUTDOWN; + } + + for (;;) { + if (conn->ksnc_rx_niov != 0) + rc = ksocknal_recv_iov (conn); + else + rc = ksocknal_recv_kiov (conn); + + if (rc <= 0) { + /* error/EOF or partial receive */ + if (rc == -EAGAIN) { + rc = 1; + } else if (rc == 0 && conn->ksnc_rx_started) { + /* EOF in the middle of a message */ + rc = -EPROTO; + } + break; + } + + /* Completed a fragment */ + + if (conn->ksnc_rx_nob_wanted == 0) { + rc = 1; + break; + } + } + + ksocknal_connsock_decref(conn); + return rc; +} + +void +ksocknal_tx_done (lnet_ni_t *ni, ksock_tx_t *tx) +{ + lnet_msg_t *lnetmsg = tx->tx_lnetmsg; + int rc = (tx->tx_resid == 0 && !tx->tx_zc_aborted) ? 0 : -EIO; + + LASSERT(ni != NULL || tx->tx_conn != NULL); + + if (tx->tx_conn != NULL) + ksocknal_conn_decref(tx->tx_conn); + + if (ni == NULL && tx->tx_conn != NULL) + ni = tx->tx_conn->ksnc_peer->ksnp_ni; + + ksocknal_free_tx (tx); + if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */ + lnet_finalize (ni, lnetmsg, rc); +} + +void +ksocknal_txlist_done (lnet_ni_t *ni, struct list_head *txlist, int error) +{ + ksock_tx_t *tx; + + while (!list_empty (txlist)) { + tx = list_entry (txlist->next, ksock_tx_t, tx_list); + + if (error && tx->tx_lnetmsg != NULL) { + CNETERR("Deleting packet type %d len %d %s->%s\n", + le32_to_cpu (tx->tx_lnetmsg->msg_hdr.type), + le32_to_cpu (tx->tx_lnetmsg->msg_hdr.payload_length), + libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)), + libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid))); + } else if (error) { + CNETERR("Deleting noop packet\n"); + } + + list_del (&tx->tx_list); + + LASSERT (atomic_read(&tx->tx_refcount) == 1); + ksocknal_tx_done (ni, tx); + } +} + +static void +ksocknal_check_zc_req(ksock_tx_t *tx) +{ + ksock_conn_t *conn = tx->tx_conn; + ksock_peer_t *peer = conn->ksnc_peer; + + /* Set tx_msg.ksm_zc_cookies[0] to a unique non-zero cookie and add tx + * to ksnp_zc_req_list if some fragment of this message should be sent + * zero-copy. Our peer will send an ACK containing this cookie when + * she has received this message to tell us we can signal completion. + * tx_msg.ksm_zc_cookies[0] remains non-zero while tx is on + * ksnp_zc_req_list. */ + LASSERT (tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + LASSERT (tx->tx_zc_capable); + + tx->tx_zc_checked = 1; + + if (conn->ksnc_proto == &ksocknal_protocol_v1x || + !conn->ksnc_zc_capable) + return; + + /* assign cookie and queue tx to pending list, it will be released when + * a matching ack is received. See ksocknal_handle_zcack() */ + + ksocknal_tx_addref(tx); + + spin_lock(&peer->ksnp_lock); + + /* ZC_REQ is going to be pinned to the peer */ + tx->tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + + LASSERT (tx->tx_msg.ksm_zc_cookies[0] == 0); + + tx->tx_msg.ksm_zc_cookies[0] = peer->ksnp_zc_next_cookie++; + + if (peer->ksnp_zc_next_cookie == 0) + peer->ksnp_zc_next_cookie = SOCKNAL_KEEPALIVE_PING + 1; + + list_add_tail(&tx->tx_zc_list, &peer->ksnp_zc_req_list); + + spin_unlock(&peer->ksnp_lock); +} + +static void +ksocknal_uncheck_zc_req(ksock_tx_t *tx) +{ + ksock_peer_t *peer = tx->tx_conn->ksnc_peer; + + LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + LASSERT(tx->tx_zc_capable); + + tx->tx_zc_checked = 0; + + spin_lock(&peer->ksnp_lock); + + if (tx->tx_msg.ksm_zc_cookies[0] == 0) { + /* Not waiting for an ACK */ + spin_unlock(&peer->ksnp_lock); + return; + } + + tx->tx_msg.ksm_zc_cookies[0] = 0; + list_del(&tx->tx_zc_list); + + spin_unlock(&peer->ksnp_lock); + + ksocknal_tx_decref(tx); +} + +static int +ksocknal_process_transmit (ksock_conn_t *conn, ksock_tx_t *tx) +{ + int rc; + + if (tx->tx_zc_capable && !tx->tx_zc_checked) + ksocknal_check_zc_req(tx); + + rc = ksocknal_transmit (conn, tx); + + CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc); + + if (tx->tx_resid == 0) { + /* Sent everything OK */ + LASSERT (rc == 0); + + return 0; + } + + if (rc == -EAGAIN) + return rc; + + if (rc == -ENOMEM) { + static int counter; + + counter++; /* exponential backoff warnings */ + if ((counter & (-counter)) == counter) + CWARN("%u ENOMEM tx %p (%u allocated)\n", + counter, conn, atomic_read(&libcfs_kmemory)); + + /* Queue on ksnd_enomem_conns for retry after a timeout */ + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + /* enomem list takes over scheduler's ref... */ + LASSERT (conn->ksnc_tx_scheduled); + list_add_tail(&conn->ksnc_tx_list, + &ksocknal_data.ksnd_enomem_conns); + if (!cfs_time_aftereq(cfs_time_add(cfs_time_current(), + SOCKNAL_ENOMEM_RETRY), + ksocknal_data.ksnd_reaper_waketime)) + wake_up (&ksocknal_data.ksnd_reaper_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + return rc; + } + + /* Actual error */ + LASSERT (rc < 0); + + if (!conn->ksnc_closing) { + switch (rc) { + case -ECONNRESET: + LCONSOLE_WARN("Host %pI4h reset our connection while we were sending data; it may have rebooted.\n", + &conn->ksnc_ipaddr); + break; + default: + LCONSOLE_WARN("There was an unexpected network error while writing to %pI4h: %d.\n", + &conn->ksnc_ipaddr, rc); + break; + } + CDEBUG(D_NET, "[%p] Error %d on write to %s ip %pI4h:%d\n", + conn, rc, + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + } + + if (tx->tx_zc_checked) + ksocknal_uncheck_zc_req(tx); + + /* it's not an error if conn is being closed */ + ksocknal_close_conn_and_siblings (conn, + (conn->ksnc_closing) ? 0 : rc); + + return rc; +} + +static void +ksocknal_launch_connection_locked (ksock_route_t *route) +{ + + /* called holding write lock on ksnd_global_lock */ + + LASSERT (!route->ksnr_scheduled); + LASSERT (!route->ksnr_connecting); + LASSERT ((ksocknal_route_mask() & ~route->ksnr_connected) != 0); + + route->ksnr_scheduled = 1; /* scheduling conn for connd */ + ksocknal_route_addref(route); /* extra ref for connd */ + + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + + list_add_tail(&route->ksnr_connd_list, + &ksocknal_data.ksnd_connd_routes); + wake_up(&ksocknal_data.ksnd_connd_waitq); + + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); +} + +void +ksocknal_launch_all_connections_locked (ksock_peer_t *peer) +{ + ksock_route_t *route; + + /* called holding write lock on ksnd_global_lock */ + for (;;) { + /* launch any/all connections that need it */ + route = ksocknal_find_connectable_route_locked(peer); + if (route == NULL) + return; + + ksocknal_launch_connection_locked(route); + } +} + +ksock_conn_t * +ksocknal_find_conn_locked(ksock_peer_t *peer, ksock_tx_t *tx, int nonblk) +{ + struct list_head *tmp; + ksock_conn_t *conn; + ksock_conn_t *typed = NULL; + ksock_conn_t *fallback = NULL; + int tnob = 0; + int fnob = 0; + + list_for_each (tmp, &peer->ksnp_conns) { + ksock_conn_t *c = list_entry(tmp, ksock_conn_t, ksnc_list); + int nob = atomic_read(&c->ksnc_tx_nob) + + c->ksnc_sock->sk->sk_wmem_queued; + int rc; + + LASSERT (!c->ksnc_closing); + LASSERT (c->ksnc_proto != NULL && + c->ksnc_proto->pro_match_tx != NULL); + + rc = c->ksnc_proto->pro_match_tx(c, tx, nonblk); + + switch (rc) { + default: + LBUG(); + case SOCKNAL_MATCH_NO: /* protocol rejected the tx */ + continue; + + case SOCKNAL_MATCH_YES: /* typed connection */ + if (typed == NULL || tnob > nob || + (tnob == nob && *ksocknal_tunables.ksnd_round_robin && + cfs_time_after(typed->ksnc_tx_last_post, c->ksnc_tx_last_post))) { + typed = c; + tnob = nob; + } + break; + + case SOCKNAL_MATCH_MAY: /* fallback connection */ + if (fallback == NULL || fnob > nob || + (fnob == nob && *ksocknal_tunables.ksnd_round_robin && + cfs_time_after(fallback->ksnc_tx_last_post, c->ksnc_tx_last_post))) { + fallback = c; + fnob = nob; + } + break; + } + } + + /* prefer the typed selection */ + conn = (typed != NULL) ? typed : fallback; + + if (conn != NULL) + conn->ksnc_tx_last_post = cfs_time_current(); + + return conn; +} + +void +ksocknal_tx_prep(ksock_conn_t *conn, ksock_tx_t *tx) +{ + conn->ksnc_proto->pro_pack(tx); + + atomic_add (tx->tx_nob, &conn->ksnc_tx_nob); + ksocknal_conn_addref(conn); /* +1 ref for tx */ + tx->tx_conn = conn; +} + +void +ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) +{ + ksock_sched_t *sched = conn->ksnc_scheduler; + ksock_msg_t *msg = &tx->tx_msg; + ksock_tx_t *ztx = NULL; + int bufnob = 0; + + /* called holding global lock (read or irq-write) and caller may + * not have dropped this lock between finding conn and calling me, + * so we don't need the {get,put}connsock dance to deref + * ksnc_sock... */ + LASSERT(!conn->ksnc_closing); + + CDEBUG(D_NET, "Sending to %s ip %pI4h:%d\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + + ksocknal_tx_prep(conn, tx); + + /* Ensure the frags we've been given EXACTLY match the number of + * bytes we want to send. Many TCP/IP stacks disregard any total + * size parameters passed to them and just look at the frags. + * + * We always expect at least 1 mapped fragment containing the + * complete ksocknal message header. */ + LASSERT (lnet_iov_nob (tx->tx_niov, tx->tx_iov) + + lnet_kiov_nob(tx->tx_nkiov, tx->tx_kiov) == + (unsigned int)tx->tx_nob); + LASSERT (tx->tx_niov >= 1); + LASSERT (tx->tx_resid == tx->tx_nob); + + CDEBUG (D_NET, "Packet %p type %d, nob %d niov %d nkiov %d\n", + tx, (tx->tx_lnetmsg != NULL) ? tx->tx_lnetmsg->msg_hdr.type: + KSOCK_MSG_NOOP, + tx->tx_nob, tx->tx_niov, tx->tx_nkiov); + + /* + * FIXME: SOCK_WMEM_QUEUED and SOCK_ERROR could block in __DARWIN8__ + * but they're used inside spinlocks a lot. + */ + bufnob = conn->ksnc_sock->sk->sk_wmem_queued; + spin_lock_bh(&sched->kss_lock); + + if (list_empty(&conn->ksnc_tx_queue) && bufnob == 0) { + /* First packet starts the timeout */ + conn->ksnc_tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + if (conn->ksnc_tx_bufnob > 0) /* something got ACKed */ + conn->ksnc_peer->ksnp_last_alive = cfs_time_current(); + conn->ksnc_tx_bufnob = 0; + mb(); /* order with adding to tx_queue */ + } + + if (msg->ksm_type == KSOCK_MSG_NOOP) { + /* The packet is noop ZC ACK, try to piggyback the ack_cookie + * on a normal packet so I don't need to send it */ + LASSERT (msg->ksm_zc_cookies[1] != 0); + LASSERT (conn->ksnc_proto->pro_queue_tx_zcack != NULL); + + if (conn->ksnc_proto->pro_queue_tx_zcack(conn, tx, 0)) + ztx = tx; /* ZC ACK piggybacked on ztx release tx later */ + + } else { + /* It's a normal packet - can it piggback a noop zc-ack that + * has been queued already? */ + LASSERT (msg->ksm_zc_cookies[1] == 0); + LASSERT (conn->ksnc_proto->pro_queue_tx_msg != NULL); + + ztx = conn->ksnc_proto->pro_queue_tx_msg(conn, tx); + /* ztx will be released later */ + } + + if (ztx != NULL) { + atomic_sub (ztx->tx_nob, &conn->ksnc_tx_nob); + list_add_tail(&ztx->tx_list, &sched->kss_zombie_noop_txs); + } + + if (conn->ksnc_tx_ready && /* able to send */ + !conn->ksnc_tx_scheduled) { /* not scheduled to send */ + /* +1 ref for scheduler */ + ksocknal_conn_addref(conn); + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + wake_up (&sched->kss_waitq); + } + + spin_unlock_bh(&sched->kss_lock); +} + + +ksock_route_t * +ksocknal_find_connectable_route_locked (ksock_peer_t *peer) +{ + unsigned long now = cfs_time_current(); + struct list_head *tmp; + ksock_route_t *route; + + list_for_each (tmp, &peer->ksnp_routes) { + route = list_entry (tmp, ksock_route_t, ksnr_list); + + LASSERT (!route->ksnr_connecting || route->ksnr_scheduled); + + if (route->ksnr_scheduled) /* connections being established */ + continue; + + /* all route types connected ? */ + if ((ksocknal_route_mask() & ~route->ksnr_connected) == 0) + continue; + + if (!(route->ksnr_retry_interval == 0 || /* first attempt */ + cfs_time_aftereq(now, route->ksnr_timeout))) { + CDEBUG(D_NET, + "Too soon to retry route %pI4h (cnted %d, interval %ld, %ld secs later)\n", + &route->ksnr_ipaddr, + route->ksnr_connected, + route->ksnr_retry_interval, + cfs_duration_sec(route->ksnr_timeout - now)); + continue; + } + + return route; + } + + return NULL; +} + +ksock_route_t * +ksocknal_find_connecting_route_locked (ksock_peer_t *peer) +{ + struct list_head *tmp; + ksock_route_t *route; + + list_for_each (tmp, &peer->ksnp_routes) { + route = list_entry (tmp, ksock_route_t, ksnr_list); + + LASSERT (!route->ksnr_connecting || route->ksnr_scheduled); + + if (route->ksnr_scheduled) + return route; + } + + return NULL; +} + +int +ksocknal_launch_packet (lnet_ni_t *ni, ksock_tx_t *tx, lnet_process_id_t id) +{ + ksock_peer_t *peer; + ksock_conn_t *conn; + rwlock_t *g_lock; + int retry; + int rc; + + LASSERT (tx->tx_conn == NULL); + + g_lock = &ksocknal_data.ksnd_global_lock; + + for (retry = 0;; retry = 1) { + read_lock(g_lock); + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) { + if (ksocknal_find_connectable_route_locked(peer) == NULL) { + conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk); + if (conn != NULL) { + /* I've got no routes that need to be + * connecting and I do have an actual + * connection... */ + ksocknal_queue_tx_locked (tx, conn); + read_unlock(g_lock); + return 0; + } + } + } + + /* I'll need a write lock... */ + read_unlock(g_lock); + + write_lock_bh(g_lock); + + peer = ksocknal_find_peer_locked(ni, id); + if (peer != NULL) + break; + + write_unlock_bh(g_lock); + + if ((id.pid & LNET_PID_USERFLAG) != 0) { + CERROR("Refusing to create a connection to userspace process %s\n", + libcfs_id2str(id)); + return -EHOSTUNREACH; + } + + if (retry) { + CERROR("Can't find peer %s\n", libcfs_id2str(id)); + return -EHOSTUNREACH; + } + + rc = ksocknal_add_peer(ni, id, + LNET_NIDADDR(id.nid), + lnet_acceptor_port()); + if (rc != 0) { + CERROR("Can't add peer %s: %d\n", + libcfs_id2str(id), rc); + return rc; + } + } + + ksocknal_launch_all_connections_locked(peer); + + conn = ksocknal_find_conn_locked(peer, tx, tx->tx_nonblk); + if (conn != NULL) { + /* Connection exists; queue message on it */ + ksocknal_queue_tx_locked (tx, conn); + write_unlock_bh(g_lock); + return 0; + } + + if (peer->ksnp_accepting > 0 || + ksocknal_find_connecting_route_locked (peer) != NULL) { + /* the message is going to be pinned to the peer */ + tx->tx_deadline = + cfs_time_shift(*ksocknal_tunables.ksnd_timeout); + + /* Queue the message until a connection is established */ + list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue); + write_unlock_bh(g_lock); + return 0; + } + + write_unlock_bh(g_lock); + + /* NB Routes may be ignored if connections to them failed recently */ + CNETERR("No usable routes to %s\n", libcfs_id2str(id)); + return -EHOSTUNREACH; +} + +int +ksocknal_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) +{ + int mpflag = 1; + int type = lntmsg->msg_type; + lnet_process_id_t target = lntmsg->msg_target; + unsigned int payload_niov = lntmsg->msg_niov; + struct kvec *payload_iov = lntmsg->msg_iov; + lnet_kiov_t *payload_kiov = lntmsg->msg_kiov; + unsigned int payload_offset = lntmsg->msg_offset; + unsigned int payload_nob = lntmsg->msg_len; + ksock_tx_t *tx; + int desc_size; + int rc; + + /* NB 'private' is different depending on what we're sending. + * Just ignore it... */ + + CDEBUG(D_NET, "sending %u bytes in %d frags to %s\n", + payload_nob, payload_niov, libcfs_id2str(target)); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= LNET_MAX_IOV); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + LASSERT (!in_interrupt ()); + + if (payload_iov != NULL) + desc_size = offsetof(ksock_tx_t, + tx_frags.virt.iov[1 + payload_niov]); + else + desc_size = offsetof(ksock_tx_t, + tx_frags.paged.kiov[payload_niov]); + + if (lntmsg->msg_vmflush) + mpflag = cfs_memory_pressure_get_and_set(); + tx = ksocknal_alloc_tx(KSOCK_MSG_LNET, desc_size); + if (tx == NULL) { + CERROR("Can't allocate tx desc type %d size %d\n", + type, desc_size); + if (lntmsg->msg_vmflush) + cfs_memory_pressure_restore(mpflag); + return -ENOMEM; + } + + tx->tx_conn = NULL; /* set when assigned a conn */ + tx->tx_lnetmsg = lntmsg; + + if (payload_iov != NULL) { + tx->tx_kiov = NULL; + tx->tx_nkiov = 0; + tx->tx_iov = tx->tx_frags.virt.iov; + tx->tx_niov = 1 + + lnet_extract_iov(payload_niov, &tx->tx_iov[1], + payload_niov, payload_iov, + payload_offset, payload_nob); + } else { + tx->tx_niov = 1; + tx->tx_iov = &tx->tx_frags.paged.iov; + tx->tx_kiov = tx->tx_frags.paged.kiov; + tx->tx_nkiov = lnet_extract_kiov(payload_niov, tx->tx_kiov, + payload_niov, payload_kiov, + payload_offset, payload_nob); + + if (payload_nob >= *ksocknal_tunables.ksnd_zc_min_payload) + tx->tx_zc_capable = 1; + } + + socklnd_init_msg(&tx->tx_msg, KSOCK_MSG_LNET); + + /* The first fragment will be set later in pro_pack */ + rc = ksocknal_launch_packet(ni, tx, target); + if (!mpflag) + cfs_memory_pressure_restore(mpflag); + + if (rc == 0) + return 0; + + ksocknal_free_tx(tx); + return -EIO; +} + +int +ksocknal_thread_start(int (*fn)(void *arg), void *arg, char *name) +{ + struct task_struct *task = kthread_run(fn, arg, "%s", name); + + if (IS_ERR(task)) + return PTR_ERR(task); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + ksocknal_data.ksnd_nthreads++; + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + return 0; +} + +void +ksocknal_thread_fini (void) +{ + write_lock_bh(&ksocknal_data.ksnd_global_lock); + ksocknal_data.ksnd_nthreads--; + write_unlock_bh(&ksocknal_data.ksnd_global_lock); +} + +int +ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip) +{ + static char ksocknal_slop_buffer[4096]; + + int nob; + unsigned int niov; + int skipped; + + LASSERT(conn->ksnc_proto != NULL); + + if ((*ksocknal_tunables.ksnd_eager_ack & conn->ksnc_type) != 0) { + /* Remind the socket to ack eagerly... */ + ksocknal_lib_eager_ack(conn); + } + + if (nob_to_skip == 0) { /* right at next packet boundary now */ + conn->ksnc_rx_started = 0; + mb(); /* racing with timeout thread */ + + switch (conn->ksnc_proto->pro_version) { + case KSOCK_PROTO_V2: + case KSOCK_PROTO_V3: + conn->ksnc_rx_state = SOCKNAL_RX_KSM_HEADER; + conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = &conn->ksnc_msg; + + conn->ksnc_rx_nob_wanted = offsetof(ksock_msg_t, ksm_u); + conn->ksnc_rx_nob_left = offsetof(ksock_msg_t, ksm_u); + conn->ksnc_rx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u); + break; + + case KSOCK_PROTO_V1: + /* Receiving bare lnet_hdr_t */ + conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; + conn->ksnc_rx_nob_wanted = sizeof(lnet_hdr_t); + conn->ksnc_rx_nob_left = sizeof(lnet_hdr_t); + + conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = &conn->ksnc_msg.ksm_u.lnetmsg; + conn->ksnc_rx_iov[0].iov_len = sizeof (lnet_hdr_t); + break; + + default: + LBUG (); + } + conn->ksnc_rx_niov = 1; + + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_csum = ~0; + return 1; + } + + /* Set up to skip as much as possible now. If there's more left + * (ran out of iov entries) we'll get called again */ + + conn->ksnc_rx_state = SOCKNAL_RX_SLOP; + conn->ksnc_rx_nob_left = nob_to_skip; + conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; + skipped = 0; + niov = 0; + + do { + nob = min_t(int, nob_to_skip, sizeof(ksocknal_slop_buffer)); + + conn->ksnc_rx_iov[niov].iov_base = ksocknal_slop_buffer; + conn->ksnc_rx_iov[niov].iov_len = nob; + niov++; + skipped += nob; + nob_to_skip -=nob; + + } while (nob_to_skip != 0 && /* mustn't overflow conn's rx iov */ + niov < sizeof(conn->ksnc_rx_iov_space) / sizeof (struct iovec)); + + conn->ksnc_rx_niov = niov; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_nob_wanted = skipped; + return 0; +} + +static int +ksocknal_process_receive (ksock_conn_t *conn) +{ + lnet_hdr_t *lhdr; + lnet_process_id_t *id; + int rc; + + LASSERT (atomic_read(&conn->ksnc_conn_refcount) > 0); + + /* NB: sched lock NOT held */ + /* SOCKNAL_RX_LNET_HEADER is here for backward compatibility */ + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_KSM_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_HEADER || + conn->ksnc_rx_state == SOCKNAL_RX_SLOP); + again: + if (conn->ksnc_rx_nob_wanted != 0) { + rc = ksocknal_receive(conn); + + if (rc <= 0) { + LASSERT (rc != -EAGAIN); + + if (rc == 0) + CDEBUG(D_NET, "[%p] EOF from %s ip %pI4h:%d\n", + conn, + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + else if (!conn->ksnc_closing) + CERROR("[%p] Error %d on read from %s ip %pI4h:%d\n", + conn, rc, + libcfs_id2str(conn->ksnc_peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + + /* it's not an error if conn is being closed */ + ksocknal_close_conn_and_siblings (conn, + (conn->ksnc_closing) ? 0 : rc); + return (rc == 0 ? -ESHUTDOWN : rc); + } + + if (conn->ksnc_rx_nob_wanted != 0) { + /* short read */ + return -EAGAIN; + } + } + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_KSM_HEADER: + if (conn->ksnc_flip) { + __swab32s(&conn->ksnc_msg.ksm_type); + __swab32s(&conn->ksnc_msg.ksm_csum); + __swab64s(&conn->ksnc_msg.ksm_zc_cookies[0]); + __swab64s(&conn->ksnc_msg.ksm_zc_cookies[1]); + } + + if (conn->ksnc_msg.ksm_type != KSOCK_MSG_NOOP && + conn->ksnc_msg.ksm_type != KSOCK_MSG_LNET) { + CERROR("%s: Unknown message type: %x\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_type); + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, -EPROTO); + return -EPROTO; + } + + if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP && + conn->ksnc_msg.ksm_csum != 0 && /* has checksum */ + conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { + /* NOOP Checksum error */ + CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, -EPROTO); + return -EIO; + } + + if (conn->ksnc_msg.ksm_zc_cookies[1] != 0) { + __u64 cookie = 0; + + LASSERT (conn->ksnc_proto != &ksocknal_protocol_v1x); + + if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) + cookie = conn->ksnc_msg.ksm_zc_cookies[0]; + + rc = conn->ksnc_proto->pro_handle_zcack(conn, cookie, + conn->ksnc_msg.ksm_zc_cookies[1]); + + if (rc != 0) { + CERROR("%s: Unknown ZC-ACK cookie: %llu, %llu\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + cookie, conn->ksnc_msg.ksm_zc_cookies[1]); + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings(conn, -EPROTO); + return rc; + } + } + + if (conn->ksnc_msg.ksm_type == KSOCK_MSG_NOOP) { + ksocknal_new_packet (conn, 0); + return 0; /* NOOP is done and just return */ + } + + conn->ksnc_rx_state = SOCKNAL_RX_LNET_HEADER; + conn->ksnc_rx_nob_wanted = sizeof(ksock_lnet_msg_t); + conn->ksnc_rx_nob_left = sizeof(ksock_lnet_msg_t); + + conn->ksnc_rx_iov = (struct kvec *)&conn->ksnc_rx_iov_space; + conn->ksnc_rx_iov[0].iov_base = &conn->ksnc_msg.ksm_u.lnetmsg; + conn->ksnc_rx_iov[0].iov_len = sizeof(ksock_lnet_msg_t); + + conn->ksnc_rx_niov = 1; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_nkiov = 0; + + goto again; /* read lnet header now */ + + case SOCKNAL_RX_LNET_HEADER: + /* unpack message header */ + conn->ksnc_proto->pro_unpack(&conn->ksnc_msg); + + if ((conn->ksnc_peer->ksnp_id.pid & LNET_PID_USERFLAG) != 0) { + /* Userspace peer */ + lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr; + id = &conn->ksnc_peer->ksnp_id; + + /* Substitute process ID assigned at connection time */ + lhdr->src_pid = cpu_to_le32(id->pid); + lhdr->src_nid = cpu_to_le64(id->nid); + } + + conn->ksnc_rx_state = SOCKNAL_RX_PARSE; + ksocknal_conn_addref(conn); /* ++ref while parsing */ + + rc = lnet_parse(conn->ksnc_peer->ksnp_ni, + &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr, + conn->ksnc_peer->ksnp_id.nid, conn, 0); + if (rc < 0) { + /* I just received garbage: give up on this conn */ + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings (conn, rc); + ksocknal_conn_decref(conn); + return -EPROTO; + } + + /* I'm racing with ksocknal_recv() */ + LASSERT (conn->ksnc_rx_state == SOCKNAL_RX_PARSE || + conn->ksnc_rx_state == SOCKNAL_RX_LNET_PAYLOAD); + + if (conn->ksnc_rx_state != SOCKNAL_RX_LNET_PAYLOAD) + return 0; + + /* ksocknal_recv() got called */ + goto again; + + case SOCKNAL_RX_LNET_PAYLOAD: + /* payload all received */ + rc = 0; + + if (conn->ksnc_rx_nob_left == 0 && /* not truncating */ + conn->ksnc_msg.ksm_csum != 0 && /* has checksum */ + conn->ksnc_msg.ksm_csum != conn->ksnc_rx_csum) { + CERROR("%s: Checksum error, wire:0x%08X data:0x%08X\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), + conn->ksnc_msg.ksm_csum, conn->ksnc_rx_csum); + rc = -EIO; + } + + if (rc == 0 && conn->ksnc_msg.ksm_zc_cookies[0] != 0) { + LASSERT(conn->ksnc_proto != &ksocknal_protocol_v1x); + + lhdr = &conn->ksnc_msg.ksm_u.lnetmsg.ksnm_hdr; + id = &conn->ksnc_peer->ksnp_id; + + rc = conn->ksnc_proto->pro_handle_zcreq(conn, + conn->ksnc_msg.ksm_zc_cookies[0], + *ksocknal_tunables.ksnd_nonblk_zcack || + le64_to_cpu(lhdr->src_nid) != id->nid); + } + + lnet_finalize(conn->ksnc_peer->ksnp_ni, conn->ksnc_cookie, rc); + + if (rc != 0) { + ksocknal_new_packet(conn, 0); + ksocknal_close_conn_and_siblings (conn, rc); + return -EPROTO; + } + /* Fall through */ + + case SOCKNAL_RX_SLOP: + /* starting new packet? */ + if (ksocknal_new_packet (conn, conn->ksnc_rx_nob_left)) + return 0; /* come back later */ + goto again; /* try to finish reading slop now */ + + default: + break; + } + + /* Not Reached */ + LBUG (); + return -EINVAL; /* keep gcc happy */ +} + +int +ksocknal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, + unsigned int niov, struct kvec *iov, lnet_kiov_t *kiov, + unsigned int offset, unsigned int mlen, unsigned int rlen) +{ + ksock_conn_t *conn = (ksock_conn_t *)private; + ksock_sched_t *sched = conn->ksnc_scheduler; + + LASSERT (mlen <= rlen); + LASSERT (niov <= LNET_MAX_IOV); + + conn->ksnc_cookie = msg; + conn->ksnc_rx_nob_wanted = mlen; + conn->ksnc_rx_nob_left = rlen; + + if (mlen == 0 || iov != NULL) { + conn->ksnc_rx_nkiov = 0; + conn->ksnc_rx_kiov = NULL; + conn->ksnc_rx_iov = conn->ksnc_rx_iov_space.iov; + conn->ksnc_rx_niov = + lnet_extract_iov(LNET_MAX_IOV, conn->ksnc_rx_iov, + niov, iov, offset, mlen); + } else { + conn->ksnc_rx_niov = 0; + conn->ksnc_rx_iov = NULL; + conn->ksnc_rx_kiov = conn->ksnc_rx_iov_space.kiov; + conn->ksnc_rx_nkiov = + lnet_extract_kiov(LNET_MAX_IOV, conn->ksnc_rx_kiov, + niov, kiov, offset, mlen); + } + + LASSERT (mlen == + lnet_iov_nob (conn->ksnc_rx_niov, conn->ksnc_rx_iov) + + lnet_kiov_nob (conn->ksnc_rx_nkiov, conn->ksnc_rx_kiov)); + + LASSERT (conn->ksnc_rx_scheduled); + + spin_lock_bh(&sched->kss_lock); + + switch (conn->ksnc_rx_state) { + case SOCKNAL_RX_PARSE_WAIT: + list_add_tail(&conn->ksnc_rx_list, &sched->kss_rx_conns); + wake_up (&sched->kss_waitq); + LASSERT (conn->ksnc_rx_ready); + break; + + case SOCKNAL_RX_PARSE: + /* scheduler hasn't noticed I'm parsing yet */ + break; + } + + conn->ksnc_rx_state = SOCKNAL_RX_LNET_PAYLOAD; + + spin_unlock_bh(&sched->kss_lock); + ksocknal_conn_decref(conn); + return 0; +} + +static inline int +ksocknal_sched_cansleep(ksock_sched_t *sched) +{ + int rc; + + spin_lock_bh(&sched->kss_lock); + + rc = !ksocknal_data.ksnd_shuttingdown && + list_empty(&sched->kss_rx_conns) && + list_empty(&sched->kss_tx_conns); + + spin_unlock_bh(&sched->kss_lock); + return rc; +} + +int ksocknal_scheduler(void *arg) +{ + struct ksock_sched_info *info; + ksock_sched_t *sched; + ksock_conn_t *conn; + ksock_tx_t *tx; + int rc; + int nloops = 0; + long id = (long)arg; + + info = ksocknal_data.ksnd_sched_info[KSOCK_THREAD_CPT(id)]; + sched = &info->ksi_scheds[KSOCK_THREAD_SID(id)]; + + cfs_block_allsigs(); + + rc = cfs_cpt_bind(lnet_cpt_table(), info->ksi_cpt); + if (rc != 0) { + CERROR("Can't set CPT affinity to %d: %d\n", + info->ksi_cpt, rc); + } + + spin_lock_bh(&sched->kss_lock); + + while (!ksocknal_data.ksnd_shuttingdown) { + int did_something = 0; + + /* Ensure I progress everything semi-fairly */ + + if (!list_empty (&sched->kss_rx_conns)) { + conn = list_entry(sched->kss_rx_conns.next, + ksock_conn_t, ksnc_rx_list); + list_del(&conn->ksnc_rx_list); + + LASSERT(conn->ksnc_rx_scheduled); + LASSERT(conn->ksnc_rx_ready); + + /* clear rx_ready in case receive isn't complete. + * Do it BEFORE we call process_recv, since + * data_ready can set it any time after we release + * kss_lock. */ + conn->ksnc_rx_ready = 0; + spin_unlock_bh(&sched->kss_lock); + + rc = ksocknal_process_receive(conn); + + spin_lock_bh(&sched->kss_lock); + + /* I'm the only one that can clear this flag */ + LASSERT(conn->ksnc_rx_scheduled); + + /* Did process_receive get everything it wanted? */ + if (rc == 0) + conn->ksnc_rx_ready = 1; + + if (conn->ksnc_rx_state == SOCKNAL_RX_PARSE) { + /* Conn blocked waiting for ksocknal_recv() + * I change its state (under lock) to signal + * it can be rescheduled */ + conn->ksnc_rx_state = SOCKNAL_RX_PARSE_WAIT; + } else if (conn->ksnc_rx_ready) { + /* reschedule for rx */ + list_add_tail (&conn->ksnc_rx_list, + &sched->kss_rx_conns); + } else { + conn->ksnc_rx_scheduled = 0; + /* drop my ref */ + ksocknal_conn_decref(conn); + } + + did_something = 1; + } + + if (!list_empty (&sched->kss_tx_conns)) { + LIST_HEAD (zlist); + + if (!list_empty(&sched->kss_zombie_noop_txs)) { + list_add(&zlist, + &sched->kss_zombie_noop_txs); + list_del_init(&sched->kss_zombie_noop_txs); + } + + conn = list_entry(sched->kss_tx_conns.next, + ksock_conn_t, ksnc_tx_list); + list_del (&conn->ksnc_tx_list); + + LASSERT(conn->ksnc_tx_scheduled); + LASSERT(conn->ksnc_tx_ready); + LASSERT(!list_empty(&conn->ksnc_tx_queue)); + + tx = list_entry(conn->ksnc_tx_queue.next, + ksock_tx_t, tx_list); + + if (conn->ksnc_tx_carrier == tx) + ksocknal_next_tx_carrier(conn); + + /* dequeue now so empty list => more to send */ + list_del(&tx->tx_list); + + /* Clear tx_ready in case send isn't complete. Do + * it BEFORE we call process_transmit, since + * write_space can set it any time after we release + * kss_lock. */ + conn->ksnc_tx_ready = 0; + spin_unlock_bh(&sched->kss_lock); + + if (!list_empty(&zlist)) { + /* free zombie noop txs, it's fast because + * noop txs are just put in freelist */ + ksocknal_txlist_done(NULL, &zlist, 0); + } + + rc = ksocknal_process_transmit(conn, tx); + + if (rc == -ENOMEM || rc == -EAGAIN) { + /* Incomplete send: replace tx on HEAD of tx_queue */ + spin_lock_bh(&sched->kss_lock); + list_add(&tx->tx_list, + &conn->ksnc_tx_queue); + } else { + /* Complete send; tx -ref */ + ksocknal_tx_decref(tx); + + spin_lock_bh(&sched->kss_lock); + /* assume space for more */ + conn->ksnc_tx_ready = 1; + } + + if (rc == -ENOMEM) { + /* Do nothing; after a short timeout, this + * conn will be reposted on kss_tx_conns. */ + } else if (conn->ksnc_tx_ready && + !list_empty (&conn->ksnc_tx_queue)) { + /* reschedule for tx */ + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); + } else { + conn->ksnc_tx_scheduled = 0; + /* drop my ref */ + ksocknal_conn_decref(conn); + } + + did_something = 1; + } + if (!did_something || /* nothing to do */ + ++nloops == SOCKNAL_RESCHED) { /* hogging CPU? */ + spin_unlock_bh(&sched->kss_lock); + + nloops = 0; + + if (!did_something) { /* wait for something to do */ + rc = wait_event_interruptible_exclusive( + sched->kss_waitq, + !ksocknal_sched_cansleep(sched)); + LASSERT (rc == 0); + } else { + cond_resched(); + } + + spin_lock_bh(&sched->kss_lock); + } + } + + spin_unlock_bh(&sched->kss_lock); + ksocknal_thread_fini(); + return 0; +} + +/* + * Add connection to kss_rx_conns of scheduler + * and wakeup the scheduler. + */ +void ksocknal_read_callback (ksock_conn_t *conn) +{ + ksock_sched_t *sched; + + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + + conn->ksnc_rx_ready = 1; + + if (!conn->ksnc_rx_scheduled) { /* not being progressed */ + list_add_tail(&conn->ksnc_rx_list, + &sched->kss_rx_conns); + conn->ksnc_rx_scheduled = 1; + /* extra ref for scheduler */ + ksocknal_conn_addref(conn); + + wake_up (&sched->kss_waitq); + } + spin_unlock_bh(&sched->kss_lock); +} + +/* + * Add connection to kss_tx_conns of scheduler + * and wakeup the scheduler. + */ +void ksocknal_write_callback (ksock_conn_t *conn) +{ + ksock_sched_t *sched; + + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + + conn->ksnc_tx_ready = 1; + + if (!conn->ksnc_tx_scheduled && /* not being progressed */ + !list_empty(&conn->ksnc_tx_queue)) { /* packets to send */ + list_add_tail (&conn->ksnc_tx_list, + &sched->kss_tx_conns); + conn->ksnc_tx_scheduled = 1; + /* extra ref for scheduler */ + ksocknal_conn_addref(conn); + + wake_up (&sched->kss_waitq); + } + + spin_unlock_bh(&sched->kss_lock); +} + +static ksock_proto_t * +ksocknal_parse_proto_version (ksock_hello_msg_t *hello) +{ + __u32 version = 0; + + if (hello->kshm_magic == LNET_PROTO_MAGIC) + version = hello->kshm_version; + else if (hello->kshm_magic == __swab32(LNET_PROTO_MAGIC)) + version = __swab32(hello->kshm_version); + + if (version != 0) { +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol == 1) + return NULL; + + if (*ksocknal_tunables.ksnd_protocol == 2 && + version == KSOCK_PROTO_V3) + return NULL; +#endif + if (version == KSOCK_PROTO_V2) + return &ksocknal_protocol_v2x; + + if (version == KSOCK_PROTO_V3) + return &ksocknal_protocol_v3x; + + return NULL; + } + + if (hello->kshm_magic == le32_to_cpu(LNET_PROTO_TCP_MAGIC)) { + lnet_magicversion_t *hmv = (lnet_magicversion_t *)hello; + + CLASSERT (sizeof (lnet_magicversion_t) == + offsetof (ksock_hello_msg_t, kshm_src_nid)); + + if (hmv->version_major == cpu_to_le16 (KSOCK_PROTO_V1_MAJOR) && + hmv->version_minor == cpu_to_le16 (KSOCK_PROTO_V1_MINOR)) + return &ksocknal_protocol_v1x; + } + + return NULL; +} + +int +ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn, + lnet_nid_t peer_nid, ksock_hello_msg_t *hello) +{ + /* CAVEAT EMPTOR: this byte flips 'ipaddrs' */ + ksock_net_t *net = (ksock_net_t *)ni->ni_data; + + LASSERT (hello->kshm_nips <= LNET_MAX_INTERFACES); + + /* rely on caller to hold a ref on socket so it wouldn't disappear */ + LASSERT (conn->ksnc_proto != NULL); + + hello->kshm_src_nid = ni->ni_nid; + hello->kshm_dst_nid = peer_nid; + hello->kshm_src_pid = the_lnet.ln_pid; + + hello->kshm_src_incarnation = net->ksnn_incarnation; + hello->kshm_ctype = conn->ksnc_type; + + return conn->ksnc_proto->pro_send_hello(conn, hello); +} + +static int +ksocknal_invert_type(int type) +{ + switch (type) { + case SOCKLND_CONN_ANY: + case SOCKLND_CONN_CONTROL: + return type; + case SOCKLND_CONN_BULK_IN: + return SOCKLND_CONN_BULK_OUT; + case SOCKLND_CONN_BULK_OUT: + return SOCKLND_CONN_BULK_IN; + default: + return SOCKLND_CONN_NONE; + } +} + +int +ksocknal_recv_hello (lnet_ni_t *ni, ksock_conn_t *conn, + ksock_hello_msg_t *hello, lnet_process_id_t *peerid, + __u64 *incarnation) +{ + /* Return < 0 fatal error + * 0 success + * EALREADY lost connection race + * EPROTO protocol version mismatch + */ + struct socket *sock = conn->ksnc_sock; + int active = (conn->ksnc_proto != NULL); + int timeout; + int proto_match; + int rc; + ksock_proto_t *proto; + lnet_process_id_t recv_id; + + /* socket type set on active connections - not set on passive */ + LASSERT (!active == !(conn->ksnc_type != SOCKLND_CONN_NONE)); + + timeout = active ? *ksocknal_tunables.ksnd_timeout : + lnet_acceptor_timeout(); + + rc = libcfs_sock_read(sock, &hello->kshm_magic, sizeof (hello->kshm_magic), timeout); + if (rc != 0) { + CERROR("Error %d reading HELLO from %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT (rc < 0); + return rc; + } + + if (hello->kshm_magic != LNET_PROTO_MAGIC && + hello->kshm_magic != __swab32(LNET_PROTO_MAGIC) && + hello->kshm_magic != le32_to_cpu (LNET_PROTO_TCP_MAGIC)) { + /* Unexpected magic! */ + CERROR("Bad magic(1) %#08x (%#08x expected) from %pI4h\n", + __cpu_to_le32 (hello->kshm_magic), + LNET_PROTO_TCP_MAGIC, + &conn->ksnc_ipaddr); + return -EPROTO; + } + + rc = libcfs_sock_read(sock, &hello->kshm_version, + sizeof(hello->kshm_version), timeout); + if (rc != 0) { + CERROR("Error %d reading HELLO from %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT (rc < 0); + return rc; + } + + proto = ksocknal_parse_proto_version(hello); + if (proto == NULL) { + if (!active) { + /* unknown protocol from peer, tell peer my protocol */ + conn->ksnc_proto = &ksocknal_protocol_v3x; +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol == 2) + conn->ksnc_proto = &ksocknal_protocol_v2x; + else if (*ksocknal_tunables.ksnd_protocol == 1) + conn->ksnc_proto = &ksocknal_protocol_v1x; +#endif + hello->kshm_nips = 0; + ksocknal_send_hello(ni, conn, ni->ni_nid, hello); + } + + CERROR("Unknown protocol version (%d.x expected) from %pI4h\n", + conn->ksnc_proto->pro_version, + &conn->ksnc_ipaddr); + + return -EPROTO; + } + + proto_match = (conn->ksnc_proto == proto); + conn->ksnc_proto = proto; + + /* receive the rest of hello message anyway */ + rc = conn->ksnc_proto->pro_recv_hello(conn, hello, timeout); + if (rc != 0) { + CERROR("Error %d reading or checking hello from from %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT (rc < 0); + return rc; + } + + *incarnation = hello->kshm_src_incarnation; + + if (hello->kshm_src_nid == LNET_NID_ANY) { + CERROR("Expecting a HELLO hdr with a NID, but got LNET_NID_ANY from %pI4h\n", + &conn->ksnc_ipaddr); + return -EPROTO; + } + + if (!active && + conn->ksnc_port > LNET_ACCEPTOR_MAX_RESERVED_PORT) { + /* Userspace NAL assigns peer process ID from socket */ + recv_id.pid = conn->ksnc_port | LNET_PID_USERFLAG; + recv_id.nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), conn->ksnc_ipaddr); + } else { + recv_id.nid = hello->kshm_src_nid; + recv_id.pid = hello->kshm_src_pid; + } + + if (!active) { + *peerid = recv_id; + + /* peer determines type */ + conn->ksnc_type = ksocknal_invert_type(hello->kshm_ctype); + if (conn->ksnc_type == SOCKLND_CONN_NONE) { + CERROR("Unexpected type %d from %s ip %pI4h\n", + hello->kshm_ctype, libcfs_id2str(*peerid), + &conn->ksnc_ipaddr); + return -EPROTO; + } + + return 0; + } + + if (peerid->pid != recv_id.pid || + peerid->nid != recv_id.nid) { + LCONSOLE_ERROR_MSG(0x130, "Connected successfully to %s on host %pI4h, but they claimed they were %s; please check your Lustre configuration.\n", + libcfs_id2str(*peerid), + &conn->ksnc_ipaddr, + libcfs_id2str(recv_id)); + return -EPROTO; + } + + if (hello->kshm_ctype == SOCKLND_CONN_NONE) { + /* Possible protocol mismatch or I lost the connection race */ + return proto_match ? EALREADY : EPROTO; + } + + if (ksocknal_invert_type(hello->kshm_ctype) != conn->ksnc_type) { + CERROR("Mismatched types: me %d, %s ip %pI4h %d\n", + conn->ksnc_type, libcfs_id2str(*peerid), + &conn->ksnc_ipaddr, + hello->kshm_ctype); + return -EPROTO; + } + + return 0; +} + +static int +ksocknal_connect (ksock_route_t *route) +{ + LIST_HEAD (zombies); + ksock_peer_t *peer = route->ksnr_peer; + int type; + int wanted; + struct socket *sock; + unsigned long deadline; + int retry_later = 0; + int rc = 0; + + deadline = cfs_time_add(cfs_time_current(), + cfs_time_seconds(*ksocknal_tunables.ksnd_timeout)); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + LASSERT (route->ksnr_scheduled); + LASSERT (!route->ksnr_connecting); + + route->ksnr_connecting = 1; + + for (;;) { + wanted = ksocknal_route_mask() & ~route->ksnr_connected; + + /* stop connecting if peer/route got closed under me, or + * route got connected while queued */ + if (peer->ksnp_closing || route->ksnr_deleted || + wanted == 0) { + retry_later = 0; + break; + } + + /* reschedule if peer is connecting to me */ + if (peer->ksnp_accepting > 0) { + CDEBUG(D_NET, + "peer %s(%d) already connecting to me, retry later.\n", + libcfs_nid2str(peer->ksnp_id.nid), peer->ksnp_accepting); + retry_later = 1; + } + + if (retry_later) /* needs reschedule */ + break; + + if ((wanted & (1 << SOCKLND_CONN_ANY)) != 0) { + type = SOCKLND_CONN_ANY; + } else if ((wanted & (1 << SOCKLND_CONN_CONTROL)) != 0) { + type = SOCKLND_CONN_CONTROL; + } else if ((wanted & (1 << SOCKLND_CONN_BULK_IN)) != 0) { + type = SOCKLND_CONN_BULK_IN; + } else { + LASSERT ((wanted & (1 << SOCKLND_CONN_BULK_OUT)) != 0); + type = SOCKLND_CONN_BULK_OUT; + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + if (cfs_time_aftereq(cfs_time_current(), deadline)) { + rc = -ETIMEDOUT; + lnet_connect_console_error(rc, peer->ksnp_id.nid, + route->ksnr_ipaddr, + route->ksnr_port); + goto failed; + } + + rc = lnet_connect(&sock, peer->ksnp_id.nid, + route->ksnr_myipaddr, + route->ksnr_ipaddr, route->ksnr_port); + if (rc != 0) + goto failed; + + rc = ksocknal_create_conn(peer->ksnp_ni, route, sock, type); + if (rc < 0) { + lnet_connect_console_error(rc, peer->ksnp_id.nid, + route->ksnr_ipaddr, + route->ksnr_port); + goto failed; + } + + /* A +ve RC means I have to retry because I lost the connection + * race or I have to renegotiate protocol version */ + retry_later = (rc != 0); + if (retry_later) + CDEBUG(D_NET, "peer %s: conn race, retry later.\n", + libcfs_nid2str(peer->ksnp_id.nid)); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + } + + route->ksnr_scheduled = 0; + route->ksnr_connecting = 0; + + if (retry_later) { + /* re-queue for attention; this frees me up to handle + * the peer's incoming connection request */ + + if (rc == EALREADY || + (rc == 0 && peer->ksnp_accepting > 0)) { + /* We want to introduce a delay before next + * attempt to connect if we lost conn race, + * but the race is resolved quickly usually, + * so min_reconnectms should be good heuristic */ + route->ksnr_retry_interval = + cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000; + route->ksnr_timeout = cfs_time_add(cfs_time_current(), + route->ksnr_retry_interval); + } + + ksocknal_launch_connection_locked(route); + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + return retry_later; + + failed: + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + route->ksnr_scheduled = 0; + route->ksnr_connecting = 0; + + /* This is a retry rather than a new connection */ + route->ksnr_retry_interval *= 2; + route->ksnr_retry_interval = + max(route->ksnr_retry_interval, + cfs_time_seconds(*ksocknal_tunables.ksnd_min_reconnectms)/1000); + route->ksnr_retry_interval = + min(route->ksnr_retry_interval, + cfs_time_seconds(*ksocknal_tunables.ksnd_max_reconnectms)/1000); + + LASSERT (route->ksnr_retry_interval != 0); + route->ksnr_timeout = cfs_time_add(cfs_time_current(), + route->ksnr_retry_interval); + + if (!list_empty(&peer->ksnp_tx_queue) && + peer->ksnp_accepting == 0 && + ksocknal_find_connecting_route_locked(peer) == NULL) { + ksock_conn_t *conn; + + /* ksnp_tx_queue is queued on a conn on successful + * connection for V1.x and V2.x */ + if (!list_empty (&peer->ksnp_conns)) { + conn = list_entry(peer->ksnp_conns.next, + ksock_conn_t, ksnc_list); + LASSERT (conn->ksnc_proto == &ksocknal_protocol_v3x); + } + + /* take all the blocked packets while I've got the lock and + * complete below... */ + list_splice_init(&peer->ksnp_tx_queue, &zombies); + } + +#if 0 /* irrelevant with only eager routes */ + if (!route->ksnr_deleted) { + /* make this route least-favourite for re-selection */ + list_del(&route->ksnr_list); + list_add_tail(&route->ksnr_list, &peer->ksnp_routes); + } +#endif + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_peer_failed(peer); + ksocknal_txlist_done(peer->ksnp_ni, &zombies, 1); + return 0; +} + +/* + * check whether we need to create more connds. + * It will try to create new thread if it's necessary, @timeout can + * be updated if failed to create, so caller wouldn't keep try while + * running out of resource. + */ +static int +ksocknal_connd_check_start(long sec, long *timeout) +{ + char name[16]; + int rc; + int total = ksocknal_data.ksnd_connd_starting + + ksocknal_data.ksnd_connd_running; + + if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) { + /* still in initializing */ + return 0; + } + + if (total >= *ksocknal_tunables.ksnd_nconnds_max || + total > ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV) { + /* can't create more connd, or still have enough + * threads to handle more connecting */ + return 0; + } + + if (list_empty(&ksocknal_data.ksnd_connd_routes)) { + /* no pending connecting request */ + return 0; + } + + if (sec - ksocknal_data.ksnd_connd_failed_stamp <= 1) { + /* may run out of resource, retry later */ + *timeout = cfs_time_seconds(1); + return 0; + } + + if (ksocknal_data.ksnd_connd_starting > 0) { + /* serialize starting to avoid flood */ + return 0; + } + + ksocknal_data.ksnd_connd_starting_stamp = sec; + ksocknal_data.ksnd_connd_starting++; + spin_unlock_bh(&ksocknal_data.ksnd_connd_lock); + + /* NB: total is the next id */ + snprintf(name, sizeof(name), "socknal_cd%02d", total); + rc = ksocknal_thread_start(ksocknal_connd, NULL, name); + + spin_lock_bh(&ksocknal_data.ksnd_connd_lock); + if (rc == 0) + return 1; + + /* we tried ... */ + LASSERT(ksocknal_data.ksnd_connd_starting > 0); + ksocknal_data.ksnd_connd_starting--; + ksocknal_data.ksnd_connd_failed_stamp = get_seconds(); + + return 1; +} + +/* + * check whether current thread can exit, it will return 1 if there are too + * many threads and no creating in past 120 seconds. + * Also, this function may update @timeout to make caller come back + * again to recheck these conditions. + */ +static int +ksocknal_connd_check_stop(long sec, long *timeout) +{ + int val; + + if (unlikely(ksocknal_data.ksnd_init < SOCKNAL_INIT_ALL)) { + /* still in initializing */ + return 0; + } + + if (ksocknal_data.ksnd_connd_starting > 0) { + /* in progress of starting new thread */ + return 0; + } + + if (ksocknal_data.ksnd_connd_running <= + *ksocknal_tunables.ksnd_nconnds) { /* can't shrink */ + return 0; + } + + /* created thread in past 120 seconds? */ + val = (int)(ksocknal_data.ksnd_connd_starting_stamp + + SOCKNAL_CONND_TIMEOUT - sec); + + *timeout = (val > 0) ? cfs_time_seconds(val) : + cfs_time_seconds(SOCKNAL_CONND_TIMEOUT); + if (val > 0) + return 0; + + /* no creating in past 120 seconds */ + + return ksocknal_data.ksnd_connd_running > + ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV; +} + +/* Go through connd_routes queue looking for a route that we can process + * right now, @timeout_p can be updated if we need to come back later */ +static ksock_route_t * +ksocknal_connd_get_route_locked(signed long *timeout_p) +{ + ksock_route_t *route; + unsigned long now; + + now = cfs_time_current(); + + /* connd_routes can contain both pending and ordinary routes */ + list_for_each_entry (route, &ksocknal_data.ksnd_connd_routes, + ksnr_connd_list) { + + if (route->ksnr_retry_interval == 0 || + cfs_time_aftereq(now, route->ksnr_timeout)) + return route; + + if (*timeout_p == MAX_SCHEDULE_TIMEOUT || + (int)*timeout_p > (int)(route->ksnr_timeout - now)) + *timeout_p = (int)(route->ksnr_timeout - now); + } + + return NULL; +} + +int +ksocknal_connd (void *arg) +{ + spinlock_t *connd_lock = &ksocknal_data.ksnd_connd_lock; + ksock_connreq_t *cr; + wait_queue_t wait; + int nloops = 0; + int cons_retry = 0; + + cfs_block_allsigs (); + + init_waitqueue_entry(&wait, current); + + spin_lock_bh(connd_lock); + + LASSERT(ksocknal_data.ksnd_connd_starting > 0); + ksocknal_data.ksnd_connd_starting--; + ksocknal_data.ksnd_connd_running++; + + while (!ksocknal_data.ksnd_shuttingdown) { + ksock_route_t *route = NULL; + long sec = get_seconds(); + long timeout = MAX_SCHEDULE_TIMEOUT; + int dropped_lock = 0; + + if (ksocknal_connd_check_stop(sec, &timeout)) { + /* wakeup another one to check stop */ + wake_up(&ksocknal_data.ksnd_connd_waitq); + break; + } + + if (ksocknal_connd_check_start(sec, &timeout)) { + /* created new thread */ + dropped_lock = 1; + } + + if (!list_empty(&ksocknal_data.ksnd_connd_connreqs)) { + /* Connection accepted by the listener */ + cr = list_entry(ksocknal_data.ksnd_connd_connreqs. \ + next, ksock_connreq_t, ksncr_list); + + list_del(&cr->ksncr_list); + spin_unlock_bh(connd_lock); + dropped_lock = 1; + + ksocknal_create_conn(cr->ksncr_ni, NULL, + cr->ksncr_sock, SOCKLND_CONN_NONE); + lnet_ni_decref(cr->ksncr_ni); + LIBCFS_FREE(cr, sizeof(*cr)); + + spin_lock_bh(connd_lock); + } + + /* Only handle an outgoing connection request if there + * is a thread left to handle incoming connections and + * create new connd */ + if (ksocknal_data.ksnd_connd_connecting + SOCKNAL_CONND_RESV < + ksocknal_data.ksnd_connd_running) { + route = ksocknal_connd_get_route_locked(&timeout); + } + if (route != NULL) { + list_del (&route->ksnr_connd_list); + ksocknal_data.ksnd_connd_connecting++; + spin_unlock_bh(connd_lock); + dropped_lock = 1; + + if (ksocknal_connect(route)) { + /* consecutive retry */ + if (cons_retry++ > SOCKNAL_INSANITY_RECONN) { + CWARN("massive consecutive re-connecting to %pI4h\n", + &route->ksnr_ipaddr); + cons_retry = 0; + } + } else { + cons_retry = 0; + } + + ksocknal_route_decref(route); + + spin_lock_bh(connd_lock); + ksocknal_data.ksnd_connd_connecting--; + } + + if (dropped_lock) { + if (++nloops < SOCKNAL_RESCHED) + continue; + spin_unlock_bh(connd_lock); + nloops = 0; + cond_resched(); + spin_lock_bh(connd_lock); + continue; + } + + /* Nothing to do for 'timeout' */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue_exclusive(&ksocknal_data.ksnd_connd_waitq, &wait); + spin_unlock_bh(connd_lock); + + nloops = 0; + schedule_timeout(timeout); + + remove_wait_queue(&ksocknal_data.ksnd_connd_waitq, &wait); + spin_lock_bh(connd_lock); + } + ksocknal_data.ksnd_connd_running--; + spin_unlock_bh(connd_lock); + + ksocknal_thread_fini(); + return 0; +} + +static ksock_conn_t * +ksocknal_find_timed_out_conn (ksock_peer_t *peer) +{ + /* We're called with a shared lock on ksnd_global_lock */ + ksock_conn_t *conn; + struct list_head *ctmp; + + list_for_each (ctmp, &peer->ksnp_conns) { + int error; + conn = list_entry (ctmp, ksock_conn_t, ksnc_list); + + /* Don't need the {get,put}connsock dance to deref ksnc_sock */ + LASSERT (!conn->ksnc_closing); + + /* SOCK_ERROR will reset error code of socket in + * some platform (like Darwin8.x) */ + error = conn->ksnc_sock->sk->sk_err; + if (error != 0) { + ksocknal_conn_addref(conn); + + switch (error) { + case ECONNRESET: + CNETERR("A connection with %s (%pI4h:%d) was reset; it may have rebooted.\n", + libcfs_id2str(peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + break; + case ETIMEDOUT: + CNETERR("A connection with %s (%pI4h:%d) timed out; the network or node may be down.\n", + libcfs_id2str(peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + break; + default: + CNETERR("An unexpected network error %d occurred with %s (%pI4h:%d\n", + error, + libcfs_id2str(peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + break; + } + + return conn; + } + + if (conn->ksnc_rx_started && + cfs_time_aftereq(cfs_time_current(), + conn->ksnc_rx_deadline)) { + /* Timed out incomplete incoming message */ + ksocknal_conn_addref(conn); + CNETERR("Timeout receiving from %s (%pI4h:%d), state %d wanted %d left %d\n", + libcfs_id2str(peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port, + conn->ksnc_rx_state, + conn->ksnc_rx_nob_wanted, + conn->ksnc_rx_nob_left); + return conn; + } + + if ((!list_empty(&conn->ksnc_tx_queue) || + conn->ksnc_sock->sk->sk_wmem_queued != 0) && + cfs_time_aftereq(cfs_time_current(), + conn->ksnc_tx_deadline)) { + /* Timed out messages queued for sending or + * buffered in the socket's send buffer */ + ksocknal_conn_addref(conn); + CNETERR("Timeout sending data to %s (%pI4h:%d) the network or that node may be down.\n", + libcfs_id2str(peer->ksnp_id), + &conn->ksnc_ipaddr, + conn->ksnc_port); + return conn; + } + } + + return NULL; +} + +static inline void +ksocknal_flush_stale_txs(ksock_peer_t *peer) +{ + ksock_tx_t *tx; + LIST_HEAD (stale_txs); + + write_lock_bh(&ksocknal_data.ksnd_global_lock); + + while (!list_empty (&peer->ksnp_tx_queue)) { + tx = list_entry (peer->ksnp_tx_queue.next, + ksock_tx_t, tx_list); + + if (!cfs_time_aftereq(cfs_time_current(), + tx->tx_deadline)) + break; + + list_del (&tx->tx_list); + list_add_tail (&tx->tx_list, &stale_txs); + } + + write_unlock_bh(&ksocknal_data.ksnd_global_lock); + + ksocknal_txlist_done(peer->ksnp_ni, &stale_txs, 1); +} + +static int +ksocknal_send_keepalive_locked(ksock_peer_t *peer) +{ + ksock_sched_t *sched; + ksock_conn_t *conn; + ksock_tx_t *tx; + + if (list_empty(&peer->ksnp_conns)) /* last_alive will be updated by create_conn */ + return 0; + + if (peer->ksnp_proto != &ksocknal_protocol_v3x) + return 0; + + if (*ksocknal_tunables.ksnd_keepalive <= 0 || + time_before(cfs_time_current(), + cfs_time_add(peer->ksnp_last_alive, + cfs_time_seconds(*ksocknal_tunables.ksnd_keepalive)))) + return 0; + + if (time_before(cfs_time_current(), peer->ksnp_send_keepalive)) + return 0; + + /* retry 10 secs later, so we wouldn't put pressure + * on this peer if we failed to send keepalive this time */ + peer->ksnp_send_keepalive = cfs_time_shift(10); + + conn = ksocknal_find_conn_locked(peer, NULL, 1); + if (conn != NULL) { + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + if (!list_empty(&conn->ksnc_tx_queue)) { + spin_unlock_bh(&sched->kss_lock); + /* there is an queued ACK, don't need keepalive */ + return 0; + } + + spin_unlock_bh(&sched->kss_lock); + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + /* cookie = 1 is reserved for keepalive PING */ + tx = ksocknal_alloc_tx_noop(1, 1); + if (tx == NULL) { + read_lock(&ksocknal_data.ksnd_global_lock); + return -ENOMEM; + } + + if (ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id) == 0) { + read_lock(&ksocknal_data.ksnd_global_lock); + return 1; + } + + ksocknal_free_tx(tx); + read_lock(&ksocknal_data.ksnd_global_lock); + + return -EIO; +} + + +static void +ksocknal_check_peer_timeouts (int idx) +{ + struct list_head *peers = &ksocknal_data.ksnd_peers[idx]; + ksock_peer_t *peer; + ksock_conn_t *conn; + ksock_tx_t *tx; + + again: + /* NB. We expect to have a look at all the peers and not find any + * connections to time out, so we just use a shared lock while we + * take a look... */ + read_lock(&ksocknal_data.ksnd_global_lock); + + list_for_each_entry(peer, peers, ksnp_list) { + unsigned long deadline = 0; + int resid = 0; + int n = 0; + + if (ksocknal_send_keepalive_locked(peer) != 0) { + read_unlock(&ksocknal_data.ksnd_global_lock); + goto again; + } + + conn = ksocknal_find_timed_out_conn (peer); + + if (conn != NULL) { + read_unlock(&ksocknal_data.ksnd_global_lock); + + ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT); + + /* NB we won't find this one again, but we can't + * just proceed with the next peer, since we dropped + * ksnd_global_lock and it might be dead already! */ + ksocknal_conn_decref(conn); + goto again; + } + + /* we can't process stale txs right here because we're + * holding only shared lock */ + if (!list_empty (&peer->ksnp_tx_queue)) { + ksock_tx_t *tx = + list_entry (peer->ksnp_tx_queue.next, + ksock_tx_t, tx_list); + + if (cfs_time_aftereq(cfs_time_current(), + tx->tx_deadline)) { + + ksocknal_peer_addref(peer); + read_unlock(&ksocknal_data.ksnd_global_lock); + + ksocknal_flush_stale_txs(peer); + + ksocknal_peer_decref(peer); + goto again; + } + } + + if (list_empty(&peer->ksnp_zc_req_list)) + continue; + + spin_lock(&peer->ksnp_lock); + list_for_each_entry(tx, &peer->ksnp_zc_req_list, tx_zc_list) { + if (!cfs_time_aftereq(cfs_time_current(), + tx->tx_deadline)) + break; + /* ignore the TX if connection is being closed */ + if (tx->tx_conn->ksnc_closing) + continue; + n++; + } + + if (n == 0) { + spin_unlock(&peer->ksnp_lock); + continue; + } + + tx = list_entry(peer->ksnp_zc_req_list.next, + ksock_tx_t, tx_zc_list); + deadline = tx->tx_deadline; + resid = tx->tx_resid; + conn = tx->tx_conn; + ksocknal_conn_addref(conn); + + spin_unlock(&peer->ksnp_lock); + read_unlock(&ksocknal_data.ksnd_global_lock); + + CERROR("Total %d stale ZC_REQs for peer %s detected; the oldest(%p) timed out %ld secs ago, resid: %d, wmem: %d\n", + n, libcfs_nid2str(peer->ksnp_id.nid), tx, + cfs_duration_sec(cfs_time_current() - deadline), + resid, conn->ksnc_sock->sk->sk_wmem_queued); + + ksocknal_close_conn_and_siblings (conn, -ETIMEDOUT); + ksocknal_conn_decref(conn); + goto again; + } + + read_unlock(&ksocknal_data.ksnd_global_lock); +} + +int +ksocknal_reaper (void *arg) +{ + wait_queue_t wait; + ksock_conn_t *conn; + ksock_sched_t *sched; + struct list_head enomem_conns; + int nenomem_conns; + long timeout; + int i; + int peer_index = 0; + unsigned long deadline = cfs_time_current(); + + cfs_block_allsigs (); + + INIT_LIST_HEAD(&enomem_conns); + init_waitqueue_entry(&wait, current); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + + while (!ksocknal_data.ksnd_shuttingdown) { + + if (!list_empty (&ksocknal_data.ksnd_deathrow_conns)) { + conn = list_entry (ksocknal_data. \ + ksnd_deathrow_conns.next, + ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + ksocknal_terminate_conn(conn); + ksocknal_conn_decref(conn); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + continue; + } + + if (!list_empty (&ksocknal_data.ksnd_zombie_conns)) { + conn = list_entry (ksocknal_data.ksnd_zombie_conns.\ + next, ksock_conn_t, ksnc_list); + list_del (&conn->ksnc_list); + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + ksocknal_destroy_conn(conn); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + continue; + } + + if (!list_empty (&ksocknal_data.ksnd_enomem_conns)) { + list_add(&enomem_conns, + &ksocknal_data.ksnd_enomem_conns); + list_del_init(&ksocknal_data.ksnd_enomem_conns); + } + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + /* reschedule all the connections that stalled with ENOMEM... */ + nenomem_conns = 0; + while (!list_empty (&enomem_conns)) { + conn = list_entry (enomem_conns.next, + ksock_conn_t, ksnc_tx_list); + list_del (&conn->ksnc_tx_list); + + sched = conn->ksnc_scheduler; + + spin_lock_bh(&sched->kss_lock); + + LASSERT(conn->ksnc_tx_scheduled); + conn->ksnc_tx_ready = 1; + list_add_tail(&conn->ksnc_tx_list, + &sched->kss_tx_conns); + wake_up(&sched->kss_waitq); + + spin_unlock_bh(&sched->kss_lock); + nenomem_conns++; + } + + /* careful with the jiffy wrap... */ + while ((timeout = cfs_time_sub(deadline, + cfs_time_current())) <= 0) { + const int n = 4; + const int p = 1; + int chunk = ksocknal_data.ksnd_peer_hash_size; + + /* Time to check for timeouts on a few more peers: I do + * checks every 'p' seconds on a proportion of the peer + * table and I need to check every connection 'n' times + * within a timeout interval, to ensure I detect a + * timeout on any connection within (n+1)/n times the + * timeout interval. */ + + if (*ksocknal_tunables.ksnd_timeout > n * p) + chunk = (chunk * n * p) / + *ksocknal_tunables.ksnd_timeout; + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + ksocknal_check_peer_timeouts (peer_index); + peer_index = (peer_index + 1) % + ksocknal_data.ksnd_peer_hash_size; + } + + deadline = cfs_time_add(deadline, cfs_time_seconds(p)); + } + + if (nenomem_conns != 0) { + /* Reduce my timeout if I rescheduled ENOMEM conns. + * This also prevents me getting woken immediately + * if any go back on my enomem list. */ + timeout = SOCKNAL_ENOMEM_RETRY; + } + ksocknal_data.ksnd_reaper_waketime = + cfs_time_add(cfs_time_current(), timeout); + + set_current_state (TASK_INTERRUPTIBLE); + add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait); + + if (!ksocknal_data.ksnd_shuttingdown && + list_empty (&ksocknal_data.ksnd_deathrow_conns) && + list_empty (&ksocknal_data.ksnd_zombie_conns)) + schedule_timeout(timeout); + + set_current_state (TASK_RUNNING); + remove_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait); + + spin_lock_bh(&ksocknal_data.ksnd_reaper_lock); + } + + spin_unlock_bh(&ksocknal_data.ksnd_reaper_lock); + + ksocknal_thread_fini(); + return 0; +} diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c new file mode 100644 index 000000000..f5e8ab060 --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c @@ -0,0 +1,714 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include "socklnd.h" + +int +ksocknal_lib_get_conn_addrs(ksock_conn_t *conn) +{ + int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1, + &conn->ksnc_ipaddr, + &conn->ksnc_port); + + /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT(!conn->ksnc_closing); + + if (rc != 0) { + CERROR("Error %d getting sock peer IP\n", rc); + return rc; + } + + rc = libcfs_sock_getaddr(conn->ksnc_sock, 0, + &conn->ksnc_myipaddr, NULL); + if (rc != 0) { + CERROR("Error %d getting sock local IP\n", rc); + return rc; + } + + return 0; +} + +int +ksocknal_lib_zc_capable(ksock_conn_t *conn) +{ + int caps = conn->ksnc_sock->sk->sk_route_caps; + + if (conn->ksnc_proto == &ksocknal_protocol_v1x) + return 0; + + /* ZC if the socket supports scatter/gather and doesn't need software + * checksums */ + return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_ALL_CSUM) != 0); +} + +int +ksocknal_lib_send_iov(ksock_conn_t *conn, ksock_tx_t *tx) +{ + struct socket *sock = conn->ksnc_sock; + int nob; + int rc; + + if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */ + conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */ + tx->tx_nob == tx->tx_resid && /* frist sending */ + tx->tx_msg.ksm_csum == 0) /* not checksummed */ + ksocknal_lib_csum_tx(tx); + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + + { +#if SOCKNAL_SINGLE_FRAG_TX + struct kvec scratch; + struct kvec *scratchiov = &scratch; + unsigned int niov = 1; +#else + struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = tx->tx_niov; +#endif + struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; + int i; + + for (nob = i = 0; i < niov; i++) { + scratchiov[i] = tx->tx_iov[i]; + nob += scratchiov[i].iov_len; + } + + if (!list_empty(&conn->ksnc_tx_queue) || + nob < tx->tx_resid) + msg.msg_flags |= MSG_MORE; + + rc = kernel_sendmsg(sock, &msg, scratchiov, niov, nob); + } + return rc; +} + +int +ksocknal_lib_send_kiov(ksock_conn_t *conn, ksock_tx_t *tx) +{ + struct socket *sock = conn->ksnc_sock; + lnet_kiov_t *kiov = tx->tx_kiov; + int rc; + int nob; + + /* Not NOOP message */ + LASSERT(tx->tx_lnetmsg != NULL); + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + if (tx->tx_msg.ksm_zc_cookies[0] != 0) { + /* Zero copy is enabled */ + struct sock *sk = sock->sk; + struct page *page = kiov->kiov_page; + int offset = kiov->kiov_offset; + int fragsize = kiov->kiov_len; + int msgflg = MSG_DONTWAIT; + + CDEBUG(D_NET, "page %p + offset %x for %d\n", + page, offset, kiov->kiov_len); + + if (!list_empty(&conn->ksnc_tx_queue) || + fragsize < tx->tx_resid) + msgflg |= MSG_MORE; + + if (sk->sk_prot->sendpage != NULL) { + rc = sk->sk_prot->sendpage(sk, page, + offset, fragsize, msgflg); + } else { + rc = cfs_tcp_sendpage(sk, page, offset, fragsize, + msgflg); + } + } else { +#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK + struct kvec scratch; + struct kvec *scratchiov = &scratch; + unsigned int niov = 1; +#else +#ifdef CONFIG_HIGHMEM +#warning "XXX risk of kmap deadlock on multiple frags..." +#endif + struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = tx->tx_nkiov; +#endif + struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; + int i; + + for (nob = i = 0; i < niov; i++) { + scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + + kiov[i].kiov_offset; + nob += scratchiov[i].iov_len = kiov[i].kiov_len; + } + + if (!list_empty(&conn->ksnc_tx_queue) || + nob < tx->tx_resid) + msg.msg_flags |= MSG_MORE; + + rc = kernel_sendmsg(sock, &msg, (struct kvec *)scratchiov, niov, nob); + + for (i = 0; i < niov; i++) + kunmap(kiov[i].kiov_page); + } + return rc; +} + +void +ksocknal_lib_eager_ack(ksock_conn_t *conn) +{ + int opt = 1; + struct socket *sock = conn->ksnc_sock; + + /* Remind the socket to ACK eagerly. If I don't, the socket might + * think I'm about to send something it could piggy-back the ACK + * on, introducing delay in completing zero-copy sends in my + * peer. */ + + kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, + (char *)&opt, sizeof(opt)); +} + +int +ksocknal_lib_recv_iov(ksock_conn_t *conn) +{ +#if SOCKNAL_SINGLE_FRAG_RX + struct kvec scratch; + struct kvec *scratchiov = &scratch; + unsigned int niov = 1; +#else + struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = conn->ksnc_rx_niov; +#endif + struct kvec *iov = conn->ksnc_rx_iov; + struct msghdr msg = { + .msg_flags = 0 + }; + int nob; + int i; + int rc; + int fragnob; + int sum; + __u32 saved_csum; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + LASSERT(niov > 0); + + for (nob = i = 0; i < niov; i++) { + scratchiov[i] = iov[i]; + nob += scratchiov[i].iov_len; + } + LASSERT(nob <= conn->ksnc_rx_nob_wanted); + + rc = kernel_recvmsg(conn->ksnc_sock, &msg, + scratchiov, niov, nob, MSG_DONTWAIT); + + saved_csum = 0; + if (conn->ksnc_proto == &ksocknal_protocol_v2x) { + saved_csum = conn->ksnc_msg.ksm_csum; + conn->ksnc_msg.ksm_csum = 0; + } + + if (saved_csum != 0) { + /* accumulate checksum */ + for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { + LASSERT(i < niov); + + fragnob = iov[i].iov_len; + if (fragnob > sum) + fragnob = sum; + + conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, + iov[i].iov_base, fragnob); + } + conn->ksnc_msg.ksm_csum = saved_csum; + } + + return rc; +} + +static void +ksocknal_lib_kiov_vunmap(void *addr) +{ + if (addr == NULL) + return; + + vunmap(addr); +} + +static void * +ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov, + struct kvec *iov, struct page **pages) +{ + void *addr; + int nob; + int i; + + if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL) + return NULL; + + LASSERT(niov <= LNET_MAX_IOV); + + if (niov < 2 || + niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags) + return NULL; + + for (nob = i = 0; i < niov; i++) { + if ((kiov[i].kiov_offset != 0 && i > 0) || + (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_CACHE_SIZE && i < niov - 1)) + return NULL; + + pages[i] = kiov[i].kiov_page; + nob += kiov[i].kiov_len; + } + + addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL); + if (addr == NULL) + return NULL; + + iov->iov_base = addr + kiov[0].kiov_offset; + iov->iov_len = nob; + + return addr; +} + +int +ksocknal_lib_recv_kiov(ksock_conn_t *conn) +{ +#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK + struct kvec scratch; + struct kvec *scratchiov = &scratch; + struct page **pages = NULL; + unsigned int niov = 1; +#else +#ifdef CONFIG_HIGHMEM +#warning "XXX risk of kmap deadlock on multiple frags..." +#endif + struct kvec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + struct page **pages = conn->ksnc_scheduler->kss_rx_scratch_pgs; + unsigned int niov = conn->ksnc_rx_nkiov; +#endif + lnet_kiov_t *kiov = conn->ksnc_rx_kiov; + struct msghdr msg = { + .msg_flags = 0 + }; + int nob; + int i; + int rc; + void *base; + void *addr; + int sum; + int fragnob; + int n; + + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages); + if (addr != NULL) { + nob = scratchiov[0].iov_len; + n = 1; + + } else { + for (nob = i = 0; i < niov; i++) { + nob += scratchiov[i].iov_len = kiov[i].kiov_len; + scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + + kiov[i].kiov_offset; + } + n = niov; + } + + LASSERT(nob <= conn->ksnc_rx_nob_wanted); + + rc = kernel_recvmsg(conn->ksnc_sock, &msg, + (struct kvec *)scratchiov, n, nob, MSG_DONTWAIT); + + if (conn->ksnc_msg.ksm_csum != 0) { + for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { + LASSERT(i < niov); + + /* Dang! have to kmap again because I have nowhere to stash the + * mapped address. But by doing it while the page is still + * mapped, the kernel just bumps the map count and returns me + * the address it stashed. */ + base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; + fragnob = kiov[i].kiov_len; + if (fragnob > sum) + fragnob = sum; + + conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, + base, fragnob); + + kunmap(kiov[i].kiov_page); + } + } + + if (addr != NULL) { + ksocknal_lib_kiov_vunmap(addr); + } else { + for (i = 0; i < niov; i++) + kunmap(kiov[i].kiov_page); + } + + return rc; +} + +void +ksocknal_lib_csum_tx(ksock_tx_t *tx) +{ + int i; + __u32 csum; + void *base; + + LASSERT(tx->tx_iov[0].iov_base == &tx->tx_msg); + LASSERT(tx->tx_conn != NULL); + LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x); + + tx->tx_msg.ksm_csum = 0; + + csum = ksocknal_csum(~0, tx->tx_iov[0].iov_base, + tx->tx_iov[0].iov_len); + + if (tx->tx_kiov != NULL) { + for (i = 0; i < tx->tx_nkiov; i++) { + base = kmap(tx->tx_kiov[i].kiov_page) + + tx->tx_kiov[i].kiov_offset; + + csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len); + + kunmap(tx->tx_kiov[i].kiov_page); + } + } else { + for (i = 1; i < tx->tx_niov; i++) + csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base, + tx->tx_iov[i].iov_len); + } + + if (*ksocknal_tunables.ksnd_inject_csum_error) { + csum++; + *ksocknal_tunables.ksnd_inject_csum_error = 0; + } + + tx->tx_msg.ksm_csum = csum; +} + +int +ksocknal_lib_get_conn_tunables(ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle) +{ + struct socket *sock = conn->ksnc_sock; + int len; + int rc; + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) { + LASSERT(conn->ksnc_closing); + *txmem = *rxmem = *nagle = 0; + return -ESHUTDOWN; + } + + rc = libcfs_sock_getbuf(sock, txmem, rxmem); + if (rc == 0) { + len = sizeof(*nagle); + rc = kernel_getsockopt(sock, SOL_TCP, TCP_NODELAY, + (char *)nagle, &len); + } + + ksocknal_connsock_decref(conn); + + if (rc == 0) + *nagle = !*nagle; + else + *txmem = *rxmem = *nagle = 0; + + return rc; +} + +int +ksocknal_lib_setup_sock(struct socket *sock) +{ + int rc; + int option; + int keep_idle; + int keep_intvl; + int keep_count; + int do_keepalive; + struct linger linger; + + sock->sk->sk_allocation = GFP_NOFS; + + /* Ensure this socket aborts active sends immediately when we close + * it. */ + + linger.l_onoff = 0; + linger.l_linger = 0; + + rc = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, + (char *)&linger, sizeof(linger)); + if (rc != 0) { + CERROR("Can't set SO_LINGER: %d\n", rc); + return rc; + } + + option = -1; + rc = kernel_setsockopt(sock, SOL_TCP, TCP_LINGER2, + (char *)&option, sizeof(option)); + if (rc != 0) { + CERROR("Can't set SO_LINGER2: %d\n", rc); + return rc; + } + + if (!*ksocknal_tunables.ksnd_nagle) { + option = 1; + + rc = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (char *)&option, sizeof(option)); + if (rc != 0) { + CERROR("Can't disable nagle: %d\n", rc); + return rc; + } + } + + rc = libcfs_sock_setbuf(sock, + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size); + if (rc != 0) { + CERROR("Can't set buffer tx %d, rx %d buffers: %d\n", + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size, rc); + return rc; + } + +/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */ + + /* snapshot tunables */ + keep_idle = *ksocknal_tunables.ksnd_keepalive_idle; + keep_count = *ksocknal_tunables.ksnd_keepalive_count; + keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl; + + do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); + + option = (do_keepalive ? 1 : 0); + rc = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&option, sizeof(option)); + if (rc != 0) { + CERROR("Can't set SO_KEEPALIVE: %d\n", rc); + return rc; + } + + if (!do_keepalive) + return 0; + + rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, + (char *)&keep_idle, sizeof(keep_idle)); + if (rc != 0) { + CERROR("Can't set TCP_KEEPIDLE: %d\n", rc); + return rc; + } + + rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, + (char *)&keep_intvl, sizeof(keep_intvl)); + if (rc != 0) { + CERROR("Can't set TCP_KEEPINTVL: %d\n", rc); + return rc; + } + + rc = kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, + (char *)&keep_count, sizeof(keep_count)); + if (rc != 0) { + CERROR("Can't set TCP_KEEPCNT: %d\n", rc); + return rc; + } + + return 0; +} + +void +ksocknal_lib_push_conn(ksock_conn_t *conn) +{ + struct sock *sk; + struct tcp_sock *tp; + int nonagle; + int val = 1; + int rc; + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) /* being shut down */ + return; + + sk = conn->ksnc_sock->sk; + tp = tcp_sk(sk); + + lock_sock(sk); + nonagle = tp->nonagle; + tp->nonagle = 1; + release_sock(sk); + + rc = kernel_setsockopt(conn->ksnc_sock, SOL_TCP, TCP_NODELAY, + (char *)&val, sizeof(val)); + LASSERT(rc == 0); + + lock_sock(sk); + tp->nonagle = nonagle; + release_sock(sk); + + ksocknal_connsock_decref(conn); +} + +extern void ksocknal_read_callback(ksock_conn_t *conn); +extern void ksocknal_write_callback(ksock_conn_t *conn); +/* + * socket call back in Linux + */ +static void +ksocknal_data_ready(struct sock *sk) +{ + ksock_conn_t *conn; + + /* interleave correctly with closing sockets... */ + LASSERT(!in_irq()); + read_lock(&ksocknal_data.ksnd_global_lock); + + conn = sk->sk_user_data; + if (conn == NULL) { /* raced with ksocknal_terminate_conn */ + LASSERT(sk->sk_data_ready != &ksocknal_data_ready); + sk->sk_data_ready(sk); + } else + ksocknal_read_callback(conn); + + read_unlock(&ksocknal_data.ksnd_global_lock); +} + +static void +ksocknal_write_space(struct sock *sk) +{ + ksock_conn_t *conn; + int wspace; + int min_wpace; + + /* interleave correctly with closing sockets... */ + LASSERT(!in_irq()); + read_lock(&ksocknal_data.ksnd_global_lock); + + conn = sk->sk_user_data; + wspace = SOCKNAL_WSPACE(sk); + min_wpace = SOCKNAL_MIN_WSPACE(sk); + + CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n", + sk, wspace, min_wpace, conn, + (conn == NULL) ? "" : (conn->ksnc_tx_ready ? + " ready" : " blocked"), + (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? + " scheduled" : " idle"), + (conn == NULL) ? "" : (list_empty(&conn->ksnc_tx_queue) ? + " empty" : " queued")); + + if (conn == NULL) { /* raced with ksocknal_terminate_conn */ + LASSERT(sk->sk_write_space != &ksocknal_write_space); + sk->sk_write_space(sk); + + read_unlock(&ksocknal_data.ksnd_global_lock); + return; + } + + if (wspace >= min_wpace) { /* got enough space */ + ksocknal_write_callback(conn); + + /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the + * ENOMEM check in ksocknal_transmit is race-free (think about + * it). */ + + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + } + + read_unlock(&ksocknal_data.ksnd_global_lock); +} + +void +ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn) +{ + conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; + conn->ksnc_saved_write_space = sock->sk->sk_write_space; +} + +void +ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn) +{ + sock->sk->sk_user_data = conn; + sock->sk->sk_data_ready = ksocknal_data_ready; + sock->sk->sk_write_space = ksocknal_write_space; + return; +} + +void +ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn) +{ + /* Remove conn's network callbacks. + * NB I _have_ to restore the callback, rather than storing a noop, + * since the socket could survive past this module being unloaded!! */ + sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; + sock->sk->sk_write_space = conn->ksnc_saved_write_space; + + /* A callback could be in progress already; they hold a read lock + * on ksnd_global_lock (to serialise with me) and NOOP if + * sk_user_data is NULL. */ + sock->sk->sk_user_data = NULL; + + return ; +} + +int +ksocknal_lib_memory_pressure(ksock_conn_t *conn) +{ + int rc = 0; + ksock_sched_t *sched; + + sched = conn->ksnc_scheduler; + spin_lock_bh(&sched->kss_lock); + + if (!test_bit(SOCK_NOSPACE, &conn->ksnc_sock->flags) && + !conn->ksnc_tx_ready) { + /* SOCK_NOSPACE is set when the socket fills + * and cleared in the write_space callback + * (which also sets ksnc_tx_ready). If + * SOCK_NOSPACE and ksnc_tx_ready are BOTH + * zero, I didn't fill the socket and + * write_space won't reschedule me, so I + * return -ENOMEM to get my caller to retry + * after a timeout */ + rc = -ENOMEM; + } + + spin_unlock_bh(&sched->kss_lock); + + return rc; +} diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h new file mode 100644 index 000000000..f5563881b --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.h @@ -0,0 +1,86 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_PORTAL_ALLOC + +#ifndef __LINUX_SOCKNAL_LIB_H__ +#define __LINUX_SOCKNAL_LIB_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../../../include/linux/libcfs/libcfs.h" + +#include +static inline __u32 ksocknal_csum(__u32 crc, unsigned char const *p, size_t len) +{ +#if 1 + return crc32_le(crc, p, len); +#else + while (len-- > 0) + crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ; + return crc; +#endif +} + +#define SOCKNAL_WSPACE(sk) sk_stream_wspace(sk) +#define SOCKNAL_MIN_WSPACE(sk) sk_stream_min_wspace(sk) + +/* assume one thread for each connection type */ +#define SOCKNAL_NSCHEDS 3 +#define SOCKNAL_NSCHEDS_HIGH (SOCKNAL_NSCHEDS << 1) + +#endif diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c new file mode 100644 index 000000000..86b88db1c --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_modparams.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2011, 2012, Intel Corporation. + * + * Author: Eric Barton + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socklnd.h" + +static int sock_timeout = 50; +module_param(sock_timeout, int, 0644); +MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)"); + +static int credits = 256; +module_param(credits, int, 0444); +MODULE_PARM_DESC(credits, "# concurrent sends"); + +static int peer_credits = 8; +module_param(peer_credits, int, 0444); +MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer"); + +static int peer_buffer_credits; +module_param(peer_buffer_credits, int, 0444); +MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits"); + +static int peer_timeout = 180; +module_param(peer_timeout, int, 0444); +MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)"); + +/* Number of daemons in each thread pool which is percpt, + * we will estimate reasonable value based on CPUs if it's not set. */ +static unsigned int nscheds; +module_param(nscheds, int, 0444); +MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting"); + +static int nconnds = 4; +module_param(nconnds, int, 0444); +MODULE_PARM_DESC(nconnds, "# connection daemons while starting"); + +static int nconnds_max = 64; +module_param(nconnds_max, int, 0444); +MODULE_PARM_DESC(nconnds_max, "max # connection daemons"); + +static int min_reconnectms = 1000; +module_param(min_reconnectms, int, 0644); +MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)"); + +static int max_reconnectms = 60000; +module_param(max_reconnectms, int, 0644); +MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)"); + +# define DEFAULT_EAGER_ACK 0 +static int eager_ack = DEFAULT_EAGER_ACK; +module_param(eager_ack, int, 0644); +MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly"); + +static int typed_conns = 1; +module_param(typed_conns, int, 0444); +MODULE_PARM_DESC(typed_conns, "use different sockets for bulk"); + +static int min_bulk = 1<<10; +module_param(min_bulk, int, 0644); +MODULE_PARM_DESC(min_bulk, "smallest 'large' message"); + +# define DEFAULT_BUFFER_SIZE 0 +static int tx_buffer_size = DEFAULT_BUFFER_SIZE; +module_param(tx_buffer_size, int, 0644); +MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)"); + +static int rx_buffer_size = DEFAULT_BUFFER_SIZE; +module_param(rx_buffer_size, int, 0644); +MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)"); + +static int nagle; +module_param(nagle, int, 0644); +MODULE_PARM_DESC(nagle, "enable NAGLE?"); + +static int round_robin = 1; +module_param(round_robin, int, 0644); +MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces"); + +static int keepalive = 30; +module_param(keepalive, int, 0644); +MODULE_PARM_DESC(keepalive, "# seconds before send keepalive"); + +static int keepalive_idle = 30; +module_param(keepalive_idle, int, 0644); +MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe"); + +#define DEFAULT_KEEPALIVE_COUNT 5 +static int keepalive_count = DEFAULT_KEEPALIVE_COUNT; +module_param(keepalive_count, int, 0644); +MODULE_PARM_DESC(keepalive_count, "# missed probes == dead"); + +static int keepalive_intvl = 5; +module_param(keepalive_intvl, int, 0644); +MODULE_PARM_DESC(keepalive_intvl, "seconds between probes"); + +static int enable_csum; +module_param(enable_csum, int, 0644); +MODULE_PARM_DESC(enable_csum, "enable check sum"); + +static int inject_csum_error; +module_param(inject_csum_error, int, 0644); +MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error"); + +static int nonblk_zcack = 1; +module_param(nonblk_zcack, int, 0644); +MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection"); + +static unsigned int zc_min_payload = 16 << 10; +module_param(zc_min_payload, int, 0644); +MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy"); + +static unsigned int zc_recv; +module_param(zc_recv, int, 0644); +MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver"); + +static unsigned int zc_recv_min_nfrags = 16; +module_param(zc_recv_min_nfrags, int, 0644); +MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv"); + + +#if SOCKNAL_VERSION_DEBUG +static int protocol = 3; +module_param(protocol, int, 0644); +MODULE_PARM_DESC(protocol, "protocol version"); +#endif + +ksock_tunables_t ksocknal_tunables; + +int ksocknal_tunables_init(void) +{ + + /* initialize ksocknal_tunables structure */ + ksocknal_tunables.ksnd_timeout = &sock_timeout; + ksocknal_tunables.ksnd_nscheds = &nscheds; + ksocknal_tunables.ksnd_nconnds = &nconnds; + ksocknal_tunables.ksnd_nconnds_max = &nconnds_max; + ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms; + ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms; + ksocknal_tunables.ksnd_eager_ack = &eager_ack; + ksocknal_tunables.ksnd_typed_conns = &typed_conns; + ksocknal_tunables.ksnd_min_bulk = &min_bulk; + ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size; + ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size; + ksocknal_tunables.ksnd_nagle = &nagle; + ksocknal_tunables.ksnd_round_robin = &round_robin; + ksocknal_tunables.ksnd_keepalive = &keepalive; + ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle; + ksocknal_tunables.ksnd_keepalive_count = &keepalive_count; + ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl; + ksocknal_tunables.ksnd_credits = &credits; + ksocknal_tunables.ksnd_peertxcredits = &peer_credits; + ksocknal_tunables.ksnd_peerrtrcredits = &peer_buffer_credits; + ksocknal_tunables.ksnd_peertimeout = &peer_timeout; + ksocknal_tunables.ksnd_enable_csum = &enable_csum; + ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error; + ksocknal_tunables.ksnd_nonblk_zcack = &nonblk_zcack; + ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload; + ksocknal_tunables.ksnd_zc_recv = &zc_recv; + ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags; + + + +#if SOCKNAL_VERSION_DEBUG + ksocknal_tunables.ksnd_protocol = &protocol; +#endif + + if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10)) + *ksocknal_tunables.ksnd_zc_min_payload = 2 << 10; + + return 0; +}; diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c new file mode 100644 index 000000000..8596581f5 --- /dev/null +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_proto.c @@ -0,0 +1,797 @@ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2012, Intel Corporation. + * + * Author: Zach Brown + * Author: Peter J. Braam + * Author: Phil Schwan + * Author: Eric Barton + * + * This file is part of Portals, http://www.sf.net/projects/sandiaportals/ + * + * Portals is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Portals is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Portals; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include "socklnd.h" + +/* + * Protocol entries : + * pro_send_hello : send hello message + * pro_recv_hello : receive hello message + * pro_pack : pack message header + * pro_unpack : unpack message header + * pro_queue_tx_zcack() : Called holding BH lock: kss_lock + * return 1 if ACK is piggybacked, otherwise return 0 + * pro_queue_tx_msg() : Called holding BH lock: kss_lock + * return the ACK that piggybacked by my message, or NULL + * pro_handle_zcreq() : handler of incoming ZC-REQ + * pro_handle_zcack() : handler of incoming ZC-ACK + * pro_match_tx() : Called holding glock + */ + +static ksock_tx_t * +ksocknal_queue_tx_msg_v1(ksock_conn_t *conn, ksock_tx_t *tx_msg) +{ + /* V1.x, just enqueue it */ + list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); + return NULL; +} + +void +ksocknal_next_tx_carrier(ksock_conn_t *conn) +{ + ksock_tx_t *tx = conn->ksnc_tx_carrier; + + /* Called holding BH lock: conn->ksnc_scheduler->kss_lock */ + LASSERT(!list_empty(&conn->ksnc_tx_queue)); + LASSERT(tx != NULL); + + /* Next TX that can carry ZC-ACK or LNet message */ + if (tx->tx_list.next == &conn->ksnc_tx_queue) { + /* no more packets queued */ + conn->ksnc_tx_carrier = NULL; + } else { + conn->ksnc_tx_carrier = list_entry(tx->tx_list.next, + ksock_tx_t, tx_list); + LASSERT(conn->ksnc_tx_carrier->tx_msg.ksm_type == tx->tx_msg.ksm_type); + } +} + +static int +ksocknal_queue_tx_zcack_v2(ksock_conn_t *conn, + ksock_tx_t *tx_ack, __u64 cookie) +{ + ksock_tx_t *tx = conn->ksnc_tx_carrier; + + LASSERT(tx_ack == NULL || + tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + /* + * Enqueue or piggyback tx_ack / cookie + * . no tx can piggyback cookie of tx_ack (or cookie), just + * enqueue the tx_ack (if tx_ack != NUL) and return NULL. + * . There is tx can piggyback cookie of tx_ack (or cookie), + * piggyback the cookie and return the tx. + */ + if (tx == NULL) { + if (tx_ack != NULL) { + list_add_tail(&tx_ack->tx_list, + &conn->ksnc_tx_queue); + conn->ksnc_tx_carrier = tx_ack; + } + return 0; + } + + if (tx->tx_msg.ksm_type == KSOCK_MSG_NOOP) { + /* tx is noop zc-ack, can't piggyback zc-ack cookie */ + if (tx_ack != NULL) + list_add_tail(&tx_ack->tx_list, + &conn->ksnc_tx_queue); + return 0; + } + + LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_LNET); + LASSERT(tx->tx_msg.ksm_zc_cookies[1] == 0); + + if (tx_ack != NULL) + cookie = tx_ack->tx_msg.ksm_zc_cookies[1]; + + /* piggyback the zc-ack cookie */ + tx->tx_msg.ksm_zc_cookies[1] = cookie; + /* move on to the next TX which can carry cookie */ + ksocknal_next_tx_carrier(conn); + + return 1; +} + +static ksock_tx_t * +ksocknal_queue_tx_msg_v2(ksock_conn_t *conn, ksock_tx_t *tx_msg) +{ + ksock_tx_t *tx = conn->ksnc_tx_carrier; + + /* + * Enqueue tx_msg: + * . If there is no NOOP on the connection, just enqueue + * tx_msg and return NULL + * . If there is NOOP on the connection, piggyback the cookie + * and replace the NOOP tx, and return the NOOP tx. + */ + if (tx == NULL) { /* nothing on queue */ + list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); + conn->ksnc_tx_carrier = tx_msg; + return NULL; + } + + if (tx->tx_msg.ksm_type == KSOCK_MSG_LNET) { /* nothing to carry */ + list_add_tail(&tx_msg->tx_list, &conn->ksnc_tx_queue); + return NULL; + } + + LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + /* There is a noop zc-ack can be piggybacked */ + tx_msg->tx_msg.ksm_zc_cookies[1] = tx->tx_msg.ksm_zc_cookies[1]; + ksocknal_next_tx_carrier(conn); + + /* use new_tx to replace the noop zc-ack packet */ + list_add(&tx_msg->tx_list, &tx->tx_list); + list_del(&tx->tx_list); + + return tx; +} + +static int +ksocknal_queue_tx_zcack_v3(ksock_conn_t *conn, + ksock_tx_t *tx_ack, __u64 cookie) +{ + ksock_tx_t *tx; + + if (conn->ksnc_type != SOCKLND_CONN_ACK) + return ksocknal_queue_tx_zcack_v2(conn, tx_ack, cookie); + + /* non-blocking ZC-ACK (to router) */ + LASSERT(tx_ack == NULL || + tx_ack->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + tx = conn->ksnc_tx_carrier; + if (tx == NULL) { + if (tx_ack != NULL) { + list_add_tail(&tx_ack->tx_list, + &conn->ksnc_tx_queue); + conn->ksnc_tx_carrier = tx_ack; + } + return 0; + } + + /* conn->ksnc_tx_carrier != NULL */ + + if (tx_ack != NULL) + cookie = tx_ack->tx_msg.ksm_zc_cookies[1]; + + if (cookie == SOCKNAL_KEEPALIVE_PING) /* ignore keepalive PING */ + return 1; + + if (tx->tx_msg.ksm_zc_cookies[1] == SOCKNAL_KEEPALIVE_PING) { + /* replace the keepalive PING with a real ACK */ + LASSERT(tx->tx_msg.ksm_zc_cookies[0] == 0); + tx->tx_msg.ksm_zc_cookies[1] = cookie; + return 1; + } + + if (cookie == tx->tx_msg.ksm_zc_cookies[0] || + cookie == tx->tx_msg.ksm_zc_cookies[1]) { + CWARN("%s: duplicated ZC cookie: %llu\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie); + return 1; /* XXX return error in the future */ + } + + if (tx->tx_msg.ksm_zc_cookies[0] == 0) { + /* NOOP tx has only one ZC-ACK cookie, can carry at least one more */ + if (tx->tx_msg.ksm_zc_cookies[1] > cookie) { + tx->tx_msg.ksm_zc_cookies[0] = tx->tx_msg.ksm_zc_cookies[1]; + tx->tx_msg.ksm_zc_cookies[1] = cookie; + } else { + tx->tx_msg.ksm_zc_cookies[0] = cookie; + } + + if (tx->tx_msg.ksm_zc_cookies[0] - tx->tx_msg.ksm_zc_cookies[1] > 2) { + /* not likely to carry more ACKs, skip it to simplify logic */ + ksocknal_next_tx_carrier(conn); + } + + return 1; + } + + /* takes two or more cookies already */ + + if (tx->tx_msg.ksm_zc_cookies[0] > tx->tx_msg.ksm_zc_cookies[1]) { + __u64 tmp = 0; + + /* two separated cookies: (a+2, a) or (a+1, a) */ + LASSERT(tx->tx_msg.ksm_zc_cookies[0] - + tx->tx_msg.ksm_zc_cookies[1] <= 2); + + if (tx->tx_msg.ksm_zc_cookies[0] - + tx->tx_msg.ksm_zc_cookies[1] == 2) { + if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) + tmp = cookie; + } else if (cookie == tx->tx_msg.ksm_zc_cookies[1] - 1) { + tmp = tx->tx_msg.ksm_zc_cookies[1]; + } else if (cookie == tx->tx_msg.ksm_zc_cookies[0] + 1) { + tmp = tx->tx_msg.ksm_zc_cookies[0]; + } + + if (tmp != 0) { + /* range of cookies */ + tx->tx_msg.ksm_zc_cookies[0] = tmp - 1; + tx->tx_msg.ksm_zc_cookies[1] = tmp + 1; + return 1; + } + + } else { + /* ksm_zc_cookies[0] < ksm_zc_cookies[1], it is range of cookies */ + if (cookie >= tx->tx_msg.ksm_zc_cookies[0] && + cookie <= tx->tx_msg.ksm_zc_cookies[1]) { + CWARN("%s: duplicated ZC cookie: %llu\n", + libcfs_id2str(conn->ksnc_peer->ksnp_id), cookie); + return 1; /* XXX: return error in the future */ + } + + if (cookie == tx->tx_msg.ksm_zc_cookies[1] + 1) { + tx->tx_msg.ksm_zc_cookies[1] = cookie; + return 1; + } + + if (cookie == tx->tx_msg.ksm_zc_cookies[0] - 1) { + tx->tx_msg.ksm_zc_cookies[0] = cookie; + return 1; + } + } + + /* failed to piggyback ZC-ACK */ + if (tx_ack != NULL) { + list_add_tail(&tx_ack->tx_list, &conn->ksnc_tx_queue); + /* the next tx can piggyback at least 1 ACK */ + ksocknal_next_tx_carrier(conn); + } + + return 0; +} + +static int +ksocknal_match_tx(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk) +{ + int nob; + +#if SOCKNAL_VERSION_DEBUG + if (!*ksocknal_tunables.ksnd_typed_conns) + return SOCKNAL_MATCH_YES; +#endif + + if (tx == NULL || tx->tx_lnetmsg == NULL) { + /* noop packet */ + nob = offsetof(ksock_msg_t, ksm_u); + } else { + nob = tx->tx_lnetmsg->msg_len + + ((conn->ksnc_proto == &ksocknal_protocol_v1x) ? + sizeof(lnet_hdr_t) : sizeof(ksock_msg_t)); + } + + /* default checking for typed connection */ + switch (conn->ksnc_type) { + default: + CERROR("ksnc_type bad: %u\n", conn->ksnc_type); + LBUG(); + case SOCKLND_CONN_ANY: + return SOCKNAL_MATCH_YES; + + case SOCKLND_CONN_BULK_IN: + return SOCKNAL_MATCH_MAY; + + case SOCKLND_CONN_BULK_OUT: + if (nob < *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + + case SOCKLND_CONN_CONTROL: + if (nob >= *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + } +} + +static int +ksocknal_match_tx_v3(ksock_conn_t *conn, ksock_tx_t *tx, int nonblk) +{ + int nob; + + if (tx == NULL || tx->tx_lnetmsg == NULL) + nob = offsetof(ksock_msg_t, ksm_u); + else + nob = tx->tx_lnetmsg->msg_len + sizeof(ksock_msg_t); + + switch (conn->ksnc_type) { + default: + CERROR("ksnc_type bad: %u\n", conn->ksnc_type); + LBUG(); + case SOCKLND_CONN_ANY: + return SOCKNAL_MATCH_NO; + + case SOCKLND_CONN_ACK: + if (nonblk) + return SOCKNAL_MATCH_YES; + else if (tx == NULL || tx->tx_lnetmsg == NULL) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_NO; + + case SOCKLND_CONN_BULK_OUT: + if (nonblk) + return SOCKNAL_MATCH_NO; + else if (nob < *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + + case SOCKLND_CONN_CONTROL: + if (nonblk) + return SOCKNAL_MATCH_NO; + else if (nob >= *ksocknal_tunables.ksnd_min_bulk) + return SOCKNAL_MATCH_MAY; + else + return SOCKNAL_MATCH_YES; + } +} + +/* (Sink) handle incoming ZC request from sender */ +static int +ksocknal_handle_zcreq(ksock_conn_t *c, __u64 cookie, int remote) +{ + ksock_peer_t *peer = c->ksnc_peer; + ksock_conn_t *conn; + ksock_tx_t *tx; + int rc; + + read_lock(&ksocknal_data.ksnd_global_lock); + + conn = ksocknal_find_conn_locked(peer, NULL, !!remote); + if (conn != NULL) { + ksock_sched_t *sched = conn->ksnc_scheduler; + + LASSERT(conn->ksnc_proto->pro_queue_tx_zcack != NULL); + + spin_lock_bh(&sched->kss_lock); + + rc = conn->ksnc_proto->pro_queue_tx_zcack(conn, NULL, cookie); + + spin_unlock_bh(&sched->kss_lock); + + if (rc) { /* piggybacked */ + read_unlock(&ksocknal_data.ksnd_global_lock); + return 0; + } + } + + read_unlock(&ksocknal_data.ksnd_global_lock); + + /* ACK connection is not ready, or can't piggyback the ACK */ + tx = ksocknal_alloc_tx_noop(cookie, !!remote); + if (tx == NULL) + return -ENOMEM; + + rc = ksocknal_launch_packet(peer->ksnp_ni, tx, peer->ksnp_id); + if (rc == 0) + return 0; + + ksocknal_free_tx(tx); + return rc; +} + +/* (Sender) handle ZC_ACK from sink */ +static int +ksocknal_handle_zcack(ksock_conn_t *conn, __u64 cookie1, __u64 cookie2) +{ + ksock_peer_t *peer = conn->ksnc_peer; + ksock_tx_t *tx; + ksock_tx_t *tmp; + LIST_HEAD(zlist); + int count; + + if (cookie1 == 0) + cookie1 = cookie2; + + count = (cookie1 > cookie2) ? 2 : (cookie2 - cookie1 + 1); + + if (cookie2 == SOCKNAL_KEEPALIVE_PING && + conn->ksnc_proto == &ksocknal_protocol_v3x) { + /* keepalive PING for V3.x, just ignore it */ + return count == 1 ? 0 : -EPROTO; + } + + spin_lock(&peer->ksnp_lock); + + list_for_each_entry_safe(tx, tmp, + &peer->ksnp_zc_req_list, tx_zc_list) { + __u64 c = tx->tx_msg.ksm_zc_cookies[0]; + + if (c == cookie1 || c == cookie2 || (cookie1 < c && c < cookie2)) { + tx->tx_msg.ksm_zc_cookies[0] = 0; + list_del(&tx->tx_zc_list); + list_add(&tx->tx_zc_list, &zlist); + + if (--count == 0) + break; + } + } + + spin_unlock(&peer->ksnp_lock); + + while (!list_empty(&zlist)) { + tx = list_entry(zlist.next, ksock_tx_t, tx_zc_list); + list_del(&tx->tx_zc_list); + ksocknal_tx_decref(tx); + } + + return count == 0 ? 0 : -EPROTO; +} + +static int +ksocknal_send_hello_v1(ksock_conn_t *conn, ksock_hello_msg_t *hello) +{ + struct socket *sock = conn->ksnc_sock; + lnet_hdr_t *hdr; + lnet_magicversion_t *hmv; + int rc; + int i; + + CLASSERT(sizeof(lnet_magicversion_t) == offsetof(lnet_hdr_t, src_nid)); + + LIBCFS_ALLOC(hdr, sizeof(*hdr)); + if (hdr == NULL) { + CERROR("Can't allocate lnet_hdr_t\n"); + return -ENOMEM; + } + + hmv = (lnet_magicversion_t *)&hdr->dest_nid; + + /* Re-organize V2.x message header to V1.x (lnet_hdr_t) + * header and send out */ + hmv->magic = cpu_to_le32 (LNET_PROTO_TCP_MAGIC); + hmv->version_major = cpu_to_le16 (KSOCK_PROTO_V1_MAJOR); + hmv->version_minor = cpu_to_le16 (KSOCK_PROTO_V1_MINOR); + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + hmv->version_major++; /* just different! */ + the_lnet.ln_testprotocompat &= ~1; + } + if ((the_lnet.ln_testprotocompat & 2) != 0) { + hmv->magic = LNET_PROTO_MAGIC; + the_lnet.ln_testprotocompat &= ~2; + } + LNET_UNLOCK(); + } + + hdr->src_nid = cpu_to_le64 (hello->kshm_src_nid); + hdr->src_pid = cpu_to_le32 (hello->kshm_src_pid); + hdr->type = cpu_to_le32 (LNET_MSG_HELLO); + hdr->payload_length = cpu_to_le32 (hello->kshm_nips * sizeof(__u32)); + hdr->msg.hello.type = cpu_to_le32 (hello->kshm_ctype); + hdr->msg.hello.incarnation = cpu_to_le64 (hello->kshm_src_incarnation); + + rc = libcfs_sock_write(sock, hdr, sizeof(*hdr), + lnet_acceptor_timeout()); + + if (rc != 0) { + CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n", + rc, &conn->ksnc_ipaddr, conn->ksnc_port); + goto out; + } + + if (hello->kshm_nips == 0) + goto out; + + for (i = 0; i < (int) hello->kshm_nips; i++) { + hello->kshm_ips[i] = __cpu_to_le32 (hello->kshm_ips[i]); + } + + rc = libcfs_sock_write(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), + lnet_acceptor_timeout()); + if (rc != 0) { + CNETERR("Error %d sending HELLO payload (%d) to %pI4h/%d\n", + rc, hello->kshm_nips, + &conn->ksnc_ipaddr, conn->ksnc_port); + } +out: + LIBCFS_FREE(hdr, sizeof(*hdr)); + + return rc; +} + +static int +ksocknal_send_hello_v2(ksock_conn_t *conn, ksock_hello_msg_t *hello) +{ + struct socket *sock = conn->ksnc_sock; + int rc; + + hello->kshm_magic = LNET_PROTO_MAGIC; + hello->kshm_version = conn->ksnc_proto->pro_version; + + if (the_lnet.ln_testprotocompat != 0) { + /* single-shot proto check */ + LNET_LOCK(); + if ((the_lnet.ln_testprotocompat & 1) != 0) { + hello->kshm_version++; /* just different! */ + the_lnet.ln_testprotocompat &= ~1; + } + LNET_UNLOCK(); + } + + rc = libcfs_sock_write(sock, hello, offsetof(ksock_hello_msg_t, kshm_ips), + lnet_acceptor_timeout()); + + if (rc != 0) { + CNETERR("Error %d sending HELLO hdr to %pI4h/%d\n", + rc, &conn->ksnc_ipaddr, conn->ksnc_port); + return rc; + } + + if (hello->kshm_nips == 0) + return 0; + + rc = libcfs_sock_write(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), + lnet_acceptor_timeout()); + if (rc != 0) { + CNETERR("Error %d sending HELLO payload (%d) to %pI4h/%d\n", + rc, hello->kshm_nips, + &conn->ksnc_ipaddr, conn->ksnc_port); + } + + return rc; +} + +static int +ksocknal_recv_hello_v1(ksock_conn_t *conn, ksock_hello_msg_t *hello, + int timeout) +{ + struct socket *sock = conn->ksnc_sock; + lnet_hdr_t *hdr; + int rc; + int i; + + LIBCFS_ALLOC(hdr, sizeof(*hdr)); + if (hdr == NULL) { + CERROR("Can't allocate lnet_hdr_t\n"); + return -ENOMEM; + } + + rc = libcfs_sock_read(sock, &hdr->src_nid, + sizeof(*hdr) - offsetof(lnet_hdr_t, src_nid), + timeout); + if (rc != 0) { + CERROR("Error %d reading rest of HELLO hdr from %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT(rc < 0 && rc != -EALREADY); + goto out; + } + + /* ...and check we got what we expected */ + if (hdr->type != cpu_to_le32 (LNET_MSG_HELLO)) { + CERROR("Expecting a HELLO hdr, but got type %d from %pI4h\n", + le32_to_cpu(hdr->type), + &conn->ksnc_ipaddr); + rc = -EPROTO; + goto out; + } + + hello->kshm_src_nid = le64_to_cpu(hdr->src_nid); + hello->kshm_src_pid = le32_to_cpu(hdr->src_pid); + hello->kshm_src_incarnation = le64_to_cpu(hdr->msg.hello.incarnation); + hello->kshm_ctype = le32_to_cpu(hdr->msg.hello.type); + hello->kshm_nips = le32_to_cpu(hdr->payload_length) / + sizeof(__u32); + + if (hello->kshm_nips > LNET_MAX_INTERFACES) { + CERROR("Bad nips %d from ip %pI4h\n", + hello->kshm_nips, &conn->ksnc_ipaddr); + rc = -EPROTO; + goto out; + } + + if (hello->kshm_nips == 0) + goto out; + + rc = libcfs_sock_read(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), timeout); + if (rc != 0) { + CERROR("Error %d reading IPs from ip %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT(rc < 0 && rc != -EALREADY); + goto out; + } + + for (i = 0; i < (int) hello->kshm_nips; i++) { + hello->kshm_ips[i] = __le32_to_cpu(hello->kshm_ips[i]); + + if (hello->kshm_ips[i] == 0) { + CERROR("Zero IP[%d] from ip %pI4h\n", + i, &conn->ksnc_ipaddr); + rc = -EPROTO; + break; + } + } +out: + LIBCFS_FREE(hdr, sizeof(*hdr)); + + return rc; +} + +static int +ksocknal_recv_hello_v2(ksock_conn_t *conn, ksock_hello_msg_t *hello, int timeout) +{ + struct socket *sock = conn->ksnc_sock; + int rc; + int i; + + if (hello->kshm_magic == LNET_PROTO_MAGIC) + conn->ksnc_flip = 0; + else + conn->ksnc_flip = 1; + + rc = libcfs_sock_read(sock, &hello->kshm_src_nid, + offsetof(ksock_hello_msg_t, kshm_ips) - + offsetof(ksock_hello_msg_t, kshm_src_nid), + timeout); + if (rc != 0) { + CERROR("Error %d reading HELLO from %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT(rc < 0 && rc != -EALREADY); + return rc; + } + + if (conn->ksnc_flip) { + __swab32s(&hello->kshm_src_pid); + __swab64s(&hello->kshm_src_nid); + __swab32s(&hello->kshm_dst_pid); + __swab64s(&hello->kshm_dst_nid); + __swab64s(&hello->kshm_src_incarnation); + __swab64s(&hello->kshm_dst_incarnation); + __swab32s(&hello->kshm_ctype); + __swab32s(&hello->kshm_nips); + } + + if (hello->kshm_nips > LNET_MAX_INTERFACES) { + CERROR("Bad nips %d from ip %pI4h\n", + hello->kshm_nips, &conn->ksnc_ipaddr); + return -EPROTO; + } + + if (hello->kshm_nips == 0) + return 0; + + rc = libcfs_sock_read(sock, hello->kshm_ips, + hello->kshm_nips * sizeof(__u32), timeout); + if (rc != 0) { + CERROR("Error %d reading IPs from ip %pI4h\n", + rc, &conn->ksnc_ipaddr); + LASSERT(rc < 0 && rc != -EALREADY); + return rc; + } + + for (i = 0; i < (int) hello->kshm_nips; i++) { + if (conn->ksnc_flip) + __swab32s(&hello->kshm_ips[i]); + + if (hello->kshm_ips[i] == 0) { + CERROR("Zero IP[%d] from ip %pI4h\n", + i, &conn->ksnc_ipaddr); + return -EPROTO; + } + } + + return 0; +} + +static void +ksocknal_pack_msg_v1(ksock_tx_t *tx) +{ + /* V1.x has no KSOCK_MSG_NOOP */ + LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + LASSERT(tx->tx_lnetmsg != NULL); + + tx->tx_iov[0].iov_base = &tx->tx_lnetmsg->msg_hdr; + tx->tx_iov[0].iov_len = sizeof(lnet_hdr_t); + + tx->tx_resid = tx->tx_nob = tx->tx_lnetmsg->msg_len + sizeof(lnet_hdr_t); +} + +static void +ksocknal_pack_msg_v2(ksock_tx_t *tx) +{ + tx->tx_iov[0].iov_base = &tx->tx_msg; + + if (tx->tx_lnetmsg != NULL) { + LASSERT(tx->tx_msg.ksm_type != KSOCK_MSG_NOOP); + + tx->tx_msg.ksm_u.lnetmsg.ksnm_hdr = tx->tx_lnetmsg->msg_hdr; + tx->tx_iov[0].iov_len = sizeof(ksock_msg_t); + tx->tx_resid = tx->tx_nob = sizeof(ksock_msg_t) + tx->tx_lnetmsg->msg_len; + } else { + LASSERT(tx->tx_msg.ksm_type == KSOCK_MSG_NOOP); + + tx->tx_iov[0].iov_len = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr); + tx->tx_resid = tx->tx_nob = offsetof(ksock_msg_t, ksm_u.lnetmsg.ksnm_hdr); + } + /* Don't checksum before start sending, because packet can be piggybacked with ACK */ +} + +static void +ksocknal_unpack_msg_v1(ksock_msg_t *msg) +{ + msg->ksm_csum = 0; + msg->ksm_type = KSOCK_MSG_LNET; + msg->ksm_zc_cookies[0] = msg->ksm_zc_cookies[1] = 0; +} + +static void +ksocknal_unpack_msg_v2(ksock_msg_t *msg) +{ + return; /* Do nothing */ +} + +ksock_proto_t ksocknal_protocol_v1x = { + .pro_version = KSOCK_PROTO_V1, + .pro_send_hello = ksocknal_send_hello_v1, + .pro_recv_hello = ksocknal_recv_hello_v1, + .pro_pack = ksocknal_pack_msg_v1, + .pro_unpack = ksocknal_unpack_msg_v1, + .pro_queue_tx_msg = ksocknal_queue_tx_msg_v1, + .pro_handle_zcreq = NULL, + .pro_handle_zcack = NULL, + .pro_queue_tx_zcack = NULL, + .pro_match_tx = ksocknal_match_tx +}; + +ksock_proto_t ksocknal_protocol_v2x = { + .pro_version = KSOCK_PROTO_V2, + .pro_send_hello = ksocknal_send_hello_v2, + .pro_recv_hello = ksocknal_recv_hello_v2, + .pro_pack = ksocknal_pack_msg_v2, + .pro_unpack = ksocknal_unpack_msg_v2, + .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2, + .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v2, + .pro_handle_zcreq = ksocknal_handle_zcreq, + .pro_handle_zcack = ksocknal_handle_zcack, + .pro_match_tx = ksocknal_match_tx +}; + +ksock_proto_t ksocknal_protocol_v3x = { + .pro_version = KSOCK_PROTO_V3, + .pro_send_hello = ksocknal_send_hello_v2, + .pro_recv_hello = ksocknal_recv_hello_v2, + .pro_pack = ksocknal_pack_msg_v2, + .pro_unpack = ksocknal_unpack_msg_v2, + .pro_queue_tx_msg = ksocknal_queue_tx_msg_v2, + .pro_queue_tx_zcack = ksocknal_queue_tx_zcack_v3, + .pro_handle_zcreq = ksocknal_handle_zcreq, + .pro_handle_zcack = ksocknal_handle_zcack, + .pro_match_tx = ksocknal_match_tx_v3 +}; -- cgit v1.2.3-54-g00ecf