summaryrefslogtreecommitdiff
path: root/net/netfilter/ipvs
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
commit57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree5e910f0e82173f4ef4f51111366a3f1299037a7b /net/netfilter/ipvs
Initial import
Diffstat (limited to 'net/netfilter/ipvs')
-rw-r--r--net/netfilter/ipvs/Kconfig291
-rw-r--r--net/netfilter/ipvs/Makefile41
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c627
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c1418
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c2136
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c3956
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c276
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c209
-rw-r--r--net/netfilter/ipvs/ip_vs_fo.c79
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c503
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c633
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c818
-rw-r--r--net/netfilter/ipvs/ip_vs_lc.c93
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c299
-rw-r--r--net/netfilter/ipvs/ip_vs_nq.c143
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c110
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c171
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c409
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c165
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c591
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c712
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c499
-rw-r--r--net/netfilter/ipvs/ip_vs_rr.c130
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c254
-rw-r--r--net/netfilter/ipvs/ip_vs_sed.c144
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c385
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c1975
-rw-r--r--net/netfilter/ipvs/ip_vs_wlc.c116
-rw-r--r--net/netfilter/ipvs/ip_vs_wrr.c270
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c1380
30 files changed, 18833 insertions, 0 deletions
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
new file mode 100644
index 000000000..3b6929dec
--- /dev/null
+++ b/net/netfilter/ipvs/Kconfig
@@ -0,0 +1,291 @@
+#
+# IP Virtual Server configuration
+#
+menuconfig IP_VS
+ tristate "IP virtual server support"
+ depends on NET && INET && NETFILTER
+ depends on (NF_CONNTRACK || NF_CONNTRACK=n)
+ ---help---
+ IP Virtual Server support will let you build a high-performance
+ virtual server based on cluster of two or more real servers. This
+ option must be enabled for at least one of the clustered computers
+ that will take care of intercepting incoming connections to a
+ single IP address and scheduling them to real servers.
+
+ Three request dispatching techniques are implemented, they are
+ virtual server via NAT, virtual server via tunneling and virtual
+ server via direct routing. The several scheduling algorithms can
+ be used to choose which server the connection is directed to,
+ thus load balancing can be achieved among the servers. For more
+ information and its administration program, please visit the
+ following URL: <http://www.linuxvirtualserver.org/>.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+if IP_VS
+
+config IP_VS_IPV6
+ bool "IPv6 support for IPVS"
+ depends on IPV6 = y || IP_VS = IPV6
+ select IP6_NF_IPTABLES
+ ---help---
+ Add IPv6 support to IPVS.
+
+ Say Y if unsure.
+
+config IP_VS_DEBUG
+ bool "IP virtual server debugging"
+ ---help---
+ Say Y here if you want to get additional messages useful in
+ debugging the IP virtual server code. You can change the debug
+ level in /proc/sys/net/ipv4/vs/debug_level
+
+config IP_VS_TAB_BITS
+ int "IPVS connection table size (the Nth power of 2)"
+ range 8 20
+ default 12
+ ---help---
+ The IPVS connection hash table uses the chaining scheme to handle
+ hash collisions. Using a big IPVS connection hash table will greatly
+ reduce conflicts when there are hundreds of thousands of connections
+ in the hash table.
+
+ Note the table size must be power of 2. The table size will be the
+ value of 2 to the your input number power. The number to choose is
+ from 8 to 20, the default number is 12, which means the table size
+ is 4096. Don't input the number too small, otherwise you will lose
+ performance on it. You can adapt the table size yourself, according
+ to your virtual server application. It is good to set the table size
+ not far less than the number of connections per second multiplying
+ average lasting time of connection in the table. For example, your
+ virtual server gets 200 connections per second, the connection lasts
+ for 200 seconds in average in the connection table, the table size
+ should be not far less than 200x200, it is good to set the table
+ size 32768 (2**15).
+
+ Another note that each connection occupies 128 bytes effectively and
+ each hash entry uses 8 bytes, so you can estimate how much memory is
+ needed for your box.
+
+ You can overwrite this number setting conn_tab_bits module parameter
+ or by appending ip_vs.conn_tab_bits=? to the kernel command line
+ if IP VS was compiled built-in.
+
+comment "IPVS transport protocol load balancing support"
+
+config IP_VS_PROTO_TCP
+ bool "TCP load balancing support"
+ ---help---
+ This option enables support for load balancing TCP transport
+ protocol. Say Y if unsure.
+
+config IP_VS_PROTO_UDP
+ bool "UDP load balancing support"
+ ---help---
+ This option enables support for load balancing UDP transport
+ protocol. Say Y if unsure.
+
+config IP_VS_PROTO_AH_ESP
+ def_bool IP_VS_PROTO_ESP || IP_VS_PROTO_AH
+
+config IP_VS_PROTO_ESP
+ bool "ESP load balancing support"
+ ---help---
+ This option enables support for load balancing ESP (Encapsulation
+ Security Payload) transport protocol. Say Y if unsure.
+
+config IP_VS_PROTO_AH
+ bool "AH load balancing support"
+ ---help---
+ This option enables support for load balancing AH (Authentication
+ Header) transport protocol. Say Y if unsure.
+
+config IP_VS_PROTO_SCTP
+ bool "SCTP load balancing support"
+ select LIBCRC32C
+ ---help---
+ This option enables support for load balancing SCTP transport
+ protocol. Say Y if unsure.
+
+comment "IPVS scheduler"
+
+config IP_VS_RR
+ tristate "round-robin scheduling"
+ ---help---
+ The robin-robin scheduling algorithm simply directs network
+ connections to different real servers in a round-robin manner.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_WRR
+ tristate "weighted round-robin scheduling"
+ ---help---
+ The weighted robin-robin scheduling algorithm directs network
+ connections to different real servers based on server weights
+ in a round-robin manner. Servers with higher weights receive
+ new connections first than those with less weights, and servers
+ with higher weights get more connections than those with less
+ weights and servers with equal weights get equal connections.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_LC
+ tristate "least-connection scheduling"
+ ---help---
+ The least-connection scheduling algorithm directs network
+ connections to the server with the least number of active
+ connections.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_WLC
+ tristate "weighted least-connection scheduling"
+ ---help---
+ The weighted least-connection scheduling algorithm directs network
+ connections to the server with the least active connections
+ normalized by the server weight.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_FO
+ tristate "weighted failover scheduling"
+ ---help---
+ The weighted failover scheduling algorithm directs network
+ connections to the server with the highest weight that is
+ currently available.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_LBLC
+ tristate "locality-based least-connection scheduling"
+ ---help---
+ The locality-based least-connection scheduling algorithm is for
+ destination IP load balancing. It is usually used in cache cluster.
+ This algorithm usually directs packet destined for an IP address to
+ its server if the server is alive and under load. If the server is
+ overloaded (its active connection numbers is larger than its weight)
+ and there is a server in its half load, then allocate the weighted
+ least-connection server to this IP address.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_LBLCR
+ tristate "locality-based least-connection with replication scheduling"
+ ---help---
+ The locality-based least-connection with replication scheduling
+ algorithm is also for destination IP load balancing. It is
+ usually used in cache cluster. It differs from the LBLC scheduling
+ as follows: the load balancer maintains mappings from a target
+ to a set of server nodes that can serve the target. Requests for
+ a target are assigned to the least-connection node in the target's
+ server set. If all the node in the server set are over loaded,
+ it picks up a least-connection node in the cluster and adds it
+ in the sever set for the target. If the server set has not been
+ modified for the specified time, the most loaded node is removed
+ from the server set, in order to avoid high degree of replication.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_DH
+ tristate "destination hashing scheduling"
+ ---help---
+ The destination hashing scheduling algorithm assigns network
+ connections to the servers through looking up a statically assigned
+ hash table by their destination IP addresses.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_SH
+ tristate "source hashing scheduling"
+ ---help---
+ The source hashing scheduling algorithm assigns network
+ connections to the servers through looking up a statically assigned
+ hash table by their source IP addresses.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_SED
+ tristate "shortest expected delay scheduling"
+ ---help---
+ The shortest expected delay scheduling algorithm assigns network
+ connections to the server with the shortest expected delay. The
+ expected delay that the job will experience is (Ci + 1) / Ui if
+ sent to the ith server, in which Ci is the number of connections
+ on the ith server and Ui is the fixed service rate (weight)
+ of the ith server.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_NQ
+ tristate "never queue scheduling"
+ ---help---
+ The never queue scheduling algorithm adopts a two-speed model.
+ When there is an idle server available, the job will be sent to
+ the idle server, instead of waiting for a fast one. When there
+ is no idle server available, the job will be sent to the server
+ that minimize its expected delay (The Shortest Expected Delay
+ scheduling algorithm).
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+comment 'IPVS SH scheduler'
+
+config IP_VS_SH_TAB_BITS
+ int "IPVS source hashing table size (the Nth power of 2)"
+ range 4 20
+ default 8
+ ---help---
+ The source hashing scheduler maps source IPs to destinations
+ stored in a hash table. This table is tiled by each destination
+ until all slots in the table are filled. When using weights to
+ allow destinations to receive more connections, the table is
+ tiled an amount proportional to the weights specified. The table
+ needs to be large enough to effectively fit all the destinations
+ multiplied by their respective weights.
+
+comment 'IPVS application helper'
+
+config IP_VS_FTP
+ tristate "FTP protocol helper"
+ depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT && \
+ NF_CONNTRACK_FTP
+ select IP_VS_NFCT
+ ---help---
+ FTP is a protocol that transfers IP address and/or port number in
+ the payload. In the virtual server via Network Address Translation,
+ the IP address and port number of real servers cannot be sent to
+ clients in ftp connections directly, so FTP protocol helper is
+ required for tracking the connection and mangling it back to that of
+ virtual service.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
+config IP_VS_NFCT
+ bool "Netfilter connection tracking"
+ depends on NF_CONNTRACK
+ ---help---
+ The Netfilter connection tracking support allows the IPVS
+ connection state to be exported to the Netfilter framework
+ for filtering purposes.
+
+config IP_VS_PE_SIP
+ tristate "SIP persistence engine"
+ depends on IP_VS_PROTO_UDP
+ depends on NF_CONNTRACK_SIP
+ ---help---
+ Allow persistence based on the SIP Call-ID
+
+endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
new file mode 100644
index 000000000..38b2723b2
--- /dev/null
+++ b/net/netfilter/ipvs/Makefile
@@ -0,0 +1,41 @@
+#
+# Makefile for the IPVS modules on top of IPv4.
+#
+
+# IPVS transport protocol load balancing support
+ip_vs_proto-objs-y :=
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o
+
+ip_vs-extra_objs-y :=
+ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o
+
+ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
+ ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
+ ip_vs_est.o ip_vs_proto.o ip_vs_pe.o \
+ $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y)
+
+
+# IPVS core
+obj-$(CONFIG_IP_VS) += ip_vs.o
+
+# IPVS schedulers
+obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
+obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
+obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
+obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
+obj-$(CONFIG_IP_VS_FO) += ip_vs_fo.o
+obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
+obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
+obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
+obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
+obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+
+# IPVS application helpers
+obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
+
+# IPVS connection template retrievers
+obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
new file mode 100644
index 000000000..dfd7b65b3
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -0,0 +1,627 @@
+/*
+ * ip_vs_app.c: Application module support for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
+ * is that ip_vs_app module handles the reverse direction (incoming requests
+ * and outgoing responses).
+ *
+ * IP_MASQ_APP application masquerading module
+ *
+ * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mutex.h>
+
+#include <net/ip_vs.h>
+
+EXPORT_SYMBOL(register_ip_vs_app);
+EXPORT_SYMBOL(unregister_ip_vs_app);
+EXPORT_SYMBOL(register_ip_vs_app_inc);
+
+static DEFINE_MUTEX(__ip_vs_app_mutex);
+
+/*
+ * Get an ip_vs_app object
+ */
+static inline int ip_vs_app_get(struct ip_vs_app *app)
+{
+ return try_module_get(app->module);
+}
+
+
+static inline void ip_vs_app_put(struct ip_vs_app *app)
+{
+ module_put(app->module);
+}
+
+static void ip_vs_app_inc_destroy(struct ip_vs_app *inc)
+{
+ kfree(inc->timeout_table);
+ kfree(inc);
+}
+
+static void ip_vs_app_inc_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_app *inc = container_of(head, struct ip_vs_app, rcu_head);
+
+ ip_vs_app_inc_destroy(inc);
+}
+
+/*
+ * Allocate/initialize app incarnation and register it in proto apps.
+ */
+static int
+ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
+ __u16 port)
+{
+ struct ip_vs_protocol *pp;
+ struct ip_vs_app *inc;
+ int ret;
+
+ if (!(pp = ip_vs_proto_get(proto)))
+ return -EPROTONOSUPPORT;
+
+ if (!pp->unregister_app)
+ return -EOPNOTSUPP;
+
+ inc = kmemdup(app, sizeof(*inc), GFP_KERNEL);
+ if (!inc)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&inc->p_list);
+ INIT_LIST_HEAD(&inc->incs_list);
+ inc->app = app;
+ inc->port = htons(port);
+ atomic_set(&inc->usecnt, 0);
+
+ if (app->timeouts) {
+ inc->timeout_table =
+ ip_vs_create_timeout_table(app->timeouts,
+ app->timeouts_size);
+ if (!inc->timeout_table) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+
+ ret = pp->register_app(net, inc);
+ if (ret)
+ goto out;
+
+ list_add(&inc->a_list, &app->incs_list);
+ IP_VS_DBG(9, "%s App %s:%u registered\n",
+ pp->name, inc->name, ntohs(inc->port));
+
+ return 0;
+
+ out:
+ ip_vs_app_inc_destroy(inc);
+ return ret;
+}
+
+
+/*
+ * Release app incarnation
+ */
+static void
+ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_protocol *pp;
+
+ if (!(pp = ip_vs_proto_get(inc->protocol)))
+ return;
+
+ if (pp->unregister_app)
+ pp->unregister_app(net, inc);
+
+ IP_VS_DBG(9, "%s App %s:%u unregistered\n",
+ pp->name, inc->name, ntohs(inc->port));
+
+ list_del(&inc->a_list);
+
+ call_rcu(&inc->rcu_head, ip_vs_app_inc_rcu_free);
+}
+
+
+/*
+ * Get reference to app inc (only called from softirq)
+ *
+ */
+int ip_vs_app_inc_get(struct ip_vs_app *inc)
+{
+ int result;
+
+ result = ip_vs_app_get(inc->app);
+ if (result)
+ atomic_inc(&inc->usecnt);
+ return result;
+}
+
+
+/*
+ * Put the app inc (only called from timer or net softirq)
+ */
+void ip_vs_app_inc_put(struct ip_vs_app *inc)
+{
+ atomic_dec(&inc->usecnt);
+ ip_vs_app_put(inc->app);
+}
+
+
+/*
+ * Register an application incarnation in protocol applications
+ */
+int
+register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto,
+ __u16 port)
+{
+ int result;
+
+ mutex_lock(&__ip_vs_app_mutex);
+
+ result = ip_vs_app_inc_new(net, app, proto, port);
+
+ mutex_unlock(&__ip_vs_app_mutex);
+
+ return result;
+}
+
+
+/* Register application for netns */
+struct ip_vs_app *register_ip_vs_app(struct net *net, struct ip_vs_app *app)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_app *a;
+ int err = 0;
+
+ if (!ipvs)
+ return ERR_PTR(-ENOENT);
+
+ mutex_lock(&__ip_vs_app_mutex);
+
+ list_for_each_entry(a, &ipvs->app_list, a_list) {
+ if (!strcmp(app->name, a->name)) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+ }
+ a = kmemdup(app, sizeof(*app), GFP_KERNEL);
+ if (!a) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+ INIT_LIST_HEAD(&a->incs_list);
+ list_add(&a->a_list, &ipvs->app_list);
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+out_unlock:
+ mutex_unlock(&__ip_vs_app_mutex);
+
+ return err ? ERR_PTR(err) : a;
+}
+
+
+/*
+ * ip_vs_app unregistration routine
+ * We are sure there are no app incarnations attached to services
+ * Caller should use synchronize_rcu() or rcu_barrier()
+ */
+void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_app *a, *anxt, *inc, *nxt;
+
+ if (!ipvs)
+ return;
+
+ mutex_lock(&__ip_vs_app_mutex);
+
+ list_for_each_entry_safe(a, anxt, &ipvs->app_list, a_list) {
+ if (app && strcmp(app->name, a->name))
+ continue;
+ list_for_each_entry_safe(inc, nxt, &a->incs_list, a_list) {
+ ip_vs_app_inc_release(net, inc);
+ }
+
+ list_del(&a->a_list);
+ kfree(a);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+ }
+
+ mutex_unlock(&__ip_vs_app_mutex);
+}
+
+
+/*
+ * Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
+ */
+int ip_vs_bind_app(struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp)
+{
+ return pp->app_conn_bind(cp);
+}
+
+
+/*
+ * Unbind cp from application incarnation (called by cp destructor)
+ */
+void ip_vs_unbind_app(struct ip_vs_conn *cp)
+{
+ struct ip_vs_app *inc = cp->app;
+
+ if (!inc)
+ return;
+
+ if (inc->unbind_conn)
+ inc->unbind_conn(inc, cp);
+ if (inc->done_conn)
+ inc->done_conn(inc, cp);
+ ip_vs_app_inc_put(inc);
+ cp->app = NULL;
+}
+
+
+/*
+ * Fixes th->seq based on ip_vs_seq info.
+ */
+static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+ __u32 seq = ntohl(th->seq);
+
+ /*
+ * Adjust seq with delta-offset for all packets after
+ * the most recent resized pkt seq and with previous_delta offset
+ * for all packets before most recent resized pkt seq.
+ */
+ if (vseq->delta || vseq->previous_delta) {
+ if(after(seq, vseq->init_seq)) {
+ th->seq = htonl(seq + vseq->delta);
+ IP_VS_DBG(9, "%s(): added delta (%d) to seq\n",
+ __func__, vseq->delta);
+ } else {
+ th->seq = htonl(seq + vseq->previous_delta);
+ IP_VS_DBG(9, "%s(): added previous_delta (%d) to seq\n",
+ __func__, vseq->previous_delta);
+ }
+ }
+}
+
+
+/*
+ * Fixes th->ack_seq based on ip_vs_seq info.
+ */
+static inline void
+vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+ __u32 ack_seq = ntohl(th->ack_seq);
+
+ /*
+ * Adjust ack_seq with delta-offset for
+ * the packets AFTER most recent resized pkt has caused a shift
+ * for packets before most recent resized pkt, use previous_delta
+ */
+ if (vseq->delta || vseq->previous_delta) {
+ /* since ack_seq is the number of octet that is expected
+ to receive next, so compare it with init_seq+delta */
+ if(after(ack_seq, vseq->init_seq+vseq->delta)) {
+ th->ack_seq = htonl(ack_seq - vseq->delta);
+ IP_VS_DBG(9, "%s(): subtracted delta "
+ "(%d) from ack_seq\n", __func__, vseq->delta);
+
+ } else {
+ th->ack_seq = htonl(ack_seq - vseq->previous_delta);
+ IP_VS_DBG(9, "%s(): subtracted "
+ "previous_delta (%d) from ack_seq\n",
+ __func__, vseq->previous_delta);
+ }
+ }
+}
+
+
+/*
+ * Updates ip_vs_seq if pkt has been resized
+ * Assumes already checked proto==IPPROTO_TCP and diff!=0.
+ */
+static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
+ unsigned int flag, __u32 seq, int diff)
+{
+ /* spinlock is to keep updating cp->flags atomic */
+ spin_lock_bh(&cp->lock);
+ if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
+ vseq->previous_delta = vseq->delta;
+ vseq->delta += diff;
+ vseq->init_seq = seq;
+ cp->flags |= flag;
+ }
+ spin_unlock_bh(&cp->lock);
+}
+
+static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
+ struct ip_vs_app *app)
+{
+ int diff;
+ const unsigned int tcp_offset = ip_hdrlen(skb);
+ struct tcphdr *th;
+ __u32 seq;
+
+ if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+ return 0;
+
+ th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
+
+ /*
+ * Remember seq number in case this pkt gets resized
+ */
+ seq = ntohl(th->seq);
+
+ /*
+ * Fix seq stuff if flagged as so.
+ */
+ if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+ vs_fix_seq(&cp->out_seq, th);
+ if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+ vs_fix_ack_seq(&cp->in_seq, th);
+
+ /*
+ * Call private output hook function
+ */
+ if (app->pkt_out == NULL)
+ return 1;
+
+ if (!app->pkt_out(app, cp, skb, &diff))
+ return 0;
+
+ /*
+ * Update ip_vs seq stuff if len has changed.
+ */
+ if (diff != 0)
+ vs_seq_update(cp, &cp->out_seq,
+ IP_VS_CONN_F_OUT_SEQ, seq, diff);
+
+ return 1;
+}
+
+/*
+ * Output pkt hook. Will call bound ip_vs_app specific function
+ * called by ipvs packet handler, assumes previously checked cp!=NULL
+ * returns false if it can't handle packet (oom)
+ */
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_app *app;
+
+ /*
+ * check if application module is bound to
+ * this ip_vs_conn.
+ */
+ if ((app = cp->app) == NULL)
+ return 1;
+
+ /* TCP is complicated */
+ if (cp->protocol == IPPROTO_TCP)
+ return app_tcp_pkt_out(cp, skb, app);
+
+ /*
+ * Call private output hook function
+ */
+ if (app->pkt_out == NULL)
+ return 1;
+
+ return app->pkt_out(app, cp, skb, NULL);
+}
+
+
+static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
+ struct ip_vs_app *app)
+{
+ int diff;
+ const unsigned int tcp_offset = ip_hdrlen(skb);
+ struct tcphdr *th;
+ __u32 seq;
+
+ if (!skb_make_writable(skb, tcp_offset + sizeof(*th)))
+ return 0;
+
+ th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset);
+
+ /*
+ * Remember seq number in case this pkt gets resized
+ */
+ seq = ntohl(th->seq);
+
+ /*
+ * Fix seq stuff if flagged as so.
+ */
+ if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+ vs_fix_seq(&cp->in_seq, th);
+ if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+ vs_fix_ack_seq(&cp->out_seq, th);
+
+ /*
+ * Call private input hook function
+ */
+ if (app->pkt_in == NULL)
+ return 1;
+
+ if (!app->pkt_in(app, cp, skb, &diff))
+ return 0;
+
+ /*
+ * Update ip_vs seq stuff if len has changed.
+ */
+ if (diff != 0)
+ vs_seq_update(cp, &cp->in_seq,
+ IP_VS_CONN_F_IN_SEQ, seq, diff);
+
+ return 1;
+}
+
+/*
+ * Input pkt hook. Will call bound ip_vs_app specific function
+ * called by ipvs packet handler, assumes previously checked cp!=NULL.
+ * returns false if can't handle packet (oom).
+ */
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_app *app;
+
+ /*
+ * check if application module is bound to
+ * this ip_vs_conn.
+ */
+ if ((app = cp->app) == NULL)
+ return 1;
+
+ /* TCP is complicated */
+ if (cp->protocol == IPPROTO_TCP)
+ return app_tcp_pkt_in(cp, skb, app);
+
+ /*
+ * Call private input hook function
+ */
+ if (app->pkt_in == NULL)
+ return 1;
+
+ return app->pkt_in(app, cp, skb, NULL);
+}
+
+
+#ifdef CONFIG_PROC_FS
+/*
+ * /proc/net/ip_vs_app entry function
+ */
+
+static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos)
+{
+ struct ip_vs_app *app, *inc;
+
+ list_for_each_entry(app, &ipvs->app_list, a_list) {
+ list_for_each_entry(inc, &app->incs_list, a_list) {
+ if (pos-- == 0)
+ return inc;
+ }
+ }
+ return NULL;
+
+}
+
+static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct net *net = seq_file_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ mutex_lock(&__ip_vs_app_mutex);
+
+ return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip_vs_app *inc, *app;
+ struct list_head *e;
+ struct net *net = seq_file_net(seq);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ip_vs_app_idx(ipvs, 0);
+
+ inc = v;
+ app = inc->app;
+
+ if ((e = inc->a_list.next) != &app->incs_list)
+ return list_entry(e, struct ip_vs_app, a_list);
+
+ /* go on to next application */
+ for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) {
+ app = list_entry(e, struct ip_vs_app, a_list);
+ list_for_each_entry(inc, &app->incs_list, a_list) {
+ return inc;
+ }
+ }
+ return NULL;
+}
+
+static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
+{
+ mutex_unlock(&__ip_vs_app_mutex);
+}
+
+static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq, "prot port usecnt name\n");
+ else {
+ const struct ip_vs_app *inc = v;
+
+ seq_printf(seq, "%-3s %-7u %-6d %-17s\n",
+ ip_vs_proto_name(inc->protocol),
+ ntohs(inc->port),
+ atomic_read(&inc->usecnt),
+ inc->name);
+ }
+ return 0;
+}
+
+static const struct seq_operations ip_vs_app_seq_ops = {
+ .start = ip_vs_app_seq_start,
+ .next = ip_vs_app_seq_next,
+ .stop = ip_vs_app_seq_stop,
+ .show = ip_vs_app_seq_show,
+};
+
+static int ip_vs_app_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ip_vs_app_seq_ops,
+ sizeof(struct seq_net_private));
+}
+
+static const struct file_operations ip_vs_app_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_app_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+#endif
+
+int __net_init ip_vs_app_net_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ INIT_LIST_HEAD(&ipvs->app_list);
+ proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops);
+ return 0;
+}
+
+void __net_exit ip_vs_app_net_cleanup(struct net *net)
+{
+ unregister_ip_vs_app(net, NULL /* all */);
+ remove_proc_entry("ip_vs_app", net->proc_net);
+}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
new file mode 100644
index 000000000..b0f7b626b
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -0,0 +1,1418 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the Netfilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others. Many code here is taken from IP MASQ code of kernel 2.2.
+ *
+ * Changes:
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/net.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h> /* for proc_net_* */
+#include <linux/slab.h>
+#include <linux/seq_file.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+
+#include <net/net_namespace.h>
+#include <net/ip_vs.h>
+
+
+#ifndef CONFIG_IP_VS_TAB_BITS
+#define CONFIG_IP_VS_TAB_BITS 12
+#endif
+
+/*
+ * Connection hash size. Default is what was selected at compile time.
+*/
+static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
+module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
+MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
+
+/* size and mask values */
+int ip_vs_conn_tab_size __read_mostly;
+static int ip_vs_conn_tab_mask __read_mostly;
+
+/*
+ * Connection hash table: for input and output packets lookups of IPVS
+ */
+static struct hlist_head *ip_vs_conn_tab __read_mostly;
+
+/* SLAB cache for IPVS connections */
+static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
+
+/* counter for no client port connections */
+static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
+
+/* random value for IPVS connection hash */
+static unsigned int ip_vs_conn_rnd __read_mostly;
+
+/*
+ * Fine locking granularity for big connection hash table
+ */
+#define CT_LOCKARRAY_BITS 5
+#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
+#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
+
+/* We need an addrstrlen that works with or without v6 */
+#ifdef CONFIG_IP_VS_IPV6
+#define IP_VS_ADDRSTRLEN INET6_ADDRSTRLEN
+#else
+#define IP_VS_ADDRSTRLEN (8+1)
+#endif
+
+struct ip_vs_aligned_lock
+{
+ spinlock_t l;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+/* lock array for conn table */
+static struct ip_vs_aligned_lock
+__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
+
+static inline void ct_write_lock_bh(unsigned int key)
+{
+ spin_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_unlock_bh(unsigned int key)
+{
+ spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+
+/*
+ * Returns hash value for IPVS connection entry
+ */
+static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto,
+ const union nf_inet_addr *addr,
+ __be16 port)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
+ (__force u32)port, proto, ip_vs_conn_rnd) ^
+ ((size_t)net>>8)) & ip_vs_conn_tab_mask;
+#endif
+ return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
+ ip_vs_conn_rnd) ^
+ ((size_t)net>>8)) & ip_vs_conn_tab_mask;
+}
+
+static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
+ bool inverse)
+{
+ const union nf_inet_addr *addr;
+ __be16 port;
+
+ if (p->pe_data && p->pe->hashkey_raw)
+ return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
+ ip_vs_conn_tab_mask;
+
+ if (likely(!inverse)) {
+ addr = p->caddr;
+ port = p->cport;
+ } else {
+ addr = p->vaddr;
+ port = p->vport;
+ }
+
+ return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port);
+}
+
+static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
+{
+ struct ip_vs_conn_param p;
+
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol,
+ &cp->caddr, cp->cport, NULL, 0, &p);
+
+ if (cp->pe) {
+ p.pe = cp->pe;
+ p.pe_data = cp->pe_data;
+ p.pe_data_len = cp->pe_data_len;
+ }
+
+ return ip_vs_conn_hashkey_param(&p, false);
+}
+
+/*
+ * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.
+ * returns bool success.
+ */
+static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
+{
+ unsigned int hash;
+ int ret;
+
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ return 0;
+
+ /* Hash by protocol, client address and port */
+ hash = ip_vs_conn_hashkey_conn(cp);
+
+ ct_write_lock_bh(hash);
+ spin_lock(&cp->lock);
+
+ if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
+ cp->flags |= IP_VS_CONN_F_HASHED;
+ atomic_inc(&cp->refcnt);
+ hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]);
+ ret = 1;
+ } else {
+ pr_err("%s(): request for already hashed, called from %pF\n",
+ __func__, __builtin_return_address(0));
+ ret = 0;
+ }
+
+ spin_unlock(&cp->lock);
+ ct_write_unlock_bh(hash);
+
+ return ret;
+}
+
+
+/*
+ * UNhashes ip_vs_conn from ip_vs_conn_tab.
+ * returns bool success. Caller should hold conn reference.
+ */
+static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
+{
+ unsigned int hash;
+ int ret;
+
+ /* unhash it and decrease its reference counter */
+ hash = ip_vs_conn_hashkey_conn(cp);
+
+ ct_write_lock_bh(hash);
+ spin_lock(&cp->lock);
+
+ if (cp->flags & IP_VS_CONN_F_HASHED) {
+ hlist_del_rcu(&cp->c_list);
+ cp->flags &= ~IP_VS_CONN_F_HASHED;
+ atomic_dec(&cp->refcnt);
+ ret = 1;
+ } else
+ ret = 0;
+
+ spin_unlock(&cp->lock);
+ ct_write_unlock_bh(hash);
+
+ return ret;
+}
+
+/* Try to unlink ip_vs_conn from ip_vs_conn_tab.
+ * returns bool success.
+ */
+static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
+{
+ unsigned int hash;
+ bool ret;
+
+ hash = ip_vs_conn_hashkey_conn(cp);
+
+ ct_write_lock_bh(hash);
+ spin_lock(&cp->lock);
+
+ if (cp->flags & IP_VS_CONN_F_HASHED) {
+ ret = false;
+ /* Decrease refcnt and unlink conn only if we are last user */
+ if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) {
+ hlist_del_rcu(&cp->c_list);
+ cp->flags &= ~IP_VS_CONN_F_HASHED;
+ ret = true;
+ }
+ } else
+ ret = atomic_read(&cp->refcnt) ? false : true;
+
+ spin_unlock(&cp->lock);
+ ct_write_unlock_bh(hash);
+
+ return ret;
+}
+
+
+/*
+ * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ * Called for pkts coming from OUTside-to-INside.
+ * p->caddr, p->cport: pkt source address (foreign host)
+ * p->vaddr, p->vport: pkt dest address (load balancer)
+ */
+static inline struct ip_vs_conn *
+__ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
+{
+ unsigned int hash;
+ struct ip_vs_conn *cp;
+
+ hash = ip_vs_conn_hashkey_param(p, false);
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (p->cport == cp->cport && p->vport == cp->vport &&
+ cp->af == p->af &&
+ ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
+ ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
+ ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
+ if (!__ip_vs_conn_get(cp))
+ continue;
+ /* HIT */
+ rcu_read_unlock();
+ return cp;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
+{
+ struct ip_vs_conn *cp;
+
+ cp = __ip_vs_conn_in_get(p);
+ if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
+ struct ip_vs_conn_param cport_zero_p = *p;
+ cport_zero_p.cport = 0;
+ cp = __ip_vs_conn_in_get(&cport_zero_p);
+ }
+
+ IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+ cp ? "hit" : "not hit");
+
+ return cp;
+}
+
+static int
+ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph,
+ int inverse, struct ip_vs_conn_param *p)
+{
+ __be16 _ports[2], *pptr;
+ struct net *net = skb_net(skb);
+
+ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
+ if (pptr == NULL)
+ return 1;
+
+ if (likely(!inverse))
+ ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr,
+ pptr[0], &iph->daddr, pptr[1], p);
+ else
+ ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr,
+ pptr[1], &iph->saddr, pptr[0], p);
+ return 0;
+}
+
+struct ip_vs_conn *
+ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph, int inverse)
+{
+ struct ip_vs_conn_param p;
+
+ if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p))
+ return NULL;
+
+ return ip_vs_conn_in_get(&p);
+}
+EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
+
+/* Get reference to connection template */
+struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
+{
+ unsigned int hash;
+ struct ip_vs_conn *cp;
+
+ hash = ip_vs_conn_hashkey_param(p, false);
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (unlikely(p->pe_data && p->pe->ct_match)) {
+ if (!ip_vs_conn_net_eq(cp, p->net))
+ continue;
+ if (p->pe == cp->pe && p->pe->ct_match(p, cp)) {
+ if (__ip_vs_conn_get(cp))
+ goto out;
+ }
+ continue;
+ }
+
+ if (cp->af == p->af &&
+ ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
+ /* protocol should only be IPPROTO_IP if
+ * p->vaddr is a fwmark */
+ ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
+ p->af, p->vaddr, &cp->vaddr) &&
+ p->vport == cp->vport && p->cport == cp->cport &&
+ cp->flags & IP_VS_CONN_F_TEMPLATE &&
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
+ if (__ip_vs_conn_get(cp))
+ goto out;
+ }
+ }
+ cp = NULL;
+
+ out:
+ rcu_read_unlock();
+
+ IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+ cp ? "hit" : "not hit");
+
+ return cp;
+}
+
+/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ * Called for pkts coming from inside-to-OUTside.
+ * p->caddr, p->cport: pkt source address (inside host)
+ * p->vaddr, p->vport: pkt dest address (foreign host) */
+struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
+{
+ unsigned int hash;
+ struct ip_vs_conn *cp, *ret=NULL;
+
+ /*
+ * Check for "full" addressed entries
+ */
+ hash = ip_vs_conn_hashkey_param(p, true);
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (p->vport == cp->cport && p->cport == cp->dport &&
+ cp->af == p->af &&
+ ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
+ ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
+ p->protocol == cp->protocol &&
+ ip_vs_conn_net_eq(cp, p->net)) {
+ if (!__ip_vs_conn_get(cp))
+ continue;
+ /* HIT */
+ ret = cp;
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+
+ IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+ ret ? "hit" : "not hit");
+
+ return ret;
+}
+
+struct ip_vs_conn *
+ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph, int inverse)
+{
+ struct ip_vs_conn_param p;
+
+ if (ip_vs_conn_fill_param_proto(af, skb, iph, inverse, &p))
+ return NULL;
+
+ return ip_vs_conn_out_get(&p);
+}
+EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
+
+/*
+ * Put back the conn and restart its timer with its timeout
+ */
+void ip_vs_conn_put(struct ip_vs_conn *cp)
+{
+ unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
+ 0 : cp->timeout;
+ mod_timer(&cp->timer, jiffies+t);
+
+ __ip_vs_conn_put(cp);
+}
+
+
+/*
+ * Fill a no_client_port connection with a client port number
+ */
+void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
+{
+ if (ip_vs_conn_unhash(cp)) {
+ spin_lock_bh(&cp->lock);
+ if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
+ atomic_dec(&ip_vs_conn_no_cport_cnt);
+ cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+ cp->cport = cport;
+ }
+ spin_unlock_bh(&cp->lock);
+
+ /* hash on new dport */
+ ip_vs_conn_hash(cp);
+ }
+}
+
+
+/*
+ * Bind a connection entry with the corresponding packet_xmit.
+ * Called by ip_vs_conn_new.
+ */
+static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
+{
+ switch (IP_VS_FWD_METHOD(cp)) {
+ case IP_VS_CONN_F_MASQ:
+ cp->packet_xmit = ip_vs_nat_xmit;
+ break;
+
+ case IP_VS_CONN_F_TUNNEL:
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->daf == AF_INET6)
+ cp->packet_xmit = ip_vs_tunnel_xmit_v6;
+ else
+#endif
+ cp->packet_xmit = ip_vs_tunnel_xmit;
+ break;
+
+ case IP_VS_CONN_F_DROUTE:
+ cp->packet_xmit = ip_vs_dr_xmit;
+ break;
+
+ case IP_VS_CONN_F_LOCALNODE:
+ cp->packet_xmit = ip_vs_null_xmit;
+ break;
+
+ case IP_VS_CONN_F_BYPASS:
+ cp->packet_xmit = ip_vs_bypass_xmit;
+ break;
+ }
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
+{
+ switch (IP_VS_FWD_METHOD(cp)) {
+ case IP_VS_CONN_F_MASQ:
+ cp->packet_xmit = ip_vs_nat_xmit_v6;
+ break;
+
+ case IP_VS_CONN_F_TUNNEL:
+ if (cp->daf == AF_INET6)
+ cp->packet_xmit = ip_vs_tunnel_xmit_v6;
+ else
+ cp->packet_xmit = ip_vs_tunnel_xmit;
+ break;
+
+ case IP_VS_CONN_F_DROUTE:
+ cp->packet_xmit = ip_vs_dr_xmit_v6;
+ break;
+
+ case IP_VS_CONN_F_LOCALNODE:
+ cp->packet_xmit = ip_vs_null_xmit;
+ break;
+
+ case IP_VS_CONN_F_BYPASS:
+ cp->packet_xmit = ip_vs_bypass_xmit_v6;
+ break;
+ }
+}
+#endif
+
+
+static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
+{
+ return atomic_read(&dest->activeconns)
+ + atomic_read(&dest->inactconns);
+}
+
+/*
+ * Bind a connection entry with a virtual service destination
+ * Called just after a new connection entry is created.
+ */
+static inline void
+ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
+{
+ unsigned int conn_flags;
+ __u32 flags;
+
+ /* if dest is NULL, then return directly */
+ if (!dest)
+ return;
+
+ /* Increase the refcnt counter of the dest */
+ ip_vs_dest_hold(dest);
+
+ conn_flags = atomic_read(&dest->conn_flags);
+ if (cp->protocol != IPPROTO_UDP)
+ conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
+ flags = cp->flags;
+ /* Bind with the destination and its corresponding transmitter */
+ if (flags & IP_VS_CONN_F_SYNC) {
+ /* if the connection is not template and is created
+ * by sync, preserve the activity flag.
+ */
+ if (!(flags & IP_VS_CONN_F_TEMPLATE))
+ conn_flags &= ~IP_VS_CONN_F_INACTIVE;
+ /* connections inherit forwarding method from dest */
+ flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT);
+ }
+ flags |= conn_flags;
+ cp->flags = flags;
+ cp->dest = dest;
+
+ IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
+ "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+ "dest->refcnt:%d\n",
+ ip_vs_proto_name(cp->protocol),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+ IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
+ ip_vs_fwd_tag(cp), cp->state,
+ cp->flags, atomic_read(&cp->refcnt),
+ atomic_read(&dest->refcnt));
+
+ /* Update the connection counters */
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ /* It is a normal connection, so modify the counters
+ * according to the flags, later the protocol can
+ * update them on state change
+ */
+ if (!(flags & IP_VS_CONN_F_INACTIVE))
+ atomic_inc(&dest->activeconns);
+ else
+ atomic_inc(&dest->inactconns);
+ } else {
+ /* It is a persistent connection/template, so increase
+ the persistent connection counter */
+ atomic_inc(&dest->persistconns);
+ }
+
+ if (dest->u_threshold != 0 &&
+ ip_vs_dest_totalconns(dest) >= dest->u_threshold)
+ dest->flags |= IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ * Check if there is a destination for the connection, if so
+ * bind the connection to the destination.
+ */
+void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
+{
+ struct ip_vs_dest *dest;
+
+ rcu_read_lock();
+
+ /* This function is only invoked by the synchronization code. We do
+ * not currently support heterogeneous pools with synchronization,
+ * so we can make the assumption that the svc_af is the same as the
+ * dest_af
+ */
+ dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, cp->af, &cp->daddr,
+ cp->dport, &cp->vaddr, cp->vport,
+ cp->protocol, cp->fwmark, cp->flags);
+ if (dest) {
+ struct ip_vs_proto_data *pd;
+
+ spin_lock_bh(&cp->lock);
+ if (cp->dest) {
+ spin_unlock_bh(&cp->lock);
+ rcu_read_unlock();
+ return;
+ }
+
+ /* Applications work depending on the forwarding method
+ * but better to reassign them always when binding dest */
+ if (cp->app)
+ ip_vs_unbind_app(cp);
+
+ ip_vs_bind_dest(cp, dest);
+ spin_unlock_bh(&cp->lock);
+
+ /* Update its packet transmitter */
+ cp->packet_xmit = NULL;
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ ip_vs_bind_xmit_v6(cp);
+ else
+#endif
+ ip_vs_bind_xmit(cp);
+
+ pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol);
+ if (pd && atomic_read(&pd->appcnt))
+ ip_vs_bind_app(cp, pd->pp);
+ }
+ rcu_read_unlock();
+}
+
+
+/*
+ * Unbind a connection entry with its VS destination
+ * Called by the ip_vs_conn_expire function.
+ */
+static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
+{
+ struct ip_vs_dest *dest = cp->dest;
+
+ if (!dest)
+ return;
+
+ IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
+ "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
+ "dest->refcnt:%d\n",
+ ip_vs_proto_name(cp->protocol),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+ IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
+ ip_vs_fwd_tag(cp), cp->state,
+ cp->flags, atomic_read(&cp->refcnt),
+ atomic_read(&dest->refcnt));
+
+ /* Update the connection counters */
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
+ /* It is a normal connection, so decrease the inactconns
+ or activeconns counter */
+ if (cp->flags & IP_VS_CONN_F_INACTIVE) {
+ atomic_dec(&dest->inactconns);
+ } else {
+ atomic_dec(&dest->activeconns);
+ }
+ } else {
+ /* It is a persistent connection/template, so decrease
+ the persistent connection counter */
+ atomic_dec(&dest->persistconns);
+ }
+
+ if (dest->l_threshold != 0) {
+ if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ } else if (dest->u_threshold != 0) {
+ if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ } else {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ }
+
+ ip_vs_dest_put(dest);
+}
+
+static int expire_quiescent_template(struct netns_ipvs *ipvs,
+ struct ip_vs_dest *dest)
+{
+#ifdef CONFIG_SYSCTL
+ return ipvs->sysctl_expire_quiescent_template &&
+ (atomic_read(&dest->weight) == 0);
+#else
+ return 0;
+#endif
+}
+
+/*
+ * Checking if the destination of a connection template is available.
+ * If available, return 1, otherwise invalidate this connection
+ * template and return 0.
+ */
+int ip_vs_check_template(struct ip_vs_conn *ct)
+{
+ struct ip_vs_dest *dest = ct->dest;
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct));
+
+ /*
+ * Checking the dest server status.
+ */
+ if ((dest == NULL) ||
+ !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
+ expire_quiescent_template(ipvs, dest)) {
+ IP_VS_DBG_BUF(9, "check_template: dest not available for "
+ "protocol %s s:%s:%d v:%s:%d "
+ "-> d:%s:%d\n",
+ ip_vs_proto_name(ct->protocol),
+ IP_VS_DBG_ADDR(ct->af, &ct->caddr),
+ ntohs(ct->cport),
+ IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
+ ntohs(ct->vport),
+ IP_VS_DBG_ADDR(ct->daf, &ct->daddr),
+ ntohs(ct->dport));
+
+ /*
+ * Invalidate the connection template
+ */
+ if (ct->vport != htons(0xffff)) {
+ if (ip_vs_conn_unhash(ct)) {
+ ct->dport = htons(0xffff);
+ ct->vport = htons(0xffff);
+ ct->cport = 0;
+ ip_vs_conn_hash(ct);
+ }
+ }
+
+ /*
+ * Simply decrease the refcnt of the template,
+ * don't restart its timer.
+ */
+ __ip_vs_conn_put(ct);
+ return 0;
+ }
+ return 1;
+}
+
+static void ip_vs_conn_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn,
+ rcu_head);
+
+ ip_vs_pe_put(cp->pe);
+ kfree(cp->pe_data);
+ kmem_cache_free(ip_vs_conn_cachep, cp);
+}
+
+static void ip_vs_conn_expire(unsigned long data)
+{
+ struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+ struct net *net = ip_vs_conn_net(cp);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ /*
+ * do I control anybody?
+ */
+ if (atomic_read(&cp->n_control))
+ goto expire_later;
+
+ /* Unlink conn if not referenced anymore */
+ if (likely(ip_vs_conn_unlink(cp))) {
+ /* delete the timer if it is activated by other users */
+ del_timer(&cp->timer);
+
+ /* does anybody control me? */
+ if (cp->control)
+ ip_vs_control_del(cp);
+
+ if (cp->flags & IP_VS_CONN_F_NFCT) {
+ /* Do not access conntracks during subsys cleanup
+ * because nf_conntrack_find_get can not be used after
+ * conntrack cleanup for the net.
+ */
+ smp_rmb();
+ if (ipvs->enable)
+ ip_vs_conn_drop_conntrack(cp);
+ }
+
+ if (unlikely(cp->app != NULL))
+ ip_vs_unbind_app(cp);
+ ip_vs_unbind_dest(cp);
+ if (cp->flags & IP_VS_CONN_F_NO_CPORT)
+ atomic_dec(&ip_vs_conn_no_cport_cnt);
+ call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
+ atomic_dec(&ipvs->conn_count);
+ return;
+ }
+
+ expire_later:
+ IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n",
+ atomic_read(&cp->refcnt),
+ atomic_read(&cp->n_control));
+
+ atomic_inc(&cp->refcnt);
+ cp->timeout = 60*HZ;
+
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs));
+
+ ip_vs_conn_put(cp);
+}
+
+/* Modify timer, so that it expires as soon as possible.
+ * Can be called without reference only if under RCU lock.
+ */
+void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
+{
+ /* Using mod_timer_pending will ensure the timer is not
+ * modified after the final del_timer in ip_vs_conn_expire.
+ */
+ if (timer_pending(&cp->timer) &&
+ time_after(cp->timer.expires, jiffies))
+ mod_timer_pending(&cp->timer, jiffies);
+}
+
+
+/*
+ * Create a new connection entry and hash it into the ip_vs_conn_tab
+ */
+struct ip_vs_conn *
+ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
+ const union nf_inet_addr *daddr, __be16 dport, unsigned int flags,
+ struct ip_vs_dest *dest, __u32 fwmark)
+{
+ struct ip_vs_conn *cp;
+ struct netns_ipvs *ipvs = net_ipvs(p->net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
+ p->protocol);
+
+ cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
+ if (cp == NULL) {
+ IP_VS_ERR_RL("%s(): no memory\n", __func__);
+ return NULL;
+ }
+
+ INIT_HLIST_NODE(&cp->c_list);
+ setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
+ ip_vs_conn_net_set(cp, p->net);
+ cp->af = p->af;
+ cp->daf = dest_af;
+ cp->protocol = p->protocol;
+ ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
+ cp->cport = p->cport;
+ /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */
+ ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+ &cp->vaddr, p->vaddr);
+ cp->vport = p->vport;
+ ip_vs_addr_set(cp->daf, &cp->daddr, daddr);
+ cp->dport = dport;
+ cp->flags = flags;
+ cp->fwmark = fwmark;
+ if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
+ ip_vs_pe_get(p->pe);
+ cp->pe = p->pe;
+ cp->pe_data = p->pe_data;
+ cp->pe_data_len = p->pe_data_len;
+ } else {
+ cp->pe = NULL;
+ cp->pe_data = NULL;
+ cp->pe_data_len = 0;
+ }
+ spin_lock_init(&cp->lock);
+
+ /*
+ * Set the entry is referenced by the current thread before hashing
+ * it in the table, so that other thread run ip_vs_random_dropentry
+ * but cannot drop this entry.
+ */
+ atomic_set(&cp->refcnt, 1);
+
+ cp->control = NULL;
+ atomic_set(&cp->n_control, 0);
+ atomic_set(&cp->in_pkts, 0);
+
+ cp->packet_xmit = NULL;
+ cp->app = NULL;
+ cp->app_data = NULL;
+ /* reset struct ip_vs_seq */
+ cp->in_seq.delta = 0;
+ cp->out_seq.delta = 0;
+
+ atomic_inc(&ipvs->conn_count);
+ if (flags & IP_VS_CONN_F_NO_CPORT)
+ atomic_inc(&ip_vs_conn_no_cport_cnt);
+
+ /* Bind the connection with a destination server */
+ cp->dest = NULL;
+ ip_vs_bind_dest(cp, dest);
+
+ /* Set its state and timeout */
+ cp->state = 0;
+ cp->old_state = 0;
+ cp->timeout = 3*HZ;
+ cp->sync_endtime = jiffies & ~3UL;
+
+ /* Bind its packet transmitter */
+#ifdef CONFIG_IP_VS_IPV6
+ if (p->af == AF_INET6)
+ ip_vs_bind_xmit_v6(cp);
+ else
+#endif
+ ip_vs_bind_xmit(cp);
+
+ if (unlikely(pd && atomic_read(&pd->appcnt)))
+ ip_vs_bind_app(cp, pd->pp);
+
+ /*
+ * Allow conntrack to be preserved. By default, conntrack
+ * is created and destroyed for every packet.
+ * Sometimes keeping conntrack can be useful for
+ * IP_VS_CONN_F_ONE_PACKET too.
+ */
+
+ if (ip_vs_conntrack_enabled(ipvs))
+ cp->flags |= IP_VS_CONN_F_NFCT;
+
+ /* Hash it in the ip_vs_conn_tab finally */
+ ip_vs_conn_hash(cp);
+
+ return cp;
+}
+
+/*
+ * /proc/net/ip_vs_conn entries
+ */
+#ifdef CONFIG_PROC_FS
+struct ip_vs_iter_state {
+ struct seq_net_private p;
+ struct hlist_head *l;
+};
+
+static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
+{
+ int idx;
+ struct ip_vs_conn *cp;
+ struct ip_vs_iter_state *iter = seq->private;
+
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ /* __ip_vs_conn_get() is not needed by
+ * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
+ */
+ if (pos-- == 0) {
+ iter->l = &ip_vs_conn_tab[idx];
+ return cp;
+ }
+ }
+ cond_resched_rcu();
+ }
+
+ return NULL;
+}
+
+static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ struct ip_vs_iter_state *iter = seq->private;
+
+ iter->l = NULL;
+ rcu_read_lock();
+ return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
+}
+
+static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct ip_vs_conn *cp = v;
+ struct ip_vs_iter_state *iter = seq->private;
+ struct hlist_node *e;
+ struct hlist_head *l = iter->l;
+ int idx;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ip_vs_conn_array(seq, 0);
+
+ /* more on same hash chain? */
+ e = rcu_dereference(hlist_next_rcu(&cp->c_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_conn, c_list);
+
+ idx = l - ip_vs_conn_tab;
+ while (++idx < ip_vs_conn_tab_size) {
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ iter->l = &ip_vs_conn_tab[idx];
+ return cp;
+ }
+ cond_resched_rcu();
+ }
+ iter->l = NULL;
+ return NULL;
+}
+
+static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
+{
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+ "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n");
+ else {
+ const struct ip_vs_conn *cp = v;
+ struct net *net = seq_file_net(seq);
+ char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
+ size_t len = 0;
+ char dbuf[IP_VS_ADDRSTRLEN];
+
+ if (!ip_vs_conn_net_eq(cp, net))
+ return 0;
+ if (cp->pe_data) {
+ pe_data[0] = ' ';
+ len = strlen(cp->pe->name);
+ memcpy(pe_data + 1, cp->pe->name, len);
+ pe_data[len + 1] = ' ';
+ len += 2;
+ len += cp->pe->show_pe_data(cp, pe_data + len);
+ }
+ pe_data[len] = '\0';
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->daf == AF_INET6)
+ snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6);
+ else
+#endif
+ snprintf(dbuf, sizeof(dbuf), "%08X",
+ ntohl(cp->daddr.ip));
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
+ "%s %04X %-11s %7lu%s\n",
+ ip_vs_proto_name(cp->protocol),
+ &cp->caddr.in6, ntohs(cp->cport),
+ &cp->vaddr.in6, ntohs(cp->vport),
+ dbuf, ntohs(cp->dport),
+ ip_vs_state_name(cp->protocol, cp->state),
+ (cp->timer.expires-jiffies)/HZ, pe_data);
+ else
+#endif
+ seq_printf(seq,
+ "%-3s %08X %04X %08X %04X"
+ " %s %04X %-11s %7lu%s\n",
+ ip_vs_proto_name(cp->protocol),
+ ntohl(cp->caddr.ip), ntohs(cp->cport),
+ ntohl(cp->vaddr.ip), ntohs(cp->vport),
+ dbuf, ntohs(cp->dport),
+ ip_vs_state_name(cp->protocol, cp->state),
+ (cp->timer.expires-jiffies)/HZ, pe_data);
+ }
+ return 0;
+}
+
+static const struct seq_operations ip_vs_conn_seq_ops = {
+ .start = ip_vs_conn_seq_start,
+ .next = ip_vs_conn_seq_next,
+ .stop = ip_vs_conn_seq_stop,
+ .show = ip_vs_conn_seq_show,
+};
+
+static int ip_vs_conn_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ip_vs_conn_seq_ops,
+ sizeof(struct ip_vs_iter_state));
+}
+
+static const struct file_operations ip_vs_conn_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_conn_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static const char *ip_vs_origin_name(unsigned int flags)
+{
+ if (flags & IP_VS_CONN_F_SYNC)
+ return "SYNC";
+ else
+ return "LOCAL";
+}
+
+static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
+{
+ char dbuf[IP_VS_ADDRSTRLEN];
+
+ if (v == SEQ_START_TOKEN)
+ seq_puts(seq,
+ "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n");
+ else {
+ const struct ip_vs_conn *cp = v;
+ struct net *net = seq_file_net(seq);
+
+ if (!ip_vs_conn_net_eq(cp, net))
+ return 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->daf == AF_INET6)
+ snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6);
+ else
+#endif
+ snprintf(dbuf, sizeof(dbuf), "%08X",
+ ntohl(cp->daddr.ip));
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
+ "%s %04X %-11s %-6s %7lu\n",
+ ip_vs_proto_name(cp->protocol),
+ &cp->caddr.in6, ntohs(cp->cport),
+ &cp->vaddr.in6, ntohs(cp->vport),
+ dbuf, ntohs(cp->dport),
+ ip_vs_state_name(cp->protocol, cp->state),
+ ip_vs_origin_name(cp->flags),
+ (cp->timer.expires-jiffies)/HZ);
+ else
+#endif
+ seq_printf(seq,
+ "%-3s %08X %04X %08X %04X "
+ "%s %04X %-11s %-6s %7lu\n",
+ ip_vs_proto_name(cp->protocol),
+ ntohl(cp->caddr.ip), ntohs(cp->cport),
+ ntohl(cp->vaddr.ip), ntohs(cp->vport),
+ dbuf, ntohs(cp->dport),
+ ip_vs_state_name(cp->protocol, cp->state),
+ ip_vs_origin_name(cp->flags),
+ (cp->timer.expires-jiffies)/HZ);
+ }
+ return 0;
+}
+
+static const struct seq_operations ip_vs_conn_sync_seq_ops = {
+ .start = ip_vs_conn_seq_start,
+ .next = ip_vs_conn_seq_next,
+ .stop = ip_vs_conn_seq_stop,
+ .show = ip_vs_conn_sync_seq_show,
+};
+
+static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
+ sizeof(struct ip_vs_iter_state));
+}
+
+static const struct file_operations ip_vs_conn_sync_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_conn_sync_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+#endif
+
+
+/*
+ * Randomly drop connection entries before running out of memory
+ */
+static inline int todrop_entry(struct ip_vs_conn *cp)
+{
+ /*
+ * The drop rate array needs tuning for real environments.
+ * Called from timer bh only => no locking
+ */
+ static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+ static char todrop_counter[9] = {0};
+ int i;
+
+ /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
+ This will leave enough time for normal connection to get
+ through. */
+ if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
+ return 0;
+
+ /* Don't drop the entry if its number of incoming packets is not
+ located in [0, 8] */
+ i = atomic_read(&cp->in_pkts);
+ if (i > 8 || i < 0) return 0;
+
+ if (!todrop_rate[i]) return 0;
+ if (--todrop_counter[i] > 0) return 0;
+
+ todrop_counter[i] = todrop_rate[i];
+ return 1;
+}
+
+/* Called from keventd and must protect itself from softirqs */
+void ip_vs_random_dropentry(struct net *net)
+{
+ int idx;
+ struct ip_vs_conn *cp, *cp_c;
+
+ rcu_read_lock();
+ /*
+ * Randomly scan 1/32 of the whole table every second
+ */
+ for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
+ unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;
+
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ /* connection template */
+ continue;
+ if (!ip_vs_conn_net_eq(cp, net))
+ continue;
+ if (cp->protocol == IPPROTO_TCP) {
+ switch(cp->state) {
+ case IP_VS_TCP_S_SYN_RECV:
+ case IP_VS_TCP_S_SYNACK:
+ break;
+
+ case IP_VS_TCP_S_ESTABLISHED:
+ if (todrop_entry(cp))
+ break;
+ continue;
+
+ default:
+ continue;
+ }
+ } else if (cp->protocol == IPPROTO_SCTP) {
+ switch (cp->state) {
+ case IP_VS_SCTP_S_INIT1:
+ case IP_VS_SCTP_S_INIT:
+ break;
+ case IP_VS_SCTP_S_ESTABLISHED:
+ if (todrop_entry(cp))
+ break;
+ continue;
+ default:
+ continue;
+ }
+ } else {
+ if (!todrop_entry(cp))
+ continue;
+ }
+
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_expire_now(cp);
+ cp_c = cp->control;
+ /* cp->control is valid only with reference to cp */
+ if (cp_c && __ip_vs_conn_get(cp)) {
+ IP_VS_DBG(4, "del conn template\n");
+ ip_vs_conn_expire_now(cp_c);
+ __ip_vs_conn_put(cp);
+ }
+ }
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+}
+
+
+/*
+ * Flush all the connection entries in the ip_vs_conn_tab
+ */
+static void ip_vs_conn_flush(struct net *net)
+{
+ int idx;
+ struct ip_vs_conn *cp, *cp_c;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+flush_again:
+ rcu_read_lock();
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
+
+ hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
+ if (!ip_vs_conn_net_eq(cp, net))
+ continue;
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_expire_now(cp);
+ cp_c = cp->control;
+ /* cp->control is valid only with reference to cp */
+ if (cp_c && __ip_vs_conn_get(cp)) {
+ IP_VS_DBG(4, "del conn template\n");
+ ip_vs_conn_expire_now(cp_c);
+ __ip_vs_conn_put(cp);
+ }
+ }
+ cond_resched_rcu();
+ }
+ rcu_read_unlock();
+
+ /* the counter may be not NULL, because maybe some conn entries
+ are run by slow timer handler or unhashed but still referred */
+ if (atomic_read(&ipvs->conn_count) != 0) {
+ schedule();
+ goto flush_again;
+ }
+}
+/*
+ * per netns init and exit
+ */
+int __net_init ip_vs_conn_net_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ atomic_set(&ipvs->conn_count, 0);
+
+ proc_create("ip_vs_conn", 0, net->proc_net, &ip_vs_conn_fops);
+ proc_create("ip_vs_conn_sync", 0, net->proc_net, &ip_vs_conn_sync_fops);
+ return 0;
+}
+
+void __net_exit ip_vs_conn_net_cleanup(struct net *net)
+{
+ /* flush all the connection entries first */
+ ip_vs_conn_flush(net);
+ remove_proc_entry("ip_vs_conn", net->proc_net);
+ remove_proc_entry("ip_vs_conn_sync", net->proc_net);
+}
+
+int __init ip_vs_conn_init(void)
+{
+ int idx;
+
+ /* Compute size and mask */
+ ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
+ ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
+
+ /*
+ * Allocate the connection hash table and initialize its list heads
+ */
+ ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab));
+ if (!ip_vs_conn_tab)
+ return -ENOMEM;
+
+ /* Allocate ip_vs_conn slab cache */
+ ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
+ sizeof(struct ip_vs_conn), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!ip_vs_conn_cachep) {
+ vfree(ip_vs_conn_tab);
+ return -ENOMEM;
+ }
+
+ pr_info("Connection hash table configured "
+ "(size=%d, memory=%ldKbytes)\n",
+ ip_vs_conn_tab_size,
+ (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
+ IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
+ sizeof(struct ip_vs_conn));
+
+ for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
+ INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
+
+ for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) {
+ spin_lock_init(&__ip_vs_conntbl_lock_array[idx].l);
+ }
+
+ /* calculate the random value for connection hash */
+ get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
+
+ return 0;
+}
+
+void ip_vs_conn_cleanup(void)
+{
+ /* Wait all ip_vs_conn_rcu_free() callbacks to complete */
+ rcu_barrier();
+ /* Release the empty cache */
+ kmem_cache_destroy(ip_vs_conn_cachep);
+ vfree(ip_vs_conn_tab);
+}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
new file mode 100644
index 000000000..5d2b806a8
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -0,0 +1,2136 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the Netfilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others.
+ *
+ * Changes:
+ * Paul `Rusty' Russell properly handle non-linear skbs
+ * Harald Welte don't use nfcache
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/sctp.h>
+#include <linux/icmp.h>
+#include <linux/slab.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h> /* for icmp_send */
+#include <net/route.h>
+#include <net/ip6_checksum.h>
+#include <net/netns/generic.h> /* net_generic() */
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#include <linux/netfilter_ipv6.h>
+#include <net/ip6_route.h>
+#endif
+
+#include <net/ip_vs.h>
+
+
+EXPORT_SYMBOL(register_ip_vs_scheduler);
+EXPORT_SYMBOL(unregister_ip_vs_scheduler);
+EXPORT_SYMBOL(ip_vs_proto_name);
+EXPORT_SYMBOL(ip_vs_conn_new);
+EXPORT_SYMBOL(ip_vs_conn_in_get);
+EXPORT_SYMBOL(ip_vs_conn_out_get);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
+#endif
+EXPORT_SYMBOL(ip_vs_conn_put);
+#ifdef CONFIG_IP_VS_DEBUG
+EXPORT_SYMBOL(ip_vs_get_debug_level);
+#endif
+
+static int ip_vs_net_id __read_mostly;
+/* netns cnt used for uniqueness */
+static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
+
+/* ID used in ICMP lookups */
+#define icmp_id(icmph) (((icmph)->un).echo.id)
+#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier)
+
+const char *ip_vs_proto_name(unsigned int proto)
+{
+ static char buf[20];
+
+ switch (proto) {
+ case IPPROTO_IP:
+ return "IP";
+ case IPPROTO_UDP:
+ return "UDP";
+ case IPPROTO_TCP:
+ return "TCP";
+ case IPPROTO_SCTP:
+ return "SCTP";
+ case IPPROTO_ICMP:
+ return "ICMP";
+#ifdef CONFIG_IP_VS_IPV6
+ case IPPROTO_ICMPV6:
+ return "ICMPv6";
+#endif
+ default:
+ sprintf(buf, "IP_%u", proto);
+ return buf;
+ }
+}
+
+void ip_vs_init_hash_table(struct list_head *table, int rows)
+{
+ while (--rows >= 0)
+ INIT_LIST_HEAD(&table[rows]);
+}
+
+static inline void
+ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest = cp->dest;
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
+ if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ struct ip_vs_cpu_stats *s;
+ struct ip_vs_service *svc;
+
+ s = this_cpu_ptr(dest->stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.inpkts++;
+ s->cnt.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ rcu_read_lock();
+ svc = rcu_dereference(dest->svc);
+ s = this_cpu_ptr(svc->stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.inpkts++;
+ s->cnt.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+ rcu_read_unlock();
+
+ s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.inpkts++;
+ s->cnt.inbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+ }
+}
+
+
+static inline void
+ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+ struct ip_vs_dest *dest = cp->dest;
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
+ if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ struct ip_vs_cpu_stats *s;
+ struct ip_vs_service *svc;
+
+ s = this_cpu_ptr(dest->stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.outpkts++;
+ s->cnt.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+
+ rcu_read_lock();
+ svc = rcu_dereference(dest->svc);
+ s = this_cpu_ptr(svc->stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.outpkts++;
+ s->cnt.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+ rcu_read_unlock();
+
+ s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.outpkts++;
+ s->cnt.outbytes += skb->len;
+ u64_stats_update_end(&s->syncp);
+ }
+}
+
+
+static inline void
+ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
+{
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct ip_vs_cpu_stats *s;
+
+ s = this_cpu_ptr(cp->dest->stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.conns++;
+ u64_stats_update_end(&s->syncp);
+
+ s = this_cpu_ptr(svc->stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.conns++;
+ u64_stats_update_end(&s->syncp);
+
+ s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+ u64_stats_update_begin(&s->syncp);
+ s->cnt.conns++;
+ u64_stats_update_end(&s->syncp);
+}
+
+
+static inline void
+ip_vs_set_state(struct ip_vs_conn *cp, int direction,
+ const struct sk_buff *skb,
+ struct ip_vs_proto_data *pd)
+{
+ if (likely(pd->pp->state_transition))
+ pd->pp->state_transition(cp, direction, skb, pd);
+}
+
+static inline int
+ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
+ struct sk_buff *skb, int protocol,
+ const union nf_inet_addr *caddr, __be16 cport,
+ const union nf_inet_addr *vaddr, __be16 vport,
+ struct ip_vs_conn_param *p)
+{
+ ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
+ vport, p);
+ p->pe = rcu_dereference(svc->pe);
+ if (p->pe && p->pe->fill_param)
+ return p->pe->fill_param(p, skb);
+
+ return 0;
+}
+
+/*
+ * IPVS persistent scheduling function
+ * It creates a connection entry according to its template if exists,
+ * or selects a server and creates a connection entry plus a template.
+ * Locking: we are svc user (svc->refcnt), so we hold all dests too
+ * Protocols supported: TCP, UDP
+ */
+static struct ip_vs_conn *
+ip_vs_sched_persist(struct ip_vs_service *svc,
+ struct sk_buff *skb, __be16 src_port, __be16 dst_port,
+ int *ignored, struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_conn *cp = NULL;
+ struct ip_vs_dest *dest;
+ struct ip_vs_conn *ct;
+ __be16 dport = 0; /* destination port to forward */
+ unsigned int flags;
+ struct ip_vs_conn_param param;
+ const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
+ union nf_inet_addr snet; /* source network of the client,
+ after masking */
+
+ /* Mask saddr with the netmask to adjust template granularity */
+#ifdef CONFIG_IP_VS_IPV6
+ if (svc->af == AF_INET6)
+ ipv6_addr_prefix(&snet.in6, &iph->saddr.in6,
+ (__force __u32) svc->netmask);
+ else
+#endif
+ snet.ip = iph->saddr.ip & svc->netmask;
+
+ IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
+ "mnet %s\n",
+ IP_VS_DBG_ADDR(svc->af, &iph->saddr), ntohs(src_port),
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr), ntohs(dst_port),
+ IP_VS_DBG_ADDR(svc->af, &snet));
+
+ /*
+ * As far as we know, FTP is a very complicated network protocol, and
+ * it uses control connection and data connections. For active FTP,
+ * FTP server initialize data connection to the client, its source port
+ * is often 20. For passive FTP, FTP server tells the clients the port
+ * that it passively listens to, and the client issues the data
+ * connection. In the tunneling or direct routing mode, the load
+ * balancer is on the client-to-server half of connection, the port
+ * number is unknown to the load balancer. So, a conn template like
+ * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
+ * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
+ * is created for other persistent services.
+ */
+ {
+ int protocol = iph->protocol;
+ const union nf_inet_addr *vaddr = &iph->daddr;
+ __be16 vport = 0;
+
+ if (dst_port == svc->port) {
+ /* non-FTP template:
+ * <protocol, caddr, 0, vaddr, vport, daddr, dport>
+ * FTP template:
+ * <protocol, caddr, 0, vaddr, 0, daddr, 0>
+ */
+ if (svc->port != FTPPORT)
+ vport = dst_port;
+ } else {
+ /* Note: persistent fwmark-based services and
+ * persistent port zero service are handled here.
+ * fwmark template:
+ * <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+ * port zero template:
+ * <protocol,caddr,0,vaddr,0,daddr,0>
+ */
+ if (svc->fwmark) {
+ protocol = IPPROTO_IP;
+ vaddr = &fwmark;
+ }
+ }
+ /* return *ignored = -1 so NF_DROP can be used */
+ if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
+ vaddr, vport, &param) < 0) {
+ *ignored = -1;
+ return NULL;
+ }
+ }
+
+ /* Check if a template already exists */
+ ct = ip_vs_ct_in_get(&param);
+ if (!ct || !ip_vs_check_template(ct)) {
+ struct ip_vs_scheduler *sched;
+
+ /*
+ * No template found or the dest of the connection
+ * template is not available.
+ * return *ignored=0 i.e. ICMP and NF_DROP
+ */
+ sched = rcu_dereference(svc->scheduler);
+ dest = sched->schedule(svc, skb, iph);
+ if (!dest) {
+ IP_VS_DBG(1, "p-schedule: no dest found.\n");
+ kfree(param.pe_data);
+ *ignored = 0;
+ return NULL;
+ }
+
+ if (dst_port == svc->port && svc->port != FTPPORT)
+ dport = dest->port;
+
+ /* Create a template
+ * This adds param.pe_data to the template,
+ * and thus param.pe_data will be destroyed
+ * when the template expires */
+ ct = ip_vs_conn_new(&param, dest->af, &dest->addr, dport,
+ IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
+ if (ct == NULL) {
+ kfree(param.pe_data);
+ *ignored = -1;
+ return NULL;
+ }
+
+ ct->timeout = svc->timeout;
+ } else {
+ /* set destination with the found template */
+ dest = ct->dest;
+ kfree(param.pe_data);
+ }
+
+ dport = dst_port;
+ if (dport == svc->port && dest->port)
+ dport = dest->port;
+
+ flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
+ && iph->protocol == IPPROTO_UDP) ?
+ IP_VS_CONN_F_ONE_PACKET : 0;
+
+ /*
+ * Create a new connection according to the template
+ */
+ ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol, &iph->saddr,
+ src_port, &iph->daddr, dst_port, &param);
+
+ cp = ip_vs_conn_new(&param, dest->af, &dest->addr, dport, flags, dest,
+ skb->mark);
+ if (cp == NULL) {
+ ip_vs_conn_put(ct);
+ *ignored = -1;
+ return NULL;
+ }
+
+ /*
+ * Add its control
+ */
+ ip_vs_control_add(cp, ct);
+ ip_vs_conn_put(ct);
+
+ ip_vs_conn_stats(cp, svc);
+ return cp;
+}
+
+
+/*
+ * IPVS main scheduling function
+ * It selects a server according to the virtual service, and
+ * creates a connection entry.
+ * Protocols supported: TCP, UDP
+ *
+ * Usage of *ignored
+ *
+ * 1 : protocol tried to schedule (eg. on SYN), found svc but the
+ * svc/scheduler decides that this packet should be accepted with
+ * NF_ACCEPT because it must not be scheduled.
+ *
+ * 0 : scheduler can not find destination, so try bypass or
+ * return ICMP and then NF_DROP (ip_vs_leave).
+ *
+ * -1 : scheduler tried to schedule but fatal error occurred, eg.
+ * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
+ * failure such as missing Call-ID, ENOMEM on skb_linearize
+ * or pe_data. In this case we should return NF_DROP without
+ * any attempts to send ICMP with ip_vs_leave.
+ */
+struct ip_vs_conn *
+ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
+ struct ip_vs_proto_data *pd, int *ignored,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_protocol *pp = pd->pp;
+ struct ip_vs_conn *cp = NULL;
+ struct ip_vs_scheduler *sched;
+ struct ip_vs_dest *dest;
+ __be16 _ports[2], *pptr;
+ unsigned int flags;
+
+ *ignored = 1;
+ /*
+ * IPv6 frags, only the first hit here.
+ */
+ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
+ if (pptr == NULL)
+ return NULL;
+
+ /*
+ * FTPDATA needs this check when using local real server.
+ * Never schedule Active FTPDATA connections from real server.
+ * For LVS-NAT they must be already created. For other methods
+ * with persistence the connection is created on SYN+ACK.
+ */
+ if (pptr[0] == FTPDATA) {
+ IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+ "Not scheduling FTPDATA");
+ return NULL;
+ }
+
+ /*
+ * Do not schedule replies from local real server.
+ */
+ if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+ (cp = pp->conn_in_get(svc->af, skb, iph, 1))) {
+ IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
+ "Not scheduling reply for existing connection");
+ __ip_vs_conn_put(cp);
+ return NULL;
+ }
+
+ /*
+ * Persistent service
+ */
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+ return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored,
+ iph);
+
+ *ignored = 0;
+
+ /*
+ * Non-persistent service
+ */
+ if (!svc->fwmark && pptr[1] != svc->port) {
+ if (!svc->port)
+ pr_err("Schedule: port zero only supported "
+ "in persistent services, "
+ "check your ipvs configuration\n");
+ return NULL;
+ }
+
+ sched = rcu_dereference(svc->scheduler);
+ dest = sched->schedule(svc, skb, iph);
+ if (dest == NULL) {
+ IP_VS_DBG(1, "Schedule: no dest found.\n");
+ return NULL;
+ }
+
+ flags = (svc->flags & IP_VS_SVC_F_ONEPACKET
+ && iph->protocol == IPPROTO_UDP) ?
+ IP_VS_CONN_F_ONE_PACKET : 0;
+
+ /*
+ * Create a connection entry.
+ */
+ {
+ struct ip_vs_conn_param p;
+
+ ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
+ &iph->saddr, pptr[0], &iph->daddr,
+ pptr[1], &p);
+ cp = ip_vs_conn_new(&p, dest->af, &dest->addr,
+ dest->port ? dest->port : pptr[1],
+ flags, dest, skb->mark);
+ if (!cp) {
+ *ignored = -1;
+ return NULL;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
+ "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
+ ip_vs_fwd_tag(cp),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
+ IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
+ cp->flags, atomic_read(&cp->refcnt));
+
+ ip_vs_conn_stats(cp, svc);
+ return cp;
+}
+
+
+/*
+ * Pass or drop the packet.
+ * Called by ip_vs_in, when the virtual service is available but
+ * no destination is available for a new connection.
+ */
+int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
+ struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph)
+{
+ __be16 _ports[2], *pptr;
+#ifdef CONFIG_SYSCTL
+ struct net *net;
+ struct netns_ipvs *ipvs;
+ int unicast;
+#endif
+
+ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
+ if (pptr == NULL) {
+ return NF_DROP;
+ }
+
+#ifdef CONFIG_SYSCTL
+ net = skb_net(skb);
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (svc->af == AF_INET6)
+ unicast = ipv6_addr_type(&iph->daddr.in6) & IPV6_ADDR_UNICAST;
+ else
+#endif
+ unicast = (inet_addr_type(net, iph->daddr.ip) == RTN_UNICAST);
+
+ /* if it is fwmark-based service, the cache_bypass sysctl is up
+ and the destination is a non-local unicast, then create
+ a cache_bypass connection entry */
+ ipvs = net_ipvs(net);
+ if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
+ int ret;
+ struct ip_vs_conn *cp;
+ unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
+ iph->protocol == IPPROTO_UDP) ?
+ IP_VS_CONN_F_ONE_PACKET : 0;
+ union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } };
+
+ /* create a new connection entry */
+ IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
+ {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(svc->net, svc->af, iph->protocol,
+ &iph->saddr, pptr[0],
+ &iph->daddr, pptr[1], &p);
+ cp = ip_vs_conn_new(&p, svc->af, &daddr, 0,
+ IP_VS_CONN_F_BYPASS | flags,
+ NULL, skb->mark);
+ if (!cp)
+ return NF_DROP;
+ }
+
+ /* statistics */
+ ip_vs_in_stats(cp, skb);
+
+ /* set state */
+ ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
+
+ /* transmit the first SYN packet */
+ ret = cp->packet_xmit(skb, cp, pd->pp, iph);
+ /* do not touch skb anymore */
+
+ atomic_inc(&cp->in_pkts);
+ ip_vs_conn_put(cp);
+ return ret;
+ }
+#endif
+
+ /*
+ * When the virtual ftp service is presented, packets destined
+ * for other services on the VIP may get here (except services
+ * listed in the ipvs table), pass the packets, because it is
+ * not ipvs job to decide to drop the packets.
+ */
+ if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT))
+ return NF_ACCEPT;
+
+ /*
+ * Notify the client that the destination is unreachable, and
+ * release the socket buffer.
+ * Since it is in IP layer, the TCP socket is not actually
+ * created, the TCP RST packet cannot be sent, instead that
+ * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
+ */
+#ifdef CONFIG_IP_VS_IPV6
+ if (svc->af == AF_INET6) {
+ if (!skb->dev) {
+ struct net *net_ = dev_net(skb_dst(skb)->dev);
+
+ skb->dev = net_->loopback_dev;
+ }
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+ } else
+#endif
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+ return NF_DROP;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static int sysctl_snat_reroute(struct sk_buff *skb)
+{
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+ return ipvs->sysctl_snat_reroute;
+}
+
+static int sysctl_nat_icmp_send(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ return ipvs->sysctl_nat_icmp_send;
+}
+
+static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs)
+{
+ return ipvs->sysctl_expire_nodest_conn;
+}
+
+#else
+
+static int sysctl_snat_reroute(struct sk_buff *skb) { return 0; }
+static int sysctl_nat_icmp_send(struct net *net) { return 0; }
+static int sysctl_expire_nodest_conn(struct netns_ipvs *ipvs) { return 0; }
+
+#endif
+
+__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
+{
+ return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
+}
+
+static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum)
+{
+ if (NF_INET_LOCAL_IN == hooknum)
+ return IP_DEFRAG_VS_IN;
+ if (NF_INET_FORWARD == hooknum)
+ return IP_DEFRAG_VS_FWD;
+ return IP_DEFRAG_VS_OUT;
+}
+
+static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+ int err;
+
+ local_bh_disable();
+ err = ip_defrag(skb, user);
+ local_bh_enable();
+ if (!err)
+ ip_send_check(ip_hdr(skb));
+
+ return err;
+}
+
+static int ip_vs_route_me_harder(int af, struct sk_buff *skb,
+ unsigned int hooknum)
+{
+ if (!sysctl_snat_reroute(skb))
+ return 0;
+ /* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */
+ if (NF_INET_LOCAL_IN == hooknum)
+ return 0;
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ struct dst_entry *dst = skb_dst(skb);
+
+ if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) &&
+ ip6_route_me_harder(skb) != 0)
+ return 1;
+ } else
+#endif
+ if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
+ ip_route_me_harder(skb, RTN_LOCAL) != 0)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Packet has been made sufficiently writable in caller
+ * - inout: 1=in->out, 0=out->in
+ */
+void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, int inout)
+{
+ struct iphdr *iph = ip_hdr(skb);
+ unsigned int icmp_offset = iph->ihl*4;
+ struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) +
+ icmp_offset);
+ struct iphdr *ciph = (struct iphdr *)(icmph + 1);
+
+ if (inout) {
+ iph->saddr = cp->vaddr.ip;
+ ip_send_check(iph);
+ ciph->daddr = cp->vaddr.ip;
+ ip_send_check(ciph);
+ } else {
+ iph->daddr = cp->daddr.ip;
+ ip_send_check(iph);
+ ciph->saddr = cp->daddr.ip;
+ ip_send_check(ciph);
+ }
+
+ /* the TCP/UDP/SCTP port */
+ if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
+ IPPROTO_SCTP == ciph->protocol) {
+ __be16 *ports = (void *)ciph + ciph->ihl*4;
+
+ if (inout)
+ ports[1] = cp->vport;
+ else
+ ports[0] = cp->dport;
+ }
+
+ /* And finally the ICMP checksum */
+ icmph->checksum = 0;
+ icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ if (inout)
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
+ "Forwarding altered outgoing ICMP");
+ else
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph,
+ "Forwarding altered incoming ICMP");
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, int inout)
+{
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+ unsigned int icmp_offset = 0;
+ unsigned int offs = 0; /* header offset*/
+ int protocol;
+ struct icmp6hdr *icmph;
+ struct ipv6hdr *ciph;
+ unsigned short fragoffs;
+
+ ipv6_find_hdr(skb, &icmp_offset, IPPROTO_ICMPV6, &fragoffs, NULL);
+ icmph = (struct icmp6hdr *)(skb_network_header(skb) + icmp_offset);
+ offs = icmp_offset + sizeof(struct icmp6hdr);
+ ciph = (struct ipv6hdr *)(skb_network_header(skb) + offs);
+
+ protocol = ipv6_find_hdr(skb, &offs, -1, &fragoffs, NULL);
+
+ if (inout) {
+ iph->saddr = cp->vaddr.in6;
+ ciph->daddr = cp->vaddr.in6;
+ } else {
+ iph->daddr = cp->daddr.in6;
+ ciph->saddr = cp->daddr.in6;
+ }
+
+ /* the TCP/UDP/SCTP port */
+ if (!fragoffs && (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
+ IPPROTO_SCTP == protocol)) {
+ __be16 *ports = (void *)(skb_network_header(skb) + offs);
+
+ IP_VS_DBG(11, "%s() changed port %d to %d\n", __func__,
+ ntohs(inout ? ports[1] : ports[0]),
+ ntohs(inout ? cp->vport : cp->dport));
+ if (inout)
+ ports[1] = cp->vport;
+ else
+ ports[0] = cp->dport;
+ }
+
+ /* And finally the ICMP checksum */
+ icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr,
+ skb->len - icmp_offset,
+ IPPROTO_ICMPV6, 0);
+ skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset;
+ skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ if (inout)
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+ (void *)ciph - (void *)iph,
+ "Forwarding altered outgoing ICMPv6");
+ else
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb,
+ (void *)ciph - (void *)iph,
+ "Forwarding altered incoming ICMPv6");
+}
+#endif
+
+/* Handle relevant response ICMP messages - forward to the right
+ * destination host.
+ */
+static int handle_response_icmp(int af, struct sk_buff *skb,
+ union nf_inet_addr *snet,
+ __u8 protocol, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp,
+ unsigned int offset, unsigned int ihl,
+ unsigned int hooknum)
+{
+ unsigned int verdict = NF_DROP;
+
+ if (IP_VS_FWD_METHOD(cp) != 0) {
+ pr_err("shouldn't reach here, because the box is on the "
+ "half connection in the tun/dr module.\n");
+ }
+
+ /* Ensure the checksum is correct */
+ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
+ /* Failed checksum! */
+ IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
+ IP_VS_DBG_ADDR(af, snet));
+ goto out;
+ }
+
+ if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
+ IPPROTO_SCTP == protocol)
+ offset += 2 * sizeof(__u16);
+ if (!skb_make_writable(skb, offset))
+ goto out;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ ip_vs_nat_icmp_v6(skb, pp, cp, 1);
+ else
+#endif
+ ip_vs_nat_icmp(skb, pp, cp, 1);
+
+ if (ip_vs_route_me_harder(af, skb, hooknum))
+ goto out;
+
+ /* do the statistics and put it back */
+ ip_vs_out_stats(cp, skb);
+
+ skb->ipvs_property = 1;
+ if (!(cp->flags & IP_VS_CONN_F_NFCT))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 0);
+ verdict = NF_ACCEPT;
+
+out:
+ __ip_vs_conn_put(cp);
+
+ return verdict;
+}
+
+/*
+ * Handle ICMP messages in the inside-to-outside direction (outgoing).
+ * Find any that might be relevant, check against existing connections.
+ * Currently handles error types - unreachable, quench, ttl exceeded.
+ */
+static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
+ unsigned int hooknum)
+{
+ struct iphdr *iph;
+ struct icmphdr _icmph, *ic;
+ struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
+ struct ip_vs_iphdr ciph;
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp;
+ unsigned int offset, ihl;
+ union nf_inet_addr snet;
+
+ *related = 1;
+
+ /* reassemble IP fragments */
+ if (ip_is_fragment(ip_hdr(skb))) {
+ if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
+ return NF_STOLEN;
+ }
+
+ iph = ip_hdr(skb);
+ offset = ihl = iph->ihl * 4;
+ ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+ if (ic == NULL)
+ return NF_DROP;
+
+ IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
+ ic->type, ntohs(icmp_id(ic)),
+ &iph->saddr, &iph->daddr);
+
+ /*
+ * Work through seeing if this is for us.
+ * These checks are supposed to be in an order that means easy
+ * things are checked first to speed up processing.... however
+ * this means that some packets will manage to get a long way
+ * down this stack and then be rejected, but that's life.
+ */
+ if ((ic->type != ICMP_DEST_UNREACH) &&
+ (ic->type != ICMP_SOURCE_QUENCH) &&
+ (ic->type != ICMP_TIME_EXCEEDED)) {
+ *related = 0;
+ return NF_ACCEPT;
+ }
+
+ /* Now find the contained IP header */
+ offset += sizeof(_icmph);
+ cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+ if (cih == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+ pp = ip_vs_proto_get(cih->protocol);
+ if (!pp)
+ return NF_ACCEPT;
+
+ /* Is the embedded protocol header present? */
+ if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
+ pp->dont_defrag))
+ return NF_ACCEPT;
+
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+ "Checking outgoing ICMP for");
+
+ ip_vs_fill_ip4hdr(cih, &ciph);
+ ciph.len += offset;
+ /* The embedded headers contain source and dest in reverse order */
+ cp = pp->conn_out_get(AF_INET, skb, &ciph, 1);
+ if (!cp)
+ return NF_ACCEPT;
+
+ snet.ip = iph->saddr;
+ return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
+ pp, ciph.len, ihl, hooknum);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
+ unsigned int hooknum, struct ip_vs_iphdr *ipvsh)
+{
+ struct icmp6hdr _icmph, *ic;
+ struct ipv6hdr _ip6h, *ip6h; /* The ip header contained within ICMP */
+ struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp;
+ union nf_inet_addr snet;
+ unsigned int writable;
+
+ *related = 1;
+ ic = frag_safe_skb_hp(skb, ipvsh->len, sizeof(_icmph), &_icmph, ipvsh);
+ if (ic == NULL)
+ return NF_DROP;
+
+ /*
+ * Work through seeing if this is for us.
+ * These checks are supposed to be in an order that means easy
+ * things are checked first to speed up processing.... however
+ * this means that some packets will manage to get a long way
+ * down this stack and then be rejected, but that's life.
+ */
+ if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
+ *related = 0;
+ return NF_ACCEPT;
+ }
+ /* Fragment header that is before ICMP header tells us that:
+ * it's not an error message since they can't be fragmented.
+ */
+ if (ipvsh->flags & IP6_FH_F_FRAG)
+ return NF_DROP;
+
+ IP_VS_DBG(8, "Outgoing ICMPv6 (%d,%d) %pI6c->%pI6c\n",
+ ic->icmp6_type, ntohs(icmpv6_id(ic)),
+ &ipvsh->saddr, &ipvsh->daddr);
+
+ /* Now find the contained IP header */
+ ciph.len = ipvsh->len + sizeof(_icmph);
+ ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h);
+ if (ip6h == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+ ciph.saddr.in6 = ip6h->saddr; /* conn_out_get() handles reverse order */
+ ciph.daddr.in6 = ip6h->daddr;
+ /* skip possible IPv6 exthdrs of contained IPv6 packet */
+ ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL);
+ if (ciph.protocol < 0)
+ return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */
+
+ pp = ip_vs_proto_get(ciph.protocol);
+ if (!pp)
+ return NF_ACCEPT;
+
+ /* The embedded headers contain source and dest in reverse order */
+ cp = pp->conn_out_get(AF_INET6, skb, &ciph, 1);
+ if (!cp)
+ return NF_ACCEPT;
+
+ snet.in6 = ciph.saddr.in6;
+ writable = ciph.len;
+ return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp,
+ pp, writable, sizeof(struct ipv6hdr),
+ hooknum);
+}
+#endif
+
+/*
+ * Check if sctp chunc is ABORT chunk
+ */
+static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
+{
+ sctp_chunkhdr_t *sch, schunk;
+ sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t),
+ sizeof(schunk), &schunk);
+ if (sch == NULL)
+ return 0;
+ if (sch->type == SCTP_CID_ABORT)
+ return 1;
+ return 0;
+}
+
+static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
+{
+ struct tcphdr _tcph, *th;
+
+ th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return 0;
+ return th->rst;
+}
+
+static inline bool is_new_conn(const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ switch (iph->protocol) {
+ case IPPROTO_TCP: {
+ struct tcphdr _tcph, *th;
+
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return false;
+ return th->syn;
+ }
+ case IPPROTO_SCTP: {
+ sctp_chunkhdr_t *sch, schunk;
+
+ sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),
+ sizeof(schunk), &schunk);
+ if (sch == NULL)
+ return false;
+ return sch->type == SCTP_CID_INIT;
+ }
+ default:
+ return false;
+ }
+}
+
+static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
+ int conn_reuse_mode)
+{
+ /* Controlled (FTP DATA or persistence)? */
+ if (cp->control)
+ return false;
+
+ switch (cp->protocol) {
+ case IPPROTO_TCP:
+ return (cp->state == IP_VS_TCP_S_TIME_WAIT) ||
+ ((conn_reuse_mode & 2) &&
+ (cp->state == IP_VS_TCP_S_FIN_WAIT) &&
+ (cp->flags & IP_VS_CONN_F_NOOUTPUT));
+ case IPPROTO_SCTP:
+ return cp->state == IP_VS_SCTP_S_CLOSED;
+ default:
+ return false;
+ }
+}
+
+/* Handle response packets: rewrite addresses and send away...
+ */
+static unsigned int
+handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph,
+ unsigned int hooknum)
+{
+ struct ip_vs_protocol *pp = pd->pp;
+
+ IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
+
+ if (!skb_make_writable(skb, iph->len))
+ goto drop;
+
+ /* mangle the packet */
+ if (pp->snat_handler && !pp->snat_handler(skb, pp, cp, iph))
+ goto drop;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ ipv6_hdr(skb)->saddr = cp->vaddr.in6;
+ else
+#endif
+ {
+ ip_hdr(skb)->saddr = cp->vaddr.ip;
+ ip_send_check(ip_hdr(skb));
+ }
+
+ /*
+ * nf_iterate does not expect change in the skb->dst->dev.
+ * It looks like it is not fatal to enable this code for hooks
+ * where our handlers are at the end of the chain list and
+ * when all next handlers use skb->dst->dev and not outdev.
+ * It will definitely route properly the inout NAT traffic
+ * when multiple paths are used.
+ */
+
+ /* For policy routing, packets originating from this
+ * machine itself may be routed differently to packets
+ * passing through. We want this packet to be routed as
+ * if it came from this machine itself. So re-compute
+ * the routing information.
+ */
+ if (ip_vs_route_me_harder(af, skb, hooknum))
+ goto drop;
+
+ IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
+
+ ip_vs_out_stats(cp, skb);
+ ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
+ skb->ipvs_property = 1;
+ if (!(cp->flags & IP_VS_CONN_F_NFCT))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 0);
+ ip_vs_conn_put(cp);
+
+ LeaveFunction(11);
+ return NF_ACCEPT;
+
+drop:
+ ip_vs_conn_put(cp);
+ kfree_skb(skb);
+ LeaveFunction(11);
+ return NF_STOLEN;
+}
+
+/*
+ * Check if outgoing packet belongs to the established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
+{
+ struct net *net = NULL;
+ struct ip_vs_iphdr iph;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
+ struct ip_vs_conn *cp;
+
+ EnterFunction(11);
+
+ /* Already marked as IPVS request or reply? */
+ if (skb->ipvs_property)
+ return NF_ACCEPT;
+
+ /* Bad... Do not break raw sockets */
+ if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+ af == AF_INET)) {
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(skb->sk);
+
+ if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+ return NF_ACCEPT;
+ }
+
+ if (unlikely(!skb_dst(skb)))
+ return NF_ACCEPT;
+
+ net = skb_net(skb);
+ if (!net_ipvs(net)->enable)
+ return NF_ACCEPT;
+
+ ip_vs_fill_iph_skb(af, skb, &iph);
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
+ int related;
+ int verdict = ip_vs_out_icmp_v6(skb, &related,
+ hooknum, &iph);
+
+ if (related)
+ return verdict;
+ }
+ } else
+#endif
+ if (unlikely(iph.protocol == IPPROTO_ICMP)) {
+ int related;
+ int verdict = ip_vs_out_icmp(skb, &related, hooknum);
+
+ if (related)
+ return verdict;
+ }
+
+ pd = ip_vs_proto_data_get(net, iph.protocol);
+ if (unlikely(!pd))
+ return NF_ACCEPT;
+ pp = pd->pp;
+
+ /* reassemble IP fragments */
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET)
+#endif
+ if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
+ if (ip_vs_gather_frags(skb,
+ ip_vs_defrag_user(hooknum)))
+ return NF_STOLEN;
+
+ ip_vs_fill_ip4hdr(skb_network_header(skb), &iph);
+ }
+
+ /*
+ * Check if the packet belongs to an existing entry
+ */
+ cp = pp->conn_out_get(af, skb, &iph, 0);
+
+ if (likely(cp))
+ return handle_response(af, skb, pd, cp, &iph, hooknum);
+ if (sysctl_nat_icmp_send(net) &&
+ (pp->protocol == IPPROTO_TCP ||
+ pp->protocol == IPPROTO_UDP ||
+ pp->protocol == IPPROTO_SCTP)) {
+ __be16 _ports[2], *pptr;
+
+ pptr = frag_safe_skb_hp(skb, iph.len,
+ sizeof(_ports), _ports, &iph);
+ if (pptr == NULL)
+ return NF_ACCEPT; /* Not for me */
+ if (ip_vs_has_real_service(net, af, iph.protocol, &iph.saddr,
+ pptr[0])) {
+ /*
+ * Notify the real server: there is no
+ * existing entry if it is not RST
+ * packet or not TCP packet.
+ */
+ if ((iph.protocol != IPPROTO_TCP &&
+ iph.protocol != IPPROTO_SCTP)
+ || ((iph.protocol == IPPROTO_TCP
+ && !is_tcp_reset(skb, iph.len))
+ || (iph.protocol == IPPROTO_SCTP
+ && !is_sctp_abort(skb,
+ iph.len)))) {
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (!skb->dev)
+ skb->dev = net->loopback_dev;
+ icmpv6_send(skb,
+ ICMPV6_DEST_UNREACH,
+ ICMPV6_PORT_UNREACH,
+ 0);
+ } else
+#endif
+ icmp_send(skb,
+ ICMP_DEST_UNREACH,
+ ICMP_PORT_UNREACH, 0);
+ return NF_DROP;
+ }
+ }
+ }
+ IP_VS_DBG_PKT(12, af, pp, skb, 0,
+ "ip_vs_out: packet continues traversal as normal");
+ return NF_ACCEPT;
+}
+
+/*
+ * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ * used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ip_vs_out(ops->hooknum, skb, AF_INET);
+}
+
+/*
+ * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ip_vs_out(ops->hooknum, skb, AF_INET);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain,
+ * used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ip_vs_out(ops->hooknum, skb, AF_INET6);
+}
+
+/*
+ * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT.
+ * Check if packet is reply for established ip_vs_conn.
+ */
+static unsigned int
+ip_vs_local_reply6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ip_vs_out(ops->hooknum, skb, AF_INET6);
+}
+
+#endif
+
+/*
+ * Handle ICMP messages in the outside-to-inside direction (incoming).
+ * Find any that might be relevant, check against existing connections,
+ * forward to the right destination host if relevant.
+ * Currently handles error types - unreachable, quench, ttl exceeded.
+ */
+static int
+ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
+{
+ struct net *net = NULL;
+ struct iphdr *iph;
+ struct icmphdr _icmph, *ic;
+ struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
+ struct ip_vs_iphdr ciph;
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
+ unsigned int offset, offset2, ihl, verdict;
+ bool ipip;
+
+ *related = 1;
+
+ /* reassemble IP fragments */
+ if (ip_is_fragment(ip_hdr(skb))) {
+ if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum)))
+ return NF_STOLEN;
+ }
+
+ iph = ip_hdr(skb);
+ offset = ihl = iph->ihl * 4;
+ ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+ if (ic == NULL)
+ return NF_DROP;
+
+ IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
+ ic->type, ntohs(icmp_id(ic)),
+ &iph->saddr, &iph->daddr);
+
+ /*
+ * Work through seeing if this is for us.
+ * These checks are supposed to be in an order that means easy
+ * things are checked first to speed up processing.... however
+ * this means that some packets will manage to get a long way
+ * down this stack and then be rejected, but that's life.
+ */
+ if ((ic->type != ICMP_DEST_UNREACH) &&
+ (ic->type != ICMP_SOURCE_QUENCH) &&
+ (ic->type != ICMP_TIME_EXCEEDED)) {
+ *related = 0;
+ return NF_ACCEPT;
+ }
+
+ /* Now find the contained IP header */
+ offset += sizeof(_icmph);
+ cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+ if (cih == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+ net = skb_net(skb);
+
+ /* Special case for errors for IPIP packets */
+ ipip = false;
+ if (cih->protocol == IPPROTO_IPIP) {
+ if (unlikely(cih->frag_off & htons(IP_OFFSET)))
+ return NF_ACCEPT;
+ /* Error for our IPIP must arrive at LOCAL_IN */
+ if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL))
+ return NF_ACCEPT;
+ offset += cih->ihl * 4;
+ cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+ if (cih == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+ ipip = true;
+ }
+
+ pd = ip_vs_proto_data_get(net, cih->protocol);
+ if (!pd)
+ return NF_ACCEPT;
+ pp = pd->pp;
+
+ /* Is the embedded protocol header present? */
+ if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
+ pp->dont_defrag))
+ return NF_ACCEPT;
+
+ IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset,
+ "Checking incoming ICMP for");
+
+ offset2 = offset;
+ ip_vs_fill_ip4hdr(cih, &ciph);
+ ciph.len += offset;
+ offset = ciph.len;
+ /* The embedded headers contain source and dest in reverse order.
+ * For IPIP this is error for request, not for reply.
+ */
+ cp = pp->conn_in_get(AF_INET, skb, &ciph, ipip ? 0 : 1);
+ if (!cp)
+ return NF_ACCEPT;
+
+ verdict = NF_DROP;
+
+ /* Ensure the checksum is correct */
+ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
+ /* Failed checksum! */
+ IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
+ &iph->saddr);
+ goto out;
+ }
+
+ if (ipip) {
+ __be32 info = ic->un.gateway;
+ __u8 type = ic->type;
+ __u8 code = ic->code;
+
+ /* Update the MTU */
+ if (ic->type == ICMP_DEST_UNREACH &&
+ ic->code == ICMP_FRAG_NEEDED) {
+ struct ip_vs_dest *dest = cp->dest;
+ u32 mtu = ntohs(ic->un.frag.mtu);
+ __be16 frag_off = cih->frag_off;
+
+ /* Strip outer IP and ICMP, go to IPIP header */
+ if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL)
+ goto ignore_ipip;
+ offset2 -= ihl + sizeof(_icmph);
+ skb_reset_network_header(skb);
+ IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
+ &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
+ ipv4_update_pmtu(skb, dev_net(skb->dev),
+ mtu, 0, 0, 0, 0);
+ /* Client uses PMTUD? */
+ if (!(frag_off & htons(IP_DF)))
+ goto ignore_ipip;
+ /* Prefer the resulting PMTU */
+ if (dest) {
+ struct ip_vs_dest_dst *dest_dst;
+
+ rcu_read_lock();
+ dest_dst = rcu_dereference(dest->dest_dst);
+ if (dest_dst)
+ mtu = dst_mtu(dest_dst->dst_cache);
+ rcu_read_unlock();
+ }
+ if (mtu > 68 + sizeof(struct iphdr))
+ mtu -= sizeof(struct iphdr);
+ info = htonl(mtu);
+ }
+ /* Strip outer IP, ICMP and IPIP, go to IP header of
+ * original request.
+ */
+ if (pskb_pull(skb, offset2) == NULL)
+ goto ignore_ipip;
+ skb_reset_network_header(skb);
+ IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n",
+ &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
+ type, code, ntohl(info));
+ icmp_send(skb, type, code, info);
+ /* ICMP can be shorter but anyways, account it */
+ ip_vs_out_stats(cp, skb);
+
+ignore_ipip:
+ consume_skb(skb);
+ verdict = NF_STOLEN;
+ goto out;
+ }
+
+ /* do the statistics and put it back */
+ ip_vs_in_stats(cp, skb);
+ if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol ||
+ IPPROTO_SCTP == cih->protocol)
+ offset += 2 * sizeof(__u16);
+ verdict = ip_vs_icmp_xmit(skb, cp, pp, offset, hooknum, &ciph);
+
+out:
+ __ip_vs_conn_put(cp);
+
+ return verdict;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related,
+ unsigned int hooknum, struct ip_vs_iphdr *iph)
+{
+ struct net *net = NULL;
+ struct ipv6hdr _ip6h, *ip6h;
+ struct icmp6hdr _icmph, *ic;
+ struct ip_vs_iphdr ciph = {.flags = 0, .fragoffs = 0};/*Contained IP */
+ struct ip_vs_conn *cp;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
+ unsigned int offs_ciph, writable, verdict;
+
+ *related = 1;
+
+ ic = frag_safe_skb_hp(skb, iph->len, sizeof(_icmph), &_icmph, iph);
+ if (ic == NULL)
+ return NF_DROP;
+
+ /*
+ * Work through seeing if this is for us.
+ * These checks are supposed to be in an order that means easy
+ * things are checked first to speed up processing.... however
+ * this means that some packets will manage to get a long way
+ * down this stack and then be rejected, but that's life.
+ */
+ if (ic->icmp6_type & ICMPV6_INFOMSG_MASK) {
+ *related = 0;
+ return NF_ACCEPT;
+ }
+ /* Fragment header that is before ICMP header tells us that:
+ * it's not an error message since they can't be fragmented.
+ */
+ if (iph->flags & IP6_FH_F_FRAG)
+ return NF_DROP;
+
+ IP_VS_DBG(8, "Incoming ICMPv6 (%d,%d) %pI6c->%pI6c\n",
+ ic->icmp6_type, ntohs(icmpv6_id(ic)),
+ &iph->saddr, &iph->daddr);
+
+ /* Now find the contained IP header */
+ ciph.len = iph->len + sizeof(_icmph);
+ offs_ciph = ciph.len; /* Save ip header offset */
+ ip6h = skb_header_pointer(skb, ciph.len, sizeof(_ip6h), &_ip6h);
+ if (ip6h == NULL)
+ return NF_ACCEPT; /* The packet looks wrong, ignore */
+ ciph.saddr.in6 = ip6h->saddr; /* conn_in_get() handles reverse order */
+ ciph.daddr.in6 = ip6h->daddr;
+ /* skip possible IPv6 exthdrs of contained IPv6 packet */
+ ciph.protocol = ipv6_find_hdr(skb, &ciph.len, -1, &ciph.fragoffs, NULL);
+ if (ciph.protocol < 0)
+ return NF_ACCEPT; /* Contained IPv6 hdr looks wrong, ignore */
+
+ net = skb_net(skb);
+ pd = ip_vs_proto_data_get(net, ciph.protocol);
+ if (!pd)
+ return NF_ACCEPT;
+ pp = pd->pp;
+
+ /* Cannot handle fragmented embedded protocol */
+ if (ciph.fragoffs)
+ return NF_ACCEPT;
+
+ IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offs_ciph,
+ "Checking incoming ICMPv6 for");
+
+ /* The embedded headers contain source and dest in reverse order
+ * if not from localhost
+ */
+ cp = pp->conn_in_get(AF_INET6, skb, &ciph,
+ (hooknum == NF_INET_LOCAL_OUT) ? 0 : 1);
+
+ if (!cp)
+ return NF_ACCEPT;
+ /* VS/TUN, VS/DR and LOCALNODE just let it go */
+ if ((hooknum == NF_INET_LOCAL_OUT) &&
+ (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)) {
+ __ip_vs_conn_put(cp);
+ return NF_ACCEPT;
+ }
+
+ /* do the statistics and put it back */
+ ip_vs_in_stats(cp, skb);
+
+ /* Need to mangle contained IPv6 header in ICMPv6 packet */
+ writable = ciph.len;
+ if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol ||
+ IPPROTO_SCTP == ciph.protocol)
+ writable += 2 * sizeof(__u16); /* Also mangle ports */
+
+ verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, writable, hooknum, &ciph);
+
+ __ip_vs_conn_put(cp);
+
+ return verdict;
+}
+#endif
+
+
+/*
+ * Check if it's for virtual services, look it up,
+ * and send it on its way...
+ */
+static unsigned int
+ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
+{
+ struct net *net;
+ struct ip_vs_iphdr iph;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_proto_data *pd;
+ struct ip_vs_conn *cp;
+ int ret, pkts;
+ struct netns_ipvs *ipvs;
+ int conn_reuse_mode;
+
+ /* Already marked as IPVS request or reply? */
+ if (skb->ipvs_property)
+ return NF_ACCEPT;
+
+ /*
+ * Big tappo:
+ * - remote client: only PACKET_HOST
+ * - route: used for struct net when skb->dev is unset
+ */
+ if (unlikely((skb->pkt_type != PACKET_HOST &&
+ hooknum != NF_INET_LOCAL_OUT) ||
+ !skb_dst(skb))) {
+ ip_vs_fill_iph_skb(af, skb, &iph);
+ IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
+ " ignored in hook %u\n",
+ skb->pkt_type, iph.protocol,
+ IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
+ return NF_ACCEPT;
+ }
+ /* ipvs enabled in this netns ? */
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ return NF_ACCEPT;
+
+ ip_vs_fill_iph_skb(af, skb, &iph);
+
+ /* Bad... Do not break raw sockets */
+ if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+ af == AF_INET)) {
+ struct sock *sk = skb->sk;
+ struct inet_sock *inet = inet_sk(skb->sk);
+
+ if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+ return NF_ACCEPT;
+ }
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
+ int related;
+ int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum,
+ &iph);
+
+ if (related)
+ return verdict;
+ }
+ } else
+#endif
+ if (unlikely(iph.protocol == IPPROTO_ICMP)) {
+ int related;
+ int verdict = ip_vs_in_icmp(skb, &related, hooknum);
+
+ if (related)
+ return verdict;
+ }
+
+ /* Protocol supported? */
+ pd = ip_vs_proto_data_get(net, iph.protocol);
+ if (unlikely(!pd))
+ return NF_ACCEPT;
+ pp = pd->pp;
+ /*
+ * Check if the packet belongs to an existing connection entry
+ */
+ cp = pp->conn_in_get(af, skb, &iph, 0);
+
+ conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
+ if (conn_reuse_mode && !iph.fragoffs &&
+ is_new_conn(skb, &iph) && cp &&
+ ((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
+ unlikely(!atomic_read(&cp->dest->weight))) ||
+ unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) {
+ if (!atomic_read(&cp->n_control))
+ ip_vs_conn_expire_now(cp);
+ __ip_vs_conn_put(cp);
+ cp = NULL;
+ }
+
+ if (unlikely(!cp) && !iph.fragoffs) {
+ /* No (second) fragments need to enter here, as nf_defrag_ipv6
+ * replayed fragment zero will already have created the cp
+ */
+ int v;
+
+ /* Schedule and create new connection entry into &cp */
+ if (!pp->conn_schedule(af, skb, pd, &v, &cp, &iph))
+ return v;
+ }
+
+ if (unlikely(!cp)) {
+ /* sorry, all this trouble for a no-hit :) */
+ IP_VS_DBG_PKT(12, af, pp, skb, 0,
+ "ip_vs_in: packet continues traversal as normal");
+ if (iph.fragoffs) {
+ /* Fragment that couldn't be mapped to a conn entry
+ * is missing module nf_defrag_ipv6
+ */
+ IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n");
+ IP_VS_DBG_PKT(7, af, pp, skb, 0, "unhandled fragment");
+ }
+ return NF_ACCEPT;
+ }
+
+ IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
+ /* Check the server status */
+ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ /* the destination server is not available */
+
+ if (sysctl_expire_nodest_conn(ipvs)) {
+ /* try to expire the connection immediately */
+ ip_vs_conn_expire_now(cp);
+ }
+ /* don't restart its timer, and silently
+ drop the packet. */
+ __ip_vs_conn_put(cp);
+ return NF_DROP;
+ }
+
+ ip_vs_in_stats(cp, skb);
+ ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
+ if (cp->packet_xmit)
+ ret = cp->packet_xmit(skb, cp, pp, &iph);
+ /* do not touch skb anymore */
+ else {
+ IP_VS_DBG_RL("warning: packet_xmit is null");
+ ret = NF_ACCEPT;
+ }
+
+ /* Increase its packet counter and check if it is needed
+ * to be synchronized
+ *
+ * Sync connection if it is about to close to
+ * encorage the standby servers to update the connections timeout
+ *
+ * For ONE_PKT let ip_vs_sync_conn() do the filter work.
+ */
+
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ pkts = sysctl_sync_threshold(ipvs);
+ else
+ pkts = atomic_add_return(1, &cp->in_pkts);
+
+ if (ipvs->sync_state & IP_VS_STATE_MASTER)
+ ip_vs_sync_conn(net, cp, pkts);
+
+ ip_vs_conn_put(cp);
+ return ret;
+}
+
+/*
+ * AF_INET handler in NF_INET_LOCAL_IN chain
+ * Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ip_vs_in(ops->hooknum, skb, AF_INET);
+}
+
+/*
+ * AF_INET handler in NF_INET_LOCAL_OUT chain
+ * Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request4(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ip_vs_in(ops->hooknum, skb, AF_INET);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+
+/*
+ * AF_INET6 handler in NF_INET_LOCAL_IN chain
+ * Schedule and forward packets from remote clients
+ */
+static unsigned int
+ip_vs_remote_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ip_vs_in(ops->hooknum, skb, AF_INET6);
+}
+
+/*
+ * AF_INET6 handler in NF_INET_LOCAL_OUT chain
+ * Schedule and forward packets from local clients
+ */
+static unsigned int
+ip_vs_local_request6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return ip_vs_in(ops->hooknum, skb, AF_INET6);
+}
+
+#endif
+
+
+/*
+ * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
+ * related packets destined for 0.0.0.0/0.
+ * When fwmark-based virtual service is used, such as transparent
+ * cache cluster, TCP packets can be marked and routed to ip_vs_in,
+ * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
+ * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
+ * and send them to ip_vs_in_icmp.
+ */
+static unsigned int
+ip_vs_forward_icmp(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ int r;
+ struct net *net;
+ struct netns_ipvs *ipvs;
+
+ if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
+ return NF_ACCEPT;
+
+ /* ipvs enabled in this netns ? */
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ return NF_ACCEPT;
+
+ return ip_vs_in_icmp(skb, &r, ops->hooknum);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static unsigned int
+ip_vs_forward_icmp_v6(const struct nf_hook_ops *ops, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ int r;
+ struct net *net;
+ struct netns_ipvs *ipvs;
+ struct ip_vs_iphdr iphdr;
+
+ ip_vs_fill_iph_skb(AF_INET6, skb, &iphdr);
+ if (iphdr.protocol != IPPROTO_ICMPV6)
+ return NF_ACCEPT;
+
+ /* ipvs enabled in this netns ? */
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ return NF_ACCEPT;
+
+ return ip_vs_in_icmp_v6(skb, &r, ops->hooknum, &iphdr);
+}
+#endif
+
+
+static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply4,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_NAT_SRC - 2,
+ },
+ /* After packet filtering, forward packet through VS/DR, VS/TUN,
+ * or VS/NAT(change destination), so that filtering rules can be
+ * applied to IPVS. */
+ {
+ .hook = ip_vs_remote_request4,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_NAT_SRC - 1,
+ },
+ /* Before ip_vs_in, change source only for VS/NAT */
+ {
+ .hook = ip_vs_local_reply4,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_NAT_DST + 1,
+ },
+ /* After mangle, schedule and forward local requests */
+ {
+ .hook = ip_vs_local_request4,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_NAT_DST + 2,
+ },
+ /* After packet filtering (but before ip_vs_out_icmp), catch icmp
+ * destined for 0.0.0.0/0, which is for incoming IPVS connections */
+ {
+ .hook = ip_vs_forward_icmp,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 99,
+ },
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply4,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 100,
+ },
+#ifdef CONFIG_IP_VS_IPV6
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP6_PRI_NAT_SRC - 2,
+ },
+ /* After packet filtering, forward packet through VS/DR, VS/TUN,
+ * or VS/NAT(change destination), so that filtering rules can be
+ * applied to IPVS. */
+ {
+ .hook = ip_vs_remote_request6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP6_PRI_NAT_SRC - 1,
+ },
+ /* Before ip_vs_in, change source only for VS/NAT */
+ {
+ .hook = ip_vs_local_reply6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP6_PRI_NAT_DST + 1,
+ },
+ /* After mangle, schedule and forward local requests */
+ {
+ .hook = ip_vs_local_request6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP6_PRI_NAT_DST + 2,
+ },
+ /* After packet filtering (but before ip_vs_out_icmp), catch icmp
+ * destined for 0.0.0.0/0, which is for incoming IPVS connections */
+ {
+ .hook = ip_vs_forward_icmp_v6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 99,
+ },
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_reply6,
+ .owner = THIS_MODULE,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 100,
+ },
+#endif
+};
+/*
+ * Initialize IP Virtual Server netns mem.
+ */
+static int __net_init __ip_vs_init(struct net *net)
+{
+ struct netns_ipvs *ipvs;
+
+ ipvs = net_generic(net, ip_vs_net_id);
+ if (ipvs == NULL)
+ return -ENOMEM;
+
+ /* Hold the beast until a service is registerd */
+ ipvs->enable = 0;
+ ipvs->net = net;
+ /* Counters used for creating unique names */
+ ipvs->gen = atomic_read(&ipvs_netns_cnt);
+ atomic_inc(&ipvs_netns_cnt);
+ net->ipvs = ipvs;
+
+ if (ip_vs_estimator_net_init(net) < 0)
+ goto estimator_fail;
+
+ if (ip_vs_control_net_init(net) < 0)
+ goto control_fail;
+
+ if (ip_vs_protocol_net_init(net) < 0)
+ goto protocol_fail;
+
+ if (ip_vs_app_net_init(net) < 0)
+ goto app_fail;
+
+ if (ip_vs_conn_net_init(net) < 0)
+ goto conn_fail;
+
+ if (ip_vs_sync_net_init(net) < 0)
+ goto sync_fail;
+
+ printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
+ sizeof(struct netns_ipvs), ipvs->gen);
+ return 0;
+/*
+ * Error handling
+ */
+
+sync_fail:
+ ip_vs_conn_net_cleanup(net);
+conn_fail:
+ ip_vs_app_net_cleanup(net);
+app_fail:
+ ip_vs_protocol_net_cleanup(net);
+protocol_fail:
+ ip_vs_control_net_cleanup(net);
+control_fail:
+ ip_vs_estimator_net_cleanup(net);
+estimator_fail:
+ net->ipvs = NULL;
+ return -ENOMEM;
+}
+
+static void __net_exit __ip_vs_cleanup(struct net *net)
+{
+ ip_vs_service_net_cleanup(net); /* ip_vs_flush() with locks */
+ ip_vs_conn_net_cleanup(net);
+ ip_vs_app_net_cleanup(net);
+ ip_vs_protocol_net_cleanup(net);
+ ip_vs_control_net_cleanup(net);
+ ip_vs_estimator_net_cleanup(net);
+ IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen);
+ net->ipvs = NULL;
+}
+
+static void __net_exit __ip_vs_dev_cleanup(struct net *net)
+{
+ EnterFunction(2);
+ net_ipvs(net)->enable = 0; /* Disable packet reception */
+ smp_wmb();
+ ip_vs_sync_net_cleanup(net);
+ LeaveFunction(2);
+}
+
+static struct pernet_operations ipvs_core_ops = {
+ .init = __ip_vs_init,
+ .exit = __ip_vs_cleanup,
+ .id = &ip_vs_net_id,
+ .size = sizeof(struct netns_ipvs),
+};
+
+static struct pernet_operations ipvs_core_dev_ops = {
+ .exit = __ip_vs_dev_cleanup,
+};
+
+/*
+ * Initialize IP Virtual Server
+ */
+static int __init ip_vs_init(void)
+{
+ int ret;
+
+ ret = ip_vs_control_init();
+ if (ret < 0) {
+ pr_err("can't setup control.\n");
+ goto exit;
+ }
+
+ ip_vs_protocol_init();
+
+ ret = ip_vs_conn_init();
+ if (ret < 0) {
+ pr_err("can't setup connection table.\n");
+ goto cleanup_protocol;
+ }
+
+ ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */
+ if (ret < 0)
+ goto cleanup_conn;
+
+ ret = register_pernet_device(&ipvs_core_dev_ops);
+ if (ret < 0)
+ goto cleanup_sub;
+
+ ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+ if (ret < 0) {
+ pr_err("can't register hooks.\n");
+ goto cleanup_dev;
+ }
+
+ ret = ip_vs_register_nl_ioctl();
+ if (ret < 0) {
+ pr_err("can't register netlink/ioctl.\n");
+ goto cleanup_hooks;
+ }
+
+ pr_info("ipvs loaded.\n");
+
+ return ret;
+
+cleanup_hooks:
+ nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+cleanup_dev:
+ unregister_pernet_device(&ipvs_core_dev_ops);
+cleanup_sub:
+ unregister_pernet_subsys(&ipvs_core_ops);
+cleanup_conn:
+ ip_vs_conn_cleanup();
+cleanup_protocol:
+ ip_vs_protocol_cleanup();
+ ip_vs_control_cleanup();
+exit:
+ return ret;
+}
+
+static void __exit ip_vs_cleanup(void)
+{
+ ip_vs_unregister_nl_ioctl();
+ nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+ unregister_pernet_device(&ipvs_core_dev_ops);
+ unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */
+ ip_vs_conn_cleanup();
+ ip_vs_protocol_cleanup();
+ ip_vs_control_cleanup();
+ pr_info("ipvs unloaded.\n");
+}
+
+module_init(ip_vs_init);
+module_exit(ip_vs_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
new file mode 100644
index 000000000..285eae3a1
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -0,0 +1,3956 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the NetFilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/swap.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/mutex.h>
+
+#include <net/net_namespace.h>
+#include <linux/nsproxy.h>
+#include <net/ip.h>
+#ifdef CONFIG_IP_VS_IPV6
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#endif
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/genetlink.h>
+
+#include <asm/uaccess.h>
+
+#include <net/ip_vs.h>
+
+/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
+static DEFINE_MUTEX(__ip_vs_mutex);
+
+/* sysctl variables */
+
+#ifdef CONFIG_IP_VS_DEBUG
+static int sysctl_ip_vs_debug_level = 0;
+
+int ip_vs_get_debug_level(void)
+{
+ return sysctl_ip_vs_debug_level;
+}
+#endif
+
+
+/* Protos */
+static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
+
+
+#ifdef CONFIG_IP_VS_IPV6
+/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
+static bool __ip_vs_addr_is_local_v6(struct net *net,
+ const struct in6_addr *addr)
+{
+ struct flowi6 fl6 = {
+ .daddr = *addr,
+ };
+ struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
+ bool is_local;
+
+ is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
+
+ dst_release(dst);
+ return is_local;
+}
+#endif
+
+#ifdef CONFIG_SYSCTL
+/*
+ * update_defense_level is called from keventd and from sysctl,
+ * so it needs to protect itself from softirqs
+ */
+static void update_defense_level(struct netns_ipvs *ipvs)
+{
+ struct sysinfo i;
+ static int old_secure_tcp = 0;
+ int availmem;
+ int nomem;
+ int to_change = -1;
+
+ /* we only count free and buffered memory (in pages) */
+ si_meminfo(&i);
+ availmem = i.freeram + i.bufferram;
+ /* however in linux 2.5 the i.bufferram is total page cache size,
+ we need adjust it */
+ /* si_swapinfo(&i); */
+ /* availmem = availmem - (i.totalswap - i.freeswap); */
+
+ nomem = (availmem < ipvs->sysctl_amemthresh);
+
+ local_bh_disable();
+
+ /* drop_entry */
+ spin_lock(&ipvs->dropentry_lock);
+ switch (ipvs->sysctl_drop_entry) {
+ case 0:
+ atomic_set(&ipvs->dropentry, 0);
+ break;
+ case 1:
+ if (nomem) {
+ atomic_set(&ipvs->dropentry, 1);
+ ipvs->sysctl_drop_entry = 2;
+ } else {
+ atomic_set(&ipvs->dropentry, 0);
+ }
+ break;
+ case 2:
+ if (nomem) {
+ atomic_set(&ipvs->dropentry, 1);
+ } else {
+ atomic_set(&ipvs->dropentry, 0);
+ ipvs->sysctl_drop_entry = 1;
+ };
+ break;
+ case 3:
+ atomic_set(&ipvs->dropentry, 1);
+ break;
+ }
+ spin_unlock(&ipvs->dropentry_lock);
+
+ /* drop_packet */
+ spin_lock(&ipvs->droppacket_lock);
+ switch (ipvs->sysctl_drop_packet) {
+ case 0:
+ ipvs->drop_rate = 0;
+ break;
+ case 1:
+ if (nomem) {
+ ipvs->drop_rate = ipvs->drop_counter
+ = ipvs->sysctl_amemthresh /
+ (ipvs->sysctl_amemthresh-availmem);
+ ipvs->sysctl_drop_packet = 2;
+ } else {
+ ipvs->drop_rate = 0;
+ }
+ break;
+ case 2:
+ if (nomem) {
+ ipvs->drop_rate = ipvs->drop_counter
+ = ipvs->sysctl_amemthresh /
+ (ipvs->sysctl_amemthresh-availmem);
+ } else {
+ ipvs->drop_rate = 0;
+ ipvs->sysctl_drop_packet = 1;
+ }
+ break;
+ case 3:
+ ipvs->drop_rate = ipvs->sysctl_am_droprate;
+ break;
+ }
+ spin_unlock(&ipvs->droppacket_lock);
+
+ /* secure_tcp */
+ spin_lock(&ipvs->securetcp_lock);
+ switch (ipvs->sysctl_secure_tcp) {
+ case 0:
+ if (old_secure_tcp >= 2)
+ to_change = 0;
+ break;
+ case 1:
+ if (nomem) {
+ if (old_secure_tcp < 2)
+ to_change = 1;
+ ipvs->sysctl_secure_tcp = 2;
+ } else {
+ if (old_secure_tcp >= 2)
+ to_change = 0;
+ }
+ break;
+ case 2:
+ if (nomem) {
+ if (old_secure_tcp < 2)
+ to_change = 1;
+ } else {
+ if (old_secure_tcp >= 2)
+ to_change = 0;
+ ipvs->sysctl_secure_tcp = 1;
+ }
+ break;
+ case 3:
+ if (old_secure_tcp < 2)
+ to_change = 1;
+ break;
+ }
+ old_secure_tcp = ipvs->sysctl_secure_tcp;
+ if (to_change >= 0)
+ ip_vs_protocol_timeout_change(ipvs,
+ ipvs->sysctl_secure_tcp > 1);
+ spin_unlock(&ipvs->securetcp_lock);
+
+ local_bh_enable();
+}
+
+
+/*
+ * Timer for checking the defense
+ */
+#define DEFENSE_TIMER_PERIOD 1*HZ
+
+static void defense_work_handler(struct work_struct *work)
+{
+ struct netns_ipvs *ipvs =
+ container_of(work, struct netns_ipvs, defense_work.work);
+
+ update_defense_level(ipvs);
+ if (atomic_read(&ipvs->dropentry))
+ ip_vs_random_dropentry(ipvs->net);
+ schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
+}
+#endif
+
+int
+ip_vs_use_count_inc(void)
+{
+ return try_module_get(THIS_MODULE);
+}
+
+void
+ip_vs_use_count_dec(void)
+{
+ module_put(THIS_MODULE);
+}
+
+
+/*
+ * Hash table: for virtual service lookups
+ */
+#define IP_VS_SVC_TAB_BITS 8
+#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
+#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
+
+/* the service table hashed by <protocol, addr, port> */
+static struct hlist_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
+/* the service table hashed by fwmark */
+static struct hlist_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
+
+
+/*
+ * Returns hash value for virtual service
+ */
+static inline unsigned int
+ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ register unsigned int porth = ntohs(port);
+ __be32 addr_fold = addr->ip;
+ __u32 ahash;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+ ahash = ntohl(addr_fold);
+ ahash ^= ((size_t) net >> 8);
+
+ return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
+ IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ * Returns hash value of fwmark for virtual service lookup
+ */
+static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
+{
+ return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
+ * or in the ip_vs_svc_fwm_table by fwmark.
+ * Should be called with locked tables.
+ */
+static int ip_vs_svc_hash(struct ip_vs_service *svc)
+{
+ unsigned int hash;
+
+ if (svc->flags & IP_VS_SVC_F_HASHED) {
+ pr_err("%s(): request for already hashed, called from %pF\n",
+ __func__, __builtin_return_address(0));
+ return 0;
+ }
+
+ if (svc->fwmark == 0) {
+ /*
+ * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
+ */
+ hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
+ &svc->addr, svc->port);
+ hlist_add_head_rcu(&svc->s_list, &ip_vs_svc_table[hash]);
+ } else {
+ /*
+ * Hash it by fwmark in svc_fwm_table
+ */
+ hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
+ hlist_add_head_rcu(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
+ }
+
+ svc->flags |= IP_VS_SVC_F_HASHED;
+ /* increase its refcnt because it is referenced by the svc table */
+ atomic_inc(&svc->refcnt);
+ return 1;
+}
+
+
+/*
+ * Unhashes a service from svc_table / svc_fwm_table.
+ * Should be called with locked tables.
+ */
+static int ip_vs_svc_unhash(struct ip_vs_service *svc)
+{
+ if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
+ pr_err("%s(): request for unhash flagged, called from %pF\n",
+ __func__, __builtin_return_address(0));
+ return 0;
+ }
+
+ if (svc->fwmark == 0) {
+ /* Remove it from the svc_table table */
+ hlist_del_rcu(&svc->s_list);
+ } else {
+ /* Remove it from the svc_fwm_table table */
+ hlist_del_rcu(&svc->f_list);
+ }
+
+ svc->flags &= ~IP_VS_SVC_F_HASHED;
+ atomic_dec(&svc->refcnt);
+ return 1;
+}
+
+
+/*
+ * Get service by {netns, proto,addr,port} in the service table.
+ */
+static inline struct ip_vs_service *
+__ip_vs_service_find(struct net *net, int af, __u16 protocol,
+ const union nf_inet_addr *vaddr, __be16 vport)
+{
+ unsigned int hash;
+ struct ip_vs_service *svc;
+
+ /* Check for "full" addressed entries */
+ hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
+
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[hash], s_list) {
+ if ((svc->af == af)
+ && ip_vs_addr_equal(af, &svc->addr, vaddr)
+ && (svc->port == vport)
+ && (svc->protocol == protocol)
+ && net_eq(svc->net, net)) {
+ /* HIT */
+ return svc;
+ }
+ }
+
+ return NULL;
+}
+
+
+/*
+ * Get service by {fwmark} in the service table.
+ */
+static inline struct ip_vs_service *
+__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
+{
+ unsigned int hash;
+ struct ip_vs_service *svc;
+
+ /* Check for fwmark addressed entries */
+ hash = ip_vs_svc_fwm_hashkey(net, fwmark);
+
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[hash], f_list) {
+ if (svc->fwmark == fwmark && svc->af == af
+ && net_eq(svc->net, net)) {
+ /* HIT */
+ return svc;
+ }
+ }
+
+ return NULL;
+}
+
+/* Find service, called under RCU lock */
+struct ip_vs_service *
+ip_vs_service_find(struct net *net, int af, __u32 fwmark, __u16 protocol,
+ const union nf_inet_addr *vaddr, __be16 vport)
+{
+ struct ip_vs_service *svc;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ /*
+ * Check the table hashed by fwmark first
+ */
+ if (fwmark) {
+ svc = __ip_vs_svc_fwm_find(net, af, fwmark);
+ if (svc)
+ goto out;
+ }
+
+ /*
+ * Check the table hashed by <protocol,addr,port>
+ * for "full" addressed entries
+ */
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
+
+ if (svc == NULL
+ && protocol == IPPROTO_TCP
+ && atomic_read(&ipvs->ftpsvc_counter)
+ && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
+ /*
+ * Check if ftp service entry exists, the packet
+ * might belong to FTP data connections.
+ */
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
+ }
+
+ if (svc == NULL
+ && atomic_read(&ipvs->nullsvc_counter)) {
+ /*
+ * Check if the catch-all port (port zero) exists
+ */
+ svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
+ }
+
+ out:
+ IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
+ fwmark, ip_vs_proto_name(protocol),
+ IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
+ svc ? "hit" : "not hit");
+
+ return svc;
+}
+
+
+static inline void
+__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+ atomic_inc(&svc->refcnt);
+ rcu_assign_pointer(dest->svc, svc);
+}
+
+static void ip_vs_service_free(struct ip_vs_service *svc)
+{
+ free_percpu(svc->stats.cpustats);
+ kfree(svc);
+}
+
+static void ip_vs_service_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_service *svc;
+
+ svc = container_of(head, struct ip_vs_service, rcu_head);
+ ip_vs_service_free(svc);
+}
+
+static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
+{
+ if (atomic_dec_and_test(&svc->refcnt)) {
+ IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
+ svc->fwmark,
+ IP_VS_DBG_ADDR(svc->af, &svc->addr),
+ ntohs(svc->port));
+ if (do_delay)
+ call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
+ else
+ ip_vs_service_free(svc);
+ }
+}
+
+
+/*
+ * Returns hash value for real service
+ */
+static inline unsigned int ip_vs_rs_hashkey(int af,
+ const union nf_inet_addr *addr,
+ __be16 port)
+{
+ register unsigned int porth = ntohs(port);
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+
+ return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
+ & IP_VS_RTAB_MASK;
+}
+
+/* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
+static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
+{
+ unsigned int hash;
+
+ if (dest->in_rs_table)
+ return;
+
+ /*
+ * Hash by proto,addr,port,
+ * which are the parameters of the real service.
+ */
+ hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
+
+ hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
+ dest->in_rs_table = 1;
+}
+
+/* Unhash ip_vs_dest from rs_table. */
+static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
+{
+ /*
+ * Remove it from the rs_table table.
+ */
+ if (dest->in_rs_table) {
+ hlist_del_rcu(&dest->d_list);
+ dest->in_rs_table = 0;
+ }
+}
+
+/* Check if real service by <proto,addr,port> is present */
+bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
+ const union nf_inet_addr *daddr, __be16 dport)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ unsigned int hash;
+ struct ip_vs_dest *dest;
+
+ /* Check for "full" addressed entries */
+ hash = ip_vs_rs_hashkey(af, daddr, dport);
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
+ if (dest->port == dport &&
+ dest->af == af &&
+ ip_vs_addr_equal(af, &dest->addr, daddr) &&
+ (dest->protocol == protocol || dest->vfwmark)) {
+ /* HIT */
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+
+ return false;
+}
+
+/* Lookup destination by {addr,port} in the given service
+ * Called under RCU lock.
+ */
+static struct ip_vs_dest *
+ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af,
+ const union nf_inet_addr *daddr, __be16 dport)
+{
+ struct ip_vs_dest *dest;
+
+ /*
+ * Find the destination for the given service
+ */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if ((dest->af == dest_af) &&
+ ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
+ (dest->port == dport)) {
+ /* HIT */
+ return dest;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Find destination by {daddr,dport,vaddr,protocol}
+ * Created to be used in ip_vs_process_message() in
+ * the backup synchronization daemon. It finds the
+ * destination to be bound to the received connection
+ * on the backup.
+ * Called under RCU lock, no refcnt is returned.
+ */
+struct ip_vs_dest *ip_vs_find_dest(struct net *net, int svc_af, int dest_af,
+ const union nf_inet_addr *daddr,
+ __be16 dport,
+ const union nf_inet_addr *vaddr,
+ __be16 vport, __u16 protocol, __u32 fwmark,
+ __u32 flags)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_service *svc;
+ __be16 port = dport;
+
+ svc = ip_vs_service_find(net, svc_af, fwmark, protocol, vaddr, vport);
+ if (!svc)
+ return NULL;
+ if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
+ port = 0;
+ dest = ip_vs_lookup_dest(svc, dest_af, daddr, port);
+ if (!dest)
+ dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport);
+ return dest;
+}
+
+void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest_dst *dest_dst = container_of(head,
+ struct ip_vs_dest_dst,
+ rcu_head);
+
+ dst_release(dest_dst->dst_cache);
+ kfree(dest_dst);
+}
+
+/* Release dest_dst and dst_cache for dest in user context */
+static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
+{
+ struct ip_vs_dest_dst *old;
+
+ old = rcu_dereference_protected(dest->dest_dst, 1);
+ if (old) {
+ RCU_INIT_POINTER(dest->dest_dst, NULL);
+ call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
+ }
+}
+
+/*
+ * Lookup dest by {svc,addr,port} in the destination trash.
+ * The destination trash is used to hold the destinations that are removed
+ * from the service table but are still referenced by some conn entries.
+ * The reason to add the destination trash is when the dest is temporary
+ * down (either by administrator or by monitor program), the dest can be
+ * picked back from the trash, the remaining connections to the dest can
+ * continue, and the counting information of the dest is also useful for
+ * scheduling.
+ */
+static struct ip_vs_dest *
+ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
+ const union nf_inet_addr *daddr, __be16 dport)
+{
+ struct ip_vs_dest *dest;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+
+ /*
+ * Find the destination in trash
+ */
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
+ IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
+ "dest->refcnt=%d\n",
+ dest->vfwmark,
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ if (dest->af == dest_af &&
+ ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
+ dest->port == dport &&
+ dest->vfwmark == svc->fwmark &&
+ dest->protocol == svc->protocol &&
+ (svc->fwmark ||
+ (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
+ dest->vport == svc->port))) {
+ /* HIT */
+ list_del(&dest->t_list);
+ ip_vs_dest_hold(dest);
+ goto out;
+ }
+ }
+
+ dest = NULL;
+
+out:
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+
+ return dest;
+}
+
+static void ip_vs_dest_free(struct ip_vs_dest *dest)
+{
+ struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
+
+ __ip_vs_dst_cache_reset(dest);
+ __ip_vs_svc_put(svc, false);
+ free_percpu(dest->stats.cpustats);
+ ip_vs_dest_put_and_free(dest);
+}
+
+/*
+ * Clean up all the destinations in the trash
+ * Called by the ip_vs_control_cleanup()
+ *
+ * When the ip_vs_control_clearup is activated by ipvs module exit,
+ * the service tables must have been flushed and all the connections
+ * are expired, and the refcnt of each destination in the trash must
+ * be 0, so we simply release them here.
+ */
+static void ip_vs_trash_cleanup(struct net *net)
+{
+ struct ip_vs_dest *dest, *nxt;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ del_timer_sync(&ipvs->dest_trash_timer);
+ /* No need to use dest_trash_lock */
+ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
+ list_del(&dest->t_list);
+ ip_vs_dest_free(dest);
+ }
+}
+
+static void
+ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
+{
+#define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c
+
+ spin_lock_bh(&src->lock);
+
+ IP_VS_SHOW_STATS_COUNTER(conns);
+ IP_VS_SHOW_STATS_COUNTER(inpkts);
+ IP_VS_SHOW_STATS_COUNTER(outpkts);
+ IP_VS_SHOW_STATS_COUNTER(inbytes);
+ IP_VS_SHOW_STATS_COUNTER(outbytes);
+
+ ip_vs_read_estimator(dst, src);
+
+ spin_unlock_bh(&src->lock);
+}
+
+static void
+ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src)
+{
+ dst->conns = (u32)src->conns;
+ dst->inpkts = (u32)src->inpkts;
+ dst->outpkts = (u32)src->outpkts;
+ dst->inbytes = src->inbytes;
+ dst->outbytes = src->outbytes;
+ dst->cps = (u32)src->cps;
+ dst->inpps = (u32)src->inpps;
+ dst->outpps = (u32)src->outpps;
+ dst->inbps = (u32)src->inbps;
+ dst->outbps = (u32)src->outbps;
+}
+
+static void
+ip_vs_zero_stats(struct ip_vs_stats *stats)
+{
+ spin_lock_bh(&stats->lock);
+
+ /* get current counters as zero point, rates are zeroed */
+
+#define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c
+
+ IP_VS_ZERO_STATS_COUNTER(conns);
+ IP_VS_ZERO_STATS_COUNTER(inpkts);
+ IP_VS_ZERO_STATS_COUNTER(outpkts);
+ IP_VS_ZERO_STATS_COUNTER(inbytes);
+ IP_VS_ZERO_STATS_COUNTER(outbytes);
+
+ ip_vs_zero_estimator(stats);
+
+ spin_unlock_bh(&stats->lock);
+}
+
+/*
+ * Update a destination in the given service
+ */
+static void
+__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
+ struct ip_vs_dest_user_kern *udest, int add)
+{
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ struct ip_vs_service *old_svc;
+ struct ip_vs_scheduler *sched;
+ int conn_flags;
+
+ /* We cannot modify an address and change the address family */
+ BUG_ON(!add && udest->af != dest->af);
+
+ if (add && udest->af != svc->af)
+ ipvs->mixed_address_family_dests++;
+
+ /* set the weight and the flags */
+ atomic_set(&dest->weight, udest->weight);
+ conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
+ conn_flags |= IP_VS_CONN_F_INACTIVE;
+
+ /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
+ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
+ conn_flags |= IP_VS_CONN_F_NOOUTPUT;
+ } else {
+ /*
+ * Put the real service in rs_table if not present.
+ * For now only for NAT!
+ */
+ ip_vs_rs_hash(ipvs, dest);
+ }
+ atomic_set(&dest->conn_flags, conn_flags);
+
+ /* bind the service */
+ old_svc = rcu_dereference_protected(dest->svc, 1);
+ if (!old_svc) {
+ __ip_vs_bind_svc(dest, svc);
+ } else {
+ if (old_svc != svc) {
+ ip_vs_zero_stats(&dest->stats);
+ __ip_vs_bind_svc(dest, svc);
+ __ip_vs_svc_put(old_svc, true);
+ }
+ }
+
+ /* set the dest status flags */
+ dest->flags |= IP_VS_DEST_F_AVAILABLE;
+
+ if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
+ dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+ dest->u_threshold = udest->u_threshold;
+ dest->l_threshold = udest->l_threshold;
+
+ dest->af = udest->af;
+
+ spin_lock_bh(&dest->dst_lock);
+ __ip_vs_dst_cache_reset(dest);
+ spin_unlock_bh(&dest->dst_lock);
+
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (add) {
+ ip_vs_start_estimator(svc->net, &dest->stats);
+ list_add_rcu(&dest->n_list, &svc->destinations);
+ svc->num_dests++;
+ if (sched->add_dest)
+ sched->add_dest(svc, dest);
+ } else {
+ if (sched->upd_dest)
+ sched->upd_dest(svc, dest);
+ }
+}
+
+
+/*
+ * Create a destination for the given service
+ */
+static int
+ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
+ struct ip_vs_dest **dest_p)
+{
+ struct ip_vs_dest *dest;
+ unsigned int atype, i;
+
+ EnterFunction(2);
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (udest->af == AF_INET6) {
+ atype = ipv6_addr_type(&udest->addr.in6);
+ if ((!(atype & IPV6_ADDR_UNICAST) ||
+ atype & IPV6_ADDR_LINKLOCAL) &&
+ !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
+ return -EINVAL;
+ } else
+#endif
+ {
+ atype = inet_addr_type(svc->net, udest->addr.ip);
+ if (atype != RTN_LOCAL && atype != RTN_UNICAST)
+ return -EINVAL;
+ }
+
+ dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
+ if (dest == NULL)
+ return -ENOMEM;
+
+ dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!dest->stats.cpustats)
+ goto err_alloc;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *ip_vs_dest_stats;
+ ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i);
+ u64_stats_init(&ip_vs_dest_stats->syncp);
+ }
+
+ dest->af = udest->af;
+ dest->protocol = svc->protocol;
+ dest->vaddr = svc->addr;
+ dest->vport = svc->port;
+ dest->vfwmark = svc->fwmark;
+ ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr);
+ dest->port = udest->port;
+
+ atomic_set(&dest->activeconns, 0);
+ atomic_set(&dest->inactconns, 0);
+ atomic_set(&dest->persistconns, 0);
+ atomic_set(&dest->refcnt, 1);
+
+ INIT_HLIST_NODE(&dest->d_list);
+ spin_lock_init(&dest->dst_lock);
+ spin_lock_init(&dest->stats.lock);
+ __ip_vs_update_dest(svc, dest, udest, 1);
+
+ *dest_p = dest;
+
+ LeaveFunction(2);
+ return 0;
+
+err_alloc:
+ kfree(dest);
+ return -ENOMEM;
+}
+
+
+/*
+ * Add a destination into an existing service
+ */
+static int
+ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+ struct ip_vs_dest *dest;
+ union nf_inet_addr daddr;
+ __be16 dport = udest->port;
+ int ret;
+
+ EnterFunction(2);
+
+ if (udest->weight < 0) {
+ pr_err("%s(): server weight less than zero\n", __func__);
+ return -ERANGE;
+ }
+
+ if (udest->l_threshold > udest->u_threshold) {
+ pr_err("%s(): lower threshold is higher than upper threshold\n",
+ __func__);
+ return -ERANGE;
+ }
+
+ ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
+
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
+ dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
+ rcu_read_unlock();
+
+ if (dest != NULL) {
+ IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
+ return -EEXIST;
+ }
+
+ /*
+ * Check if the dest already exists in the trash and
+ * is from the same service
+ */
+ dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport);
+
+ if (dest != NULL) {
+ IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
+ "dest->refcnt=%d, service %u/%s:%u\n",
+ IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport),
+ atomic_read(&dest->refcnt),
+ dest->vfwmark,
+ IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
+ ntohs(dest->vport));
+
+ __ip_vs_update_dest(svc, dest, udest, 1);
+ ret = 0;
+ } else {
+ /*
+ * Allocate and initialize the dest structure
+ */
+ ret = ip_vs_new_dest(svc, udest, &dest);
+ }
+ LeaveFunction(2);
+
+ return ret;
+}
+
+
+/*
+ * Edit a destination in the given service
+ */
+static int
+ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+ struct ip_vs_dest *dest;
+ union nf_inet_addr daddr;
+ __be16 dport = udest->port;
+
+ EnterFunction(2);
+
+ if (udest->weight < 0) {
+ pr_err("%s(): server weight less than zero\n", __func__);
+ return -ERANGE;
+ }
+
+ if (udest->l_threshold > udest->u_threshold) {
+ pr_err("%s(): lower threshold is higher than upper threshold\n",
+ __func__);
+ return -ERANGE;
+ }
+
+ ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
+
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
+ dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
+ rcu_read_unlock();
+
+ if (dest == NULL) {
+ IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
+ return -ENOENT;
+ }
+
+ __ip_vs_update_dest(svc, dest, udest, 0);
+ LeaveFunction(2);
+
+ return 0;
+}
+
+/*
+ * Delete a destination (must be already unlinked from the service)
+ */
+static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest,
+ bool cleanup)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_stop_estimator(net, &dest->stats);
+
+ /*
+ * Remove it from the d-linked list with the real services.
+ */
+ ip_vs_rs_unhash(dest);
+
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ if (list_empty(&ipvs->dest_trash) && !cleanup)
+ mod_timer(&ipvs->dest_trash_timer,
+ jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
+ /* dest lives in trash without reference */
+ list_add(&dest->t_list, &ipvs->dest_trash);
+ dest->idle_start = 0;
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+ ip_vs_dest_put(dest);
+}
+
+
+/*
+ * Unlink a destination from the given service
+ */
+static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest,
+ int svcupd)
+{
+ dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
+
+ /*
+ * Remove it from the d-linked destination list.
+ */
+ list_del_rcu(&dest->n_list);
+ svc->num_dests--;
+
+ if (dest->af != svc->af)
+ net_ipvs(svc->net)->mixed_address_family_dests--;
+
+ if (svcupd) {
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched->del_dest)
+ sched->del_dest(svc, dest);
+ }
+}
+
+
+/*
+ * Delete a destination server in the given service
+ */
+static int
+ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
+{
+ struct ip_vs_dest *dest;
+ __be16 dport = udest->port;
+
+ EnterFunction(2);
+
+ /* We use function that requires RCU lock */
+ rcu_read_lock();
+ dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport);
+ rcu_read_unlock();
+
+ if (dest == NULL) {
+ IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
+ return -ENOENT;
+ }
+
+ /*
+ * Unlink dest from the service
+ */
+ __ip_vs_unlink_dest(svc, dest, 1);
+
+ /*
+ * Delete the destination
+ */
+ __ip_vs_del_dest(svc->net, dest, false);
+
+ LeaveFunction(2);
+
+ return 0;
+}
+
+static void ip_vs_dest_trash_expire(unsigned long data)
+{
+ struct net *net = (struct net *) data;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_dest *dest, *next;
+ unsigned long now = jiffies;
+
+ spin_lock(&ipvs->dest_trash_lock);
+ list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
+ if (atomic_read(&dest->refcnt) > 0)
+ continue;
+ if (dest->idle_start) {
+ if (time_before(now, dest->idle_start +
+ IP_VS_DEST_TRASH_PERIOD))
+ continue;
+ } else {
+ dest->idle_start = max(1UL, now);
+ continue;
+ }
+ IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
+ dest->vfwmark,
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port));
+ list_del(&dest->t_list);
+ ip_vs_dest_free(dest);
+ }
+ if (!list_empty(&ipvs->dest_trash))
+ mod_timer(&ipvs->dest_trash_timer,
+ jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
+ spin_unlock(&ipvs->dest_trash_lock);
+}
+
+/*
+ * Add a service into the service hash table
+ */
+static int
+ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
+ struct ip_vs_service **svc_p)
+{
+ int ret = 0, i;
+ struct ip_vs_scheduler *sched = NULL;
+ struct ip_vs_pe *pe = NULL;
+ struct ip_vs_service *svc = NULL;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ /* Lookup the scheduler by 'u->sched_name' */
+ sched = ip_vs_scheduler_get(u->sched_name);
+ if (sched == NULL) {
+ pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
+ ret = -ENOENT;
+ goto out_err;
+ }
+
+ if (u->pe_name && *u->pe_name) {
+ pe = ip_vs_pe_getbyname(u->pe_name);
+ if (pe == NULL) {
+ pr_info("persistence engine module ip_vs_pe_%s "
+ "not found\n", u->pe_name);
+ ret = -ENOENT;
+ goto out_err;
+ }
+ }
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (u->af == AF_INET6) {
+ __u32 plen = (__force __u32) u->netmask;
+
+ if (plen < 1 || plen > 128) {
+ ret = -EINVAL;
+ goto out_err;
+ }
+ }
+#endif
+
+ svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
+ if (svc == NULL) {
+ IP_VS_DBG(1, "%s(): no memory\n", __func__);
+ ret = -ENOMEM;
+ goto out_err;
+ }
+ svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!svc->stats.cpustats) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *ip_vs_stats;
+ ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
+ u64_stats_init(&ip_vs_stats->syncp);
+ }
+
+
+ /* I'm the first user of the service */
+ atomic_set(&svc->refcnt, 0);
+
+ svc->af = u->af;
+ svc->protocol = u->protocol;
+ ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
+ svc->port = u->port;
+ svc->fwmark = u->fwmark;
+ svc->flags = u->flags;
+ svc->timeout = u->timeout * HZ;
+ svc->netmask = u->netmask;
+ svc->net = net;
+
+ INIT_LIST_HEAD(&svc->destinations);
+ spin_lock_init(&svc->sched_lock);
+ spin_lock_init(&svc->stats.lock);
+
+ /* Bind the scheduler */
+ ret = ip_vs_bind_scheduler(svc, sched);
+ if (ret)
+ goto out_err;
+ sched = NULL;
+
+ /* Bind the ct retriever */
+ RCU_INIT_POINTER(svc->pe, pe);
+ pe = NULL;
+
+ /* Update the virtual service counters */
+ if (svc->port == FTPPORT)
+ atomic_inc(&ipvs->ftpsvc_counter);
+ else if (svc->port == 0)
+ atomic_inc(&ipvs->nullsvc_counter);
+
+ ip_vs_start_estimator(net, &svc->stats);
+
+ /* Count only IPv4 services for old get/setsockopt interface */
+ if (svc->af == AF_INET)
+ ipvs->num_services++;
+
+ /* Hash the service into the service table */
+ ip_vs_svc_hash(svc);
+
+ *svc_p = svc;
+ /* Now there is a service - full throttle */
+ ipvs->enable = 1;
+ return 0;
+
+
+ out_err:
+ if (svc != NULL) {
+ ip_vs_unbind_scheduler(svc, sched);
+ ip_vs_service_free(svc);
+ }
+ ip_vs_scheduler_put(sched);
+ ip_vs_pe_put(pe);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ return ret;
+}
+
+
+/*
+ * Edit a service and bind it with a new scheduler
+ */
+static int
+ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
+{
+ struct ip_vs_scheduler *sched, *old_sched;
+ struct ip_vs_pe *pe = NULL, *old_pe = NULL;
+ int ret = 0;
+
+ /*
+ * Lookup the scheduler, by 'u->sched_name'
+ */
+ sched = ip_vs_scheduler_get(u->sched_name);
+ if (sched == NULL) {
+ pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
+ return -ENOENT;
+ }
+ old_sched = sched;
+
+ if (u->pe_name && *u->pe_name) {
+ pe = ip_vs_pe_getbyname(u->pe_name);
+ if (pe == NULL) {
+ pr_info("persistence engine module ip_vs_pe_%s "
+ "not found\n", u->pe_name);
+ ret = -ENOENT;
+ goto out;
+ }
+ old_pe = pe;
+ }
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (u->af == AF_INET6) {
+ __u32 plen = (__force __u32) u->netmask;
+
+ if (plen < 1 || plen > 128) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+#endif
+
+ old_sched = rcu_dereference_protected(svc->scheduler, 1);
+ if (sched != old_sched) {
+ /* Bind the new scheduler */
+ ret = ip_vs_bind_scheduler(svc, sched);
+ if (ret) {
+ old_sched = sched;
+ goto out;
+ }
+ /* Unbind the old scheduler on success */
+ ip_vs_unbind_scheduler(svc, old_sched);
+ }
+
+ /*
+ * Set the flags and timeout value
+ */
+ svc->flags = u->flags | IP_VS_SVC_F_HASHED;
+ svc->timeout = u->timeout * HZ;
+ svc->netmask = u->netmask;
+
+ old_pe = rcu_dereference_protected(svc->pe, 1);
+ if (pe != old_pe)
+ rcu_assign_pointer(svc->pe, pe);
+
+out:
+ ip_vs_scheduler_put(old_sched);
+ ip_vs_pe_put(old_pe);
+ return ret;
+}
+
+/*
+ * Delete a service from the service list
+ * - The service must be unlinked, unlocked and not referenced!
+ * - We are called under _bh lock
+ */
+static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
+{
+ struct ip_vs_dest *dest, *nxt;
+ struct ip_vs_scheduler *old_sched;
+ struct ip_vs_pe *old_pe;
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+
+ pr_info("%s: enter\n", __func__);
+
+ /* Count only IPv4 services for old get/setsockopt interface */
+ if (svc->af == AF_INET)
+ ipvs->num_services--;
+
+ ip_vs_stop_estimator(svc->net, &svc->stats);
+
+ /* Unbind scheduler */
+ old_sched = rcu_dereference_protected(svc->scheduler, 1);
+ ip_vs_unbind_scheduler(svc, old_sched);
+ ip_vs_scheduler_put(old_sched);
+
+ /* Unbind persistence engine, keep svc->pe */
+ old_pe = rcu_dereference_protected(svc->pe, 1);
+ ip_vs_pe_put(old_pe);
+
+ /*
+ * Unlink the whole destination list
+ */
+ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
+ __ip_vs_unlink_dest(svc, dest, 0);
+ __ip_vs_del_dest(svc->net, dest, cleanup);
+ }
+
+ /*
+ * Update the virtual service counters
+ */
+ if (svc->port == FTPPORT)
+ atomic_dec(&ipvs->ftpsvc_counter);
+ else if (svc->port == 0)
+ atomic_dec(&ipvs->nullsvc_counter);
+
+ /*
+ * Free the service if nobody refers to it
+ */
+ __ip_vs_svc_put(svc, true);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+}
+
+/*
+ * Unlink a service from list and try to delete it if its refcnt reached 0
+ */
+static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
+{
+ /* Hold svc to avoid double release from dest_trash */
+ atomic_inc(&svc->refcnt);
+ /*
+ * Unhash it from the service table
+ */
+ ip_vs_svc_unhash(svc);
+
+ __ip_vs_del_service(svc, cleanup);
+}
+
+/*
+ * Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+ if (svc == NULL)
+ return -EEXIST;
+ ip_vs_unlink_service(svc, false);
+
+ return 0;
+}
+
+
+/*
+ * Flush all the virtual services
+ */
+static int ip_vs_flush(struct net *net, bool cleanup)
+{
+ int idx;
+ struct ip_vs_service *svc;
+ struct hlist_node *n;
+
+ /*
+ * Flush the service table hashed by <netns,protocol,addr,port>
+ */
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry_safe(svc, n, &ip_vs_svc_table[idx],
+ s_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_unlink_service(svc, cleanup);
+ }
+ }
+
+ /*
+ * Flush the service table hashed by fwmark
+ */
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry_safe(svc, n, &ip_vs_svc_fwm_table[idx],
+ f_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_unlink_service(svc, cleanup);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Delete service by {netns} in the service table.
+ * Called by __ip_vs_cleanup()
+ */
+void ip_vs_service_net_cleanup(struct net *net)
+{
+ EnterFunction(2);
+ /* Check for "full" addressed entries */
+ mutex_lock(&__ip_vs_mutex);
+ ip_vs_flush(net, true);
+ mutex_unlock(&__ip_vs_mutex);
+ LeaveFunction(2);
+}
+
+/* Put all references for device (dst_cache) */
+static inline void
+ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
+{
+ struct ip_vs_dest_dst *dest_dst;
+
+ spin_lock_bh(&dest->dst_lock);
+ dest_dst = rcu_dereference_protected(dest->dest_dst, 1);
+ if (dest_dst && dest_dst->dst_cache->dev == dev) {
+ IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
+ dev->name,
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port),
+ atomic_read(&dest->refcnt));
+ __ip_vs_dst_cache_reset(dest);
+ }
+ spin_unlock_bh(&dest->dst_lock);
+
+}
+/* Netdev event receiver
+ * Currently only NETDEV_DOWN is handled to release refs to cached dsts
+ */
+static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_service *svc;
+ struct ip_vs_dest *dest;
+ unsigned int idx;
+
+ if (event != NETDEV_DOWN || !ipvs)
+ return NOTIFY_DONE;
+ IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
+ EnterFunction(2);
+ mutex_lock(&__ip_vs_mutex);
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ if (net_eq(svc->net, net)) {
+ list_for_each_entry(dest, &svc->destinations,
+ n_list) {
+ ip_vs_forget_dev(dest, dev);
+ }
+ }
+ }
+
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ if (net_eq(svc->net, net)) {
+ list_for_each_entry(dest, &svc->destinations,
+ n_list) {
+ ip_vs_forget_dev(dest, dev);
+ }
+ }
+
+ }
+ }
+
+ spin_lock_bh(&ipvs->dest_trash_lock);
+ list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
+ ip_vs_forget_dev(dest, dev);
+ }
+ spin_unlock_bh(&ipvs->dest_trash_lock);
+ mutex_unlock(&__ip_vs_mutex);
+ LeaveFunction(2);
+ return NOTIFY_DONE;
+}
+
+/*
+ * Zero counters in a service or all services
+ */
+static int ip_vs_zero_service(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ ip_vs_zero_stats(&dest->stats);
+ }
+ ip_vs_zero_stats(&svc->stats);
+ return 0;
+}
+
+static int ip_vs_zero_all(struct net *net)
+{
+ int idx;
+ struct ip_vs_service *svc;
+
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_zero_service(svc);
+ }
+ }
+
+ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ if (net_eq(svc->net, net))
+ ip_vs_zero_service(svc);
+ }
+ }
+
+ ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
+ return 0;
+}
+
+#ifdef CONFIG_SYSCTL
+
+static int zero;
+static int three = 3;
+
+static int
+proc_do_defense_mode(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = current->nsproxy->net_ns;
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (*valp != val)) {
+ if ((*valp < 0) || (*valp > 3)) {
+ /* Restore the correct value */
+ *valp = val;
+ } else {
+ update_defense_level(net_ipvs(net));
+ }
+ }
+ return rc;
+}
+
+static int
+proc_do_sync_threshold(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val[2];
+ int rc;
+
+ /* backup the value first */
+ memcpy(val, valp, sizeof(val));
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (valp[0] < 0 || valp[1] < 0 ||
+ (valp[0] >= valp[1] && valp[1]))) {
+ /* Restore the correct value */
+ memcpy(valp, val, sizeof(val));
+ }
+ return rc;
+}
+
+static int
+proc_do_sync_mode(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (*valp != val)) {
+ if ((*valp < 0) || (*valp > 1)) {
+ /* Restore the correct value */
+ *valp = val;
+ }
+ }
+ return rc;
+}
+
+static int
+proc_do_sync_ports(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ int *valp = table->data;
+ int val = *valp;
+ int rc;
+
+ rc = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (write && (*valp != val)) {
+ if (*valp < 1 || !is_power_of_2(*valp)) {
+ /* Restore the correct value */
+ *valp = val;
+ }
+ }
+ return rc;
+}
+
+/*
+ * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
+ * Do not change order or insert new entries without
+ * align with netns init in ip_vs_control_net_init()
+ */
+
+static struct ctl_table vs_vars[] = {
+ {
+ .procname = "amemthresh",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "am_droprate",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "drop_entry",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_defense_mode,
+ },
+ {
+ .procname = "drop_packet",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_defense_mode,
+ },
+#ifdef CONFIG_IP_VS_NFCT
+ {
+ .procname = "conntrack",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
+ {
+ .procname = "secure_tcp",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_do_defense_mode,
+ },
+ {
+ .procname = "snat_reroute",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .procname = "sync_version",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_do_sync_mode,
+ },
+ {
+ .procname = "sync_ports",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_do_sync_ports,
+ },
+ {
+ .procname = "sync_persist_mode",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_qlen_max",
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "sync_sock_size",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "cache_bypass",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "expire_nodest_conn",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sloppy_tcp",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sloppy_sctp",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "expire_quiescent_template",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sync_threshold",
+ .maxlen =
+ sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
+ .mode = 0644,
+ .proc_handler = proc_do_sync_threshold,
+ },
+ {
+ .procname = "sync_refresh_period",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "sync_retries",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &three,
+ },
+ {
+ .procname = "nat_icmp_send",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "pmtu_disc",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "backup_only",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "conn_reuse_mode",
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#ifdef CONFIG_IP_VS_DEBUG
+ {
+ .procname = "debug_level",
+ .data = &sysctl_ip_vs_debug_level,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
+ { }
+};
+
+#endif
+
+#ifdef CONFIG_PROC_FS
+
+struct ip_vs_iter {
+ struct seq_net_private p; /* Do not move this, netns depends upon it*/
+ struct hlist_head *table;
+ int bucket;
+};
+
+/*
+ * Write the contents of the VS rule table to a PROCfs file.
+ * (It is kept just for backward compatibility)
+ */
+static inline const char *ip_vs_fwd_name(unsigned int flags)
+{
+ switch (flags & IP_VS_CONN_F_FWD_MASK) {
+ case IP_VS_CONN_F_LOCALNODE:
+ return "Local";
+ case IP_VS_CONN_F_TUNNEL:
+ return "Tunnel";
+ case IP_VS_CONN_F_DROUTE:
+ return "Route";
+ default:
+ return "Masq";
+ }
+}
+
+
+/* Get the Nth entry in the two lists */
+static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
+{
+ struct net *net = seq_file_net(seq);
+ struct ip_vs_iter *iter = seq->private;
+ int idx;
+ struct ip_vs_service *svc;
+
+ /* look in hash by protocol */
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_table[idx], s_list) {
+ if (net_eq(svc->net, net) && pos-- == 0) {
+ iter->table = ip_vs_svc_table;
+ iter->bucket = idx;
+ return svc;
+ }
+ }
+ }
+
+ /* keep looking in fwmark */
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry_rcu(svc, &ip_vs_svc_fwm_table[idx],
+ f_list) {
+ if (net_eq(svc->net, net) && pos-- == 0) {
+ iter->table = ip_vs_svc_fwm_table;
+ iter->bucket = idx;
+ return svc;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(RCU)
+{
+ rcu_read_lock();
+ return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+
+static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct hlist_node *e;
+ struct ip_vs_iter *iter;
+ struct ip_vs_service *svc;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN)
+ return ip_vs_info_array(seq,0);
+
+ svc = v;
+ iter = seq->private;
+
+ if (iter->table == ip_vs_svc_table) {
+ /* next service in table hashed by protocol */
+ e = rcu_dereference(hlist_next_rcu(&svc->s_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_service, s_list);
+
+ while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+ hlist_for_each_entry_rcu(svc,
+ &ip_vs_svc_table[iter->bucket],
+ s_list) {
+ return svc;
+ }
+ }
+
+ iter->table = ip_vs_svc_fwm_table;
+ iter->bucket = -1;
+ goto scan_fwmark;
+ }
+
+ /* next service in hashed by fwmark */
+ e = rcu_dereference(hlist_next_rcu(&svc->f_list));
+ if (e)
+ return hlist_entry(e, struct ip_vs_service, f_list);
+
+ scan_fwmark:
+ while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+ hlist_for_each_entry_rcu(svc,
+ &ip_vs_svc_fwm_table[iter->bucket],
+ f_list)
+ return svc;
+ }
+
+ return NULL;
+}
+
+static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ rcu_read_unlock();
+}
+
+
+static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
+{
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq,
+ "IP Virtual Server version %d.%d.%d (size=%d)\n",
+ NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
+ seq_puts(seq,
+ "Prot LocalAddress:Port Scheduler Flags\n");
+ seq_puts(seq,
+ " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
+ } else {
+ const struct ip_vs_service *svc = v;
+ const struct ip_vs_iter *iter = seq->private;
+ const struct ip_vs_dest *dest;
+ struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
+
+ if (iter->table == ip_vs_svc_table) {
+#ifdef CONFIG_IP_VS_IPV6
+ if (svc->af == AF_INET6)
+ seq_printf(seq, "%s [%pI6]:%04X %s ",
+ ip_vs_proto_name(svc->protocol),
+ &svc->addr.in6,
+ ntohs(svc->port),
+ sched->name);
+ else
+#endif
+ seq_printf(seq, "%s %08X:%04X %s %s ",
+ ip_vs_proto_name(svc->protocol),
+ ntohl(svc->addr.ip),
+ ntohs(svc->port),
+ sched->name,
+ (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
+ } else {
+ seq_printf(seq, "FWM %08X %s %s",
+ svc->fwmark, sched->name,
+ (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
+ }
+
+ if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+ seq_printf(seq, "persistent %d %08X\n",
+ svc->timeout,
+ ntohl(svc->netmask));
+ else
+ seq_putc(seq, '\n');
+
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+#ifdef CONFIG_IP_VS_IPV6
+ if (dest->af == AF_INET6)
+ seq_printf(seq,
+ " -> [%pI6]:%04X"
+ " %-7s %-6d %-10d %-10d\n",
+ &dest->addr.in6,
+ ntohs(dest->port),
+ ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+ atomic_read(&dest->weight),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->inactconns));
+ else
+#endif
+ seq_printf(seq,
+ " -> %08X:%04X "
+ "%-7s %-6d %-10d %-10d\n",
+ ntohl(dest->addr.ip),
+ ntohs(dest->port),
+ ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+ atomic_read(&dest->weight),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->inactconns));
+
+ }
+ }
+ return 0;
+}
+
+static const struct seq_operations ip_vs_info_seq_ops = {
+ .start = ip_vs_info_seq_start,
+ .next = ip_vs_info_seq_next,
+ .stop = ip_vs_info_seq_stop,
+ .show = ip_vs_info_seq_show,
+};
+
+static int ip_vs_info_open(struct inode *inode, struct file *file)
+{
+ return seq_open_net(inode, file, &ip_vs_info_seq_ops,
+ sizeof(struct ip_vs_iter));
+}
+
+static const struct file_operations ip_vs_info_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_info_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_net,
+};
+
+static int ip_vs_stats_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq_file_single_net(seq);
+ struct ip_vs_kstats show;
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Total Incoming Outgoing Incoming Outgoing\n");
+ seq_printf(seq,
+ " Conns Packets Packets Bytes Bytes\n");
+
+ ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
+ seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n",
+ (unsigned long long)show.conns,
+ (unsigned long long)show.inpkts,
+ (unsigned long long)show.outpkts,
+ (unsigned long long)show.inbytes,
+ (unsigned long long)show.outbytes);
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567*/
+ seq_puts(seq,
+ " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
+ seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n",
+ (unsigned long long)show.cps,
+ (unsigned long long)show.inpps,
+ (unsigned long long)show.outpps,
+ (unsigned long long)show.inbps,
+ (unsigned long long)show.outbps);
+
+ return 0;
+}
+
+static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, ip_vs_stats_show);
+}
+
+static const struct file_operations ip_vs_stats_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_stats_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
+{
+ struct net *net = seq_file_single_net(seq);
+ struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
+ struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
+ struct ip_vs_kstats kstats;
+ int i;
+
+/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Total Incoming Outgoing Incoming Outgoing\n");
+ seq_printf(seq,
+ "CPU Conns Packets Packets Bytes Bytes\n");
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
+ unsigned int start;
+ u64 conns, inpkts, outpkts, inbytes, outbytes;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&u->syncp);
+ conns = u->cnt.conns;
+ inpkts = u->cnt.inpkts;
+ outpkts = u->cnt.outpkts;
+ inbytes = u->cnt.inbytes;
+ outbytes = u->cnt.outbytes;
+ } while (u64_stats_fetch_retry_irq(&u->syncp, start));
+
+ seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
+ i, (u64)conns, (u64)inpkts,
+ (u64)outpkts, (u64)inbytes,
+ (u64)outbytes);
+ }
+
+ ip_vs_copy_stats(&kstats, tot_stats);
+
+ seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n",
+ (unsigned long long)kstats.conns,
+ (unsigned long long)kstats.inpkts,
+ (unsigned long long)kstats.outpkts,
+ (unsigned long long)kstats.inbytes,
+ (unsigned long long)kstats.outbytes);
+
+/* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+ seq_puts(seq,
+ " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
+ seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n",
+ kstats.cps,
+ kstats.inpps,
+ kstats.outpps,
+ kstats.inbps,
+ kstats.outbps);
+
+ return 0;
+}
+
+static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, ip_vs_stats_percpu_show);
+}
+
+static const struct file_operations ip_vs_stats_percpu_fops = {
+ .owner = THIS_MODULE,
+ .open = ip_vs_stats_percpu_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+#endif
+
+/*
+ * Set timeout values for tcp tcpfin udp in the timeout_table.
+ */
+static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
+{
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
+ struct ip_vs_proto_data *pd;
+#endif
+
+ IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
+ u->tcp_timeout,
+ u->tcp_fin_timeout,
+ u->udp_timeout);
+
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ if (u->tcp_timeout) {
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
+ = u->tcp_timeout * HZ;
+ }
+
+ if (u->tcp_fin_timeout) {
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
+ = u->tcp_fin_timeout * HZ;
+ }
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ if (u->udp_timeout) {
+ pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ pd->timeout_table[IP_VS_UDP_S_NORMAL]
+ = u->udp_timeout * HZ;
+ }
+#endif
+ return 0;
+}
+
+#define CMDID(cmd) (cmd - IP_VS_BASE_CTL)
+
+struct ip_vs_svcdest_user {
+ struct ip_vs_service_user s;
+ struct ip_vs_dest_user d;
+};
+
+static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = {
+ [CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user),
+ [CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user),
+ [CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user),
+ [CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user),
+ [CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user),
+ [CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user),
+ [CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user),
+ [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user),
+ [CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user),
+ [CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user),
+};
+
+union ip_vs_set_arglen {
+ struct ip_vs_service_user field_IP_VS_SO_SET_ADD;
+ struct ip_vs_service_user field_IP_VS_SO_SET_EDIT;
+ struct ip_vs_service_user field_IP_VS_SO_SET_DEL;
+ struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST;
+ struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST;
+ struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST;
+ struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT;
+ struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON;
+ struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON;
+ struct ip_vs_service_user field_IP_VS_SO_SET_ZERO;
+};
+
+#define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen)
+
+static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
+ struct ip_vs_service_user *usvc_compat)
+{
+ memset(usvc, 0, sizeof(*usvc));
+
+ usvc->af = AF_INET;
+ usvc->protocol = usvc_compat->protocol;
+ usvc->addr.ip = usvc_compat->addr;
+ usvc->port = usvc_compat->port;
+ usvc->fwmark = usvc_compat->fwmark;
+
+ /* Deep copy of sched_name is not needed here */
+ usvc->sched_name = usvc_compat->sched_name;
+
+ usvc->flags = usvc_compat->flags;
+ usvc->timeout = usvc_compat->timeout;
+ usvc->netmask = usvc_compat->netmask;
+}
+
+static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
+ struct ip_vs_dest_user *udest_compat)
+{
+ memset(udest, 0, sizeof(*udest));
+
+ udest->addr.ip = udest_compat->addr;
+ udest->port = udest_compat->port;
+ udest->conn_flags = udest_compat->conn_flags;
+ udest->weight = udest_compat->weight;
+ udest->u_threshold = udest_compat->u_threshold;
+ udest->l_threshold = udest_compat->l_threshold;
+ udest->af = AF_INET;
+}
+
+static int
+do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+ struct net *net = sock_net(sk);
+ int ret;
+ unsigned char arg[MAX_SET_ARGLEN];
+ struct ip_vs_service_user *usvc_compat;
+ struct ip_vs_service_user_kern usvc;
+ struct ip_vs_service *svc;
+ struct ip_vs_dest_user *udest_compat;
+ struct ip_vs_dest_user_kern udest;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ BUILD_BUG_ON(sizeof(arg) > 255);
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
+ return -EINVAL;
+ if (len != set_arglen[CMDID(cmd)]) {
+ IP_VS_DBG(1, "set_ctl: len %u != %u\n",
+ len, set_arglen[CMDID(cmd)]);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(arg, user, len) != 0)
+ return -EFAULT;
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ /* Handle daemons since they have another lock */
+ if (cmd == IP_VS_SO_SET_STARTDAEMON ||
+ cmd == IP_VS_SO_SET_STOPDAEMON) {
+ struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+
+ mutex_lock(&ipvs->sync_mutex);
+ if (cmd == IP_VS_SO_SET_STARTDAEMON)
+ ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
+ dm->syncid);
+ else
+ ret = stop_sync_thread(net, dm->state);
+ mutex_unlock(&ipvs->sync_mutex);
+ goto out_dec;
+ }
+
+ mutex_lock(&__ip_vs_mutex);
+ if (cmd == IP_VS_SO_SET_FLUSH) {
+ /* Flush the virtual service */
+ ret = ip_vs_flush(net, false);
+ goto out_unlock;
+ } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
+ /* Set timeout values for (tcp tcpfin udp) */
+ ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
+ goto out_unlock;
+ }
+
+ usvc_compat = (struct ip_vs_service_user *)arg;
+ udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
+
+ /* We only use the new structs internally, so copy userspace compat
+ * structs to extended internal versions */
+ ip_vs_copy_usvc_compat(&usvc, usvc_compat);
+ ip_vs_copy_udest_compat(&udest, udest_compat);
+
+ if (cmd == IP_VS_SO_SET_ZERO) {
+ /* if no service address is set, zero counters in all */
+ if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
+ ret = ip_vs_zero_all(net);
+ goto out_unlock;
+ }
+ }
+
+ /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
+ if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
+ usvc.protocol != IPPROTO_SCTP) {
+ pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
+ usvc.protocol, &usvc.addr.ip,
+ ntohs(usvc.port), usvc.sched_name);
+ ret = -EFAULT;
+ goto out_unlock;
+ }
+
+ /* Lookup the exact service by <protocol, addr, port> or fwmark */
+ rcu_read_lock();
+ if (usvc.fwmark == 0)
+ svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
+ &usvc.addr, usvc.port);
+ else
+ svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
+ rcu_read_unlock();
+
+ if (cmd != IP_VS_SO_SET_ADD
+ && (svc == NULL || svc->protocol != usvc.protocol)) {
+ ret = -ESRCH;
+ goto out_unlock;
+ }
+
+ switch (cmd) {
+ case IP_VS_SO_SET_ADD:
+ if (svc != NULL)
+ ret = -EEXIST;
+ else
+ ret = ip_vs_add_service(net, &usvc, &svc);
+ break;
+ case IP_VS_SO_SET_EDIT:
+ ret = ip_vs_edit_service(svc, &usvc);
+ break;
+ case IP_VS_SO_SET_DEL:
+ ret = ip_vs_del_service(svc);
+ if (!ret)
+ goto out_unlock;
+ break;
+ case IP_VS_SO_SET_ZERO:
+ ret = ip_vs_zero_service(svc);
+ break;
+ case IP_VS_SO_SET_ADDDEST:
+ ret = ip_vs_add_dest(svc, &udest);
+ break;
+ case IP_VS_SO_SET_EDITDEST:
+ ret = ip_vs_edit_dest(svc, &udest);
+ break;
+ case IP_VS_SO_SET_DELDEST:
+ ret = ip_vs_del_dest(svc, &udest);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+ out_unlock:
+ mutex_unlock(&__ip_vs_mutex);
+ out_dec:
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ return ret;
+}
+
+
+static void
+ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
+{
+ struct ip_vs_scheduler *sched;
+ struct ip_vs_kstats kstats;
+
+ sched = rcu_dereference_protected(src->scheduler, 1);
+ dst->protocol = src->protocol;
+ dst->addr = src->addr.ip;
+ dst->port = src->port;
+ dst->fwmark = src->fwmark;
+ strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name));
+ dst->flags = src->flags;
+ dst->timeout = src->timeout / HZ;
+ dst->netmask = src->netmask;
+ dst->num_dests = src->num_dests;
+ ip_vs_copy_stats(&kstats, &src->stats);
+ ip_vs_export_stats_user(&dst->stats, &kstats);
+}
+
+static inline int
+__ip_vs_get_service_entries(struct net *net,
+ const struct ip_vs_get_services *get,
+ struct ip_vs_get_services __user *uptr)
+{
+ int idx, count=0;
+ struct ip_vs_service *svc;
+ struct ip_vs_service_entry entry;
+ int ret = 0;
+
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+ /* Only expose IPv4 entries to old interface */
+ if (svc->af != AF_INET || !net_eq(svc->net, net))
+ continue;
+
+ if (count >= get->num_services)
+ goto out;
+ memset(&entry, 0, sizeof(entry));
+ ip_vs_copy_service(&entry, svc);
+ if (copy_to_user(&uptr->entrytable[count],
+ &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ count++;
+ }
+ }
+
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+ /* Only expose IPv4 entries to old interface */
+ if (svc->af != AF_INET || !net_eq(svc->net, net))
+ continue;
+
+ if (count >= get->num_services)
+ goto out;
+ memset(&entry, 0, sizeof(entry));
+ ip_vs_copy_service(&entry, svc);
+ if (copy_to_user(&uptr->entrytable[count],
+ &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ count++;
+ }
+ }
+out:
+ return ret;
+}
+
+static inline int
+__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
+ struct ip_vs_get_dests __user *uptr)
+{
+ struct ip_vs_service *svc;
+ union nf_inet_addr addr = { .ip = get->addr };
+ int ret = 0;
+
+ rcu_read_lock();
+ if (get->fwmark)
+ svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
+ else
+ svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
+ get->port);
+ rcu_read_unlock();
+
+ if (svc) {
+ int count = 0;
+ struct ip_vs_dest *dest;
+ struct ip_vs_dest_entry entry;
+ struct ip_vs_kstats kstats;
+
+ memset(&entry, 0, sizeof(entry));
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ if (count >= get->num_dests)
+ break;
+
+ /* Cannot expose heterogeneous members via sockopt
+ * interface
+ */
+ if (dest->af != svc->af)
+ continue;
+
+ entry.addr = dest->addr.ip;
+ entry.port = dest->port;
+ entry.conn_flags = atomic_read(&dest->conn_flags);
+ entry.weight = atomic_read(&dest->weight);
+ entry.u_threshold = dest->u_threshold;
+ entry.l_threshold = dest->l_threshold;
+ entry.activeconns = atomic_read(&dest->activeconns);
+ entry.inactconns = atomic_read(&dest->inactconns);
+ entry.persistconns = atomic_read(&dest->persistconns);
+ ip_vs_copy_stats(&kstats, &dest->stats);
+ ip_vs_export_stats_user(&entry.stats, &kstats);
+ if (copy_to_user(&uptr->entrytable[count],
+ &entry, sizeof(entry))) {
+ ret = -EFAULT;
+ break;
+ }
+ count++;
+ }
+ } else
+ ret = -ESRCH;
+ return ret;
+}
+
+static inline void
+__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
+{
+#if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
+ struct ip_vs_proto_data *pd;
+#endif
+
+ memset(u, 0, sizeof (*u));
+
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+ u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
+ u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+ u->udp_timeout =
+ pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
+#endif
+}
+
+static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = {
+ [CMDID(IP_VS_SO_GET_VERSION)] = 64,
+ [CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo),
+ [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services),
+ [CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry),
+ [CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests),
+ [CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user),
+ [CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user),
+};
+
+union ip_vs_get_arglen {
+ char field_IP_VS_SO_GET_VERSION[64];
+ struct ip_vs_getinfo field_IP_VS_SO_GET_INFO;
+ struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES;
+ struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE;
+ struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS;
+ struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT;
+ struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2];
+};
+
+#define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen)
+
+static int
+do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+ unsigned char arg[MAX_GET_ARGLEN];
+ int ret = 0;
+ unsigned int copylen;
+ struct net *net = sock_net(sk);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ BUG_ON(!net);
+ BUILD_BUG_ON(sizeof(arg) > 255);
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
+ return -EINVAL;
+
+ copylen = get_arglen[CMDID(cmd)];
+ if (*len < (int) copylen) {
+ IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(arg, user, copylen) != 0)
+ return -EFAULT;
+ /*
+ * Handle daemons first since it has its own locking
+ */
+ if (cmd == IP_VS_SO_GET_DAEMON) {
+ struct ip_vs_daemon_user d[2];
+
+ memset(&d, 0, sizeof(d));
+ mutex_lock(&ipvs->sync_mutex);
+ if (ipvs->sync_state & IP_VS_STATE_MASTER) {
+ d[0].state = IP_VS_STATE_MASTER;
+ strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
+ sizeof(d[0].mcast_ifn));
+ d[0].syncid = ipvs->master_syncid;
+ }
+ if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
+ d[1].state = IP_VS_STATE_BACKUP;
+ strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
+ sizeof(d[1].mcast_ifn));
+ d[1].syncid = ipvs->backup_syncid;
+ }
+ if (copy_to_user(user, &d, sizeof(d)) != 0)
+ ret = -EFAULT;
+ mutex_unlock(&ipvs->sync_mutex);
+ return ret;
+ }
+
+ mutex_lock(&__ip_vs_mutex);
+ switch (cmd) {
+ case IP_VS_SO_GET_VERSION:
+ {
+ char buf[64];
+
+ sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
+ NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
+ if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+ *len = strlen(buf)+1;
+ }
+ break;
+
+ case IP_VS_SO_GET_INFO:
+ {
+ struct ip_vs_getinfo info;
+ info.version = IP_VS_VERSION_CODE;
+ info.size = ip_vs_conn_tab_size;
+ info.num_services = ipvs->num_services;
+ if (copy_to_user(user, &info, sizeof(info)) != 0)
+ ret = -EFAULT;
+ }
+ break;
+
+ case IP_VS_SO_GET_SERVICES:
+ {
+ struct ip_vs_get_services *get;
+ int size;
+
+ get = (struct ip_vs_get_services *)arg;
+ size = sizeof(*get) +
+ sizeof(struct ip_vs_service_entry) * get->num_services;
+ if (*len != size) {
+ pr_err("length: %u != %u\n", *len, size);
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = __ip_vs_get_service_entries(net, get, user);
+ }
+ break;
+
+ case IP_VS_SO_GET_SERVICE:
+ {
+ struct ip_vs_service_entry *entry;
+ struct ip_vs_service *svc;
+ union nf_inet_addr addr;
+
+ entry = (struct ip_vs_service_entry *)arg;
+ addr.ip = entry->addr;
+ rcu_read_lock();
+ if (entry->fwmark)
+ svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
+ else
+ svc = __ip_vs_service_find(net, AF_INET,
+ entry->protocol, &addr,
+ entry->port);
+ rcu_read_unlock();
+ if (svc) {
+ ip_vs_copy_service(entry, svc);
+ if (copy_to_user(user, entry, sizeof(*entry)) != 0)
+ ret = -EFAULT;
+ } else
+ ret = -ESRCH;
+ }
+ break;
+
+ case IP_VS_SO_GET_DESTS:
+ {
+ struct ip_vs_get_dests *get;
+ int size;
+
+ get = (struct ip_vs_get_dests *)arg;
+ size = sizeof(*get) +
+ sizeof(struct ip_vs_dest_entry) * get->num_dests;
+ if (*len != size) {
+ pr_err("length: %u != %u\n", *len, size);
+ ret = -EINVAL;
+ goto out;
+ }
+ ret = __ip_vs_get_dest_entries(net, get, user);
+ }
+ break;
+
+ case IP_VS_SO_GET_TIMEOUT:
+ {
+ struct ip_vs_timeout_user t;
+
+ __ip_vs_get_timeouts(net, &t);
+ if (copy_to_user(user, &t, sizeof(t)) != 0)
+ ret = -EFAULT;
+ }
+ break;
+
+ default:
+ ret = -EINVAL;
+ }
+
+out:
+ mutex_unlock(&__ip_vs_mutex);
+ return ret;
+}
+
+
+static struct nf_sockopt_ops ip_vs_sockopts = {
+ .pf = PF_INET,
+ .set_optmin = IP_VS_BASE_CTL,
+ .set_optmax = IP_VS_SO_SET_MAX+1,
+ .set = do_ip_vs_set_ctl,
+ .get_optmin = IP_VS_BASE_CTL,
+ .get_optmax = IP_VS_SO_GET_MAX+1,
+ .get = do_ip_vs_get_ctl,
+ .owner = THIS_MODULE,
+};
+
+/*
+ * Generic Netlink interface
+ */
+
+/* IPVS genetlink family */
+static struct genl_family ip_vs_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = IPVS_GENL_NAME,
+ .version = IPVS_GENL_VERSION,
+ .maxattr = IPVS_CMD_MAX,
+ .netnsok = true, /* Make ipvsadm to work on netns */
+};
+
+/* Policy used for first-level command attributes */
+static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
+ [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED },
+ [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED },
+ [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED },
+ [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 },
+ [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
+ [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
+static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
+ [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 },
+ [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING,
+ .len = IP_VS_IFNAME_MAXLEN },
+ [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
+static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
+ [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 },
+ [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 },
+ [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY,
+ .len = sizeof(union nf_inet_addr) },
+ [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 },
+ [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 },
+ [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING,
+ .len = IP_VS_SCHEDNAME_MAXLEN },
+ [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING,
+ .len = IP_VS_PENAME_MAXLEN },
+ [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY,
+ .len = sizeof(struct ip_vs_flags) },
+ [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 },
+ [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 },
+ [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED },
+};
+
+/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
+static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
+ [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY,
+ .len = sizeof(union nf_inet_addr) },
+ [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 },
+ [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
+ [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
+ [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 },
+};
+
+static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
+ struct ip_vs_kstats *kstats)
+{
+ struct nlattr *nl_stats = nla_nest_start(skb, container_type);
+
+ if (!nl_stats)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) ||
+ nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps))
+ goto nla_put_failure;
+ nla_nest_end(skb, nl_stats);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nl_stats);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type,
+ struct ip_vs_kstats *kstats)
+{
+ struct nlattr *nl_stats = nla_nest_start(skb, container_type);
+
+ if (!nl_stats)
+ return -EMSGSIZE;
+
+ if (nla_put_u64(skb, IPVS_STATS_ATTR_CONNS, kstats->conns) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_CPS, kstats->cps) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps) ||
+ nla_put_u64(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps))
+ goto nla_put_failure;
+ nla_nest_end(skb, nl_stats);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nl_stats);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_fill_service(struct sk_buff *skb,
+ struct ip_vs_service *svc)
+{
+ struct ip_vs_scheduler *sched;
+ struct ip_vs_pe *pe;
+ struct nlattr *nl_service;
+ struct ip_vs_flags flags = { .flags = svc->flags,
+ .mask = ~0 };
+ struct ip_vs_kstats kstats;
+
+ nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
+ if (!nl_service)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
+ goto nla_put_failure;
+ if (svc->fwmark) {
+ if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
+ goto nla_put_failure;
+ } else {
+ if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
+ nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
+ nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
+ goto nla_put_failure;
+ }
+
+ sched = rcu_dereference_protected(svc->scheduler, 1);
+ pe = rcu_dereference_protected(svc->pe, 1);
+ if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) ||
+ (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
+ nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
+ nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
+ nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
+ goto nla_put_failure;
+ ip_vs_copy_stats(&kstats, &svc->stats);
+ if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats))
+ goto nla_put_failure;
+ if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nl_service);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nl_service);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_service(struct sk_buff *skb,
+ struct ip_vs_service *svc,
+ struct netlink_callback *cb)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &ip_vs_genl_family, NLM_F_MULTI,
+ IPVS_CMD_NEW_SERVICE);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (ip_vs_genl_fill_service(skb, svc) < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_services(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ int idx = 0, i;
+ int start = cb->args[0];
+ struct ip_vs_service *svc;
+ struct net *net = skb_sknet(skb);
+
+ mutex_lock(&__ip_vs_mutex);
+ for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
+ if (++idx <= start || !net_eq(svc->net, net))
+ continue;
+ if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
+ idx--;
+ goto nla_put_failure;
+ }
+ }
+ }
+
+ for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
+ hlist_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
+ if (++idx <= start || !net_eq(svc->net, net))
+ continue;
+ if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
+ idx--;
+ goto nla_put_failure;
+ }
+ }
+ }
+
+nla_put_failure:
+ mutex_unlock(&__ip_vs_mutex);
+ cb->args[0] = idx;
+
+ return skb->len;
+}
+
+static int ip_vs_genl_parse_service(struct net *net,
+ struct ip_vs_service_user_kern *usvc,
+ struct nlattr *nla, int full_entry,
+ struct ip_vs_service **ret_svc)
+{
+ struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
+ struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
+ struct ip_vs_service *svc;
+
+ /* Parse mandatory identifying service fields first */
+ if (nla == NULL ||
+ nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
+ return -EINVAL;
+
+ nla_af = attrs[IPVS_SVC_ATTR_AF];
+ nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL];
+ nla_addr = attrs[IPVS_SVC_ATTR_ADDR];
+ nla_port = attrs[IPVS_SVC_ATTR_PORT];
+ nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK];
+
+ if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
+ return -EINVAL;
+
+ memset(usvc, 0, sizeof(*usvc));
+
+ usvc->af = nla_get_u16(nla_af);
+#ifdef CONFIG_IP_VS_IPV6
+ if (usvc->af != AF_INET && usvc->af != AF_INET6)
+#else
+ if (usvc->af != AF_INET)
+#endif
+ return -EAFNOSUPPORT;
+
+ if (nla_fwmark) {
+ usvc->protocol = IPPROTO_TCP;
+ usvc->fwmark = nla_get_u32(nla_fwmark);
+ } else {
+ usvc->protocol = nla_get_u16(nla_protocol);
+ nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
+ usvc->port = nla_get_be16(nla_port);
+ usvc->fwmark = 0;
+ }
+
+ rcu_read_lock();
+ if (usvc->fwmark)
+ svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
+ else
+ svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
+ &usvc->addr, usvc->port);
+ rcu_read_unlock();
+ *ret_svc = svc;
+
+ /* If a full entry was requested, check for the additional fields */
+ if (full_entry) {
+ struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
+ *nla_netmask;
+ struct ip_vs_flags flags;
+
+ nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
+ nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
+ nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
+ nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
+ nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
+
+ if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
+ return -EINVAL;
+
+ nla_memcpy(&flags, nla_flags, sizeof(flags));
+
+ /* prefill flags from service if it already exists */
+ if (svc)
+ usvc->flags = svc->flags;
+
+ /* set new flags from userland */
+ usvc->flags = (usvc->flags & ~flags.mask) |
+ (flags.flags & flags.mask);
+ usvc->sched_name = nla_data(nla_sched);
+ usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
+ usvc->timeout = nla_get_u32(nla_timeout);
+ usvc->netmask = nla_get_be32(nla_netmask);
+ }
+
+ return 0;
+}
+
+static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
+ struct nlattr *nla)
+{
+ struct ip_vs_service_user_kern usvc;
+ struct ip_vs_service *svc;
+ int ret;
+
+ ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
+ return ret ? ERR_PTR(ret) : svc;
+}
+
+static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
+{
+ struct nlattr *nl_dest;
+ struct ip_vs_kstats kstats;
+
+ nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
+ if (!nl_dest)
+ return -EMSGSIZE;
+
+ if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
+ nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
+ (atomic_read(&dest->conn_flags) &
+ IP_VS_CONN_F_FWD_MASK)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
+ atomic_read(&dest->weight)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
+ atomic_read(&dest->activeconns)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
+ atomic_read(&dest->inactconns)) ||
+ nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
+ atomic_read(&dest->persistconns)) ||
+ nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af))
+ goto nla_put_failure;
+ ip_vs_copy_stats(&kstats, &dest->stats);
+ if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats))
+ goto nla_put_failure;
+ if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nl_dest);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nl_dest);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
+ struct netlink_callback *cb)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &ip_vs_genl_family, NLM_F_MULTI,
+ IPVS_CMD_NEW_DEST);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (ip_vs_genl_fill_dest(skb, dest) < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_dests(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ int idx = 0;
+ int start = cb->args[0];
+ struct ip_vs_service *svc;
+ struct ip_vs_dest *dest;
+ struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
+ struct net *net = skb_sknet(skb);
+
+ mutex_lock(&__ip_vs_mutex);
+
+ /* Try to find the service for which to dump destinations */
+ if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
+ IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
+ goto out_err;
+
+
+ svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
+ if (IS_ERR(svc) || svc == NULL)
+ goto out_err;
+
+ /* Dump the destinations */
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ if (++idx <= start)
+ continue;
+ if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
+ idx--;
+ goto nla_put_failure;
+ }
+ }
+
+nla_put_failure:
+ cb->args[0] = idx;
+
+out_err:
+ mutex_unlock(&__ip_vs_mutex);
+
+ return skb->len;
+}
+
+static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
+ struct nlattr *nla, int full_entry)
+{
+ struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
+ struct nlattr *nla_addr, *nla_port;
+ struct nlattr *nla_addr_family;
+
+ /* Parse mandatory identifying destination fields first */
+ if (nla == NULL ||
+ nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
+ return -EINVAL;
+
+ nla_addr = attrs[IPVS_DEST_ATTR_ADDR];
+ nla_port = attrs[IPVS_DEST_ATTR_PORT];
+ nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY];
+
+ if (!(nla_addr && nla_port))
+ return -EINVAL;
+
+ memset(udest, 0, sizeof(*udest));
+
+ nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
+ udest->port = nla_get_be16(nla_port);
+
+ if (nla_addr_family)
+ udest->af = nla_get_u16(nla_addr_family);
+ else
+ udest->af = 0;
+
+ /* If a full entry was requested, check for the additional fields */
+ if (full_entry) {
+ struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
+ *nla_l_thresh;
+
+ nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
+ nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
+ nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
+ nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
+
+ if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
+ return -EINVAL;
+
+ udest->conn_flags = nla_get_u32(nla_fwd)
+ & IP_VS_CONN_F_FWD_MASK;
+ udest->weight = nla_get_u32(nla_weight);
+ udest->u_threshold = nla_get_u32(nla_u_thresh);
+ udest->l_threshold = nla_get_u32(nla_l_thresh);
+ }
+
+ return 0;
+}
+
+static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
+ const char *mcast_ifn, __u32 syncid)
+{
+ struct nlattr *nl_daemon;
+
+ nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
+ if (!nl_daemon)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
+ nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) ||
+ nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid))
+ goto nla_put_failure;
+ nla_nest_end(skb, nl_daemon);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nl_daemon);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
+ const char *mcast_ifn, __u32 syncid,
+ struct netlink_callback *cb)
+{
+ void *hdr;
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &ip_vs_genl_family, NLM_F_MULTI,
+ IPVS_CMD_NEW_DAEMON);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct net *net = skb_sknet(skb);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ mutex_lock(&ipvs->sync_mutex);
+ if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
+ if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
+ ipvs->master_mcast_ifn,
+ ipvs->master_syncid, cb) < 0)
+ goto nla_put_failure;
+
+ cb->args[0] = 1;
+ }
+
+ if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
+ if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
+ ipvs->backup_mcast_ifn,
+ ipvs->backup_syncid, cb) < 0)
+ goto nla_put_failure;
+
+ cb->args[1] = 1;
+ }
+
+nla_put_failure:
+ mutex_unlock(&ipvs->sync_mutex);
+
+ return skb->len;
+}
+
+static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
+{
+ if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
+ attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
+ attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
+ return -EINVAL;
+
+ /* The synchronization protocol is incompatible with mixed family
+ * services
+ */
+ if (net_ipvs(net)->mixed_address_family_dests > 0)
+ return -EINVAL;
+
+ return start_sync_thread(net,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
+ nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
+}
+
+static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
+{
+ if (!attrs[IPVS_DAEMON_ATTR_STATE])
+ return -EINVAL;
+
+ return stop_sync_thread(net,
+ nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
+}
+
+static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
+{
+ struct ip_vs_timeout_user t;
+
+ __ip_vs_get_timeouts(net, &t);
+
+ if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
+ t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
+
+ if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
+ t.tcp_fin_timeout =
+ nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
+
+ if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
+ t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
+
+ return ip_vs_set_timeout(net, &t);
+}
+
+static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
+{
+ int ret = 0, cmd;
+ struct net *net;
+ struct netns_ipvs *ipvs;
+
+ net = skb_sknet(skb);
+ ipvs = net_ipvs(net);
+ cmd = info->genlhdr->cmd;
+
+ if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
+ struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
+
+ mutex_lock(&ipvs->sync_mutex);
+ if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
+ nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
+ info->attrs[IPVS_CMD_ATTR_DAEMON],
+ ip_vs_daemon_policy)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (cmd == IPVS_CMD_NEW_DAEMON)
+ ret = ip_vs_genl_new_daemon(net, daemon_attrs);
+ else
+ ret = ip_vs_genl_del_daemon(net, daemon_attrs);
+out:
+ mutex_unlock(&ipvs->sync_mutex);
+ }
+ return ret;
+}
+
+static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ip_vs_service *svc = NULL;
+ struct ip_vs_service_user_kern usvc;
+ struct ip_vs_dest_user_kern udest;
+ int ret = 0, cmd;
+ int need_full_svc = 0, need_full_dest = 0;
+ struct net *net;
+
+ net = skb_sknet(skb);
+ cmd = info->genlhdr->cmd;
+
+ mutex_lock(&__ip_vs_mutex);
+
+ if (cmd == IPVS_CMD_FLUSH) {
+ ret = ip_vs_flush(net, false);
+ goto out;
+ } else if (cmd == IPVS_CMD_SET_CONFIG) {
+ ret = ip_vs_genl_set_config(net, info->attrs);
+ goto out;
+ } else if (cmd == IPVS_CMD_ZERO &&
+ !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
+ ret = ip_vs_zero_all(net);
+ goto out;
+ }
+
+ /* All following commands require a service argument, so check if we
+ * received a valid one. We need a full service specification when
+ * adding / editing a service. Only identifying members otherwise. */
+ if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
+ need_full_svc = 1;
+
+ ret = ip_vs_genl_parse_service(net, &usvc,
+ info->attrs[IPVS_CMD_ATTR_SERVICE],
+ need_full_svc, &svc);
+ if (ret)
+ goto out;
+
+ /* Unless we're adding a new service, the service must already exist */
+ if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
+ ret = -ESRCH;
+ goto out;
+ }
+
+ /* Destination commands require a valid destination argument. For
+ * adding / editing a destination, we need a full destination
+ * specification. */
+ if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
+ cmd == IPVS_CMD_DEL_DEST) {
+ if (cmd != IPVS_CMD_DEL_DEST)
+ need_full_dest = 1;
+
+ ret = ip_vs_genl_parse_dest(&udest,
+ info->attrs[IPVS_CMD_ATTR_DEST],
+ need_full_dest);
+ if (ret)
+ goto out;
+
+ /* Old protocols did not allow the user to specify address
+ * family, so we set it to zero instead. We also didn't
+ * allow heterogeneous pools in the old code, so it's safe
+ * to assume that this will have the same address family as
+ * the service.
+ */
+ if (udest.af == 0)
+ udest.af = svc->af;
+
+ if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) {
+ /* The synchronization protocol is incompatible
+ * with mixed family services
+ */
+ if (net_ipvs(net)->sync_state) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Which connection types do we support? */
+ switch (udest.conn_flags) {
+ case IP_VS_CONN_F_TUNNEL:
+ /* We are able to forward this */
+ break;
+ default:
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ }
+
+ switch (cmd) {
+ case IPVS_CMD_NEW_SERVICE:
+ if (svc == NULL)
+ ret = ip_vs_add_service(net, &usvc, &svc);
+ else
+ ret = -EEXIST;
+ break;
+ case IPVS_CMD_SET_SERVICE:
+ ret = ip_vs_edit_service(svc, &usvc);
+ break;
+ case IPVS_CMD_DEL_SERVICE:
+ ret = ip_vs_del_service(svc);
+ /* do not use svc, it can be freed */
+ break;
+ case IPVS_CMD_NEW_DEST:
+ ret = ip_vs_add_dest(svc, &udest);
+ break;
+ case IPVS_CMD_SET_DEST:
+ ret = ip_vs_edit_dest(svc, &udest);
+ break;
+ case IPVS_CMD_DEL_DEST:
+ ret = ip_vs_del_dest(svc, &udest);
+ break;
+ case IPVS_CMD_ZERO:
+ ret = ip_vs_zero_service(svc);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+
+out:
+ mutex_unlock(&__ip_vs_mutex);
+
+ return ret;
+}
+
+static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ void *reply;
+ int ret, cmd, reply_cmd;
+ struct net *net;
+
+ net = skb_sknet(skb);
+ cmd = info->genlhdr->cmd;
+
+ if (cmd == IPVS_CMD_GET_SERVICE)
+ reply_cmd = IPVS_CMD_NEW_SERVICE;
+ else if (cmd == IPVS_CMD_GET_INFO)
+ reply_cmd = IPVS_CMD_SET_INFO;
+ else if (cmd == IPVS_CMD_GET_CONFIG)
+ reply_cmd = IPVS_CMD_SET_CONFIG;
+ else {
+ pr_err("unknown Generic Netlink command\n");
+ return -EINVAL;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ mutex_lock(&__ip_vs_mutex);
+
+ reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
+ if (reply == NULL)
+ goto nla_put_failure;
+
+ switch (cmd) {
+ case IPVS_CMD_GET_SERVICE:
+ {
+ struct ip_vs_service *svc;
+
+ svc = ip_vs_genl_find_service(net,
+ info->attrs[IPVS_CMD_ATTR_SERVICE]);
+ if (IS_ERR(svc)) {
+ ret = PTR_ERR(svc);
+ goto out_err;
+ } else if (svc) {
+ ret = ip_vs_genl_fill_service(msg, svc);
+ if (ret)
+ goto nla_put_failure;
+ } else {
+ ret = -ESRCH;
+ goto out_err;
+ }
+
+ break;
+ }
+
+ case IPVS_CMD_GET_CONFIG:
+ {
+ struct ip_vs_timeout_user t;
+
+ __ip_vs_get_timeouts(net, &t);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
+ t.tcp_timeout) ||
+ nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
+ t.tcp_fin_timeout))
+ goto nla_put_failure;
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
+ goto nla_put_failure;
+#endif
+
+ break;
+ }
+
+ case IPVS_CMD_GET_INFO:
+ if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
+ IP_VS_VERSION_CODE) ||
+ nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
+ ip_vs_conn_tab_size))
+ goto nla_put_failure;
+ break;
+ }
+
+ genlmsg_end(msg, reply);
+ ret = genlmsg_reply(msg, info);
+ goto out;
+
+nla_put_failure:
+ pr_err("not enough space in Netlink message\n");
+ ret = -EMSGSIZE;
+
+out_err:
+ nlmsg_free(msg);
+out:
+ mutex_unlock(&__ip_vs_mutex);
+
+ return ret;
+}
+
+
+static const struct genl_ops ip_vs_genl_ops[] = {
+ {
+ .cmd = IPVS_CMD_NEW_SERVICE,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_SET_SERVICE,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_DEL_SERVICE,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_GET_SERVICE,
+ .flags = GENL_ADMIN_PERM,
+ .doit = ip_vs_genl_get_cmd,
+ .dumpit = ip_vs_genl_dump_services,
+ .policy = ip_vs_cmd_policy,
+ },
+ {
+ .cmd = IPVS_CMD_NEW_DEST,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_SET_DEST,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_DEL_DEST,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_GET_DEST,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .dumpit = ip_vs_genl_dump_dests,
+ },
+ {
+ .cmd = IPVS_CMD_NEW_DAEMON,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_daemon,
+ },
+ {
+ .cmd = IPVS_CMD_DEL_DAEMON,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_daemon,
+ },
+ {
+ .cmd = IPVS_CMD_GET_DAEMON,
+ .flags = GENL_ADMIN_PERM,
+ .dumpit = ip_vs_genl_dump_daemons,
+ },
+ {
+ .cmd = IPVS_CMD_SET_CONFIG,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_GET_CONFIG,
+ .flags = GENL_ADMIN_PERM,
+ .doit = ip_vs_genl_get_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_GET_INFO,
+ .flags = GENL_ADMIN_PERM,
+ .doit = ip_vs_genl_get_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_ZERO,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ip_vs_cmd_policy,
+ .doit = ip_vs_genl_set_cmd,
+ },
+ {
+ .cmd = IPVS_CMD_FLUSH,
+ .flags = GENL_ADMIN_PERM,
+ .doit = ip_vs_genl_set_cmd,
+ },
+};
+
+static int __init ip_vs_genl_register(void)
+{
+ return genl_register_family_with_ops(&ip_vs_genl_family,
+ ip_vs_genl_ops);
+}
+
+static void ip_vs_genl_unregister(void)
+{
+ genl_unregister_family(&ip_vs_genl_family);
+}
+
+/* End of Generic Netlink interface definitions */
+
+/*
+ * per netns intit/exit func.
+ */
+#ifdef CONFIG_SYSCTL
+static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
+{
+ int idx;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ctl_table *tbl;
+
+ atomic_set(&ipvs->dropentry, 0);
+ spin_lock_init(&ipvs->dropentry_lock);
+ spin_lock_init(&ipvs->droppacket_lock);
+ spin_lock_init(&ipvs->securetcp_lock);
+
+ if (!net_eq(net, &init_net)) {
+ tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
+ if (tbl == NULL)
+ return -ENOMEM;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ tbl[0].procname = NULL;
+ } else
+ tbl = vs_vars;
+ /* Initialize sysctl defaults */
+ idx = 0;
+ ipvs->sysctl_amemthresh = 1024;
+ tbl[idx++].data = &ipvs->sysctl_amemthresh;
+ ipvs->sysctl_am_droprate = 10;
+ tbl[idx++].data = &ipvs->sysctl_am_droprate;
+ tbl[idx++].data = &ipvs->sysctl_drop_entry;
+ tbl[idx++].data = &ipvs->sysctl_drop_packet;
+#ifdef CONFIG_IP_VS_NFCT
+ tbl[idx++].data = &ipvs->sysctl_conntrack;
+#endif
+ tbl[idx++].data = &ipvs->sysctl_secure_tcp;
+ ipvs->sysctl_snat_reroute = 1;
+ tbl[idx++].data = &ipvs->sysctl_snat_reroute;
+ ipvs->sysctl_sync_ver = 1;
+ tbl[idx++].data = &ipvs->sysctl_sync_ver;
+ ipvs->sysctl_sync_ports = 1;
+ tbl[idx++].data = &ipvs->sysctl_sync_ports;
+ tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
+ ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
+ tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
+ ipvs->sysctl_sync_sock_size = 0;
+ tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
+ tbl[idx++].data = &ipvs->sysctl_cache_bypass;
+ tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
+ tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
+ tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
+ tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
+ ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
+ ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
+ tbl[idx].data = &ipvs->sysctl_sync_threshold;
+ tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
+ ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
+ tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
+ ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
+ tbl[idx++].data = &ipvs->sysctl_sync_retries;
+ tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
+ ipvs->sysctl_pmtu_disc = 1;
+ tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
+ tbl[idx++].data = &ipvs->sysctl_backup_only;
+ ipvs->sysctl_conn_reuse_mode = 1;
+ tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
+
+
+ ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
+ if (ipvs->sysctl_hdr == NULL) {
+ if (!net_eq(net, &init_net))
+ kfree(tbl);
+ return -ENOMEM;
+ }
+ ip_vs_start_estimator(net, &ipvs->tot_stats);
+ ipvs->sysctl_tbl = tbl;
+ /* Schedule defense work */
+ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
+ schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
+
+ return 0;
+}
+
+static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ cancel_delayed_work_sync(&ipvs->defense_work);
+ cancel_work_sync(&ipvs->defense_work.work);
+ unregister_net_sysctl_table(ipvs->sysctl_hdr);
+ ip_vs_stop_estimator(net, &ipvs->tot_stats);
+
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->sysctl_tbl);
+}
+
+#else
+
+static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; }
+static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { }
+
+#endif
+
+static struct notifier_block ip_vs_dst_notifier = {
+ .notifier_call = ip_vs_dst_event,
+};
+
+int __net_init ip_vs_control_net_init(struct net *net)
+{
+ int i, idx;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ /* Initialize rs_table */
+ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
+ INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
+
+ INIT_LIST_HEAD(&ipvs->dest_trash);
+ spin_lock_init(&ipvs->dest_trash_lock);
+ setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire,
+ (unsigned long) net);
+ atomic_set(&ipvs->ftpsvc_counter, 0);
+ atomic_set(&ipvs->nullsvc_counter, 0);
+
+ /* procfs stats */
+ ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+ if (!ipvs->tot_stats.cpustats)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *ipvs_tot_stats;
+ ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i);
+ u64_stats_init(&ipvs_tot_stats->syncp);
+ }
+
+ spin_lock_init(&ipvs->tot_stats.lock);
+
+ proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
+ proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops);
+ proc_create("ip_vs_stats_percpu", 0, net->proc_net,
+ &ip_vs_stats_percpu_fops);
+
+ if (ip_vs_control_net_init_sysctl(net))
+ goto err;
+
+ return 0;
+
+err:
+ free_percpu(ipvs->tot_stats.cpustats);
+ return -ENOMEM;
+}
+
+void __net_exit ip_vs_control_net_cleanup(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_trash_cleanup(net);
+ ip_vs_control_net_cleanup_sysctl(net);
+ remove_proc_entry("ip_vs_stats_percpu", net->proc_net);
+ remove_proc_entry("ip_vs_stats", net->proc_net);
+ remove_proc_entry("ip_vs", net->proc_net);
+ free_percpu(ipvs->tot_stats.cpustats);
+}
+
+int __init ip_vs_register_nl_ioctl(void)
+{
+ int ret;
+
+ ret = nf_register_sockopt(&ip_vs_sockopts);
+ if (ret) {
+ pr_err("cannot register sockopt.\n");
+ goto err_sock;
+ }
+
+ ret = ip_vs_genl_register();
+ if (ret) {
+ pr_err("cannot register Generic Netlink interface.\n");
+ goto err_genl;
+ }
+ return 0;
+
+err_genl:
+ nf_unregister_sockopt(&ip_vs_sockopts);
+err_sock:
+ return ret;
+}
+
+void ip_vs_unregister_nl_ioctl(void)
+{
+ ip_vs_genl_unregister();
+ nf_unregister_sockopt(&ip_vs_sockopts);
+}
+
+int __init ip_vs_control_init(void)
+{
+ int idx;
+ int ret;
+
+ EnterFunction(2);
+
+ /* Initialize svc_table, ip_vs_svc_fwm_table */
+ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+ INIT_HLIST_HEAD(&ip_vs_svc_table[idx]);
+ INIT_HLIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+ }
+
+ smp_wmb(); /* Do we really need it now ? */
+
+ ret = register_netdevice_notifier(&ip_vs_dst_notifier);
+ if (ret < 0)
+ return ret;
+
+ LeaveFunction(2);
+ return 0;
+}
+
+
+void ip_vs_control_cleanup(void)
+{
+ EnterFunction(2);
+ unregister_netdevice_notifier(&ip_vs_dst_notifier);
+ LeaveFunction(2);
+}
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
new file mode 100644
index 000000000..6be5c538b
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -0,0 +1,276 @@
+/*
+ * IPVS: Destination Hashing scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * Inspired by the consistent hashing scheduler patch from
+ * Thomas Proell <proellt@gmx.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The dh algorithm is to select server by the hash key of destination IP
+ * address. The pseudo code is as follows:
+ *
+ * n <- servernode[dest_ip];
+ * if (n is dead) OR
+ * (n is overloaded) OR (n.weight <= 0) then
+ * return NULL;
+ *
+ * return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet destination IP address to the current server
+ * array. If the dh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * IPVS DH bucket
+ */
+struct ip_vs_dh_bucket {
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
+};
+
+/*
+ * for IPVS DH entry hash table
+ */
+#ifndef CONFIG_IP_VS_DH_TAB_BITS
+#define CONFIG_IP_VS_DH_TAB_BITS 8
+#endif
+#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS
+#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS)
+#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1)
+
+struct ip_vs_dh_state {
+ struct ip_vs_dh_bucket buckets[IP_VS_DH_TAB_SIZE];
+ struct rcu_head rcu_head;
+};
+
+/*
+ * Returns hash value for IPVS DH entry
+ */
+static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *addr)
+{
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+ return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK;
+}
+
+
+/*
+ * Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_dh_get(int af, struct ip_vs_dh_state *s, const union nf_inet_addr *addr)
+{
+ return rcu_dereference(s->buckets[ip_vs_dh_hashkey(af, addr)].dest);
+}
+
+
+/*
+ * Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_dh_reassign(struct ip_vs_dh_state *s, struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_dh_bucket *b;
+ struct list_head *p;
+ struct ip_vs_dest *dest;
+ bool empty;
+
+ b = &s->buckets[0];
+ p = &svc->destinations;
+ empty = list_empty(p);
+ for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest)
+ ip_vs_dest_put(dest);
+ if (empty)
+ RCU_INIT_POINTER(b->dest, NULL);
+ else {
+ if (p == &svc->destinations)
+ p = p->next;
+
+ dest = list_entry(p, struct ip_vs_dest, n_list);
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(b->dest, dest);
+
+ p = p->next;
+ }
+ b++;
+ }
+ return 0;
+}
+
+
+/*
+ * Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_dh_flush(struct ip_vs_dh_state *s)
+{
+ int i;
+ struct ip_vs_dh_bucket *b;
+ struct ip_vs_dest *dest;
+
+ b = &s->buckets[0];
+ for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(b->dest, NULL);
+ }
+ b++;
+ }
+}
+
+
+static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_dh_state *s;
+
+ /* allocate the DH table for this service */
+ s = kzalloc(sizeof(struct ip_vs_dh_state), GFP_KERNEL);
+ if (s == NULL)
+ return -ENOMEM;
+
+ svc->sched_data = s;
+ IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
+ "current service\n",
+ sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+
+ /* assign the hash buckets with current dests */
+ ip_vs_dh_reassign(s, svc);
+
+ return 0;
+}
+
+
+static void ip_vs_dh_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_dh_state *s = svc->sched_data;
+
+ /* got to clean up hash buckets here */
+ ip_vs_dh_flush(s);
+
+ /* release the table itself */
+ kfree_rcu(s, rcu_head);
+ IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
+ sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+}
+
+
+static int ip_vs_dh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_dh_state *s = svc->sched_data;
+
+ /* assign the hash buckets with the updated service */
+ ip_vs_dh_reassign(s, svc);
+
+ return 0;
+}
+
+
+/*
+ * If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ * consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+ return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ * Destination hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_dh_state *s;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ s = (struct ip_vs_dh_state *) svc->sched_data;
+ dest = ip_vs_dh_get(svc->af, s, &iph->daddr);
+ if (!dest
+ || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+ || atomic_read(&dest->weight) <= 0
+ || is_overloaded(dest)) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ IP_VS_DBG_BUF(6, "DH: destination IP address %s --> server %s:%d\n",
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr),
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS DH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_dh_scheduler =
+{
+ .name = "dh",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list),
+ .init_service = ip_vs_dh_init_svc,
+ .done_service = ip_vs_dh_done_svc,
+ .add_dest = ip_vs_dh_dest_changed,
+ .del_dest = ip_vs_dh_dest_changed,
+ .schedule = ip_vs_dh_schedule,
+};
+
+
+static int __init ip_vs_dh_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+
+
+static void __exit ip_vs_dh_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
+ synchronize_rcu();
+}
+
+
+module_init(ip_vs_dh_init);
+module_exit(ip_vs_dh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
new file mode 100644
index 000000000..ef0eb0a8d
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -0,0 +1,209 @@
+/*
+ * ip_vs_est.c: simple rate estimator for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * Network name space (netns) aware.
+ * Global data moved to netns i.e struct netns_ipvs
+ * Affected data: est_list and est_lock.
+ * estimation_timer() runs with timer per netns.
+ * get_stats()) do the per cpu summing.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/sysctl.h>
+#include <linux/list.h>
+
+#include <net/ip_vs.h>
+
+/*
+ This code is to estimate rate in a shorter interval (such as 8
+ seconds) for virtual services and real servers. For measure rate in a
+ long interval, it is easy to implement a user level daemon which
+ periodically reads those statistical counters and measure rate.
+
+ Currently, the measurement is activated by slow timer handler. Hope
+ this measurement will not introduce too much load.
+
+ We measure rate during the last 8 seconds every 2 seconds:
+
+ avgrate = avgrate*(1-W) + rate*W
+
+ where W = 2^(-2)
+
+ NOTES.
+
+ * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10.
+
+ * Netlink users can see 64-bit values but sockopt users are restricted
+ to 32-bit values for conns, packets, bps, cps and pps.
+
+ * A lot of code is taken from net/core/gen_estimator.c
+ */
+
+
+/*
+ * Make a summary from each cpu
+ */
+static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum,
+ struct ip_vs_cpu_stats __percpu *stats)
+{
+ int i;
+ bool add = false;
+
+ for_each_possible_cpu(i) {
+ struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i);
+ unsigned int start;
+ u64 conns, inpkts, outpkts, inbytes, outbytes;
+
+ if (add) {
+ do {
+ start = u64_stats_fetch_begin(&s->syncp);
+ conns = s->cnt.conns;
+ inpkts = s->cnt.inpkts;
+ outpkts = s->cnt.outpkts;
+ inbytes = s->cnt.inbytes;
+ outbytes = s->cnt.outbytes;
+ } while (u64_stats_fetch_retry(&s->syncp, start));
+ sum->conns += conns;
+ sum->inpkts += inpkts;
+ sum->outpkts += outpkts;
+ sum->inbytes += inbytes;
+ sum->outbytes += outbytes;
+ } else {
+ add = true;
+ do {
+ start = u64_stats_fetch_begin(&s->syncp);
+ sum->conns = s->cnt.conns;
+ sum->inpkts = s->cnt.inpkts;
+ sum->outpkts = s->cnt.outpkts;
+ sum->inbytes = s->cnt.inbytes;
+ sum->outbytes = s->cnt.outbytes;
+ } while (u64_stats_fetch_retry(&s->syncp, start));
+ }
+ }
+}
+
+
+static void estimation_timer(unsigned long arg)
+{
+ struct ip_vs_estimator *e;
+ struct ip_vs_stats *s;
+ u64 rate;
+ struct net *net = (struct net *)arg;
+ struct netns_ipvs *ipvs;
+
+ ipvs = net_ipvs(net);
+ spin_lock(&ipvs->est_lock);
+ list_for_each_entry(e, &ipvs->est_list, list) {
+ s = container_of(e, struct ip_vs_stats, est);
+
+ spin_lock(&s->lock);
+ ip_vs_read_cpu_stats(&s->kstats, s->cpustats);
+
+ /* scaled by 2^10, but divided 2 seconds */
+ rate = (s->kstats.conns - e->last_conns) << 9;
+ e->last_conns = s->kstats.conns;
+ e->cps += ((s64)rate - (s64)e->cps) >> 2;
+
+ rate = (s->kstats.inpkts - e->last_inpkts) << 9;
+ e->last_inpkts = s->kstats.inpkts;
+ e->inpps += ((s64)rate - (s64)e->inpps) >> 2;
+
+ rate = (s->kstats.outpkts - e->last_outpkts) << 9;
+ e->last_outpkts = s->kstats.outpkts;
+ e->outpps += ((s64)rate - (s64)e->outpps) >> 2;
+
+ /* scaled by 2^5, but divided 2 seconds */
+ rate = (s->kstats.inbytes - e->last_inbytes) << 4;
+ e->last_inbytes = s->kstats.inbytes;
+ e->inbps += ((s64)rate - (s64)e->inbps) >> 2;
+
+ rate = (s->kstats.outbytes - e->last_outbytes) << 4;
+ e->last_outbytes = s->kstats.outbytes;
+ e->outbps += ((s64)rate - (s64)e->outbps) >> 2;
+ spin_unlock(&s->lock);
+ }
+ spin_unlock(&ipvs->est_lock);
+ mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
+}
+
+void ip_vs_start_estimator(struct net *net, struct ip_vs_stats *stats)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_estimator *est = &stats->est;
+
+ INIT_LIST_HEAD(&est->list);
+
+ spin_lock_bh(&ipvs->est_lock);
+ list_add(&est->list, &ipvs->est_list);
+ spin_unlock_bh(&ipvs->est_lock);
+}
+
+void ip_vs_stop_estimator(struct net *net, struct ip_vs_stats *stats)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_estimator *est = &stats->est;
+
+ spin_lock_bh(&ipvs->est_lock);
+ list_del(&est->list);
+ spin_unlock_bh(&ipvs->est_lock);
+}
+
+void ip_vs_zero_estimator(struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *est = &stats->est;
+ struct ip_vs_kstats *k = &stats->kstats;
+
+ /* reset counters, caller must hold the stats->lock lock */
+ est->last_inbytes = k->inbytes;
+ est->last_outbytes = k->outbytes;
+ est->last_conns = k->conns;
+ est->last_inpkts = k->inpkts;
+ est->last_outpkts = k->outpkts;
+ est->cps = 0;
+ est->inpps = 0;
+ est->outpps = 0;
+ est->inbps = 0;
+ est->outbps = 0;
+}
+
+/* Get decoded rates */
+void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats)
+{
+ struct ip_vs_estimator *e = &stats->est;
+
+ dst->cps = (e->cps + 0x1FF) >> 10;
+ dst->inpps = (e->inpps + 0x1FF) >> 10;
+ dst->outpps = (e->outpps + 0x1FF) >> 10;
+ dst->inbps = (e->inbps + 0xF) >> 5;
+ dst->outbps = (e->outbps + 0xF) >> 5;
+}
+
+int __net_init ip_vs_estimator_net_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ INIT_LIST_HEAD(&ipvs->est_list);
+ spin_lock_init(&ipvs->est_lock);
+ setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net);
+ mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);
+ return 0;
+}
+
+void __net_exit ip_vs_estimator_net_cleanup(struct net *net)
+{
+ del_timer_sync(&net_ipvs(net)->est_timer);
+}
diff --git a/net/netfilter/ipvs/ip_vs_fo.c b/net/netfilter/ipvs/ip_vs_fo.c
new file mode 100644
index 000000000..e09874d02
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_fo.c
@@ -0,0 +1,79 @@
+/*
+ * IPVS: Weighted Fail Over module
+ *
+ * Authors: Kenny Mathis <kmathis@chokepoint.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Kenny Mathis : added initial functionality based on weight
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+/* Weighted Fail Over Module */
+static struct ip_vs_dest *
+ip_vs_fo_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *hweight = NULL;
+ int hw = 0; /* Track highest weight */
+
+ IP_VS_DBG(6, "ip_vs_fo_schedule(): Scheduling...\n");
+
+ /* Basic failover functionality
+ * Find virtual server with highest weight and send it traffic
+ */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > hw) {
+ hweight = dest;
+ hw = atomic_read(&dest->weight);
+ }
+ }
+
+ if (hweight) {
+ IP_VS_DBG_BUF(6, "FO: server %s:%u activeconns %d weight %d\n",
+ IP_VS_DBG_ADDR(hweight->af, &hweight->addr),
+ ntohs(hweight->port),
+ atomic_read(&hweight->activeconns),
+ atomic_read(&hweight->weight));
+ return hweight;
+ }
+
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+}
+
+static struct ip_vs_scheduler ip_vs_fo_scheduler = {
+ .name = "fo",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_fo_scheduler.n_list),
+ .schedule = ip_vs_fo_schedule,
+};
+
+static int __init ip_vs_fo_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_fo_scheduler);
+}
+
+static void __exit ip_vs_fo_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_fo_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_fo_init);
+module_exit(ip_vs_fo_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
new file mode 100644
index 000000000..5d3daae98
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -0,0 +1,503 @@
+/*
+ * ip_vs_ftp.c: IPVS ftp application module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * Changes:
+ *
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
+ * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
+ *
+ * IP_MASQ_FTP ftp masquerading module
+ *
+ * Version: @(#)ip_masq_ftp.c 0.04 02/05/96
+ *
+ * Author: Wouter Gadeyne
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <linux/gfp.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <asm/unaligned.h>
+
+#include <net/ip_vs.h>
+
+
+#define SERVER_STRING "227 "
+#define CLIENT_STRING "PORT"
+
+
+/*
+ * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
+ * First port is set to the default port.
+ */
+static unsigned int ports_count = 1;
+static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0};
+module_param_array(ports, ushort, &ports_count, 0444);
+MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
+
+
+/* Dummy variable */
+static int ip_vs_ftp_pasv;
+
+
+static int
+ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+ /* We use connection tracking for the command connection */
+ cp->flags |= IP_VS_CONN_F_NFCT;
+ return 0;
+}
+
+
+static int
+ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+ return 0;
+}
+
+
+/*
+ * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern", ignoring before "skip" and terminated with
+ * the "term" character.
+ * <addr,port> is in network order.
+ */
+static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
+ const char *pattern, size_t plen,
+ char skip, char term,
+ __be32 *addr, __be16 *port,
+ char **start, char **end)
+{
+ char *s, c;
+ unsigned char p[6];
+ int i = 0;
+
+ if (data_limit - data < plen) {
+ /* check if there is partial match */
+ if (strncasecmp(data, pattern, data_limit - data) == 0)
+ return -1;
+ else
+ return 0;
+ }
+
+ if (strncasecmp(data, pattern, plen) != 0) {
+ return 0;
+ }
+ s = data + plen;
+ if (skip) {
+ int found = 0;
+
+ for (;; s++) {
+ if (s == data_limit)
+ return -1;
+ if (!found) {
+ if (*s == skip)
+ found = 1;
+ } else if (*s != skip) {
+ break;
+ }
+ }
+ }
+
+ for (data = s; ; data++) {
+ if (data == data_limit)
+ return -1;
+ if (*data == term)
+ break;
+ }
+ *end = data;
+
+ memset(p, 0, sizeof(p));
+ for (data = s; ; data++) {
+ c = *data;
+ if (c == term)
+ break;
+ if (c >= '0' && c <= '9') {
+ p[i] = p[i]*10 + c - '0';
+ } else if (c == ',' && i < 5) {
+ i++;
+ } else {
+ /* unexpected character */
+ return -1;
+ }
+ }
+
+ if (i != 5)
+ return -1;
+
+ *start = s;
+ *addr = get_unaligned((__be32 *) p);
+ *port = get_unaligned((__be16 *) (p + 4));
+ return 1;
+}
+
+/*
+ * Look at outgoing ftp packets to catch the response to a PASV command
+ * from the server (inside-to-outside).
+ * When we see one, we build a connection entry with the client address,
+ * client port 0 (unknown at the moment), the server address and the
+ * server port. Mark the current connection entry as a control channel
+ * of the new entry. All this work is just to make the data connection
+ * can be scheduled to the right server later.
+ *
+ * The outgoing packet should be something like
+ * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
+ * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ */
+static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
+ struct sk_buff *skb, int *diff)
+{
+ struct iphdr *iph;
+ struct tcphdr *th;
+ char *data, *data_limit;
+ char *start, *end;
+ union nf_inet_addr from;
+ __be16 port;
+ struct ip_vs_conn *n_cp;
+ char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
+ unsigned int buf_len;
+ int ret = 0;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ struct net *net;
+
+ *diff = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ /* This application helper doesn't work with IPv6 yet,
+ * so turn this into a no-op for IPv6 packets
+ */
+ if (cp->af == AF_INET6)
+ return 1;
+#endif
+
+ /* Only useful for established sessions */
+ if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+ return 1;
+
+ /* Linear packets are much easier to deal with. */
+ if (!skb_make_writable(skb, skb->len))
+ return 0;
+
+ if (cp->app_data == &ip_vs_ftp_pasv) {
+ iph = ip_hdr(skb);
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+ data = (char *)th + (th->doff << 2);
+ data_limit = skb_tail_pointer(skb);
+
+ if (ip_vs_ftp_get_addrport(data, data_limit,
+ SERVER_STRING,
+ sizeof(SERVER_STRING)-1,
+ '(', ')',
+ &from.ip, &port,
+ &start, &end) != 1)
+ return 1;
+
+ IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n",
+ &from.ip, ntohs(port), &cp->caddr.ip, 0);
+
+ /*
+ * Now update or create an connection entry for it
+ */
+ {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ iph->protocol, &from, port,
+ &cp->caddr, 0, &p);
+ n_cp = ip_vs_conn_out_get(&p);
+ }
+ if (!n_cp) {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp),
+ AF_INET, IPPROTO_TCP, &cp->caddr,
+ 0, &cp->vaddr, port, &p);
+ /* As above, this is ipv4 only */
+ n_cp = ip_vs_conn_new(&p, AF_INET, &from, port,
+ IP_VS_CONN_F_NO_CPORT |
+ IP_VS_CONN_F_NFCT,
+ cp->dest, skb->mark);
+ if (!n_cp)
+ return 0;
+
+ /* add its controller */
+ ip_vs_control_add(n_cp, cp);
+ }
+
+ /*
+ * Replace the old passive address with the new one
+ */
+ from.ip = n_cp->vaddr.ip;
+ port = n_cp->vport;
+ snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u",
+ ((unsigned char *)&from.ip)[0],
+ ((unsigned char *)&from.ip)[1],
+ ((unsigned char *)&from.ip)[2],
+ ((unsigned char *)&from.ip)[3],
+ ntohs(port) >> 8,
+ ntohs(port) & 0xFF);
+
+ buf_len = strlen(buf);
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct && !nf_ct_is_untracked(ct) && nfct_nat(ct)) {
+ /* If mangling fails this function will return 0
+ * which will cause the packet to be dropped.
+ * Mangling can only fail under memory pressure,
+ * hopefully it will succeed on the retransmitted
+ * packet.
+ */
+ rcu_read_lock();
+ ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ iph->ihl * 4,
+ start-data, end-start,
+ buf, buf_len);
+ rcu_read_unlock();
+ if (ret) {
+ ip_vs_nfct_expect_related(skb, ct, n_cp,
+ IPPROTO_TCP, 0, 0);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ /* csum is updated */
+ ret = 1;
+ }
+ }
+
+ /*
+ * Not setting 'diff' is intentional, otherwise the sequence
+ * would be adjusted twice.
+ */
+
+ net = skb_net(skb);
+ cp->app_data = NULL;
+ ip_vs_tcp_conn_listen(net, n_cp);
+ ip_vs_conn_put(n_cp);
+ return ret;
+ }
+ return 1;
+}
+
+
+/*
+ * Look at incoming ftp packets to catch the PASV/PORT command
+ * (outside-to-inside).
+ *
+ * The incoming packet having the PORT command should be something like
+ * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
+ * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
+ * In this case, we create a connection entry using the client address and
+ * port, so that the active ftp data connection from the server can reach
+ * the client.
+ */
+static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
+ struct sk_buff *skb, int *diff)
+{
+ struct iphdr *iph;
+ struct tcphdr *th;
+ char *data, *data_start, *data_limit;
+ char *start, *end;
+ union nf_inet_addr to;
+ __be16 port;
+ struct ip_vs_conn *n_cp;
+ struct net *net;
+
+ /* no diff required for incoming packets */
+ *diff = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ /* This application helper doesn't work with IPv6 yet,
+ * so turn this into a no-op for IPv6 packets
+ */
+ if (cp->af == AF_INET6)
+ return 1;
+#endif
+
+ /* Only useful for established sessions */
+ if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+ return 1;
+
+ /* Linear packets are much easier to deal with. */
+ if (!skb_make_writable(skb, skb->len))
+ return 0;
+
+ /*
+ * Detecting whether it is passive
+ */
+ iph = ip_hdr(skb);
+ th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+
+ /* Since there may be OPTIONS in the TCP packet and the HLEN is
+ the length of the header in 32-bit multiples, it is accurate
+ to calculate data address by th+HLEN*4 */
+ data = data_start = (char *)th + (th->doff << 2);
+ data_limit = skb_tail_pointer(skb);
+
+ while (data <= data_limit - 6) {
+ if (strncasecmp(data, "PASV\r\n", 6) == 0) {
+ /* Passive mode on */
+ IP_VS_DBG(7, "got PASV at %td of %td\n",
+ data - data_start,
+ data_limit - data_start);
+ cp->app_data = &ip_vs_ftp_pasv;
+ return 1;
+ }
+ data++;
+ }
+
+ /*
+ * To support virtual FTP server, the scenerio is as follows:
+ * FTP client ----> Load Balancer ----> FTP server
+ * First detect the port number in the application data,
+ * then create a new connection entry for the coming data
+ * connection.
+ */
+ if (ip_vs_ftp_get_addrport(data_start, data_limit,
+ CLIENT_STRING, sizeof(CLIENT_STRING)-1,
+ ' ', '\r', &to.ip, &port,
+ &start, &end) != 1)
+ return 1;
+
+ IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port));
+
+ /* Passive mode off */
+ cp->app_data = NULL;
+
+ /*
+ * Now update or create a connection entry for it
+ */
+ IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n",
+ ip_vs_proto_name(iph->protocol),
+ &to.ip, ntohs(port), &cp->vaddr.ip, 0);
+
+ {
+ struct ip_vs_conn_param p;
+ ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
+ iph->protocol, &to, port, &cp->vaddr,
+ htons(ntohs(cp->vport)-1), &p);
+ n_cp = ip_vs_conn_in_get(&p);
+ if (!n_cp) {
+ /* This is ipv4 only */
+ n_cp = ip_vs_conn_new(&p, AF_INET, &cp->daddr,
+ htons(ntohs(cp->dport)-1),
+ IP_VS_CONN_F_NFCT, cp->dest,
+ skb->mark);
+ if (!n_cp)
+ return 0;
+
+ /* add its controller */
+ ip_vs_control_add(n_cp, cp);
+ }
+ }
+
+ /*
+ * Move tunnel to listen state
+ */
+ net = skb_net(skb);
+ ip_vs_tcp_conn_listen(net, n_cp);
+ ip_vs_conn_put(n_cp);
+
+ return 1;
+}
+
+
+static struct ip_vs_app ip_vs_ftp = {
+ .name = "ftp",
+ .type = IP_VS_APP_TYPE_FTP,
+ .protocol = IPPROTO_TCP,
+ .module = THIS_MODULE,
+ .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list),
+ .init_conn = ip_vs_ftp_init_conn,
+ .done_conn = ip_vs_ftp_done_conn,
+ .bind_conn = NULL,
+ .unbind_conn = NULL,
+ .pkt_out = ip_vs_ftp_out,
+ .pkt_in = ip_vs_ftp_in,
+};
+
+/*
+ * per netns ip_vs_ftp initialization
+ */
+static int __net_init __ip_vs_ftp_init(struct net *net)
+{
+ int i, ret;
+ struct ip_vs_app *app;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!ipvs)
+ return -ENOENT;
+
+ app = register_ip_vs_app(net, &ip_vs_ftp);
+ if (IS_ERR(app))
+ return PTR_ERR(app);
+
+ for (i = 0; i < ports_count; i++) {
+ if (!ports[i])
+ continue;
+ ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]);
+ if (ret)
+ goto err_unreg;
+ pr_info("%s: loaded support on port[%d] = %d\n",
+ app->name, i, ports[i]);
+ }
+ return 0;
+
+err_unreg:
+ unregister_ip_vs_app(net, &ip_vs_ftp);
+ return ret;
+}
+/*
+ * netns exit
+ */
+static void __ip_vs_ftp_exit(struct net *net)
+{
+ unregister_ip_vs_app(net, &ip_vs_ftp);
+}
+
+static struct pernet_operations ip_vs_ftp_ops = {
+ .init = __ip_vs_ftp_init,
+ .exit = __ip_vs_ftp_exit,
+};
+
+static int __init ip_vs_ftp_init(void)
+{
+ int rv;
+
+ rv = register_pernet_subsys(&ip_vs_ftp_ops);
+ /* rcu_barrier() is called by netns on error */
+ return rv;
+}
+
+/*
+ * ip_vs_ftp finish.
+ */
+static void __exit ip_vs_ftp_exit(void)
+{
+ unregister_pernet_subsys(&ip_vs_ftp_ops);
+ /* rcu_barrier() is called by netns */
+}
+
+
+module_init(ip_vs_ftp_init);
+module_exit(ip_vs_ftp_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
new file mode 100644
index 000000000..127f14046
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -0,0 +1,633 @@
+/*
+ * IPVS: Locality-Based Least-Connection scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Martin Hamilton : fixed the terrible locking bugs
+ * *lock(tbl->lock) ==> *lock(&tbl->lock)
+ * Wensong Zhang : fixed the uninitialized tbl->lock bug
+ * Wensong Zhang : added doing full expiration check to
+ * collect stale entries of 24+ hours when
+ * no partial expire check in a half hour
+ * Julian Anastasov : replaced del_timer call with del_timer_sync
+ * to avoid the possible race between timer
+ * handler and del_timer thread in SMP
+ *
+ */
+
+/*
+ * The lblc algorithm is as follows (pseudo code):
+ *
+ * if cachenode[dest_ip] is null then
+ * n, cachenode[dest_ip] <- {weighted least-conn node};
+ * else
+ * n <- cachenode[dest_ip];
+ * if (n is dead) OR
+ * (n.conns>n.weight AND
+ * there is a node m with m.conns<m.weight/2) then
+ * n, cachenode[dest_ip] <- {weighted least-conn node};
+ *
+ * return n;
+ *
+ * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
+ * me to write this module.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/jiffies.h>
+
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * It is for garbage collection of stale IPVS lblc entries,
+ * when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL (60*HZ)
+#define ENTRY_TIMEOUT (6*60*HZ)
+
+#define DEFAULT_EXPIRATION (24*60*60*HZ)
+
+/*
+ * It is for full expiration check.
+ * When there is no partial expiration check (garbage collection)
+ * in a half hour, do a full expiration check to collect stale
+ * entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION 30
+
+
+/*
+ * for IPVS lblc entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
+#define CONFIG_IP_VS_LBLC_TAB_BITS 10
+#endif
+#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS
+#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS)
+#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1)
+
+
+/*
+ * IPVS lblc entry represents an association between destination
+ * IP address and its destination server
+ */
+struct ip_vs_lblc_entry {
+ struct hlist_node list;
+ int af; /* address family */
+ union nf_inet_addr addr; /* destination IP address */
+ struct ip_vs_dest *dest; /* real server (cache) */
+ unsigned long lastuse; /* last used time */
+ struct rcu_head rcu_head;
+};
+
+
+/*
+ * IPVS lblc hash table
+ */
+struct ip_vs_lblc_table {
+ struct rcu_head rcu_head;
+ struct hlist_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */
+ struct timer_list periodic_timer; /* collect stale entries */
+ atomic_t entries; /* number of entries */
+ int max_size; /* maximum size of entries */
+ int rover; /* rover for expire check */
+ int counter; /* counter for no expire */
+ bool dead;
+};
+
+
+/*
+ * IPVS LBLC sysctl table
+ */
+#ifdef CONFIG_SYSCTL
+static struct ctl_table vs_vars_table[] = {
+ {
+ .procname = "lblc_expiration",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#endif
+
+static void ip_vs_lblc_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_lblc_entry *en = container_of(head,
+ struct ip_vs_lblc_entry,
+ rcu_head);
+
+ ip_vs_dest_put_and_free(en->dest);
+ kfree(en);
+}
+
+static inline void ip_vs_lblc_del(struct ip_vs_lblc_entry *en)
+{
+ hlist_del_rcu(&en->list);
+ call_rcu(&en->rcu_head, ip_vs_lblc_rcu_free);
+}
+
+/*
+ * Returns hash value for IPVS LBLC entry
+ */
+static inline unsigned int
+ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
+{
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+ return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+}
+
+
+/*
+ * Hash an entry in the ip_vs_lblc_table.
+ * returns bool success.
+ */
+static void
+ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
+{
+ unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr);
+
+ hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
+ atomic_inc(&tbl->entries);
+}
+
+
+/* Get ip_vs_lblc_entry associated with supplied parameters. */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl,
+ const union nf_inet_addr *addr)
+{
+ unsigned int hash = ip_vs_lblc_hashkey(af, addr);
+ struct ip_vs_lblc_entry *en;
+
+ hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
+ if (ip_vs_addr_equal(af, &en->addr, addr))
+ return en;
+
+ return NULL;
+}
+
+
+/*
+ * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP
+ * address to a server. Called under spin lock.
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr,
+ u16 af, struct ip_vs_dest *dest)
+{
+ struct ip_vs_lblc_entry *en;
+
+ en = ip_vs_lblc_get(af, tbl, daddr);
+ if (en) {
+ if (en->dest == dest)
+ return en;
+ ip_vs_lblc_del(en);
+ }
+ en = kmalloc(sizeof(*en), GFP_ATOMIC);
+ if (!en)
+ return NULL;
+
+ en->af = af;
+ ip_vs_addr_copy(af, &en->addr, daddr);
+ en->lastuse = jiffies;
+
+ ip_vs_dest_hold(dest);
+ en->dest = dest;
+
+ ip_vs_lblc_hash(tbl, en);
+
+ return en;
+}
+
+
+/*
+ * Flush all the entries of the specified table.
+ */
+static void ip_vs_lblc_flush(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
+ int i;
+
+ spin_lock_bh(&svc->sched_lock);
+ tbl->dead = 1;
+ for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
+ ip_vs_lblc_del(en);
+ atomic_dec(&tbl->entries);
+ }
+ }
+ spin_unlock_bh(&svc->sched_lock);
+}
+
+static int sysctl_lblc_expiration(struct ip_vs_service *svc)
+{
+#ifdef CONFIG_SYSCTL
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ return ipvs->sysctl_lblc_expiration;
+#else
+ return DEFAULT_EXPIRATION;
+#endif
+}
+
+static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
+ unsigned long now = jiffies;
+ int i, j;
+
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
+ if (time_before(now,
+ en->lastuse +
+ sysctl_lblc_expiration(svc)))
+ continue;
+
+ ip_vs_lblc_del(en);
+ atomic_dec(&tbl->entries);
+ }
+ spin_unlock(&svc->sched_lock);
+ }
+ tbl->rover = j;
+}
+
+
+/*
+ * Periodical timer handler for IPVS lblc table
+ * It is used to collect stale entries when the number of entries
+ * exceeds the maximum size of the table.
+ *
+ * Fixme: we probably need more complicated algorithm to collect
+ * entries that have not been used for a long time even
+ * if the number of entries doesn't exceed the maximum size
+ * of the table.
+ * The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblc_check_expire(unsigned long data)
+{
+ struct ip_vs_service *svc = (struct ip_vs_service *) data;
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+ unsigned long now = jiffies;
+ int goal;
+ int i, j;
+ struct ip_vs_lblc_entry *en;
+ struct hlist_node *next;
+
+ if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+ /* do full expiration check */
+ ip_vs_lblc_full_check(svc);
+ tbl->counter = 1;
+ goto out;
+ }
+
+ if (atomic_read(&tbl->entries) <= tbl->max_size) {
+ tbl->counter++;
+ goto out;
+ }
+
+ goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+ if (goal > tbl->max_size/2)
+ goal = tbl->max_size/2;
+
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
+ if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
+ continue;
+
+ ip_vs_lblc_del(en);
+ atomic_dec(&tbl->entries);
+ goal--;
+ }
+ spin_unlock(&svc->sched_lock);
+ if (goal <= 0)
+ break;
+ }
+ tbl->rover = j;
+
+ out:
+ mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
+}
+
+
+static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_lblc_table *tbl;
+
+ /*
+ * Allocate the ip_vs_lblc_table for this service
+ */
+ tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
+ if (tbl == NULL)
+ return -ENOMEM;
+
+ svc->sched_data = tbl;
+ IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
+ "current service\n", sizeof(*tbl));
+
+ /*
+ * Initialize the hash buckets
+ */
+ for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
+ INIT_HLIST_HEAD(&tbl->bucket[i]);
+ }
+ tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
+ tbl->rover = 0;
+ tbl->counter = 1;
+ tbl->dead = 0;
+
+ /*
+ * Hook periodic timer for garbage collection
+ */
+ setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire,
+ (unsigned long)svc);
+ mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
+
+ return 0;
+}
+
+
+static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+
+ /* remove periodic timer */
+ del_timer_sync(&tbl->periodic_timer);
+
+ /* got to clean up table entries here */
+ ip_vs_lblc_flush(svc);
+
+ /* release the table itself */
+ kfree_rcu(tbl, rcu_head);
+ IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
+ sizeof(*tbl));
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_lblc_schedule(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ /*
+ * We use the following formula to estimate the load:
+ * (dest overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connection.
+ */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+ if (atomic_read(&dest->weight) > 0) {
+ least = dest;
+ loh = ip_vs_dest_conn_overhead(least);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ doh = ip_vs_dest_conn_overhead(dest);
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "LBLC: server %s:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+/*
+ * If this destination server is overloaded and there is a less loaded
+ * server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+ if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+ struct ip_vs_dest *d;
+
+ list_for_each_entry_rcu(d, &svc->destinations, n_list) {
+ if (atomic_read(&d->activeconns)*2
+ < atomic_read(&d->weight)) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+
+/*
+ * Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_lblc_table *tbl = svc->sched_data;
+ struct ip_vs_dest *dest = NULL;
+ struct ip_vs_lblc_entry *en;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /* First look in our cache */
+ en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr);
+ if (en) {
+ /* We only hold a read lock, but this is atomic */
+ en->lastuse = jiffies;
+
+ /*
+ * If the destination is not available, i.e. it's in the trash,
+ * we must ignore it, as it may be removed from under our feet,
+ * if someone drops our reference count. Our caller only makes
+ * sure that destinations, that are not in the trash, are not
+ * moved to the trash, while we are scheduling. But anyone can
+ * free up entries from the trash at any time.
+ */
+
+ dest = en->dest;
+ if ((dest->flags & IP_VS_DEST_F_AVAILABLE) &&
+ atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
+ goto out;
+ }
+
+ /* No cache entry or it is invalid, time to schedule */
+ dest = __ip_vs_lblc_schedule(svc);
+ if (!dest) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ /* If we fail to create a cache entry, we'll just use the valid dest */
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_lblc_new(tbl, &iph->daddr, svc->af, dest);
+ spin_unlock_bh(&svc->sched_lock);
+
+out:
+ IP_VS_DBG_BUF(6, "LBLC: destination IP address %s --> server %s:%d\n",
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr),
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS LBLC Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblc_scheduler = {
+ .name = "lblc",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
+ .init_service = ip_vs_lblc_init_svc,
+ .done_service = ip_vs_lblc_done_svc,
+ .schedule = ip_vs_lblc_schedule,
+};
+
+/*
+ * per netns init.
+ */
+#ifdef CONFIG_SYSCTL
+static int __net_init __ip_vs_lblc_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!ipvs)
+ return -ENOENT;
+
+ if (!net_eq(net, &init_net)) {
+ ipvs->lblc_ctl_table = kmemdup(vs_vars_table,
+ sizeof(vs_vars_table),
+ GFP_KERNEL);
+ if (ipvs->lblc_ctl_table == NULL)
+ return -ENOMEM;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ ipvs->lblc_ctl_table[0].procname = NULL;
+
+ } else
+ ipvs->lblc_ctl_table = vs_vars_table;
+ ipvs->sysctl_lblc_expiration = DEFAULT_EXPIRATION;
+ ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
+
+ ipvs->lblc_ctl_header =
+ register_net_sysctl(net, "net/ipv4/vs", ipvs->lblc_ctl_table);
+ if (!ipvs->lblc_ctl_header) {
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblc_ctl_table);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __net_exit __ip_vs_lblc_exit(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ unregister_net_sysctl_table(ipvs->lblc_ctl_header);
+
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblc_ctl_table);
+}
+
+#else
+
+static int __net_init __ip_vs_lblc_init(struct net *net) { return 0; }
+static void __net_exit __ip_vs_lblc_exit(struct net *net) { }
+
+#endif
+
+static struct pernet_operations ip_vs_lblc_ops = {
+ .init = __ip_vs_lblc_init,
+ .exit = __ip_vs_lblc_exit,
+};
+
+static int __init ip_vs_lblc_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&ip_vs_lblc_ops);
+ if (ret)
+ return ret;
+
+ ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+ if (ret)
+ unregister_pernet_subsys(&ip_vs_lblc_ops);
+ return ret;
+}
+
+static void __exit ip_vs_lblc_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+ unregister_pernet_subsys(&ip_vs_lblc_ops);
+ rcu_barrier();
+}
+
+
+module_init(ip_vs_lblc_init);
+module_exit(ip_vs_lblc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
new file mode 100644
index 000000000..2229d2d8b
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -0,0 +1,818 @@
+/*
+ * IPVS: Locality-Based Least-Connection with Replication scheduler
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Julian Anastasov : Added the missing (dest->weight>0)
+ * condition in the ip_vs_dest_set_max.
+ *
+ */
+
+/*
+ * The lblc/r algorithm is as follows (pseudo code):
+ *
+ * if serverSet[dest_ip] is null then
+ * n, serverSet[dest_ip] <- {weighted least-conn node};
+ * else
+ * n <- {least-conn (alive) node in serverSet[dest_ip]};
+ * if (n is null) OR
+ * (n.conns>n.weight AND
+ * there is a node m with m.conns<m.weight/2) then
+ * n <- {weighted least-conn node};
+ * add n to serverSet[dest_ip];
+ * if |serverSet[dest_ip]| > 1 AND
+ * now - serverSet[dest_ip].lastMod > T then
+ * m <- {most conn node in serverSet[dest_ip]};
+ * remove m from serverSet[dest_ip];
+ * if serverSet[dest_ip] changed then
+ * serverSet[dest_ip].lastMod <- now;
+ *
+ * return n;
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <net/net_namespace.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * It is for garbage collection of stale IPVS lblcr entries,
+ * when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL (60*HZ)
+#define ENTRY_TIMEOUT (6*60*HZ)
+
+#define DEFAULT_EXPIRATION (24*60*60*HZ)
+
+/*
+ * It is for full expiration check.
+ * When there is no partial expiration check (garbage collection)
+ * in a half hour, do a full expiration check to collect stale
+ * entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION 30
+
+/*
+ * for IPVS lblcr entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
+#define CONFIG_IP_VS_LBLCR_TAB_BITS 10
+#endif
+#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS
+#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS)
+#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1)
+
+
+/*
+ * IPVS destination set structure and operations
+ */
+struct ip_vs_dest_set_elem {
+ struct list_head list; /* list link */
+ struct ip_vs_dest *dest; /* destination server */
+ struct rcu_head rcu_head;
+};
+
+struct ip_vs_dest_set {
+ atomic_t size; /* set size */
+ unsigned long lastmod; /* last modified time */
+ struct list_head list; /* destination list */
+};
+
+
+static void ip_vs_dest_set_insert(struct ip_vs_dest_set *set,
+ struct ip_vs_dest *dest, bool check)
+{
+ struct ip_vs_dest_set_elem *e;
+
+ if (check) {
+ list_for_each_entry(e, &set->list, list) {
+ if (e->dest == dest)
+ return;
+ }
+ }
+
+ e = kmalloc(sizeof(*e), GFP_ATOMIC);
+ if (e == NULL)
+ return;
+
+ ip_vs_dest_hold(dest);
+ e->dest = dest;
+
+ list_add_rcu(&e->list, &set->list);
+ atomic_inc(&set->size);
+
+ set->lastmod = jiffies;
+}
+
+static void ip_vs_lblcr_elem_rcu_free(struct rcu_head *head)
+{
+ struct ip_vs_dest_set_elem *e;
+
+ e = container_of(head, struct ip_vs_dest_set_elem, rcu_head);
+ ip_vs_dest_put_and_free(e->dest);
+ kfree(e);
+}
+
+static void
+ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+ struct ip_vs_dest_set_elem *e;
+
+ list_for_each_entry(e, &set->list, list) {
+ if (e->dest == dest) {
+ /* HIT */
+ atomic_dec(&set->size);
+ set->lastmod = jiffies;
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free);
+ break;
+ }
+ }
+}
+
+static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
+{
+ struct ip_vs_dest_set_elem *e, *ep;
+
+ list_for_each_entry_safe(e, ep, &set->list, list) {
+ list_del_rcu(&e->list);
+ call_rcu(&e->rcu_head, ip_vs_lblcr_elem_rcu_free);
+ }
+}
+
+/* get weighted least-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
+{
+ register struct ip_vs_dest_set_elem *e;
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ /* select the first destination server, whose weight > 0 */
+ list_for_each_entry_rcu(e, &set->list, list) {
+ least = e->dest;
+ if (least->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ if ((atomic_read(&least->weight) > 0)
+ && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
+ loh = ip_vs_dest_conn_overhead(least);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /* find the destination with the weighted least load */
+ nextstage:
+ list_for_each_entry_continue_rcu(e, &set->list, list) {
+ dest = e->dest;
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ doh = ip_vs_dest_conn_overhead(dest);
+ if (((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight))
+ && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "%s(): server %s:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ __func__,
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+ return least;
+}
+
+
+/* get weighted most-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
+{
+ register struct ip_vs_dest_set_elem *e;
+ struct ip_vs_dest *dest, *most;
+ int moh, doh;
+
+ if (set == NULL)
+ return NULL;
+
+ /* select the first destination server, whose weight > 0 */
+ list_for_each_entry(e, &set->list, list) {
+ most = e->dest;
+ if (atomic_read(&most->weight) > 0) {
+ moh = ip_vs_dest_conn_overhead(most);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /* find the destination with the weighted most load */
+ nextstage:
+ list_for_each_entry_continue(e, &set->list, list) {
+ dest = e->dest;
+ doh = ip_vs_dest_conn_overhead(dest);
+ /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
+ if (((__s64)moh * atomic_read(&dest->weight) <
+ (__s64)doh * atomic_read(&most->weight))
+ && (atomic_read(&dest->weight) > 0)) {
+ most = dest;
+ moh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "%s(): server %s:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ __func__,
+ IP_VS_DBG_ADDR(most->af, &most->addr), ntohs(most->port),
+ atomic_read(&most->activeconns),
+ atomic_read(&most->refcnt),
+ atomic_read(&most->weight), moh);
+ return most;
+}
+
+
+/*
+ * IPVS lblcr entry represents an association between destination
+ * IP address and its destination server set
+ */
+struct ip_vs_lblcr_entry {
+ struct hlist_node list;
+ int af; /* address family */
+ union nf_inet_addr addr; /* destination IP address */
+ struct ip_vs_dest_set set; /* destination server set */
+ unsigned long lastuse; /* last used time */
+ struct rcu_head rcu_head;
+};
+
+
+/*
+ * IPVS lblcr hash table
+ */
+struct ip_vs_lblcr_table {
+ struct rcu_head rcu_head;
+ struct hlist_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */
+ atomic_t entries; /* number of entries */
+ int max_size; /* maximum size of entries */
+ struct timer_list periodic_timer; /* collect stale entries */
+ int rover; /* rover for expire check */
+ int counter; /* counter for no expire */
+ bool dead;
+};
+
+
+#ifdef CONFIG_SYSCTL
+/*
+ * IPVS LBLCR sysctl table
+ */
+
+static struct ctl_table vs_vars_table[] = {
+ {
+ .procname = "lblcr_expiration",
+ .data = NULL,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ { }
+};
+#endif
+
+static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
+{
+ hlist_del_rcu(&en->list);
+ ip_vs_dest_set_eraseall(&en->set);
+ kfree_rcu(en, rcu_head);
+}
+
+
+/*
+ * Returns hash value for IPVS LBLCR entry
+ */
+static inline unsigned int
+ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
+{
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+ return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+}
+
+
+/*
+ * Hash an entry in the ip_vs_lblcr_table.
+ * returns bool success.
+ */
+static void
+ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
+{
+ unsigned int hash = ip_vs_lblcr_hashkey(en->af, &en->addr);
+
+ hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
+ atomic_inc(&tbl->entries);
+}
+
+
+/* Get ip_vs_lblcr_entry associated with supplied parameters. */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_get(int af, struct ip_vs_lblcr_table *tbl,
+ const union nf_inet_addr *addr)
+{
+ unsigned int hash = ip_vs_lblcr_hashkey(af, addr);
+ struct ip_vs_lblcr_entry *en;
+
+ hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
+ if (ip_vs_addr_equal(af, &en->addr, addr))
+ return en;
+
+ return NULL;
+}
+
+
+/*
+ * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination
+ * IP address to a server. Called under spin lock.
+ */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, const union nf_inet_addr *daddr,
+ u16 af, struct ip_vs_dest *dest)
+{
+ struct ip_vs_lblcr_entry *en;
+
+ en = ip_vs_lblcr_get(af, tbl, daddr);
+ if (!en) {
+ en = kmalloc(sizeof(*en), GFP_ATOMIC);
+ if (!en)
+ return NULL;
+
+ en->af = af;
+ ip_vs_addr_copy(af, &en->addr, daddr);
+ en->lastuse = jiffies;
+
+ /* initialize its dest set */
+ atomic_set(&(en->set.size), 0);
+ INIT_LIST_HEAD(&en->set.list);
+
+ ip_vs_dest_set_insert(&en->set, dest, false);
+
+ ip_vs_lblcr_hash(tbl, en);
+ return en;
+ }
+
+ ip_vs_dest_set_insert(&en->set, dest, true);
+
+ return en;
+}
+
+
+/*
+ * Flush all the entries of the specified table.
+ */
+static void ip_vs_lblcr_flush(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
+ int i;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
+
+ spin_lock_bh(&svc->sched_lock);
+ tbl->dead = 1;
+ for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) {
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[i], list) {
+ ip_vs_lblcr_free(en);
+ }
+ }
+ spin_unlock_bh(&svc->sched_lock);
+}
+
+static int sysctl_lblcr_expiration(struct ip_vs_service *svc)
+{
+#ifdef CONFIG_SYSCTL
+ struct netns_ipvs *ipvs = net_ipvs(svc->net);
+ return ipvs->sysctl_lblcr_expiration;
+#else
+ return DEFAULT_EXPIRATION;
+#endif
+}
+
+static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
+ unsigned long now = jiffies;
+ int i, j;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
+
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
+ if (time_after(en->lastuse +
+ sysctl_lblcr_expiration(svc), now))
+ continue;
+
+ ip_vs_lblcr_free(en);
+ atomic_dec(&tbl->entries);
+ }
+ spin_unlock(&svc->sched_lock);
+ }
+ tbl->rover = j;
+}
+
+
+/*
+ * Periodical timer handler for IPVS lblcr table
+ * It is used to collect stale entries when the number of entries
+ * exceeds the maximum size of the table.
+ *
+ * Fixme: we probably need more complicated algorithm to collect
+ * entries that have not been used for a long time even
+ * if the number of entries doesn't exceed the maximum size
+ * of the table.
+ * The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblcr_check_expire(unsigned long data)
+{
+ struct ip_vs_service *svc = (struct ip_vs_service *) data;
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
+ unsigned long now = jiffies;
+ int goal;
+ int i, j;
+ struct ip_vs_lblcr_entry *en;
+ struct hlist_node *next;
+
+ if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+ /* do full expiration check */
+ ip_vs_lblcr_full_check(svc);
+ tbl->counter = 1;
+ goto out;
+ }
+
+ if (atomic_read(&tbl->entries) <= tbl->max_size) {
+ tbl->counter++;
+ goto out;
+ }
+
+ goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+ if (goal > tbl->max_size/2)
+ goal = tbl->max_size/2;
+
+ for (i = 0, j = tbl->rover; i < IP_VS_LBLCR_TAB_SIZE; i++) {
+ j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+
+ spin_lock(&svc->sched_lock);
+ hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
+ if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
+ continue;
+
+ ip_vs_lblcr_free(en);
+ atomic_dec(&tbl->entries);
+ goal--;
+ }
+ spin_unlock(&svc->sched_lock);
+ if (goal <= 0)
+ break;
+ }
+ tbl->rover = j;
+
+ out:
+ mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+
+static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_lblcr_table *tbl;
+
+ /*
+ * Allocate the ip_vs_lblcr_table for this service
+ */
+ tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
+ if (tbl == NULL)
+ return -ENOMEM;
+
+ svc->sched_data = tbl;
+ IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
+ "current service\n", sizeof(*tbl));
+
+ /*
+ * Initialize the hash buckets
+ */
+ for (i = 0; i < IP_VS_LBLCR_TAB_SIZE; i++) {
+ INIT_HLIST_HEAD(&tbl->bucket[i]);
+ }
+ tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
+ tbl->rover = 0;
+ tbl->counter = 1;
+ tbl->dead = 0;
+
+ /*
+ * Hook periodic timer for garbage collection
+ */
+ setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire,
+ (unsigned long)svc);
+ mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);
+
+ return 0;
+}
+
+
+static void ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
+
+ /* remove periodic timer */
+ del_timer_sync(&tbl->periodic_timer);
+
+ /* got to clean up table entries here */
+ ip_vs_lblcr_flush(svc);
+
+ /* release the table itself */
+ kfree_rcu(tbl, rcu_head);
+ IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
+ sizeof(*tbl));
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_lblcr_schedule(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ /*
+ * We use the following formula to estimate the load:
+ * (dest overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connection.
+ */
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ if (atomic_read(&dest->weight) > 0) {
+ least = dest;
+ loh = ip_vs_dest_conn_overhead(least);
+ goto nextstage;
+ }
+ }
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+
+ doh = ip_vs_dest_conn_overhead(dest);
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "LBLCR: server %s:%d "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+/*
+ * If this destination server is overloaded and there is a less loaded
+ * server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+ if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+ struct ip_vs_dest *d;
+
+ list_for_each_entry_rcu(d, &svc->destinations, n_list) {
+ if (atomic_read(&d->activeconns)*2
+ < atomic_read(&d->weight)) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+
+/*
+ * Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_lblcr_table *tbl = svc->sched_data;
+ struct ip_vs_dest *dest;
+ struct ip_vs_lblcr_entry *en;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /* First look in our cache */
+ en = ip_vs_lblcr_get(svc->af, tbl, &iph->daddr);
+ if (en) {
+ en->lastuse = jiffies;
+
+ /* Get the least loaded destination */
+ dest = ip_vs_dest_set_min(&en->set);
+
+ /* More than one destination + enough time passed by, cleanup */
+ if (atomic_read(&en->set.size) > 1 &&
+ time_after(jiffies, en->set.lastmod +
+ sysctl_lblcr_expiration(svc))) {
+ spin_lock_bh(&svc->sched_lock);
+ if (atomic_read(&en->set.size) > 1) {
+ struct ip_vs_dest *m;
+
+ m = ip_vs_dest_set_max(&en->set);
+ if (m)
+ ip_vs_dest_set_erase(&en->set, m);
+ }
+ spin_unlock_bh(&svc->sched_lock);
+ }
+
+ /* If the destination is not overloaded, use it */
+ if (dest && !is_overloaded(dest, svc))
+ goto out;
+
+ /* The cache entry is invalid, time to schedule */
+ dest = __ip_vs_lblcr_schedule(svc);
+ if (!dest) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ /* Update our cache entry */
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_dest_set_insert(&en->set, dest, true);
+ spin_unlock_bh(&svc->sched_lock);
+ goto out;
+ }
+
+ /* No cache entry, time to schedule */
+ dest = __ip_vs_lblcr_schedule(svc);
+ if (!dest) {
+ IP_VS_DBG(1, "no destination available\n");
+ return NULL;
+ }
+
+ /* If we fail to create a cache entry, we'll just use the valid dest */
+ spin_lock_bh(&svc->sched_lock);
+ if (!tbl->dead)
+ ip_vs_lblcr_new(tbl, &iph->daddr, svc->af, dest);
+ spin_unlock_bh(&svc->sched_lock);
+
+out:
+ IP_VS_DBG_BUF(6, "LBLCR: destination IP address %s --> server %s:%d\n",
+ IP_VS_DBG_ADDR(svc->af, &iph->daddr),
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS LBLCR Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
+{
+ .name = "lblcr",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list),
+ .init_service = ip_vs_lblcr_init_svc,
+ .done_service = ip_vs_lblcr_done_svc,
+ .schedule = ip_vs_lblcr_schedule,
+};
+
+/*
+ * per netns init.
+ */
+#ifdef CONFIG_SYSCTL
+static int __net_init __ip_vs_lblcr_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!ipvs)
+ return -ENOENT;
+
+ if (!net_eq(net, &init_net)) {
+ ipvs->lblcr_ctl_table = kmemdup(vs_vars_table,
+ sizeof(vs_vars_table),
+ GFP_KERNEL);
+ if (ipvs->lblcr_ctl_table == NULL)
+ return -ENOMEM;
+
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ ipvs->lblcr_ctl_table[0].procname = NULL;
+ } else
+ ipvs->lblcr_ctl_table = vs_vars_table;
+ ipvs->sysctl_lblcr_expiration = DEFAULT_EXPIRATION;
+ ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration;
+
+ ipvs->lblcr_ctl_header =
+ register_net_sysctl(net, "net/ipv4/vs", ipvs->lblcr_ctl_table);
+ if (!ipvs->lblcr_ctl_header) {
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblcr_ctl_table);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void __net_exit __ip_vs_lblcr_exit(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ unregister_net_sysctl_table(ipvs->lblcr_ctl_header);
+
+ if (!net_eq(net, &init_net))
+ kfree(ipvs->lblcr_ctl_table);
+}
+
+#else
+
+static int __net_init __ip_vs_lblcr_init(struct net *net) { return 0; }
+static void __net_exit __ip_vs_lblcr_exit(struct net *net) { }
+
+#endif
+
+static struct pernet_operations ip_vs_lblcr_ops = {
+ .init = __ip_vs_lblcr_init,
+ .exit = __ip_vs_lblcr_exit,
+};
+
+static int __init ip_vs_lblcr_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&ip_vs_lblcr_ops);
+ if (ret)
+ return ret;
+
+ ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+ if (ret)
+ unregister_pernet_subsys(&ip_vs_lblcr_ops);
+ return ret;
+}
+
+static void __exit ip_vs_lblcr_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+ unregister_pernet_subsys(&ip_vs_lblcr_ops);
+ rcu_barrier();
+}
+
+
+module_init(ip_vs_lblcr_init);
+module_exit(ip_vs_lblcr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c
new file mode 100644
index 000000000..19a0769a9
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -0,0 +1,93 @@
+/*
+ * IPVS: Least-Connection Scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Wensong Zhang : added the ip_vs_lc_update_svc
+ * Wensong Zhang : added any dest with weight=0 is quiesced
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+/*
+ * Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *least = NULL;
+ unsigned int loh = 0, doh;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /*
+ * Simply select the server with the least number of
+ * (activeconns<<5) + inactconns
+ * Except whose weight is equal to zero.
+ * If the weight is equal to zero, it means that the server is
+ * quiesced, the existing connections to the server still get
+ * served, but no new connection is assigned to the server.
+ */
+
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
+ atomic_read(&dest->weight) == 0)
+ continue;
+ doh = ip_vs_dest_conn_overhead(dest);
+ if (!least || doh < loh) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ if (!least)
+ ip_vs_scheduler_err(svc, "no destination available");
+ else
+ IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d "
+ "inactconns %d\n",
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->inactconns));
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_lc_scheduler = {
+ .name = "lc",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list),
+ .schedule = ip_vs_lc_schedule,
+};
+
+
+static int __init ip_vs_lc_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
+}
+
+static void __exit ip_vs_lc_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_lc_init);
+module_exit(ip_vs_lc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
new file mode 100644
index 000000000..5882bbfd1
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -0,0 +1,299 @@
+/*
+ * ip_vs_nfct.c: Netfilter connection tracking support for IPVS
+ *
+ * Portions Copyright (C) 2001-2002
+ * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
+ *
+ * Portions Copyright (C) 2003-2010
+ * Julian Anastasov
+ *
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * Authors:
+ * Ben North <ben@redfrontdoor.org>
+ * Julian Anastasov <ja@ssi.bg> Reorganize and sync with latest kernels
+ * Hannes Eder <heder@google.com> Extend NFCT support for FTP, ipvs match
+ *
+ *
+ * Current status:
+ *
+ * - provide conntrack confirmation for new and related connections, by
+ * this way we can see their proper conntrack state in all hooks
+ * - support for all forwarding methods, not only NAT
+ * - FTP support (NAT), ability to support other NAT apps with expectations
+ * - to correctly create expectations for related NAT connections the proper
+ * NF conntrack support must be already installed, eg. ip_vs_ftp requires
+ * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables
+ * NAT rules are needed)
+ * - alter reply for NAT when forwarding packet in original direction:
+ * conntrack from client in NEW or RELATED (Passive FTP DATA) state or
+ * when RELATED conntrack is created from real server (Active FTP DATA)
+ * - if iptables_nat is not loaded the Passive FTP will not work (the
+ * PASV response can not be NAT-ed) but Active FTP should work
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+
+#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u"
+#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \
+ &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+ (T)->dst.protonum
+
+#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
+#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \
+ &((C)->vaddr.ip), ntohs((C)->vport), \
+ &((C)->daddr.ip), ntohs((C)->dport), \
+ (C)->protocol, (C)->state
+
+void
+ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conntrack_tuple new_tuple;
+
+ if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) ||
+ nf_ct_is_dying(ct))
+ return;
+
+ /* Never alter conntrack for non-NAT conns */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return;
+
+ /* Alter reply only in original direction */
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ return;
+
+ /* Applications may adjust TCP seqs */
+ if (cp->app && nf_ct_protonum(ct) == IPPROTO_TCP &&
+ !nfct_seqadj(ct) && !nfct_seqadj_ext_add(ct))
+ return;
+
+ /*
+ * The connection is not yet in the hashtable, so we update it.
+ * CIP->VIP will remain the same, so leave the tuple in
+ * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the
+ * real-server we will see RIP->DIP.
+ */
+ new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ /*
+ * This will also take care of UDP and other protocols.
+ */
+ if (outin) {
+ new_tuple.src.u3 = cp->daddr;
+ if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+ new_tuple.dst.protonum != IPPROTO_ICMPV6)
+ new_tuple.src.u.tcp.port = cp->dport;
+ } else {
+ new_tuple.dst.u3 = cp->vaddr;
+ if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+ new_tuple.dst.protonum != IPPROTO_ICMPV6)
+ new_tuple.dst.u.tcp.port = cp->vport;
+ }
+ IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+ "ctinfo=%d, old reply=" FMT_TUPLE
+ ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
+ __func__, ct, ct->status, ctinfo,
+ ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
+ ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+ nf_conntrack_alter_reply(ct, &new_tuple);
+}
+
+int ip_vs_confirm_conntrack(struct sk_buff *skb)
+{
+ return nf_conntrack_confirm(skb);
+}
+
+/*
+ * Called from init_conntrack() as expectfn handler.
+ */
+static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct nf_conntrack_tuple *orig, new_reply;
+ struct ip_vs_conn *cp;
+ struct ip_vs_conn_param p;
+ struct net *net = nf_ct_net(ct);
+
+ if (exp->tuple.src.l3num != PF_INET)
+ return;
+
+ /*
+ * We assume that no NF locks are held before this callback.
+ * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
+ * expectations even if they use wildcard values, now we provide the
+ * actual values from the newly created original conntrack direction.
+ * The conntrack is confirmed when packet reaches IPVS hooks.
+ */
+
+ /* RS->CLIENT */
+ orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum,
+ &orig->src.u3, orig->src.u.tcp.port,
+ &orig->dst.u3, orig->dst.u.tcp.port, &p);
+ cp = ip_vs_conn_out_get(&p);
+ if (cp) {
+ /* Change reply CLIENT->RS to CLIENT->VS */
+ new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
+ __func__, ct, ct->status,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ new_reply.dst.u3 = cp->vaddr;
+ new_reply.dst.u.tcp.port = cp->vport;
+ IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
+ ", inout cp=" FMT_CONN "\n",
+ __func__, ct,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ goto alter;
+ }
+
+ /* CLIENT->VS */
+ cp = ip_vs_conn_in_get(&p);
+ if (cp) {
+ /* Change reply VS->CLIENT to RS->CLIENT */
+ new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
+ __func__, ct, ct->status,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ new_reply.src.u3 = cp->daddr;
+ new_reply.src.u.tcp.port = cp->dport;
+ IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", outin cp=" FMT_CONN "\n",
+ __func__, ct,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ goto alter;
+ }
+
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+ " - unknown expect\n",
+ __func__, ct, ct->status, ARG_TUPLE(orig));
+ return;
+
+alter:
+ /* Never alter conntrack for non-NAT conns */
+ if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
+ nf_conntrack_alter_reply(ct, &new_reply);
+ ip_vs_conn_put(cp);
+ return;
+}
+
+/*
+ * Create NF conntrack expectation with wildcard (optional) source port.
+ * Then the default callback function will alter the reply and will confirm
+ * the conntrack entry when the first packet comes.
+ * Use port 0 to expect connection from any port.
+ */
+void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
+ struct ip_vs_conn *cp, u_int8_t proto,
+ const __be16 port, int from_rs)
+{
+ struct nf_conntrack_expect *exp;
+
+ if (ct == NULL || nf_ct_is_untracked(ct))
+ return;
+
+ exp = nf_ct_expect_alloc(ct);
+ if (!exp)
+ return;
+
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ from_rs ? &cp->daddr : &cp->caddr,
+ from_rs ? &cp->caddr : &cp->vaddr,
+ proto, port ? &port : NULL,
+ from_rs ? &cp->cport : &cp->vport);
+
+ exp->expectfn = ip_vs_nfct_expect_callback;
+
+ IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&exp->tuple));
+ nf_ct_expect_related(exp);
+ nf_ct_expect_put(exp);
+}
+EXPORT_SYMBOL(ip_vs_nfct_expect_related);
+
+/*
+ * Our connection was terminated, try to drop the conntrack immediately
+ */
+void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
+{
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ struct nf_conntrack_tuple tuple;
+
+ if (!cp->cport)
+ return;
+
+ tuple = (struct nf_conntrack_tuple) {
+ .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } };
+ tuple.src.u3 = cp->caddr;
+ tuple.src.u.all = cp->cport;
+ tuple.src.l3num = cp->af;
+ tuple.dst.u3 = cp->vaddr;
+ tuple.dst.u.all = cp->vport;
+
+ IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
+ " for conn " FMT_CONN "\n",
+ __func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+
+ h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE,
+ &tuple);
+ if (h) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ /* Show what happens instead of calling nf_ct_kill() */
+ if (del_timer(&ct->timeout)) {
+ IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
+ if (ct->timeout.function)
+ ct->timeout.function(ct->timeout.data);
+ } else {
+ IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
+ }
+ nf_ct_put(ct);
+ } else {
+ IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+ __func__, ARG_TUPLE(&tuple));
+ }
+}
+
diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c
new file mode 100644
index 000000000..a8b63401e
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nq.c
@@ -0,0 +1,143 @@
+/*
+ * IPVS: Never Queue scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The NQ algorithm adopts a two-speed model. When there is an idle server
+ * available, the job will be sent to the idle server, instead of waiting
+ * for a fast one. When there is no idle server available, the job will be
+ * sent to the server that minimize its expected delay (The Shortest
+ * Expected Delay scheduling algorithm).
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
+ *
+ * The difference between NQ and SED is that NQ can improve overall
+ * system utilization.
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static inline int
+ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
+{
+ /*
+ * We only use the active connection number in the cost
+ * calculation here.
+ */
+ return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ * Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *least = NULL;
+ int loh = 0, doh;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /*
+ * We calculate the load of each dest server as follows:
+ * (server expected overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connections.
+ */
+
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
+ !atomic_read(&dest->weight))
+ continue;
+
+ doh = ip_vs_nq_dest_overhead(dest);
+
+ /* return the server directly if it is idle */
+ if (atomic_read(&dest->activeconns) == 0) {
+ least = dest;
+ loh = doh;
+ goto out;
+ }
+
+ if (!least ||
+ ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight))) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ if (!least) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ out:
+ IP_VS_DBG_BUF(6, "NQ: server %s:%u "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_nq_scheduler =
+{
+ .name = "nq",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list),
+ .schedule = ip_vs_nq_schedule,
+};
+
+
+static int __init ip_vs_nq_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+
+static void __exit ip_vs_nq_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_nq_init);
+module_exit(ip_vs_nq_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
new file mode 100644
index 000000000..0df17caa8
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -0,0 +1,110 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+/* IPVS pe list */
+static LIST_HEAD(ip_vs_pe);
+
+/* semaphore for IPVS PEs. */
+static DEFINE_MUTEX(ip_vs_pe_mutex);
+
+/* Get pe in the pe list by name */
+struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
+{
+ struct ip_vs_pe *pe;
+
+ IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__,
+ pe_name);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) {
+ /* Test and get the modules atomically */
+ if (pe->module &&
+ !try_module_get(pe->module)) {
+ /* This pe is just deleted */
+ continue;
+ }
+ if (strcmp(pe_name, pe->name)==0) {
+ /* HIT */
+ rcu_read_unlock();
+ return pe;
+ }
+ module_put(pe->module);
+ }
+ rcu_read_unlock();
+
+ return NULL;
+}
+
+/* Lookup pe and try to load it if it doesn't exist */
+struct ip_vs_pe *ip_vs_pe_getbyname(const char *name)
+{
+ struct ip_vs_pe *pe;
+
+ /* Search for the pe by name */
+ pe = __ip_vs_pe_getbyname(name);
+
+ /* If pe not found, load the module and search again */
+ if (!pe) {
+ request_module("ip_vs_pe_%s", name);
+ pe = __ip_vs_pe_getbyname(name);
+ }
+
+ return pe;
+}
+
+/* Register a pe in the pe list */
+int register_ip_vs_pe(struct ip_vs_pe *pe)
+{
+ struct ip_vs_pe *tmp;
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ mutex_lock(&ip_vs_pe_mutex);
+ /* Make sure that the pe with this name doesn't exist
+ * in the pe list.
+ */
+ list_for_each_entry(tmp, &ip_vs_pe, n_list) {
+ if (strcmp(tmp->name, pe->name) == 0) {
+ mutex_unlock(&ip_vs_pe_mutex);
+ ip_vs_use_count_dec();
+ pr_err("%s(): [%s] pe already existed "
+ "in the system\n", __func__, pe->name);
+ return -EINVAL;
+ }
+ }
+ /* Add it into the d-linked pe list */
+ list_add_rcu(&pe->n_list, &ip_vs_pe);
+ mutex_unlock(&ip_vs_pe_mutex);
+
+ pr_info("[%s] pe registered.\n", pe->name);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(register_ip_vs_pe);
+
+/* Unregister a pe from the pe list */
+int unregister_ip_vs_pe(struct ip_vs_pe *pe)
+{
+ mutex_lock(&ip_vs_pe_mutex);
+ /* Remove it from the d-linked pe list */
+ list_del_rcu(&pe->n_list);
+ mutex_unlock(&ip_vs_pe_mutex);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ pr_info("[%s] pe unregistered.\n", pe->name);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ip_vs_pe);
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
new file mode 100644
index 000000000..bed5f7042
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -0,0 +1,171 @@
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack.h>
+#include <linux/netfilter/nf_conntrack_sip.h>
+
+#ifdef CONFIG_IP_VS_DEBUG
+static const char *ip_vs_dbg_callid(char *buf, size_t buf_len,
+ const char *callid, size_t callid_len,
+ int *idx)
+{
+ size_t max_len = 64;
+ size_t len = min3(max_len, callid_len, buf_len - *idx - 1);
+ memcpy(buf + *idx, callid, len);
+ buf[*idx+len] = '\0';
+ *idx += len + 1;
+ return buf + *idx - len;
+}
+
+#define IP_VS_DEBUG_CALLID(callid, len) \
+ ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf), \
+ callid, len, &ip_vs_dbg_idx)
+#endif
+
+static int get_callid(const char *dptr, unsigned int dataoff,
+ unsigned int datalen,
+ unsigned int *matchoff, unsigned int *matchlen)
+{
+ /* Find callid */
+ while (1) {
+ int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen,
+ SIP_HDR_CALL_ID, matchoff,
+ matchlen);
+ if (ret > 0)
+ break;
+ if (!ret)
+ return -EINVAL;
+ dataoff += *matchoff;
+ }
+
+ /* Too large is useless */
+ if (*matchlen > IP_VS_PEDATA_MAXLEN)
+ return -EINVAL;
+
+ /* SIP headers are always followed by a line terminator */
+ if (*matchoff + *matchlen == datalen)
+ return -EINVAL;
+
+ /* RFC 2543 allows lines to be terminated with CR, LF or CRLF,
+ * RFC 3261 allows only CRLF, we support both. */
+ if (*(dptr + *matchoff + *matchlen) != '\r' &&
+ *(dptr + *matchoff + *matchlen) != '\n')
+ return -EINVAL;
+
+ IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n",
+ IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen),
+ *matchlen);
+ return 0;
+}
+
+static int
+ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
+{
+ struct ip_vs_iphdr iph;
+ unsigned int dataoff, datalen, matchoff, matchlen;
+ const char *dptr;
+ int retc;
+
+ ip_vs_fill_iph_skb(p->af, skb, &iph);
+
+ /* Only useful with UDP */
+ if (iph.protocol != IPPROTO_UDP)
+ return -EINVAL;
+ /* todo: IPv6 fragments:
+ * I think this only should be done for the first fragment. /HS
+ */
+ dataoff = iph.len + sizeof(struct udphdr);
+
+ if (dataoff >= skb->len)
+ return -EINVAL;
+ retc = skb_linearize(skb);
+ if (retc < 0)
+ return retc;
+ dptr = skb->data + dataoff;
+ datalen = skb->len - dataoff;
+
+ if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
+ return -EINVAL;
+
+ /* N.B: pe_data is only set on success,
+ * this allows fallback to the default persistence logic on failure
+ */
+ p->pe_data = kmemdup(dptr + matchoff, matchlen, GFP_ATOMIC);
+ if (!p->pe_data)
+ return -ENOMEM;
+
+ p->pe_data_len = matchlen;
+
+ return 0;
+}
+
+static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p,
+ struct ip_vs_conn *ct)
+
+{
+ bool ret = false;
+
+ if (ct->af == p->af &&
+ ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) &&
+ /* protocol should only be IPPROTO_IP if
+ * d_addr is a fwmark */
+ ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
+ p->vaddr, &ct->vaddr) &&
+ ct->vport == p->vport &&
+ ct->flags & IP_VS_CONN_F_TEMPLATE &&
+ ct->protocol == p->protocol &&
+ ct->pe_data && ct->pe_data_len == p->pe_data_len &&
+ !memcmp(ct->pe_data, p->pe_data, p->pe_data_len))
+ ret = true;
+
+ IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n",
+ ip_vs_proto_name(p->protocol),
+ IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len),
+ IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
+ ret ? "hit" : "not hit");
+
+ return ret;
+}
+
+static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p,
+ u32 initval, bool inverse)
+{
+ return jhash(p->pe_data, p->pe_data_len, initval);
+}
+
+static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
+{
+ memcpy(buf, cp->pe_data, cp->pe_data_len);
+ return cp->pe_data_len;
+}
+
+static struct ip_vs_pe ip_vs_sip_pe =
+{
+ .name = "sip",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_sip_pe.n_list),
+ .fill_param = ip_vs_sip_fill_param,
+ .ct_match = ip_vs_sip_ct_match,
+ .hashkey_raw = ip_vs_sip_hashkey_raw,
+ .show_pe_data = ip_vs_sip_show_pe_data,
+};
+
+static int __init ip_vs_sip_init(void)
+{
+ return register_ip_vs_pe(&ip_vs_sip_pe);
+}
+
+static void __exit ip_vs_sip_cleanup(void)
+{
+ unregister_ip_vs_pe(&ip_vs_sip_pe);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_sip_init);
+module_exit(ip_vs_sip_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
new file mode 100644
index 000000000..939f7fbe9
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -0,0 +1,409 @@
+/*
+ * ip_vs_proto.c: transport protocol load balancing support for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/gfp.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * IPVS protocols can only be registered/unregistered when the ipvs
+ * module is loaded/unloaded, so no lock is needed in accessing the
+ * ipvs protocol table.
+ */
+
+#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */
+#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1))
+
+static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
+
+
+/*
+ * register an ipvs protocol
+ */
+static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+ unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
+
+ pp->next = ip_vs_proto_table[hash];
+ ip_vs_proto_table[hash] = pp;
+
+ if (pp->init != NULL)
+ pp->init(pp);
+
+ return 0;
+}
+
+/*
+ * register an ipvs protocols netns related data
+ */
+static int
+register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
+ struct ip_vs_proto_data *pd =
+ kzalloc(sizeof(struct ip_vs_proto_data), GFP_KERNEL);
+
+ if (!pd)
+ return -ENOMEM;
+
+ pd->pp = pp; /* For speed issues */
+ pd->next = ipvs->proto_data_table[hash];
+ ipvs->proto_data_table[hash] = pd;
+ atomic_set(&pd->appcnt, 0); /* Init app counter */
+
+ if (pp->init_netns != NULL) {
+ int ret = pp->init_netns(net, pd);
+ if (ret) {
+ /* unlink an free proto data */
+ ipvs->proto_data_table[hash] = pd->next;
+ kfree(pd);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * unregister an ipvs protocol
+ */
+static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+ struct ip_vs_protocol **pp_p;
+ unsigned int hash = IP_VS_PROTO_HASH(pp->protocol);
+
+ pp_p = &ip_vs_proto_table[hash];
+ for (; *pp_p; pp_p = &(*pp_p)->next) {
+ if (*pp_p == pp) {
+ *pp_p = pp->next;
+ if (pp->exit != NULL)
+ pp->exit(pp);
+ return 0;
+ }
+ }
+
+ return -ESRCH;
+}
+
+/*
+ * unregister an ipvs protocols netns data
+ */
+static int
+unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data **pd_p;
+ unsigned int hash = IP_VS_PROTO_HASH(pd->pp->protocol);
+
+ pd_p = &ipvs->proto_data_table[hash];
+ for (; *pd_p; pd_p = &(*pd_p)->next) {
+ if (*pd_p == pd) {
+ *pd_p = pd->next;
+ if (pd->pp->exit_netns != NULL)
+ pd->pp->exit_netns(net, pd);
+ kfree(pd);
+ return 0;
+ }
+ }
+
+ return -ESRCH;
+}
+
+/*
+ * get ip_vs_protocol object by its proto.
+ */
+struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
+{
+ struct ip_vs_protocol *pp;
+ unsigned int hash = IP_VS_PROTO_HASH(proto);
+
+ for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
+ if (pp->protocol == proto)
+ return pp;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(ip_vs_proto_get);
+
+/*
+ * get ip_vs_protocol object data by netns and proto
+ */
+static struct ip_vs_proto_data *
+__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
+{
+ struct ip_vs_proto_data *pd;
+ unsigned int hash = IP_VS_PROTO_HASH(proto);
+
+ for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) {
+ if (pd->pp->protocol == proto)
+ return pd;
+ }
+
+ return NULL;
+}
+
+struct ip_vs_proto_data *
+ip_vs_proto_data_get(struct net *net, unsigned short proto)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ return __ipvs_proto_data_get(ipvs, proto);
+}
+EXPORT_SYMBOL(ip_vs_proto_data_get);
+
+/*
+ * Propagate event for state change to all protocols
+ */
+void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags)
+{
+ struct ip_vs_proto_data *pd;
+ int i;
+
+ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+ for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) {
+ if (pd->pp->timeout_change)
+ pd->pp->timeout_change(pd, flags);
+ }
+ }
+}
+
+
+int *
+ip_vs_create_timeout_table(int *table, int size)
+{
+ return kmemdup(table, size, GFP_KERNEL);
+}
+
+
+/*
+ * Set timeout value for state specified by name
+ */
+int
+ip_vs_set_state_timeout(int *table, int num, const char *const *names,
+ const char *name, int to)
+{
+ int i;
+
+ if (!table || !name || !to)
+ return -EINVAL;
+
+ for (i = 0; i < num; i++) {
+ if (strcmp(names[i], name))
+ continue;
+ table[i] = to * HZ;
+ return 0;
+ }
+ return -ENOENT;
+}
+
+
+const char * ip_vs_state_name(__u16 proto, int state)
+{
+ struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+
+ if (pp == NULL || pp->state_name == NULL)
+ return (IPPROTO_IP == proto) ? "NONE" : "ERR!";
+ return pp->state_name(state);
+}
+
+
+static void
+ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
+ const struct sk_buff *skb,
+ int offset,
+ const char *msg)
+{
+ char buf[128];
+ struct iphdr _iph, *ih;
+
+ ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+ if (ih == NULL)
+ sprintf(buf, "TRUNCATED");
+ else if (ih->frag_off & htons(IP_OFFSET))
+ sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr);
+ else {
+ __be16 _ports[2], *pptr;
+
+ pptr = skb_header_pointer(skb, offset + ih->ihl*4,
+ sizeof(_ports), _ports);
+ if (pptr == NULL)
+ sprintf(buf, "TRUNCATED %pI4->%pI4",
+ &ih->saddr, &ih->daddr);
+ else
+ sprintf(buf, "%pI4:%u->%pI4:%u",
+ &ih->saddr, ntohs(pptr[0]),
+ &ih->daddr, ntohs(pptr[1]));
+ }
+
+ pr_debug("%s: %s %s\n", msg, pp->name, buf);
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static void
+ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
+ const struct sk_buff *skb,
+ int offset,
+ const char *msg)
+{
+ char buf[192];
+ struct ipv6hdr _iph, *ih;
+
+ ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+ if (ih == NULL)
+ sprintf(buf, "TRUNCATED");
+ else if (ih->nexthdr == IPPROTO_FRAGMENT)
+ sprintf(buf, "%pI6c->%pI6c frag", &ih->saddr, &ih->daddr);
+ else {
+ __be16 _ports[2], *pptr;
+
+ pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
+ sizeof(_ports), _ports);
+ if (pptr == NULL)
+ sprintf(buf, "TRUNCATED %pI6c->%pI6c",
+ &ih->saddr, &ih->daddr);
+ else
+ sprintf(buf, "%pI6c:%u->%pI6c:%u",
+ &ih->saddr, ntohs(pptr[0]),
+ &ih->daddr, ntohs(pptr[1]));
+ }
+
+ pr_debug("%s: %s %s\n", msg, pp->name, buf);
+}
+#endif
+
+
+void
+ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
+ const struct sk_buff *skb,
+ int offset,
+ const char *msg)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg);
+ else
+#endif
+ ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
+}
+
+/*
+ * per network name-space init
+ */
+int __net_init ip_vs_protocol_net_init(struct net *net)
+{
+ int i, ret;
+ static struct ip_vs_protocol *protos[] = {
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ &ip_vs_protocol_tcp,
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ &ip_vs_protocol_udp,
+#endif
+#ifdef CONFIG_IP_VS_PROTO_SCTP
+ &ip_vs_protocol_sctp,
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+ &ip_vs_protocol_ah,
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+ &ip_vs_protocol_esp,
+#endif
+ };
+
+ for (i = 0; i < ARRAY_SIZE(protos); i++) {
+ ret = register_ip_vs_proto_netns(net, protos[i]);
+ if (ret < 0)
+ goto cleanup;
+ }
+ return 0;
+
+cleanup:
+ ip_vs_protocol_net_cleanup(net);
+ return ret;
+}
+
+void __net_exit ip_vs_protocol_net_cleanup(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd;
+ int i;
+
+ /* unregister all the ipvs proto data for this netns */
+ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+ while ((pd = ipvs->proto_data_table[i]) != NULL)
+ unregister_ip_vs_proto_netns(net, pd);
+ }
+}
+
+int __init ip_vs_protocol_init(void)
+{
+ char protocols[64];
+#define REGISTER_PROTOCOL(p) \
+ do { \
+ register_ip_vs_protocol(p); \
+ strcat(protocols, ", "); \
+ strcat(protocols, (p)->name); \
+ } while (0)
+
+ protocols[0] = '\0';
+ protocols[2] = '\0';
+#ifdef CONFIG_IP_VS_PROTO_TCP
+ REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+ REGISTER_PROTOCOL(&ip_vs_protocol_udp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_SCTP
+ REGISTER_PROTOCOL(&ip_vs_protocol_sctp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+ REGISTER_PROTOCOL(&ip_vs_protocol_ah);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+ REGISTER_PROTOCOL(&ip_vs_protocol_esp);
+#endif
+ pr_info("Registered protocols (%s)\n", &protocols[2]);
+
+ return 0;
+}
+
+
+void ip_vs_protocol_cleanup(void)
+{
+ struct ip_vs_protocol *pp;
+ int i;
+
+ /* unregister all the ipvs protocols */
+ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+ while ((pp = ip_vs_proto_table[i]) != NULL)
+ unregister_ip_vs_protocol(pp);
+ }
+}
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
new file mode 100644
index 000000000..5de3dd312
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -0,0 +1,165 @@
+/*
+ * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS
+ *
+ * Authors: Julian Anastasov <ja@ssi.bg>, February 2002
+ * Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation;
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/* TODO:
+
+struct isakmp_hdr {
+ __u8 icookie[8];
+ __u8 rcookie[8];
+ __u8 np;
+ __u8 version;
+ __u8 xchgtype;
+ __u8 flags;
+ __u32 msgid;
+ __u32 length;
+};
+
+*/
+
+#define PORT_ISAKMP 500
+
+static void
+ah_esp_conn_fill_param_proto(struct net *net, int af,
+ const struct ip_vs_iphdr *iph, int inverse,
+ struct ip_vs_conn_param *p)
+{
+ if (likely(!inverse))
+ ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
+ &iph->saddr, htons(PORT_ISAKMP),
+ &iph->daddr, htons(PORT_ISAKMP), p);
+ else
+ ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
+ &iph->daddr, htons(PORT_ISAKMP),
+ &iph->saddr, htons(PORT_ISAKMP), p);
+}
+
+static struct ip_vs_conn *
+ah_esp_conn_in_get(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph,
+ int inverse)
+{
+ struct ip_vs_conn *cp;
+ struct ip_vs_conn_param p;
+ struct net *net = skb_net(skb);
+
+ ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
+ cp = ip_vs_conn_in_get(&p);
+ if (!cp) {
+ /*
+ * We are not sure if the packet is from our
+ * service, so our conn_schedule hook should return NF_ACCEPT
+ */
+ IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
+ "%s%s %s->%s\n",
+ inverse ? "ICMP+" : "",
+ ip_vs_proto_get(iph->protocol)->name,
+ IP_VS_DBG_ADDR(af, &iph->saddr),
+ IP_VS_DBG_ADDR(af, &iph->daddr));
+ }
+
+ return cp;
+}
+
+
+static struct ip_vs_conn *
+ah_esp_conn_out_get(int af, const struct sk_buff *skb,
+ const struct ip_vs_iphdr *iph, int inverse)
+{
+ struct ip_vs_conn *cp;
+ struct ip_vs_conn_param p;
+ struct net *net = skb_net(skb);
+
+ ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
+ cp = ip_vs_conn_out_get(&p);
+ if (!cp) {
+ IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
+ "%s%s %s->%s\n",
+ inverse ? "ICMP+" : "",
+ ip_vs_proto_get(iph->protocol)->name,
+ IP_VS_DBG_ADDR(af, &iph->saddr),
+ IP_VS_DBG_ADDR(af, &iph->daddr));
+ }
+
+ return cp;
+}
+
+
+static int
+ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
+{
+ /*
+ * AH/ESP is only related traffic. Pass the packet to IP stack.
+ */
+ *verdict = NF_ACCEPT;
+ return 0;
+}
+
+#ifdef CONFIG_IP_VS_PROTO_AH
+struct ip_vs_protocol ip_vs_protocol_ah = {
+ .name = "AH",
+ .protocol = IPPROTO_AH,
+ .num_states = 1,
+ .dont_defrag = 1,
+ .init = NULL,
+ .exit = NULL,
+ .conn_schedule = ah_esp_conn_schedule,
+ .conn_in_get = ah_esp_conn_in_get,
+ .conn_out_get = ah_esp_conn_out_get,
+ .snat_handler = NULL,
+ .dnat_handler = NULL,
+ .csum_check = NULL,
+ .state_transition = NULL,
+ .register_app = NULL,
+ .unregister_app = NULL,
+ .app_conn_bind = NULL,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = NULL, /* ISAKMP */
+};
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_ESP
+struct ip_vs_protocol ip_vs_protocol_esp = {
+ .name = "ESP",
+ .protocol = IPPROTO_ESP,
+ .num_states = 1,
+ .dont_defrag = 1,
+ .init = NULL,
+ .exit = NULL,
+ .conn_schedule = ah_esp_conn_schedule,
+ .conn_in_get = ah_esp_conn_in_get,
+ .conn_out_get = ah_esp_conn_out_get,
+ .snat_handler = NULL,
+ .dnat_handler = NULL,
+ .csum_check = NULL,
+ .state_transition = NULL,
+ .register_app = NULL,
+ .unregister_app = NULL,
+ .app_conn_bind = NULL,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = NULL, /* ISAKMP */
+};
+#endif
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
new file mode 100644
index 000000000..5b84c0b56
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -0,0 +1,591 @@
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/sctp.h>
+#include <net/ip.h>
+#include <net/ip6_checksum.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/sctp/checksum.h>
+#include <net/ip_vs.h>
+
+static int
+sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
+{
+ struct net *net;
+ struct ip_vs_service *svc;
+ struct netns_ipvs *ipvs;
+ sctp_chunkhdr_t _schunkh, *sch;
+ sctp_sctphdr_t *sh, _sctph;
+
+ sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+ if (sh == NULL) {
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ sch = skb_header_pointer(skb, iph->len + sizeof(sctp_sctphdr_t),
+ sizeof(_schunkh), &_schunkh);
+ if (sch == NULL) {
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ rcu_read_lock();
+ if ((sch->type == SCTP_CID_INIT || sysctl_sloppy_sctp(ipvs)) &&
+ (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, sh->dest))) {
+ int ignored;
+
+ if (ip_vs_todrop(ipvs)) {
+ /*
+ * It seems that we are very loaded.
+ * We have to drop this packet :(
+ */
+ rcu_read_unlock();
+ *verdict = NF_DROP;
+ return 0;
+ }
+ /*
+ * Let the virtual server select a real server for the
+ * incoming connection, and create a connection entry.
+ */
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd, iph);
+ else
+ *verdict = NF_DROP;
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+ /* NF_ACCEPT */
+ return 1;
+}
+
+static void sctp_nat_csum(struct sk_buff *skb, sctp_sctphdr_t *sctph,
+ unsigned int sctphoff)
+{
+ sctph->checksum = sctp_compute_cksum(skb, sctphoff);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+}
+
+static int
+sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ sctp_sctphdr_t *sctph;
+ unsigned int sctphoff = iph->len;
+ bool payload_csum = false;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /* Call application helper if needed */
+ ret = ip_vs_app_pkt_out(cp, skb);
+ if (ret == 0)
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 2)
+ payload_csum = true;
+ }
+
+ sctph = (void *) skb_network_header(skb) + sctphoff;
+
+ /* Only update csum if we really have to */
+ if (sctph->source != cp->vport || payload_csum ||
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ sctph->source = cp->vport;
+ sctp_nat_csum(skb, sctph, sctphoff);
+ } else {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
+ return 1;
+}
+
+static int
+sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ sctp_sctphdr_t *sctph;
+ unsigned int sctphoff = iph->len;
+ bool payload_csum = false;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, sctphoff + sizeof(*sctph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /* Call application helper if needed */
+ ret = ip_vs_app_pkt_in(cp, skb);
+ if (ret == 0)
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 2)
+ payload_csum = true;
+ }
+
+ sctph = (void *) skb_network_header(skb) + sctphoff;
+
+ /* Only update csum if we really have to */
+ if (sctph->dest != cp->dport || payload_csum ||
+ (skb->ip_summed == CHECKSUM_PARTIAL &&
+ !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CSUM))) {
+ sctph->dest = cp->dport;
+ sctp_nat_csum(skb, sctph, sctphoff);
+ } else if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
+ return 1;
+}
+
+static int
+sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+ unsigned int sctphoff;
+ struct sctphdr *sh, _sctph;
+ __le32 cmp, val;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ sctphoff = sizeof(struct ipv6hdr);
+ else
+#endif
+ sctphoff = ip_hdrlen(skb);
+
+ sh = skb_header_pointer(skb, sctphoff, sizeof(_sctph), &_sctph);
+ if (sh == NULL)
+ return 0;
+
+ cmp = sh->checksum;
+ val = sctp_compute_cksum(skb, sctphoff);
+
+ if (val != cmp) {
+ /* CRC failure, dump it. */
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+ "Failed checksum for");
+ return 0;
+ }
+ return 1;
+}
+
+enum ipvs_sctp_event_t {
+ IP_VS_SCTP_DATA = 0, /* DATA, SACK, HEARTBEATs */
+ IP_VS_SCTP_INIT,
+ IP_VS_SCTP_INIT_ACK,
+ IP_VS_SCTP_COOKIE_ECHO,
+ IP_VS_SCTP_COOKIE_ACK,
+ IP_VS_SCTP_SHUTDOWN,
+ IP_VS_SCTP_SHUTDOWN_ACK,
+ IP_VS_SCTP_SHUTDOWN_COMPLETE,
+ IP_VS_SCTP_ERROR,
+ IP_VS_SCTP_ABORT,
+ IP_VS_SCTP_EVENT_LAST
+};
+
+/* RFC 2960, 3.2 Chunk Field Descriptions */
+static __u8 sctp_events[] = {
+ [SCTP_CID_DATA] = IP_VS_SCTP_DATA,
+ [SCTP_CID_INIT] = IP_VS_SCTP_INIT,
+ [SCTP_CID_INIT_ACK] = IP_VS_SCTP_INIT_ACK,
+ [SCTP_CID_SACK] = IP_VS_SCTP_DATA,
+ [SCTP_CID_HEARTBEAT] = IP_VS_SCTP_DATA,
+ [SCTP_CID_HEARTBEAT_ACK] = IP_VS_SCTP_DATA,
+ [SCTP_CID_ABORT] = IP_VS_SCTP_ABORT,
+ [SCTP_CID_SHUTDOWN] = IP_VS_SCTP_SHUTDOWN,
+ [SCTP_CID_SHUTDOWN_ACK] = IP_VS_SCTP_SHUTDOWN_ACK,
+ [SCTP_CID_ERROR] = IP_VS_SCTP_ERROR,
+ [SCTP_CID_COOKIE_ECHO] = IP_VS_SCTP_COOKIE_ECHO,
+ [SCTP_CID_COOKIE_ACK] = IP_VS_SCTP_COOKIE_ACK,
+ [SCTP_CID_ECN_ECNE] = IP_VS_SCTP_DATA,
+ [SCTP_CID_ECN_CWR] = IP_VS_SCTP_DATA,
+ [SCTP_CID_SHUTDOWN_COMPLETE] = IP_VS_SCTP_SHUTDOWN_COMPLETE,
+};
+
+/* SCTP States:
+ * See RFC 2960, 4. SCTP Association State Diagram
+ *
+ * New states (not in diagram):
+ * - INIT1 state: use shorter timeout for dropped INIT packets
+ * - REJECTED state: use shorter timeout if INIT is rejected with ABORT
+ * - INIT, COOKIE_SENT, COOKIE_REPLIED, COOKIE states: for better debugging
+ *
+ * The states are as seen in real server. In the diagram, INIT1, INIT,
+ * COOKIE_SENT and COOKIE_REPLIED processing happens in CLOSED state.
+ *
+ * States as per packets from client (C) and server (S):
+ *
+ * Setup of client connection:
+ * IP_VS_SCTP_S_INIT1: First C:INIT sent, wait for S:INIT-ACK
+ * IP_VS_SCTP_S_INIT: Next C:INIT sent, wait for S:INIT-ACK
+ * IP_VS_SCTP_S_COOKIE_SENT: S:INIT-ACK sent, wait for C:COOKIE-ECHO
+ * IP_VS_SCTP_S_COOKIE_REPLIED: C:COOKIE-ECHO sent, wait for S:COOKIE-ACK
+ *
+ * Setup of server connection:
+ * IP_VS_SCTP_S_COOKIE_WAIT: S:INIT sent, wait for C:INIT-ACK
+ * IP_VS_SCTP_S_COOKIE: C:INIT-ACK sent, wait for S:COOKIE-ECHO
+ * IP_VS_SCTP_S_COOKIE_ECHOED: S:COOKIE-ECHO sent, wait for C:COOKIE-ACK
+ */
+
+#define sNO IP_VS_SCTP_S_NONE
+#define sI1 IP_VS_SCTP_S_INIT1
+#define sIN IP_VS_SCTP_S_INIT
+#define sCS IP_VS_SCTP_S_COOKIE_SENT
+#define sCR IP_VS_SCTP_S_COOKIE_REPLIED
+#define sCW IP_VS_SCTP_S_COOKIE_WAIT
+#define sCO IP_VS_SCTP_S_COOKIE
+#define sCE IP_VS_SCTP_S_COOKIE_ECHOED
+#define sES IP_VS_SCTP_S_ESTABLISHED
+#define sSS IP_VS_SCTP_S_SHUTDOWN_SENT
+#define sSR IP_VS_SCTP_S_SHUTDOWN_RECEIVED
+#define sSA IP_VS_SCTP_S_SHUTDOWN_ACK_SENT
+#define sRJ IP_VS_SCTP_S_REJECTED
+#define sCL IP_VS_SCTP_S_CLOSED
+
+static const __u8 sctp_states
+ [IP_VS_DIR_LAST][IP_VS_SCTP_EVENT_LAST][IP_VS_SCTP_S_LAST] = {
+ { /* INPUT */
+/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/
+/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* i */{sI1, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN},
+/* i_a */{sCW, sCW, sCW, sCS, sCR, sCO, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_e */{sCR, sIN, sIN, sCR, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_a */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sES, sES, sSS, sSR, sSA, sRJ, sCL},
+/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL},
+/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sES, sCL, sSR, sCL, sRJ, sCL},
+/* s_c */{sCL, sCL, sCL, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sCL, sRJ, sCL},
+/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCL, sES, sSS, sSR, sSA, sRJ, sCL},
+/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+ },
+ { /* OUTPUT */
+/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/
+/* d */{sES, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* i */{sCW, sCW, sCW, sCW, sCW, sCW, sCW, sCW, sES, sCW, sCW, sCW, sCW, sCW},
+/* i_a */{sCS, sCS, sCS, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_e */{sCE, sCE, sCE, sCE, sCE, sCE, sCE, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_a */{sES, sES, sES, sES, sES, sES, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL},
+/* s */{sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSS, sSR, sSA, sRJ, sCL},
+/* s_a */{sSA, sSA, sSA, sSA, sSA, sCW, sCO, sCE, sES, sSA, sSA, sSA, sRJ, sCL},
+/* s_c */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* err */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* ab */{sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+ },
+ { /* INPUT-ONLY */
+/* sNO, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL*/
+/* d */{sES, sI1, sIN, sCS, sCR, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* i */{sI1, sIN, sIN, sIN, sIN, sIN, sCO, sCE, sES, sSS, sSR, sSA, sIN, sIN},
+/* i_a */{sCE, sCE, sCE, sCE, sCE, sCE, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_e */{sES, sES, sES, sES, sES, sES, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* c_a */{sES, sI1, sIN, sES, sES, sCW, sES, sES, sES, sSS, sSR, sSA, sRJ, sCL},
+/* s */{sSR, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sSR, sSS, sSR, sSA, sRJ, sCL},
+/* s_a */{sCL, sIN, sIN, sCS, sCR, sCW, sCO, sCE, sCL, sCL, sSR, sCL, sRJ, sCL},
+/* s_c */{sCL, sCL, sCL, sCL, sCL, sCW, sCO, sCE, sES, sSS, sCL, sCL, sRJ, sCL},
+/* err */{sCL, sI1, sIN, sCS, sCR, sCW, sCO, sCE, sES, sSS, sSR, sSA, sRJ, sCL},
+/* ab */{sCL, sCL, sCL, sCL, sCL, sRJ, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+ },
+};
+
+#define IP_VS_SCTP_MAX_RTO ((60 + 1) * HZ)
+
+/* Timeout table[state] */
+static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
+ [IP_VS_SCTP_S_NONE] = 2 * HZ,
+ [IP_VS_SCTP_S_INIT1] = (0 + 3 + 1) * HZ,
+ [IP_VS_SCTP_S_INIT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_SENT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_REPLIED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_WAIT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_COOKIE_ECHOED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_ESTABLISHED] = 15 * 60 * HZ,
+ [IP_VS_SCTP_S_SHUTDOWN_SENT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_REJECTED] = (0 + 3 + 1) * HZ,
+ [IP_VS_SCTP_S_CLOSED] = IP_VS_SCTP_MAX_RTO,
+ [IP_VS_SCTP_S_LAST] = 2 * HZ,
+};
+
+static const char *sctp_state_name_table[IP_VS_SCTP_S_LAST + 1] = {
+ [IP_VS_SCTP_S_NONE] = "NONE",
+ [IP_VS_SCTP_S_INIT1] = "INIT1",
+ [IP_VS_SCTP_S_INIT] = "INIT",
+ [IP_VS_SCTP_S_COOKIE_SENT] = "C-SENT",
+ [IP_VS_SCTP_S_COOKIE_REPLIED] = "C-REPLIED",
+ [IP_VS_SCTP_S_COOKIE_WAIT] = "C-WAIT",
+ [IP_VS_SCTP_S_COOKIE] = "COOKIE",
+ [IP_VS_SCTP_S_COOKIE_ECHOED] = "C-ECHOED",
+ [IP_VS_SCTP_S_ESTABLISHED] = "ESTABLISHED",
+ [IP_VS_SCTP_S_SHUTDOWN_SENT] = "S-SENT",
+ [IP_VS_SCTP_S_SHUTDOWN_RECEIVED] = "S-RECEIVED",
+ [IP_VS_SCTP_S_SHUTDOWN_ACK_SENT] = "S-ACK-SENT",
+ [IP_VS_SCTP_S_REJECTED] = "REJECTED",
+ [IP_VS_SCTP_S_CLOSED] = "CLOSED",
+ [IP_VS_SCTP_S_LAST] = "BUG!",
+};
+
+
+static const char *sctp_state_name(int state)
+{
+ if (state >= IP_VS_SCTP_S_LAST)
+ return "ERR!";
+ if (sctp_state_name_table[state])
+ return sctp_state_name_table[state];
+ return "?";
+}
+
+static inline void
+set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
+ int direction, const struct sk_buff *skb)
+{
+ sctp_chunkhdr_t _sctpch, *sch;
+ unsigned char chunk_type;
+ int event, next_state;
+ int ihl, cofs;
+
+#ifdef CONFIG_IP_VS_IPV6
+ ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
+#else
+ ihl = ip_hdrlen(skb);
+#endif
+
+ cofs = ihl + sizeof(sctp_sctphdr_t);
+ sch = skb_header_pointer(skb, cofs, sizeof(_sctpch), &_sctpch);
+ if (sch == NULL)
+ return;
+
+ chunk_type = sch->type;
+ /*
+ * Section 3: Multiple chunks can be bundled into one SCTP packet
+ * up to the MTU size, except for the INIT, INIT ACK, and
+ * SHUTDOWN COMPLETE chunks. These chunks MUST NOT be bundled with
+ * any other chunk in a packet.
+ *
+ * Section 3.3.7: DATA chunks MUST NOT be bundled with ABORT. Control
+ * chunks (except for INIT, INIT ACK, and SHUTDOWN COMPLETE) MAY be
+ * bundled with an ABORT, but they MUST be placed before the ABORT
+ * in the SCTP packet or they will be ignored by the receiver.
+ */
+ if ((sch->type == SCTP_CID_COOKIE_ECHO) ||
+ (sch->type == SCTP_CID_COOKIE_ACK)) {
+ int clen = ntohs(sch->length);
+
+ if (clen >= sizeof(sctp_chunkhdr_t)) {
+ sch = skb_header_pointer(skb, cofs + ALIGN(clen, 4),
+ sizeof(_sctpch), &_sctpch);
+ if (sch && sch->type == SCTP_CID_ABORT)
+ chunk_type = sch->type;
+ }
+ }
+
+ event = (chunk_type < sizeof(sctp_events)) ?
+ sctp_events[chunk_type] : IP_VS_SCTP_DATA;
+
+ /* Update direction to INPUT_ONLY if necessary
+ * or delete NO_OUTPUT flag if output packet detected
+ */
+ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
+ if (direction == IP_VS_DIR_OUTPUT)
+ cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
+ else
+ direction = IP_VS_DIR_INPUT_ONLY;
+ }
+
+ next_state = sctp_states[direction][event][cp->state];
+
+ if (next_state != cp->state) {
+ struct ip_vs_dest *dest = cp->dest;
+
+ IP_VS_DBG_BUF(8, "%s %s %s:%d->"
+ "%s:%d state: %s->%s conn->refcnt:%d\n",
+ pd->pp->name,
+ ((direction == IP_VS_DIR_OUTPUT) ?
+ "output " : "input "),
+ IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
+ ntohs(cp->dport),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+ ntohs(cp->cport),
+ sctp_state_name(cp->state),
+ sctp_state_name(next_state),
+ atomic_read(&cp->refcnt));
+ if (dest) {
+ if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (next_state != IP_VS_SCTP_S_ESTABLISHED)) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ cp->flags |= IP_VS_CONN_F_INACTIVE;
+ } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (next_state == IP_VS_SCTP_S_ESTABLISHED)) {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
+ }
+ }
+ if (likely(pd))
+ cp->timeout = pd->timeout_table[cp->state = next_state];
+ else /* What to do ? */
+ cp->timeout = sctp_timeouts[cp->state = next_state];
+}
+
+static void
+sctp_state_transition(struct ip_vs_conn *cp, int direction,
+ const struct sk_buff *skb, struct ip_vs_proto_data *pd)
+{
+ spin_lock_bh(&cp->lock);
+ set_sctp_state(pd, cp, direction, skb);
+ spin_unlock_bh(&cp->lock);
+}
+
+static inline __u16 sctp_app_hashkey(__be16 port)
+{
+ return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port)
+ & SCTP_APP_TAB_MASK;
+}
+
+static int sctp_register_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_app *i;
+ __u16 hash;
+ __be16 port = inc->port;
+ int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
+
+ hash = sctp_app_hashkey(port);
+
+ list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {
+ if (i->port == port) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+ list_add_rcu(&inc->p_list, &ipvs->sctp_apps[hash]);
+ atomic_inc(&pd->appcnt);
+out:
+
+ return ret;
+}
+
+static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
+
+ atomic_dec(&pd->appcnt);
+ list_del_rcu(&inc->p_list);
+}
+
+static int sctp_app_conn_bind(struct ip_vs_conn *cp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+ int hash;
+ struct ip_vs_app *inc;
+ int result = 0;
+
+ /* Default binding: bind app only for NAT */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return 0;
+ /* Lookup application incarnations and bind the right one */
+ hash = sctp_app_hashkey(cp->vport);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->sctp_apps[hash], p_list) {
+ if (inc->port == cp->vport) {
+ if (unlikely(!ip_vs_app_inc_get(inc)))
+ break;
+ rcu_read_unlock();
+
+ IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
+ "%s:%u to app %s on port %u\n",
+ __func__,
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+ ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+ ntohs(cp->vport),
+ inc->name, ntohs(inc->port));
+ cp->app = inc;
+ if (inc->init_conn)
+ result = inc->init_conn(inc, cp);
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+out:
+ return result;
+}
+
+/* ---------------------------------------------
+ * timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
+ sizeof(sctp_timeouts));
+ if (!pd->timeout_table)
+ return -ENOMEM;
+ return 0;
+}
+
+static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd)
+{
+ kfree(pd->timeout_table);
+}
+
+struct ip_vs_protocol ip_vs_protocol_sctp = {
+ .name = "SCTP",
+ .protocol = IPPROTO_SCTP,
+ .num_states = IP_VS_SCTP_S_LAST,
+ .dont_defrag = 0,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __ip_vs_sctp_init,
+ .exit_netns = __ip_vs_sctp_exit,
+ .register_app = sctp_register_app,
+ .unregister_app = sctp_unregister_app,
+ .conn_schedule = sctp_conn_schedule,
+ .conn_in_get = ip_vs_conn_in_get_proto,
+ .conn_out_get = ip_vs_conn_out_get_proto,
+ .snat_handler = sctp_snat_handler,
+ .dnat_handler = sctp_dnat_handler,
+ .csum_check = sctp_csum_check,
+ .state_name = sctp_state_name,
+ .state_transition = sctp_state_transition,
+ .app_conn_bind = sctp_app_conn_bind,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = NULL,
+};
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
new file mode 100644
index 000000000..8e92beb0c
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -0,0 +1,712 @@
+/*
+ * ip_vs_proto_tcp.c: TCP load balancing support for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ *
+ * Network name space (netns) aware.
+ * Global data moved to netns i.e struct netns_ipvs
+ * tcp_timeouts table has copy per netns in a hash table per
+ * protocol ip_vs_proto_data and is handled by netns
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h> /* for tcphdr */
+#include <net/ip.h>
+#include <net/tcp.h> /* for csum_tcpudp_magic */
+#include <net/ip6_checksum.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+static int
+tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
+{
+ struct net *net;
+ struct ip_vs_service *svc;
+ struct tcphdr _tcph, *th;
+ struct netns_ipvs *ipvs;
+
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+ if (th == NULL) {
+ *verdict = NF_DROP;
+ return 0;
+ }
+ net = skb_net(skb);
+ ipvs = net_ipvs(net);
+ /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
+ rcu_read_lock();
+ if ((th->syn || sysctl_sloppy_tcp(ipvs)) && !th->rst &&
+ (svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, th->dest))) {
+ int ignored;
+
+ if (ip_vs_todrop(ipvs)) {
+ /*
+ * It seems that we are very loaded.
+ * We have to drop this packet :(
+ */
+ rcu_read_unlock();
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ /*
+ * Let the virtual server select a real server for the
+ * incoming connection, and create a connection entry.
+ */
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd, iph);
+ else
+ *verdict = NF_DROP;
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+ /* NF_ACCEPT */
+ return 1;
+}
+
+
+static inline void
+tcp_fast_csum_update(int af, struct tcphdr *tcph,
+ const union nf_inet_addr *oldip,
+ const union nf_inet_addr *newip,
+ __be16 oldport, __be16 newport)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ tcph->check =
+ csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+ ip_vs_check_diff2(oldport, newport,
+ ~csum_unfold(tcph->check))));
+ else
+#endif
+ tcph->check =
+ csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+ ip_vs_check_diff2(oldport, newport,
+ ~csum_unfold(tcph->check))));
+}
+
+
+static inline void
+tcp_partial_csum_update(int af, struct tcphdr *tcph,
+ const union nf_inet_addr *oldip,
+ const union nf_inet_addr *newip,
+ __be16 oldlen, __be16 newlen)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ tcph->check =
+ ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+ ip_vs_check_diff2(oldlen, newlen,
+ csum_unfold(tcph->check))));
+ else
+#endif
+ tcph->check =
+ ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+ ip_vs_check_diff2(oldlen, newlen,
+ csum_unfold(tcph->check))));
+}
+
+
+static int
+tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ struct tcphdr *tcph;
+ unsigned int tcphoff = iph->len;
+ int oldlen;
+ int payload_csum = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+ oldlen = skb->len - tcphoff;
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /* Call application helper if needed */
+ if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - tcphoff;
+ else
+ payload_csum = 1;
+ }
+
+ tcph = (void *)skb_network_header(skb) + tcphoff;
+ tcph->source = cp->vport;
+
+ /* Adjust TCP checksums */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+ htons(oldlen),
+ htons(skb->len - tcphoff));
+ } else if (!payload_csum) {
+ /* Only port and addr are changed, do fast csum update */
+ tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
+ cp->dport, cp->vport);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ tcph->check = 0;
+ skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
+ &cp->caddr.in6,
+ skb->len - tcphoff,
+ cp->protocol, skb->csum);
+ else
+#endif
+ tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
+ cp->caddr.ip,
+ skb->len - tcphoff,
+ cp->protocol,
+ skb->csum);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+ pp->name, tcph->check,
+ (char*)&(tcph->check) - (char*)tcph);
+ }
+ return 1;
+}
+
+
+static int
+tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ struct tcphdr *tcph;
+ unsigned int tcphoff = iph->len;
+ int oldlen;
+ int payload_csum = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+ oldlen = skb->len - tcphoff;
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, tcphoff+sizeof(*tcph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /*
+ * Attempt ip_vs_app call.
+ * It will fix ip_vs_conn and iph ack_seq stuff
+ */
+ if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - tcphoff;
+ else
+ payload_csum = 1;
+ }
+
+ tcph = (void *)skb_network_header(skb) + tcphoff;
+ tcph->dest = cp->dport;
+
+ /*
+ * Adjust TCP checksums
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
+ htons(oldlen),
+ htons(skb->len - tcphoff));
+ } else if (!payload_csum) {
+ /* Only port and addr are changed, do fast csum update */
+ tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
+ cp->vport, cp->dport);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ tcph->check = 0;
+ skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ tcph->check = csum_ipv6_magic(&cp->caddr.in6,
+ &cp->daddr.in6,
+ skb->len - tcphoff,
+ cp->protocol, skb->csum);
+ else
+#endif
+ tcph->check = csum_tcpudp_magic(cp->caddr.ip,
+ cp->daddr.ip,
+ skb->len - tcphoff,
+ cp->protocol,
+ skb->csum);
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ return 1;
+}
+
+
+static int
+tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+ unsigned int tcphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ tcphoff = sizeof(struct ipv6hdr);
+ else
+#endif
+ tcphoff = ip_hdrlen(skb);
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_NONE:
+ skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+ case CHECKSUM_COMPLETE:
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr,
+ skb->len - tcphoff,
+ ipv6_hdr(skb)->nexthdr,
+ skb->csum)) {
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+ "Failed checksum for");
+ return 0;
+ }
+ } else
+#endif
+ if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr,
+ skb->len - tcphoff,
+ ip_hdr(skb)->protocol,
+ skb->csum)) {
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+ "Failed checksum for");
+ return 0;
+ }
+ break;
+ default:
+ /* No need to checksum. */
+ break;
+ }
+
+ return 1;
+}
+
+
+#define TCP_DIR_INPUT 0
+#define TCP_DIR_OUTPUT 4
+#define TCP_DIR_INPUT_ONLY 8
+
+static const int tcp_state_off[IP_VS_DIR_LAST] = {
+ [IP_VS_DIR_INPUT] = TCP_DIR_INPUT,
+ [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT,
+ [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY,
+};
+
+/*
+ * Timeout table[state]
+ */
+static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+ [IP_VS_TCP_S_NONE] = 2*HZ,
+ [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
+ [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
+ [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ,
+ [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ,
+ [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ,
+ [IP_VS_TCP_S_CLOSE] = 10*HZ,
+ [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ,
+ [IP_VS_TCP_S_LAST_ACK] = 30*HZ,
+ [IP_VS_TCP_S_LISTEN] = 2*60*HZ,
+ [IP_VS_TCP_S_SYNACK] = 120*HZ,
+ [IP_VS_TCP_S_LAST] = 2*HZ,
+};
+
+static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
+ [IP_VS_TCP_S_NONE] = "NONE",
+ [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED",
+ [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT",
+ [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV",
+ [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT",
+ [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT",
+ [IP_VS_TCP_S_CLOSE] = "CLOSE",
+ [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT",
+ [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK",
+ [IP_VS_TCP_S_LISTEN] = "LISTEN",
+ [IP_VS_TCP_S_SYNACK] = "SYNACK",
+ [IP_VS_TCP_S_LAST] = "BUG!",
+};
+
+#define sNO IP_VS_TCP_S_NONE
+#define sES IP_VS_TCP_S_ESTABLISHED
+#define sSS IP_VS_TCP_S_SYN_SENT
+#define sSR IP_VS_TCP_S_SYN_RECV
+#define sFW IP_VS_TCP_S_FIN_WAIT
+#define sTW IP_VS_TCP_S_TIME_WAIT
+#define sCL IP_VS_TCP_S_CLOSE
+#define sCW IP_VS_TCP_S_CLOSE_WAIT
+#define sLA IP_VS_TCP_S_LAST_ACK
+#define sLI IP_VS_TCP_S_LISTEN
+#define sSA IP_VS_TCP_S_SYNACK
+
+struct tcp_states_t {
+ int next_state[IP_VS_TCP_S_LAST];
+};
+
+static const char * tcp_state_name(int state)
+{
+ if (state >= IP_VS_TCP_S_LAST)
+ return "ERR!";
+ return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
+}
+
+static struct tcp_states_t tcp_states [] = {
+/* INPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
+
+/* OUTPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/* INPUT-ONLY */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static struct tcp_states_t tcp_states_dos [] = {
+/* INPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
+/*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+
+/* OUTPUT */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/* INPUT-ONLY */
+/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
+/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
+{
+ int on = (flags & 1); /* secure_tcp */
+
+ /*
+ ** FIXME: change secure_tcp to independent sysctl var
+ ** or make it per-service or per-app because it is valid
+ ** for most if not for all of the applications. Something
+ ** like "capabilities" (flags) for each object.
+ */
+ pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
+}
+
+static inline int tcp_state_idx(struct tcphdr *th)
+{
+ if (th->rst)
+ return 3;
+ if (th->syn)
+ return 0;
+ if (th->fin)
+ return 1;
+ if (th->ack)
+ return 2;
+ return -1;
+}
+
+static inline void
+set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
+ int direction, struct tcphdr *th)
+{
+ int state_idx;
+ int new_state = IP_VS_TCP_S_CLOSE;
+ int state_off = tcp_state_off[direction];
+
+ /*
+ * Update state offset to INPUT_ONLY if necessary
+ * or delete NO_OUTPUT flag if output packet detected
+ */
+ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
+ if (state_off == TCP_DIR_OUTPUT)
+ cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
+ else
+ state_off = TCP_DIR_INPUT_ONLY;
+ }
+
+ if ((state_idx = tcp_state_idx(th)) < 0) {
+ IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
+ goto tcp_state_out;
+ }
+
+ new_state =
+ pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
+
+ tcp_state_out:
+ if (new_state != cp->state) {
+ struct ip_vs_dest *dest = cp->dest;
+
+ IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
+ "%s:%d state: %s->%s conn->refcnt:%d\n",
+ pd->pp->name,
+ ((state_off == TCP_DIR_OUTPUT) ?
+ "output " : "input "),
+ th->syn ? 'S' : '.',
+ th->fin ? 'F' : '.',
+ th->ack ? 'A' : '.',
+ th->rst ? 'R' : '.',
+ IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
+ ntohs(cp->dport),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+ ntohs(cp->cport),
+ tcp_state_name(cp->state),
+ tcp_state_name(new_state),
+ atomic_read(&cp->refcnt));
+
+ if (dest) {
+ if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (new_state != IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ cp->flags |= IP_VS_CONN_F_INACTIVE;
+ } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+ (new_state == IP_VS_TCP_S_ESTABLISHED)) {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+ }
+ }
+ }
+
+ if (likely(pd))
+ cp->timeout = pd->timeout_table[cp->state = new_state];
+ else /* What to do ? */
+ cp->timeout = tcp_timeouts[cp->state = new_state];
+}
+
+/*
+ * Handle state transitions
+ */
+static void
+tcp_state_transition(struct ip_vs_conn *cp, int direction,
+ const struct sk_buff *skb,
+ struct ip_vs_proto_data *pd)
+{
+ struct tcphdr _tcph, *th;
+
+#ifdef CONFIG_IP_VS_IPV6
+ int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
+#else
+ int ihl = ip_hdrlen(skb);
+#endif
+
+ th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
+ if (th == NULL)
+ return;
+
+ spin_lock_bh(&cp->lock);
+ set_tcp_state(pd, cp, direction, th);
+ spin_unlock_bh(&cp->lock);
+}
+
+static inline __u16 tcp_app_hashkey(__be16 port)
+{
+ return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
+ & TCP_APP_TAB_MASK;
+}
+
+
+static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_app *i;
+ __u16 hash;
+ __be16 port = inc->port;
+ int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+ hash = tcp_app_hashkey(port);
+
+ list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
+ if (i->port == port) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+ list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
+ atomic_inc(&pd->appcnt);
+
+ out:
+ return ret;
+}
+
+
+static void
+tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+ atomic_dec(&pd->appcnt);
+ list_del_rcu(&inc->p_list);
+}
+
+
+static int
+tcp_app_conn_bind(struct ip_vs_conn *cp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+ int hash;
+ struct ip_vs_app *inc;
+ int result = 0;
+
+ /* Default binding: bind app only for NAT */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return 0;
+
+ /* Lookup application incarnations and bind the right one */
+ hash = tcp_app_hashkey(cp->vport);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
+ if (inc->port == cp->vport) {
+ if (unlikely(!ip_vs_app_inc_get(inc)))
+ break;
+ rcu_read_unlock();
+
+ IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
+ "%s:%u to app %s on port %u\n",
+ __func__,
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+ ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+ ntohs(cp->vport),
+ inc->name, ntohs(inc->port));
+
+ cp->app = inc;
+ if (inc->init_conn)
+ result = inc->init_conn(inc, cp);
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+
+ out:
+ return result;
+}
+
+
+/*
+ * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
+ */
+void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
+{
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
+
+ spin_lock_bh(&cp->lock);
+ cp->state = IP_VS_TCP_S_LISTEN;
+ cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
+ : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
+ spin_unlock_bh(&cp->lock);
+}
+
+/* ---------------------------------------------
+ * timeouts is netns related now.
+ * ---------------------------------------------
+ */
+static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
+ sizeof(tcp_timeouts));
+ if (!pd->timeout_table)
+ return -ENOMEM;
+ pd->tcp_state_table = tcp_states;
+ return 0;
+}
+
+static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
+{
+ kfree(pd->timeout_table);
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_tcp = {
+ .name = "TCP",
+ .protocol = IPPROTO_TCP,
+ .num_states = IP_VS_TCP_S_LAST,
+ .dont_defrag = 0,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __ip_vs_tcp_init,
+ .exit_netns = __ip_vs_tcp_exit,
+ .register_app = tcp_register_app,
+ .unregister_app = tcp_unregister_app,
+ .conn_schedule = tcp_conn_schedule,
+ .conn_in_get = ip_vs_conn_in_get_proto,
+ .conn_out_get = ip_vs_conn_out_get_proto,
+ .snat_handler = tcp_snat_handler,
+ .dnat_handler = tcp_dnat_handler,
+ .csum_check = tcp_csum_check,
+ .state_name = tcp_state_name,
+ .state_transition = tcp_state_transition,
+ .app_conn_bind = tcp_app_conn_bind,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = tcp_timeout_change,
+};
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
new file mode 100644
index 000000000..b62a3c0ff
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -0,0 +1,499 @@
+/*
+ * ip_vs_proto_udp.c: UDP load balancing support for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
+ * Network name space (netns) aware.
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/udp.h>
+
+#include <net/ip_vs.h>
+#include <net/ip.h>
+#include <net/ip6_checksum.h>
+
+static int
+udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
+ int *verdict, struct ip_vs_conn **cpp,
+ struct ip_vs_iphdr *iph)
+{
+ struct net *net;
+ struct ip_vs_service *svc;
+ struct udphdr _udph, *uh;
+
+ /* IPv6 fragments, only first fragment will hit this */
+ uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
+ if (uh == NULL) {
+ *verdict = NF_DROP;
+ return 0;
+ }
+ net = skb_net(skb);
+ rcu_read_lock();
+ svc = ip_vs_service_find(net, af, skb->mark, iph->protocol,
+ &iph->daddr, uh->dest);
+ if (svc) {
+ int ignored;
+
+ if (ip_vs_todrop(net_ipvs(net))) {
+ /*
+ * It seems that we are very loaded.
+ * We have to drop this packet :(
+ */
+ rcu_read_unlock();
+ *verdict = NF_DROP;
+ return 0;
+ }
+
+ /*
+ * Let the virtual server select a real server for the
+ * incoming connection, and create a connection entry.
+ */
+ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
+ if (!*cpp && ignored <= 0) {
+ if (!ignored)
+ *verdict = ip_vs_leave(svc, skb, pd, iph);
+ else
+ *verdict = NF_DROP;
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+ /* NF_ACCEPT */
+ return 1;
+}
+
+
+static inline void
+udp_fast_csum_update(int af, struct udphdr *uhdr,
+ const union nf_inet_addr *oldip,
+ const union nf_inet_addr *newip,
+ __be16 oldport, __be16 newport)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ uhdr->check =
+ csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+ ip_vs_check_diff2(oldport, newport,
+ ~csum_unfold(uhdr->check))));
+ else
+#endif
+ uhdr->check =
+ csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+ ip_vs_check_diff2(oldport, newport,
+ ~csum_unfold(uhdr->check))));
+ if (!uhdr->check)
+ uhdr->check = CSUM_MANGLED_0;
+}
+
+static inline void
+udp_partial_csum_update(int af, struct udphdr *uhdr,
+ const union nf_inet_addr *oldip,
+ const union nf_inet_addr *newip,
+ __be16 oldlen, __be16 newlen)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ uhdr->check =
+ ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
+ ip_vs_check_diff2(oldlen, newlen,
+ csum_unfold(uhdr->check))));
+ else
+#endif
+ uhdr->check =
+ ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
+ ip_vs_check_diff2(oldlen, newlen,
+ csum_unfold(uhdr->check))));
+}
+
+
+static int
+udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ struct udphdr *udph;
+ unsigned int udphoff = iph->len;
+ int oldlen;
+ int payload_csum = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+ oldlen = skb->len - udphoff;
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /*
+ * Call application helper if needed
+ */
+ if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - udphoff;
+ else
+ payload_csum = 1;
+ }
+
+ udph = (void *)skb_network_header(skb) + udphoff;
+ udph->source = cp->vport;
+
+ /*
+ * Adjust UDP checksums
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+ htons(oldlen),
+ htons(skb->len - udphoff));
+ } else if (!payload_csum && (udph->check != 0)) {
+ /* Only port and addr are changed, do fast csum update */
+ udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr,
+ cp->dport, cp->vport);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ udph->check = 0;
+ skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ udph->check = csum_ipv6_magic(&cp->vaddr.in6,
+ &cp->caddr.in6,
+ skb->len - udphoff,
+ cp->protocol, skb->csum);
+ else
+#endif
+ udph->check = csum_tcpudp_magic(cp->vaddr.ip,
+ cp->caddr.ip,
+ skb->len - udphoff,
+ cp->protocol,
+ skb->csum);
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+ pp->name, udph->check,
+ (char*)&(udph->check) - (char*)udph);
+ }
+ return 1;
+}
+
+
+static int
+udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
+ struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
+{
+ struct udphdr *udph;
+ unsigned int udphoff = iph->len;
+ int oldlen;
+ int payload_csum = 0;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && iph->fragoffs)
+ return 1;
+#endif
+ oldlen = skb->len - udphoff;
+
+ /* csum_check requires unshared skb */
+ if (!skb_make_writable(skb, udphoff+sizeof(*udph)))
+ return 0;
+
+ if (unlikely(cp->app != NULL)) {
+ int ret;
+
+ /* Some checks before mangling */
+ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp))
+ return 0;
+
+ /*
+ * Attempt ip_vs_app call.
+ * It will fix ip_vs_conn
+ */
+ if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+ return 0;
+ /* ret=2: csum update is needed after payload mangling */
+ if (ret == 1)
+ oldlen = skb->len - udphoff;
+ else
+ payload_csum = 1;
+ }
+
+ udph = (void *)skb_network_header(skb) + udphoff;
+ udph->dest = cp->dport;
+
+ /*
+ * Adjust UDP checksums
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
+ htons(oldlen),
+ htons(skb->len - udphoff));
+ } else if (!payload_csum && (udph->check != 0)) {
+ /* Only port and addr are changed, do fast csum update */
+ udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr,
+ cp->vport, cp->dport);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = (cp->app && pp->csum_check) ?
+ CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+ } else {
+ /* full checksum calculation */
+ udph->check = 0;
+ skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0);
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ udph->check = csum_ipv6_magic(&cp->caddr.in6,
+ &cp->daddr.in6,
+ skb->len - udphoff,
+ cp->protocol, skb->csum);
+ else
+#endif
+ udph->check = csum_tcpudp_magic(cp->caddr.ip,
+ cp->daddr.ip,
+ skb->len - udphoff,
+ cp->protocol,
+ skb->csum);
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ return 1;
+}
+
+
+static int
+udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+ struct udphdr _udph, *uh;
+ unsigned int udphoff;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ udphoff = sizeof(struct ipv6hdr);
+ else
+#endif
+ udphoff = ip_hdrlen(skb);
+
+ uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
+ if (uh == NULL)
+ return 0;
+
+ if (uh->check != 0) {
+ switch (skb->ip_summed) {
+ case CHECKSUM_NONE:
+ skb->csum = skb_checksum(skb, udphoff,
+ skb->len - udphoff, 0);
+ case CHECKSUM_COMPLETE:
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6) {
+ if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr,
+ skb->len - udphoff,
+ ipv6_hdr(skb)->nexthdr,
+ skb->csum)) {
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+ "Failed checksum for");
+ return 0;
+ }
+ } else
+#endif
+ if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
+ ip_hdr(skb)->daddr,
+ skb->len - udphoff,
+ ip_hdr(skb)->protocol,
+ skb->csum)) {
+ IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
+ "Failed checksum for");
+ return 0;
+ }
+ break;
+ default:
+ /* No need to checksum. */
+ break;
+ }
+ }
+ return 1;
+}
+
+static inline __u16 udp_app_hashkey(__be16 port)
+{
+ return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
+ & UDP_APP_TAB_MASK;
+}
+
+
+static int udp_register_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_app *i;
+ __u16 hash;
+ __be16 port = inc->port;
+ int ret = 0;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+
+ hash = udp_app_hashkey(port);
+
+ list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) {
+ if (i->port == port) {
+ ret = -EEXIST;
+ goto out;
+ }
+ }
+ list_add_rcu(&inc->p_list, &ipvs->udp_apps[hash]);
+ atomic_inc(&pd->appcnt);
+
+ out:
+ return ret;
+}
+
+
+static void
+udp_unregister_app(struct net *net, struct ip_vs_app *inc)
+{
+ struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
+
+ atomic_dec(&pd->appcnt);
+ list_del_rcu(&inc->p_list);
+}
+
+
+static int udp_app_conn_bind(struct ip_vs_conn *cp)
+{
+ struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
+ int hash;
+ struct ip_vs_app *inc;
+ int result = 0;
+
+ /* Default binding: bind app only for NAT */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return 0;
+
+ /* Lookup application incarnations and bind the right one */
+ hash = udp_app_hashkey(cp->vport);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inc, &ipvs->udp_apps[hash], p_list) {
+ if (inc->port == cp->vport) {
+ if (unlikely(!ip_vs_app_inc_get(inc)))
+ break;
+ rcu_read_unlock();
+
+ IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
+ "%s:%u to app %s on port %u\n",
+ __func__,
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr),
+ ntohs(cp->cport),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+ ntohs(cp->vport),
+ inc->name, ntohs(inc->port));
+
+ cp->app = inc;
+ if (inc->init_conn)
+ result = inc->init_conn(inc, cp);
+ goto out;
+ }
+ }
+ rcu_read_unlock();
+
+ out:
+ return result;
+}
+
+
+static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
+ [IP_VS_UDP_S_NORMAL] = 5*60*HZ,
+ [IP_VS_UDP_S_LAST] = 2*HZ,
+};
+
+static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
+ [IP_VS_UDP_S_NORMAL] = "UDP",
+ [IP_VS_UDP_S_LAST] = "BUG!",
+};
+
+static const char * udp_state_name(int state)
+{
+ if (state >= IP_VS_UDP_S_LAST)
+ return "ERR!";
+ return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
+}
+
+static void
+udp_state_transition(struct ip_vs_conn *cp, int direction,
+ const struct sk_buff *skb,
+ struct ip_vs_proto_data *pd)
+{
+ if (unlikely(!pd)) {
+ pr_err("UDP no ns data\n");
+ return;
+ }
+
+ cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
+}
+
+static int __udp_init(struct net *net, struct ip_vs_proto_data *pd)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE);
+ pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts,
+ sizeof(udp_timeouts));
+ if (!pd->timeout_table)
+ return -ENOMEM;
+ return 0;
+}
+
+static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd)
+{
+ kfree(pd->timeout_table);
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_udp = {
+ .name = "UDP",
+ .protocol = IPPROTO_UDP,
+ .num_states = IP_VS_UDP_S_LAST,
+ .dont_defrag = 0,
+ .init = NULL,
+ .exit = NULL,
+ .init_netns = __udp_init,
+ .exit_netns = __udp_exit,
+ .conn_schedule = udp_conn_schedule,
+ .conn_in_get = ip_vs_conn_in_get_proto,
+ .conn_out_get = ip_vs_conn_out_get_proto,
+ .snat_handler = udp_snat_handler,
+ .dnat_handler = udp_dnat_handler,
+ .csum_check = udp_csum_check,
+ .state_transition = udp_state_transition,
+ .state_name = udp_state_name,
+ .register_app = udp_register_app,
+ .unregister_app = udp_unregister_app,
+ .app_conn_bind = udp_app_conn_bind,
+ .debug_packet = ip_vs_tcpudp_debug_packet,
+ .timeout_change = NULL,
+};
diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c
new file mode 100644
index 000000000..58bacfc46
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_rr.c
@@ -0,0 +1,130 @@
+/*
+ * IPVS: Round-Robin Scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes/Changes:
+ * Wensong Zhang : changed the ip_vs_rr_schedule to return dest
+ * Julian Anastasov : fixed the NULL pointer access bug in debugging
+ * Wensong Zhang : changed some comestics things for debugging
+ * Wensong Zhang : changed for the d-linked destination list
+ * Wensong Zhang : added the ip_vs_rr_update_svc
+ * Wensong Zhang : added any dest with weight=0 is quiesced
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
+{
+ svc->sched_data = &svc->destinations;
+ return 0;
+}
+
+
+static int ip_vs_rr_del_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest)
+{
+ struct list_head *p;
+
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *) svc->sched_data;
+ /* dest is already unlinked, so p->prev is not valid but
+ * p->next is valid, use it to reach previous entry.
+ */
+ if (p == &dest->n_list)
+ svc->sched_data = p->next->prev;
+ spin_unlock_bh(&svc->sched_lock);
+ return 0;
+}
+
+
+/*
+ * Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct list_head *p;
+ struct ip_vs_dest *dest, *last;
+ int pass = 0;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ spin_lock_bh(&svc->sched_lock);
+ p = (struct list_head *) svc->sched_data;
+ last = dest = list_entry(p, struct ip_vs_dest, n_list);
+
+ do {
+ list_for_each_entry_continue_rcu(dest,
+ &svc->destinations,
+ n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > 0)
+ /* HIT */
+ goto out;
+ if (dest == last)
+ goto stop;
+ }
+ pass++;
+ /* Previous dest could be unlinked, do not loop forever.
+ * If we stay at head there is no need for 2nd pass.
+ */
+ } while (pass < 2 && p != &svc->destinations);
+
+stop:
+ spin_unlock_bh(&svc->sched_lock);
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+
+ out:
+ svc->sched_data = &dest->n_list;
+ spin_unlock_bh(&svc->sched_lock);
+ IP_VS_DBG_BUF(6, "RR: server %s:%u "
+ "activeconns %d refcnt %d weight %d\n",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->refcnt), atomic_read(&dest->weight));
+
+ return dest;
+}
+
+
+static struct ip_vs_scheduler ip_vs_rr_scheduler = {
+ .name = "rr", /* name */
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list),
+ .init_service = ip_vs_rr_init_svc,
+ .add_dest = NULL,
+ .del_dest = ip_vs_rr_del_dest,
+ .schedule = ip_vs_rr_schedule,
+};
+
+static int __init ip_vs_rr_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+
+static void __exit ip_vs_rr_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_rr_init);
+module_exit(ip_vs_rr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c
new file mode 100644
index 000000000..199760c71
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sched.c
@@ -0,0 +1,254 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the Netfilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+EXPORT_SYMBOL(ip_vs_scheduler_err);
+/*
+ * IPVS scheduler list
+ */
+static LIST_HEAD(ip_vs_schedulers);
+
+/* semaphore for schedulers */
+static DEFINE_MUTEX(ip_vs_sched_mutex);
+
+
+/*
+ * Bind a service with a scheduler
+ */
+int ip_vs_bind_scheduler(struct ip_vs_service *svc,
+ struct ip_vs_scheduler *scheduler)
+{
+ int ret;
+
+ if (scheduler->init_service) {
+ ret = scheduler->init_service(svc);
+ if (ret) {
+ pr_err("%s(): init error\n", __func__);
+ return ret;
+ }
+ }
+ rcu_assign_pointer(svc->scheduler, scheduler);
+ return 0;
+}
+
+
+/*
+ * Unbind a service with its scheduler
+ */
+void ip_vs_unbind_scheduler(struct ip_vs_service *svc,
+ struct ip_vs_scheduler *sched)
+{
+ struct ip_vs_scheduler *cur_sched;
+
+ cur_sched = rcu_dereference_protected(svc->scheduler, 1);
+ /* This check proves that old 'sched' was installed */
+ if (!cur_sched)
+ return;
+
+ if (sched->done_service)
+ sched->done_service(svc);
+ /* svc->scheduler can not be set to NULL */
+}
+
+
+/*
+ * Get scheduler in the scheduler list by name
+ */
+static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
+{
+ struct ip_vs_scheduler *sched;
+
+ IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name);
+
+ mutex_lock(&ip_vs_sched_mutex);
+
+ list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
+ /*
+ * Test and get the modules atomically
+ */
+ if (sched->module && !try_module_get(sched->module)) {
+ /*
+ * This scheduler is just deleted
+ */
+ continue;
+ }
+ if (strcmp(sched_name, sched->name)==0) {
+ /* HIT */
+ mutex_unlock(&ip_vs_sched_mutex);
+ return sched;
+ }
+ module_put(sched->module);
+ }
+
+ mutex_unlock(&ip_vs_sched_mutex);
+ return NULL;
+}
+
+
+/*
+ * Lookup scheduler and try to load it if it doesn't exist
+ */
+struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
+{
+ struct ip_vs_scheduler *sched;
+
+ /*
+ * Search for the scheduler by sched_name
+ */
+ sched = ip_vs_sched_getbyname(sched_name);
+
+ /*
+ * If scheduler not found, load the module and search again
+ */
+ if (sched == NULL) {
+ request_module("ip_vs_%s", sched_name);
+ sched = ip_vs_sched_getbyname(sched_name);
+ }
+
+ return sched;
+}
+
+void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
+{
+ if (scheduler && scheduler->module)
+ module_put(scheduler->module);
+}
+
+/*
+ * Common error output helper for schedulers
+ */
+
+void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg)
+{
+ struct ip_vs_scheduler *sched;
+
+ sched = rcu_dereference(svc->scheduler);
+ if (svc->fwmark) {
+ IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n",
+ sched->name, svc->fwmark, svc->fwmark, msg);
+#ifdef CONFIG_IP_VS_IPV6
+ } else if (svc->af == AF_INET6) {
+ IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n",
+ sched->name, ip_vs_proto_name(svc->protocol),
+ &svc->addr.in6, ntohs(svc->port), msg);
+#endif
+ } else {
+ IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n",
+ sched->name, ip_vs_proto_name(svc->protocol),
+ &svc->addr.ip, ntohs(svc->port), msg);
+ }
+}
+
+/*
+ * Register a scheduler in the scheduler list
+ */
+int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+ struct ip_vs_scheduler *sched;
+
+ if (!scheduler) {
+ pr_err("%s(): NULL arg\n", __func__);
+ return -EINVAL;
+ }
+
+ if (!scheduler->name) {
+ pr_err("%s(): NULL scheduler_name\n", __func__);
+ return -EINVAL;
+ }
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ mutex_lock(&ip_vs_sched_mutex);
+
+ if (!list_empty(&scheduler->n_list)) {
+ mutex_unlock(&ip_vs_sched_mutex);
+ ip_vs_use_count_dec();
+ pr_err("%s(): [%s] scheduler already linked\n",
+ __func__, scheduler->name);
+ return -EINVAL;
+ }
+
+ /*
+ * Make sure that the scheduler with this name doesn't exist
+ * in the scheduler list.
+ */
+ list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
+ if (strcmp(scheduler->name, sched->name) == 0) {
+ mutex_unlock(&ip_vs_sched_mutex);
+ ip_vs_use_count_dec();
+ pr_err("%s(): [%s] scheduler already existed "
+ "in the system\n", __func__, scheduler->name);
+ return -EINVAL;
+ }
+ }
+ /*
+ * Add it into the d-linked scheduler list
+ */
+ list_add(&scheduler->n_list, &ip_vs_schedulers);
+ mutex_unlock(&ip_vs_sched_mutex);
+
+ pr_info("[%s] scheduler registered.\n", scheduler->name);
+
+ return 0;
+}
+
+
+/*
+ * Unregister a scheduler from the scheduler list
+ */
+int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+ if (!scheduler) {
+ pr_err("%s(): NULL arg\n", __func__);
+ return -EINVAL;
+ }
+
+ mutex_lock(&ip_vs_sched_mutex);
+ if (list_empty(&scheduler->n_list)) {
+ mutex_unlock(&ip_vs_sched_mutex);
+ pr_err("%s(): [%s] scheduler is not in the list. failed\n",
+ __func__, scheduler->name);
+ return -EINVAL;
+ }
+
+ /*
+ * Remove it from the d-linked scheduler list
+ */
+ list_del(&scheduler->n_list);
+ mutex_unlock(&ip_vs_sched_mutex);
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ pr_info("[%s] scheduler unregistered.\n", scheduler->name);
+
+ return 0;
+}
diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c
new file mode 100644
index 000000000..f8e2d00f5
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sed.c
@@ -0,0 +1,144 @@
+/*
+ * IPVS: Shortest Expected Delay scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The SED algorithm attempts to minimize each job's expected delay until
+ * completion. The expected delay that the job will experience is
+ * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
+ * jobs on the ith server and Ui is the fixed service rate (weight) of
+ * the ith server. The SED algorithm adopts a greedy policy that each does
+ * what is in its own best interest, i.e. to join the queue which would
+ * minimize its expected delay of completion.
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
+ *
+ * The difference between SED and WLC is that SED includes the incoming
+ * job in the cost function (the increment of 1). SED may outperform
+ * WLC, while scheduling big jobs under larger heterogeneous systems
+ * (the server weight varies a lot).
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static inline int
+ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
+{
+ /*
+ * We only use the active connection number in the cost
+ * calculation here.
+ */
+ return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ * Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ /*
+ * We calculate the load of each dest server as follows:
+ * (server expected overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connections.
+ */
+
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > 0) {
+ least = dest;
+ loh = ip_vs_sed_dest_overhead(least);
+ goto nextstage;
+ }
+ }
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+ doh = ip_vs_sed_dest_overhead(dest);
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "SED: server %s:%u "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_sed_scheduler =
+{
+ .name = "sed",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list),
+ .schedule = ip_vs_sed_schedule,
+};
+
+
+static int __init ip_vs_sed_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+
+static void __exit ip_vs_sed_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_sed_init);
+module_exit(ip_vs_sed_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
new file mode 100644
index 000000000..98a13433b
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -0,0 +1,385 @@
+/*
+ * IPVS: Source Hashing scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@gnuchina.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The sh algorithm is to select server by the hash key of source IP
+ * address. The pseudo code is as follows:
+ *
+ * n <- servernode[src_ip];
+ * if (n is dead) OR
+ * (n is overloaded) or (n.weight <= 0) then
+ * return NULL;
+ *
+ * return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet source IP address to the current server
+ * array. If the sh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ * The weight destination attribute can be used to control the
+ * distribution of connections to the destinations in servernode. The
+ * greater the weight, the more connections the destination
+ * will receive.
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+#include <net/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
+
+
+/*
+ * IPVS SH bucket
+ */
+struct ip_vs_sh_bucket {
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
+};
+
+/*
+ * for IPVS SH entry hash table
+ */
+#ifndef CONFIG_IP_VS_SH_TAB_BITS
+#define CONFIG_IP_VS_SH_TAB_BITS 8
+#endif
+#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS
+#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS)
+#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1)
+
+struct ip_vs_sh_state {
+ struct rcu_head rcu_head;
+ struct ip_vs_sh_bucket buckets[IP_VS_SH_TAB_SIZE];
+};
+
+/* Helper function to determine if server is unavailable */
+static inline bool is_unavailable(struct ip_vs_dest *dest)
+{
+ return atomic_read(&dest->weight) <= 0 ||
+ dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+/*
+ * Returns hash value for IPVS SH entry
+ */
+static inline unsigned int
+ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
+ __be16 port, unsigned int offset)
+{
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0]^addr->ip6[1]^
+ addr->ip6[2]^addr->ip6[3];
+#endif
+ return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+ IP_VS_SH_TAB_MASK;
+}
+
+
+/*
+ * Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ unsigned int hash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
+ struct ip_vs_dest *dest = rcu_dereference(s->buckets[hash].dest);
+
+ return (!dest || is_unavailable(dest)) ? NULL : dest;
+}
+
+
+/* As ip_vs_sh_get, but with fallback if selected server is unavailable
+ *
+ * The fallback strategy loops around the table starting from a "random"
+ * point (in fact, it is chosen to be the original hash value to make the
+ * algorithm deterministic) to find a new server.
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get_fallback(struct ip_vs_service *svc, struct ip_vs_sh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ unsigned int offset, roffset;
+ unsigned int hash, ihash;
+ struct ip_vs_dest *dest;
+
+ /* first try the dest it's supposed to go to */
+ ihash = ip_vs_sh_hashkey(svc->af, addr, port, 0);
+ dest = rcu_dereference(s->buckets[ihash].dest);
+ if (!dest)
+ return NULL;
+ if (!is_unavailable(dest))
+ return dest;
+
+ IP_VS_DBG_BUF(6, "SH: selected unavailable server %s:%d, reselecting",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
+
+ /* if the original dest is unavailable, loop around the table
+ * starting from ihash to find a new dest
+ */
+ for (offset = 0; offset < IP_VS_SH_TAB_SIZE; offset++) {
+ roffset = (offset + ihash) % IP_VS_SH_TAB_SIZE;
+ hash = ip_vs_sh_hashkey(svc->af, addr, port, roffset);
+ dest = rcu_dereference(s->buckets[hash].dest);
+ if (!dest)
+ break;
+ if (!is_unavailable(dest))
+ return dest;
+ IP_VS_DBG_BUF(6, "SH: selected unavailable "
+ "server %s:%d (offset %d), reselecting",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port), roffset);
+ }
+
+ return NULL;
+}
+
+/*
+ * Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_sh_reassign(struct ip_vs_sh_state *s, struct ip_vs_service *svc)
+{
+ int i;
+ struct ip_vs_sh_bucket *b;
+ struct list_head *p;
+ struct ip_vs_dest *dest;
+ int d_count;
+ bool empty;
+
+ b = &s->buckets[0];
+ p = &svc->destinations;
+ empty = list_empty(p);
+ d_count = 0;
+ for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest)
+ ip_vs_dest_put(dest);
+ if (empty)
+ RCU_INIT_POINTER(b->dest, NULL);
+ else {
+ if (p == &svc->destinations)
+ p = p->next;
+
+ dest = list_entry(p, struct ip_vs_dest, n_list);
+ ip_vs_dest_hold(dest);
+ RCU_INIT_POINTER(b->dest, dest);
+
+ IP_VS_DBG_BUF(6, "assigned i: %d dest: %s weight: %d\n",
+ i, IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ atomic_read(&dest->weight));
+
+ /* Don't move to next dest until filling weight */
+ if (++d_count >= atomic_read(&dest->weight)) {
+ p = p->next;
+ d_count = 0;
+ }
+
+ }
+ b++;
+ }
+ return 0;
+}
+
+
+/*
+ * Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_sh_flush(struct ip_vs_sh_state *s)
+{
+ int i;
+ struct ip_vs_sh_bucket *b;
+ struct ip_vs_dest *dest;
+
+ b = &s->buckets[0];
+ for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+ dest = rcu_dereference_protected(b->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(b->dest, NULL);
+ }
+ b++;
+ }
+}
+
+
+static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_sh_state *s;
+
+ /* allocate the SH table for this service */
+ s = kzalloc(sizeof(struct ip_vs_sh_state), GFP_KERNEL);
+ if (s == NULL)
+ return -ENOMEM;
+
+ svc->sched_data = s;
+ IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
+ "current service\n",
+ sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+
+ /* assign the hash buckets with current dests */
+ ip_vs_sh_reassign(s, svc);
+
+ return 0;
+}
+
+
+static void ip_vs_sh_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_sh_state *s = svc->sched_data;
+
+ /* got to clean up hash buckets here */
+ ip_vs_sh_flush(s);
+
+ /* release the table itself */
+ kfree_rcu(s, rcu_head);
+ IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
+ sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+}
+
+
+static int ip_vs_sh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_sh_state *s = svc->sched_data;
+
+ /* assign the hash buckets with the updated service */
+ ip_vs_sh_reassign(s, svc);
+
+ return 0;
+}
+
+
+/* Helper function to get port number */
+static inline __be16
+ip_vs_sh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
+{
+ __be16 port;
+ struct tcphdr _tcph, *th;
+ struct udphdr _udph, *uh;
+ sctp_sctphdr_t _sctph, *sh;
+
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
+ if (unlikely(th == NULL))
+ return 0;
+ port = th->source;
+ break;
+ case IPPROTO_UDP:
+ uh = skb_header_pointer(skb, iph->len, sizeof(_udph), &_udph);
+ if (unlikely(uh == NULL))
+ return 0;
+ port = uh->source;
+ break;
+ case IPPROTO_SCTP:
+ sh = skb_header_pointer(skb, iph->len, sizeof(_sctph), &_sctph);
+ if (unlikely(sh == NULL))
+ return 0;
+ port = sh->source;
+ break;
+ default:
+ port = 0;
+ }
+
+ return port;
+}
+
+
+/*
+ * Source Hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_sh_state *s;
+ __be16 port = 0;
+
+ IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
+
+ if (svc->flags & IP_VS_SVC_F_SCHED_SH_PORT)
+ port = ip_vs_sh_get_port(skb, iph);
+
+ s = (struct ip_vs_sh_state *) svc->sched_data;
+
+ if (svc->flags & IP_VS_SVC_F_SCHED_SH_FALLBACK)
+ dest = ip_vs_sh_get_fallback(svc, s, &iph->saddr, port);
+ else
+ dest = ip_vs_sh_get(svc, s, &iph->saddr, port);
+
+ if (!dest) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ IP_VS_DBG_BUF(6, "SH: source IP address %s --> server %s:%d\n",
+ IP_VS_DBG_ADDR(svc->af, &iph->saddr),
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+
+/*
+ * IPVS SH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_sh_scheduler =
+{
+ .name = "sh",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list),
+ .init_service = ip_vs_sh_init_svc,
+ .done_service = ip_vs_sh_done_svc,
+ .add_dest = ip_vs_sh_dest_changed,
+ .del_dest = ip_vs_sh_dest_changed,
+ .upd_dest = ip_vs_sh_dest_changed,
+ .schedule = ip_vs_sh_schedule,
+};
+
+
+static int __init ip_vs_sh_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+
+
+static void __exit ip_vs_sh_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
+ synchronize_rcu();
+}
+
+
+module_init(ip_vs_sh_init);
+module_exit(ip_vs_sh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
new file mode 100644
index 000000000..19b9cce6c
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -0,0 +1,1975 @@
+/*
+ * IPVS An implementation of the IP virtual server support for the
+ * LINUX operating system. IPVS is now implemented as a module
+ * over the NetFilter framework. IPVS can be used to build a
+ * high-performance and highly available server based on a
+ * cluster of servers.
+ *
+ * Version 1, is capable of handling both version 0 and 1 messages.
+ * Version 0 is the plain old format.
+ * Note Version 0 receivers will just drop Ver 1 messages.
+ * Version 1 is capable of handle IPv6, Persistence data,
+ * time-outs, and firewall marks.
+ * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order.
+ * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0
+ *
+ * Definitions Message: is a complete datagram
+ * Sync_conn: is a part of a Message
+ * Param Data is an option to a Sync_conn.
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * ip_vs_sync: sync connection info from master load balancer to backups
+ * through multicast
+ *
+ * Changes:
+ * Alexandre Cassen : Added master & backup support at a time.
+ * Alexandre Cassen : Added SyncID support for incoming sync
+ * messages filtering.
+ * Justin Ossevoort : Fix endian problem on sync message size.
+ * Hans Schillstrom : Added Version 1: i.e. IPv6,
+ * Persistence support, fwmark and time-out.
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/inetdevice.h>
+#include <linux/net.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/igmp.h> /* for ip_mc_join_group */
+#include <linux/udp.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/kernel.h>
+
+#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */
+
+#include <net/ip.h>
+#include <net/sock.h>
+
+#include <net/ip_vs.h>
+
+#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
+#define IP_VS_SYNC_PORT 8848 /* multicast port */
+
+#define SYNC_PROTO_VER 1 /* Protocol version in header */
+
+static struct lock_class_key __ipvs_sync_key;
+/*
+ * IPVS sync connection entry
+ * Version 0, i.e. original version.
+ */
+struct ip_vs_sync_conn_v0 {
+ __u8 reserved;
+
+ /* Protocol, addresses and port numbers */
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __be16 cport;
+ __be16 vport;
+ __be16 dport;
+ __be32 caddr; /* client address */
+ __be32 vaddr; /* virtual address */
+ __be32 daddr; /* destination address */
+
+ /* Flags and state transition */
+ __be16 flags; /* status flags */
+ __be16 state; /* state info */
+
+ /* The sequence options start here */
+};
+
+struct ip_vs_sync_conn_options {
+ struct ip_vs_seq in_seq; /* incoming seq. struct */
+ struct ip_vs_seq out_seq; /* outgoing seq. struct */
+};
+
+/*
+ Sync Connection format (sync_conn)
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Type | Protocol | Ver. | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Flags |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | State | cport |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | vport | dport |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | fwmark |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | timeout (in sec.) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | ... |
+ | IP-Addresses (v4 or v6) |
+ | ... |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ Optional Parameters.
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Param. Type | Param. Length | Param. data |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
+ | ... |
+ | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | | Param Type | Param. Length |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Param data |
+ | Last Param data should be padded for 32 bit alignment |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*/
+
+/*
+ * Type 0, IPv4 sync connection format
+ */
+struct ip_vs_sync_v4 {
+ __u8 type;
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __be16 ver_size; /* Version msb 4 bits */
+ /* Flags and state transition */
+ __be32 flags; /* status flags */
+ __be16 state; /* state info */
+ /* Protocol, addresses and port numbers */
+ __be16 cport;
+ __be16 vport;
+ __be16 dport;
+ __be32 fwmark; /* Firewall mark from skb */
+ __be32 timeout; /* cp timeout */
+ __be32 caddr; /* client address */
+ __be32 vaddr; /* virtual address */
+ __be32 daddr; /* destination address */
+ /* The sequence options start here */
+ /* PE data padded to 32bit alignment after seq. options */
+};
+/*
+ * Type 2 messages IPv6
+ */
+struct ip_vs_sync_v6 {
+ __u8 type;
+ __u8 protocol; /* Which protocol (TCP/UDP) */
+ __be16 ver_size; /* Version msb 4 bits */
+ /* Flags and state transition */
+ __be32 flags; /* status flags */
+ __be16 state; /* state info */
+ /* Protocol, addresses and port numbers */
+ __be16 cport;
+ __be16 vport;
+ __be16 dport;
+ __be32 fwmark; /* Firewall mark from skb */
+ __be32 timeout; /* cp timeout */
+ struct in6_addr caddr; /* client address */
+ struct in6_addr vaddr; /* virtual address */
+ struct in6_addr daddr; /* destination address */
+ /* The sequence options start here */
+ /* PE data padded to 32bit alignment after seq. options */
+};
+
+union ip_vs_sync_conn {
+ struct ip_vs_sync_v4 v4;
+ struct ip_vs_sync_v6 v6;
+};
+
+/* Bits in Type field in above */
+#define STYPE_INET6 0
+#define STYPE_F_INET6 (1 << STYPE_INET6)
+
+#define SVER_SHIFT 12 /* Shift to get version */
+#define SVER_MASK 0x0fff /* Mask to strip version */
+
+#define IPVS_OPT_SEQ_DATA 1
+#define IPVS_OPT_PE_DATA 2
+#define IPVS_OPT_PE_NAME 3
+#define IPVS_OPT_PARAM 7
+
+#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
+#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
+#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
+#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
+
+struct ip_vs_sync_thread_data {
+ struct net *net;
+ struct socket *sock;
+ char *buf;
+ int id;
+};
+
+/* Version 0 definition of packet sizes */
+#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
+#define FULL_CONN_SIZE \
+(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
+
+
+/*
+ The master mulitcasts messages (Datagrams) to the backup load balancers
+ in the following format.
+
+ Version 1:
+ Note, first byte should be Zero, so ver 0 receivers will drop the packet.
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | 0 | SyncID | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Count Conns | Version | Reserved, set to Zero |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | IPVS Sync Connection (1) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | . |
+ ~ . ~
+ | . |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | IPVS Sync Connection (n) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ Version 0 Header
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Count Conns | SyncID | Size |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | IPVS Sync Connection (1) |
+*/
+
+#define SYNC_MESG_HEADER_LEN 4
+#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
+
+/* Version 0 header */
+struct ip_vs_sync_mesg_v0 {
+ __u8 nr_conns;
+ __u8 syncid;
+ __be16 size;
+
+ /* ip_vs_sync_conn entries start here */
+};
+
+/* Version 1 header */
+struct ip_vs_sync_mesg {
+ __u8 reserved; /* must be zero */
+ __u8 syncid;
+ __be16 size;
+ __u8 nr_conns;
+ __s8 version; /* SYNC_PROTO_VER */
+ __u16 spare;
+ /* ip_vs_sync_conn entries start here */
+};
+
+struct ip_vs_sync_buff {
+ struct list_head list;
+ unsigned long firstuse;
+
+ /* pointers for the message data */
+ struct ip_vs_sync_mesg *mesg;
+ unsigned char *head;
+ unsigned char *end;
+};
+
+/*
+ * Copy of struct ip_vs_seq
+ * From unaligned network order to aligned host order
+ */
+static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
+{
+ ho->init_seq = get_unaligned_be32(&no->init_seq);
+ ho->delta = get_unaligned_be32(&no->delta);
+ ho->previous_delta = get_unaligned_be32(&no->previous_delta);
+}
+
+/*
+ * Copy of struct ip_vs_seq
+ * From Aligned host order to unaligned network order
+ */
+static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
+{
+ put_unaligned_be32(ho->init_seq, &no->init_seq);
+ put_unaligned_be32(ho->delta, &no->delta);
+ put_unaligned_be32(ho->previous_delta, &no->previous_delta);
+}
+
+static inline struct ip_vs_sync_buff *
+sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
+{
+ struct ip_vs_sync_buff *sb;
+
+ spin_lock_bh(&ipvs->sync_lock);
+ if (list_empty(&ms->sync_queue)) {
+ sb = NULL;
+ __set_current_state(TASK_INTERRUPTIBLE);
+ } else {
+ sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff,
+ list);
+ list_del(&sb->list);
+ ms->sync_queue_len--;
+ if (!ms->sync_queue_len)
+ ms->sync_queue_delay = 0;
+ }
+ spin_unlock_bh(&ipvs->sync_lock);
+
+ return sb;
+}
+
+/*
+ * Create a new sync buffer for Version 1 proto.
+ */
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_sync_buff *sb;
+
+ if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+ return NULL;
+
+ sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ if (!sb->mesg) {
+ kfree(sb);
+ return NULL;
+ }
+ sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */
+ sb->mesg->version = SYNC_PROTO_VER;
+ sb->mesg->syncid = ipvs->master_syncid;
+ sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg));
+ sb->mesg->nr_conns = 0;
+ sb->mesg->spare = 0;
+ sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
+ sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen;
+
+ sb->firstuse = jiffies;
+ return sb;
+}
+
+static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
+{
+ kfree(sb->mesg);
+ kfree(sb);
+}
+
+static inline void sb_queue_tail(struct netns_ipvs *ipvs,
+ struct ipvs_master_sync_state *ms)
+{
+ struct ip_vs_sync_buff *sb = ms->sync_buff;
+
+ spin_lock(&ipvs->sync_lock);
+ if (ipvs->sync_state & IP_VS_STATE_MASTER &&
+ ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) {
+ if (!ms->sync_queue_len)
+ schedule_delayed_work(&ms->master_wakeup_work,
+ max(IPVS_SYNC_SEND_DELAY, 1));
+ ms->sync_queue_len++;
+ list_add_tail(&sb->list, &ms->sync_queue);
+ if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE)
+ wake_up_process(ms->master_thread);
+ } else
+ ip_vs_sync_buff_release(sb);
+ spin_unlock(&ipvs->sync_lock);
+}
+
+/*
+ * Get the current sync buffer if it has been created for more
+ * than the specified time or the specified time is zero.
+ */
+static inline struct ip_vs_sync_buff *
+get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms,
+ unsigned long time)
+{
+ struct ip_vs_sync_buff *sb;
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ sb = ms->sync_buff;
+ if (sb && time_after_eq(jiffies - sb->firstuse, time)) {
+ ms->sync_buff = NULL;
+ __set_current_state(TASK_RUNNING);
+ } else
+ sb = NULL;
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ return sb;
+}
+
+static inline int
+select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
+{
+ return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask;
+}
+
+/*
+ * Create a new sync buffer for Version 0 proto.
+ */
+static inline struct ip_vs_sync_buff *
+ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
+{
+ struct ip_vs_sync_buff *sb;
+ struct ip_vs_sync_mesg_v0 *mesg;
+
+ if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+ return NULL;
+
+ sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
+ if (!sb->mesg) {
+ kfree(sb);
+ return NULL;
+ }
+ mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
+ mesg->nr_conns = 0;
+ mesg->syncid = ipvs->master_syncid;
+ mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0));
+ sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
+ sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
+ sb->firstuse = jiffies;
+ return sb;
+}
+
+/* Check if connection is controlled by persistence */
+static inline bool in_persistence(struct ip_vs_conn *cp)
+{
+ for (cp = cp->control; cp; cp = cp->control) {
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ return true;
+ }
+ return false;
+}
+
+/* Check if conn should be synced.
+ * pkts: conn packets, use sysctl_sync_threshold to avoid packet check
+ * - (1) sync_refresh_period: reduce sync rate. Additionally, retry
+ * sync_retries times with period of sync_refresh_period/8
+ * - (2) if both sync_refresh_period and sync_period are 0 send sync only
+ * for state changes or only once when pkts matches sync_threshold
+ * - (3) templates: rate can be reduced only with sync_refresh_period or
+ * with (2)
+ */
+static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
+ struct ip_vs_conn *cp, int pkts)
+{
+ unsigned long orig = ACCESS_ONCE(cp->sync_endtime);
+ unsigned long now = jiffies;
+ unsigned long n = (now + cp->timeout) & ~3UL;
+ unsigned int sync_refresh_period;
+ int sync_period;
+ int force;
+
+ /* Check if we sync in current state */
+ if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE))
+ force = 0;
+ else if (unlikely(sysctl_sync_persist_mode(ipvs) && in_persistence(cp)))
+ return 0;
+ else if (likely(cp->protocol == IPPROTO_TCP)) {
+ if (!((1 << cp->state) &
+ ((1 << IP_VS_TCP_S_ESTABLISHED) |
+ (1 << IP_VS_TCP_S_FIN_WAIT) |
+ (1 << IP_VS_TCP_S_CLOSE) |
+ (1 << IP_VS_TCP_S_CLOSE_WAIT) |
+ (1 << IP_VS_TCP_S_TIME_WAIT))))
+ return 0;
+ force = cp->state != cp->old_state;
+ if (force && cp->state != IP_VS_TCP_S_ESTABLISHED)
+ goto set;
+ } else if (unlikely(cp->protocol == IPPROTO_SCTP)) {
+ if (!((1 << cp->state) &
+ ((1 << IP_VS_SCTP_S_ESTABLISHED) |
+ (1 << IP_VS_SCTP_S_SHUTDOWN_SENT) |
+ (1 << IP_VS_SCTP_S_SHUTDOWN_RECEIVED) |
+ (1 << IP_VS_SCTP_S_SHUTDOWN_ACK_SENT) |
+ (1 << IP_VS_SCTP_S_CLOSED))))
+ return 0;
+ force = cp->state != cp->old_state;
+ if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED)
+ goto set;
+ } else {
+ /* UDP or another protocol with single state */
+ force = 0;
+ }
+
+ sync_refresh_period = sysctl_sync_refresh_period(ipvs);
+ if (sync_refresh_period > 0) {
+ long diff = n - orig;
+ long min_diff = max(cp->timeout >> 1, 10UL * HZ);
+
+ /* Avoid sync if difference is below sync_refresh_period
+ * and below the half timeout.
+ */
+ if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) {
+ int retries = orig & 3;
+
+ if (retries >= sysctl_sync_retries(ipvs))
+ return 0;
+ if (time_before(now, orig - cp->timeout +
+ (sync_refresh_period >> 3)))
+ return 0;
+ n |= retries + 1;
+ }
+ }
+ sync_period = sysctl_sync_period(ipvs);
+ if (sync_period > 0) {
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) &&
+ pkts % sync_period != sysctl_sync_threshold(ipvs))
+ return 0;
+ } else if (sync_refresh_period <= 0 &&
+ pkts != sysctl_sync_threshold(ipvs))
+ return 0;
+
+set:
+ cp->old_state = cp->state;
+ n = cmpxchg(&cp->sync_endtime, orig, n);
+ return n == orig || force;
+}
+
+/*
+ * Version 0 , could be switched in by sys_ctl.
+ * Add an ip_vs_conn information into the current sync_buff.
+ */
+static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
+ int pkts)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg_v0 *m;
+ struct ip_vs_sync_conn_v0 *s;
+ struct ip_vs_sync_buff *buff;
+ struct ipvs_master_sync_state *ms;
+ int id;
+ int len;
+
+ if (unlikely(cp->af != AF_INET))
+ return;
+ /* Do not sync ONE PACKET */
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ return;
+
+ if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
+ return;
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ return;
+ }
+
+ id = select_master_thread_id(ipvs, cp);
+ ms = &ipvs->ms[id];
+ buff = ms->sync_buff;
+ if (buff) {
+ m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
+ /* Send buffer if it is for v1 */
+ if (!m->nr_conns) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
+ buff = NULL;
+ }
+ }
+ if (!buff) {
+ buff = ip_vs_sync_buff_create_v0(ipvs);
+ if (!buff) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ pr_err("ip_vs_sync_buff_create failed.\n");
+ return;
+ }
+ ms->sync_buff = buff;
+ }
+
+ len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
+ SIMPLE_CONN_SIZE;
+ m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
+ s = (struct ip_vs_sync_conn_v0 *) buff->head;
+
+ /* copy members */
+ s->reserved = 0;
+ s->protocol = cp->protocol;
+ s->cport = cp->cport;
+ s->vport = cp->vport;
+ s->dport = cp->dport;
+ s->caddr = cp->caddr.ip;
+ s->vaddr = cp->vaddr.ip;
+ s->daddr = cp->daddr.ip;
+ s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
+ s->state = htons(cp->state);
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+ struct ip_vs_sync_conn_options *opt =
+ (struct ip_vs_sync_conn_options *)&s[1];
+ memcpy(opt, &cp->in_seq, sizeof(*opt));
+ }
+
+ m->nr_conns++;
+ m->size = htons(ntohs(m->size) + len);
+ buff->head += len;
+
+ /* check if there is a space for next one */
+ if (buff->head + FULL_CONN_SIZE > buff->end) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
+ }
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+
+ /* synchronize its controller if it has */
+ cp = cp->control;
+ if (cp) {
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ pkts = atomic_add_return(1, &cp->in_pkts);
+ else
+ pkts = sysctl_sync_threshold(ipvs);
+ ip_vs_sync_conn(net, cp->control, pkts);
+ }
+}
+
+/*
+ * Add an ip_vs_conn information into the current sync_buff.
+ * Called by ip_vs_in.
+ * Sending Version 1 messages
+ */
+void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg *m;
+ union ip_vs_sync_conn *s;
+ struct ip_vs_sync_buff *buff;
+ struct ipvs_master_sync_state *ms;
+ int id;
+ __u8 *p;
+ unsigned int len, pe_name_len, pad;
+
+ /* Handle old version of the protocol */
+ if (sysctl_sync_ver(ipvs) == 0) {
+ ip_vs_sync_conn_v0(net, cp, pkts);
+ return;
+ }
+ /* Do not sync ONE PACKET */
+ if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
+ goto control;
+sloop:
+ if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
+ goto control;
+
+ /* Sanity checks */
+ pe_name_len = 0;
+ if (cp->pe_data_len) {
+ if (!cp->pe_data || !cp->dest) {
+ IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
+ return;
+ }
+ pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
+ }
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ return;
+ }
+
+ id = select_master_thread_id(ipvs, cp);
+ ms = &ipvs->ms[id];
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6)
+ len = sizeof(struct ip_vs_sync_v6);
+ else
+#endif
+ len = sizeof(struct ip_vs_sync_v4);
+
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
+ len += sizeof(struct ip_vs_sync_conn_options) + 2;
+
+ if (cp->pe_data_len)
+ len += cp->pe_data_len + 2; /* + Param hdr field */
+ if (pe_name_len)
+ len += pe_name_len + 2;
+
+ /* check if there is a space for this one */
+ pad = 0;
+ buff = ms->sync_buff;
+ if (buff) {
+ m = buff->mesg;
+ pad = (4 - (size_t) buff->head) & 3;
+ /* Send buffer if it is for v0 */
+ if (buff->head + len + pad > buff->end || m->reserved) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
+ buff = NULL;
+ pad = 0;
+ }
+ }
+
+ if (!buff) {
+ buff = ip_vs_sync_buff_create(ipvs);
+ if (!buff) {
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+ pr_err("ip_vs_sync_buff_create failed.\n");
+ return;
+ }
+ ms->sync_buff = buff;
+ m = buff->mesg;
+ }
+
+ p = buff->head;
+ buff->head += pad + len;
+ m->size = htons(ntohs(m->size) + pad + len);
+ /* Add ev. padding from prev. sync_conn */
+ while (pad--)
+ *(p++) = 0;
+
+ s = (union ip_vs_sync_conn *)p;
+
+ /* Set message type & copy members */
+ s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
+ s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */
+ s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
+ s->v4.state = htons(cp->state);
+ s->v4.protocol = cp->protocol;
+ s->v4.cport = cp->cport;
+ s->v4.vport = cp->vport;
+ s->v4.dport = cp->dport;
+ s->v4.fwmark = htonl(cp->fwmark);
+ s->v4.timeout = htonl(cp->timeout / HZ);
+ m->nr_conns++;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6) {
+ p += sizeof(struct ip_vs_sync_v6);
+ s->v6.caddr = cp->caddr.in6;
+ s->v6.vaddr = cp->vaddr.in6;
+ s->v6.daddr = cp->daddr.in6;
+ } else
+#endif
+ {
+ p += sizeof(struct ip_vs_sync_v4); /* options ptr */
+ s->v4.caddr = cp->caddr.ip;
+ s->v4.vaddr = cp->vaddr.ip;
+ s->v4.daddr = cp->daddr.ip;
+ }
+ if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+ *(p++) = IPVS_OPT_SEQ_DATA;
+ *(p++) = sizeof(struct ip_vs_sync_conn_options);
+ hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
+ p += sizeof(struct ip_vs_seq);
+ hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
+ p += sizeof(struct ip_vs_seq);
+ }
+ /* Handle pe data */
+ if (cp->pe_data_len && cp->pe_data) {
+ *(p++) = IPVS_OPT_PE_DATA;
+ *(p++) = cp->pe_data_len;
+ memcpy(p, cp->pe_data, cp->pe_data_len);
+ p += cp->pe_data_len;
+ if (pe_name_len) {
+ /* Add PE_NAME */
+ *(p++) = IPVS_OPT_PE_NAME;
+ *(p++) = pe_name_len;
+ memcpy(p, cp->pe->name, pe_name_len);
+ p += pe_name_len;
+ }
+ }
+
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+
+control:
+ /* synchronize its controller if it has */
+ cp = cp->control;
+ if (!cp)
+ return;
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ pkts = atomic_add_return(1, &cp->in_pkts);
+ else
+ pkts = sysctl_sync_threshold(ipvs);
+ goto sloop;
+}
+
+/*
+ * fill_param used by version 1
+ */
+static inline int
+ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc,
+ struct ip_vs_conn_param *p,
+ __u8 *pe_data, unsigned int pe_data_len,
+ __u8 *pe_name, unsigned int pe_name_len)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ ip_vs_conn_fill_param(net, af, sc->v6.protocol,
+ (const union nf_inet_addr *)&sc->v6.caddr,
+ sc->v6.cport,
+ (const union nf_inet_addr *)&sc->v6.vaddr,
+ sc->v6.vport, p);
+ else
+#endif
+ ip_vs_conn_fill_param(net, af, sc->v4.protocol,
+ (const union nf_inet_addr *)&sc->v4.caddr,
+ sc->v4.cport,
+ (const union nf_inet_addr *)&sc->v4.vaddr,
+ sc->v4.vport, p);
+ /* Handle pe data */
+ if (pe_data_len) {
+ if (pe_name_len) {
+ char buff[IP_VS_PENAME_MAXLEN+1];
+
+ memcpy(buff, pe_name, pe_name_len);
+ buff[pe_name_len]=0;
+ p->pe = __ip_vs_pe_getbyname(buff);
+ if (!p->pe) {
+ IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
+ buff);
+ return 1;
+ }
+ } else {
+ IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
+ return 1;
+ }
+
+ p->pe_data = kmemdup(pe_data, pe_data_len, GFP_ATOMIC);
+ if (!p->pe_data) {
+ module_put(p->pe->module);
+ return -ENOMEM;
+ }
+ p->pe_data_len = pe_data_len;
+ }
+ return 0;
+}
+
+/*
+ * Connection Add / Update.
+ * Common for version 0 and 1 reception of backup sync_conns.
+ * Param: ...
+ * timeout is in sec.
+ */
+static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
+ unsigned int flags, unsigned int state,
+ unsigned int protocol, unsigned int type,
+ const union nf_inet_addr *daddr, __be16 dport,
+ unsigned long timeout, __u32 fwmark,
+ struct ip_vs_sync_conn_options *opt)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_conn *cp;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ cp = ip_vs_conn_in_get(param);
+ if (cp && ((cp->dport != dport) ||
+ !ip_vs_addr_equal(cp->daf, &cp->daddr, daddr))) {
+ if (!(flags & IP_VS_CONN_F_INACTIVE)) {
+ ip_vs_conn_expire_now(cp);
+ __ip_vs_conn_put(cp);
+ cp = NULL;
+ } else {
+ /* This is the expiration message for the
+ * connection that was already replaced, so we
+ * just ignore it.
+ */
+ __ip_vs_conn_put(cp);
+ kfree(param->pe_data);
+ return;
+ }
+ }
+ } else {
+ cp = ip_vs_ct_in_get(param);
+ }
+
+ if (cp) {
+ /* Free pe_data */
+ kfree(param->pe_data);
+
+ dest = cp->dest;
+ spin_lock_bh(&cp->lock);
+ if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
+ !(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
+ if (flags & IP_VS_CONN_F_INACTIVE) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ } else {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ }
+ }
+ flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
+ flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
+ cp->flags = flags;
+ spin_unlock_bh(&cp->lock);
+ if (!dest)
+ ip_vs_try_bind_dest(cp);
+ } else {
+ /*
+ * Find the appropriate destination for the connection.
+ * If it is not found the connection will remain unbound
+ * but still handled.
+ */
+ rcu_read_lock();
+ /* This function is only invoked by the synchronization
+ * code. We do not currently support heterogeneous pools
+ * with synchronization, so we can make the assumption that
+ * the svc_af is the same as the dest_af
+ */
+ dest = ip_vs_find_dest(net, type, type, daddr, dport,
+ param->vaddr, param->vport, protocol,
+ fwmark, flags);
+
+ cp = ip_vs_conn_new(param, type, daddr, dport, flags, dest,
+ fwmark);
+ rcu_read_unlock();
+ if (!cp) {
+ kfree(param->pe_data);
+ IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
+ return;
+ }
+ if (!(flags & IP_VS_CONN_F_TEMPLATE))
+ kfree(param->pe_data);
+ }
+
+ if (opt)
+ memcpy(&cp->in_seq, opt, sizeof(*opt));
+ atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs));
+ cp->state = state;
+ cp->old_state = cp->state;
+ /*
+ * For Ver 0 messages style
+ * - Not possible to recover the right timeout for templates
+ * - can not find the right fwmark
+ * virtual service. If needed, we can do it for
+ * non-fwmark persistent services.
+ * Ver 1 messages style.
+ * - No problem.
+ */
+ if (timeout) {
+ if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
+ timeout = MAX_SCHEDULE_TIMEOUT / HZ;
+ cp->timeout = timeout*HZ;
+ } else {
+ struct ip_vs_proto_data *pd;
+
+ pd = ip_vs_proto_data_get(net, protocol);
+ if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
+ cp->timeout = pd->timeout_table[state];
+ else
+ cp->timeout = (3*60*HZ);
+ }
+ ip_vs_conn_put(cp);
+}
+
+/*
+ * Process received multicast message for Version 0
+ */
+static void ip_vs_process_message_v0(struct net *net, const char *buffer,
+ const size_t buflen)
+{
+ struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
+ struct ip_vs_sync_conn_v0 *s;
+ struct ip_vs_sync_conn_options *opt;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn_param param;
+ char *p;
+ int i;
+
+ p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
+ for (i=0; i<m->nr_conns; i++) {
+ unsigned int flags, state;
+
+ if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
+ return;
+ }
+ s = (struct ip_vs_sync_conn_v0 *) p;
+ flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
+ flags &= ~IP_VS_CONN_F_HASHED;
+ if (flags & IP_VS_CONN_F_SEQ_MASK) {
+ opt = (struct ip_vs_sync_conn_options *)&s[1];
+ p += FULL_CONN_SIZE;
+ if (p > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
+ return;
+ }
+ } else {
+ opt = NULL;
+ p += SIMPLE_CONN_SIZE;
+ }
+
+ state = ntohs(s->state);
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ pp = ip_vs_proto_get(s->protocol);
+ if (!pp) {
+ IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
+ s->protocol);
+ continue;
+ }
+ if (state >= pp->num_states) {
+ IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
+ pp->name, state);
+ continue;
+ }
+ } else {
+ /* protocol in templates is not used for state/timeout */
+ if (state > 0) {
+ IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
+ state);
+ state = 0;
+ }
+ }
+
+ ip_vs_conn_fill_param(net, AF_INET, s->protocol,
+ (const union nf_inet_addr *)&s->caddr,
+ s->cport,
+ (const union nf_inet_addr *)&s->vaddr,
+ s->vport, &param);
+
+ /* Send timeout as Zero */
+ ip_vs_proc_conn(net, &param, flags, state, s->protocol, AF_INET,
+ (union nf_inet_addr *)&s->daddr, s->dport,
+ 0, 0, opt);
+ }
+}
+
+/*
+ * Handle options
+ */
+static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
+ __u32 *opt_flags,
+ struct ip_vs_sync_conn_options *opt)
+{
+ struct ip_vs_sync_conn_options *topt;
+
+ topt = (struct ip_vs_sync_conn_options *)p;
+
+ if (plen != sizeof(struct ip_vs_sync_conn_options)) {
+ IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
+ return -EINVAL;
+ }
+ if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
+ IP_VS_DBG(2, "BACKUP, conn options found twice\n");
+ return -EINVAL;
+ }
+ ntoh_seq(&topt->in_seq, &opt->in_seq);
+ ntoh_seq(&topt->out_seq, &opt->out_seq);
+ *opt_flags |= IPVS_OPT_F_SEQ_DATA;
+ return 0;
+}
+
+static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
+ __u8 **data, unsigned int maxlen,
+ __u32 *opt_flags, __u32 flag)
+{
+ if (plen > maxlen) {
+ IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
+ return -EINVAL;
+ }
+ if (*opt_flags & flag) {
+ IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
+ return -EINVAL;
+ }
+ *data_len = plen;
+ *data = p;
+ *opt_flags |= flag;
+ return 0;
+}
+/*
+ * Process a Version 1 sync. connection
+ */
+static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end)
+{
+ struct ip_vs_sync_conn_options opt;
+ union ip_vs_sync_conn *s;
+ struct ip_vs_protocol *pp;
+ struct ip_vs_conn_param param;
+ __u32 flags;
+ unsigned int af, state, pe_data_len=0, pe_name_len=0;
+ __u8 *pe_data=NULL, *pe_name=NULL;
+ __u32 opt_flags=0;
+ int retc=0;
+
+ s = (union ip_vs_sync_conn *) p;
+
+ if (s->v6.type & STYPE_F_INET6) {
+#ifdef CONFIG_IP_VS_IPV6
+ af = AF_INET6;
+ p += sizeof(struct ip_vs_sync_v6);
+#else
+ IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
+ retc = 10;
+ goto out;
+#endif
+ } else if (!s->v4.type) {
+ af = AF_INET;
+ p += sizeof(struct ip_vs_sync_v4);
+ } else {
+ return -10;
+ }
+ if (p > msg_end)
+ return -20;
+
+ /* Process optional params check Type & Len. */
+ while (p < msg_end) {
+ int ptype;
+ int plen;
+
+ if (p+2 > msg_end)
+ return -30;
+ ptype = *(p++);
+ plen = *(p++);
+
+ if (!plen || ((p + plen) > msg_end))
+ return -40;
+ /* Handle seq option p = param data */
+ switch (ptype & ~IPVS_OPT_F_PARAM) {
+ case IPVS_OPT_SEQ_DATA:
+ if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
+ return -50;
+ break;
+
+ case IPVS_OPT_PE_DATA:
+ if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
+ IP_VS_PEDATA_MAXLEN, &opt_flags,
+ IPVS_OPT_F_PE_DATA))
+ return -60;
+ break;
+
+ case IPVS_OPT_PE_NAME:
+ if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
+ IP_VS_PENAME_MAXLEN, &opt_flags,
+ IPVS_OPT_F_PE_NAME))
+ return -70;
+ break;
+
+ default:
+ /* Param data mandatory ? */
+ if (!(ptype & IPVS_OPT_F_PARAM)) {
+ IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
+ ptype & ~IPVS_OPT_F_PARAM);
+ retc = 20;
+ goto out;
+ }
+ }
+ p += plen; /* Next option */
+ }
+
+ /* Get flags and Mask off unsupported */
+ flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
+ flags |= IP_VS_CONN_F_SYNC;
+ state = ntohs(s->v4.state);
+
+ if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
+ pp = ip_vs_proto_get(s->v4.protocol);
+ if (!pp) {
+ IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
+ s->v4.protocol);
+ retc = 30;
+ goto out;
+ }
+ if (state >= pp->num_states) {
+ IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
+ pp->name, state);
+ retc = 40;
+ goto out;
+ }
+ } else {
+ /* protocol in templates is not used for state/timeout */
+ if (state > 0) {
+ IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
+ state);
+ state = 0;
+ }
+ }
+ if (ip_vs_conn_fill_param_sync(net, af, s, &param, pe_data,
+ pe_data_len, pe_name, pe_name_len)) {
+ retc = 50;
+ goto out;
+ }
+ /* If only IPv4, just silent skip IPv6 */
+ if (af == AF_INET)
+ ip_vs_proc_conn(net, &param, flags, state, s->v4.protocol, af,
+ (union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
+ ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
+ (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+ );
+#ifdef CONFIG_IP_VS_IPV6
+ else
+ ip_vs_proc_conn(net, &param, flags, state, s->v6.protocol, af,
+ (union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
+ ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
+ (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
+ );
+#endif
+ ip_vs_pe_put(param.pe);
+ return 0;
+ /* Error exit */
+out:
+ IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
+ return retc;
+
+}
+/*
+ * Process received multicast message and create the corresponding
+ * ip_vs_conn entries.
+ * Handles Version 0 & 1
+ */
+static void ip_vs_process_message(struct net *net, __u8 *buffer,
+ const size_t buflen)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
+ __u8 *p, *msg_end;
+ int i, nr_conns;
+
+ if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
+ IP_VS_DBG(2, "BACKUP, message header too short\n");
+ return;
+ }
+
+ if (buflen != ntohs(m2->size)) {
+ IP_VS_DBG(2, "BACKUP, bogus message size\n");
+ return;
+ }
+ /* SyncID sanity check */
+ if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) {
+ IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
+ return;
+ }
+ /* Handle version 1 message */
+ if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
+ && (m2->spare == 0)) {
+
+ msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
+ nr_conns = m2->nr_conns;
+
+ for (i=0; i<nr_conns; i++) {
+ union ip_vs_sync_conn *s;
+ unsigned int size;
+ int retc;
+
+ p = msg_end;
+ if (p + sizeof(s->v4) > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
+ return;
+ }
+ s = (union ip_vs_sync_conn *)p;
+ size = ntohs(s->v4.ver_size) & SVER_MASK;
+ msg_end = p + size;
+ /* Basic sanity checks */
+ if (msg_end > buffer+buflen) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
+ return;
+ }
+ if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
+ ntohs(s->v4.ver_size) >> SVER_SHIFT);
+ return;
+ }
+ /* Process a single sync_conn */
+ retc = ip_vs_proc_sync_conn(net, p, msg_end);
+ if (retc < 0) {
+ IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
+ retc);
+ return;
+ }
+ /* Make sure we have 32 bit alignment */
+ msg_end = p + ((size + 3) & ~3);
+ }
+ } else {
+ /* Old type of message */
+ ip_vs_process_message_v0(net, buffer, buflen);
+ return;
+ }
+}
+
+
+/*
+ * Setup sndbuf (mode=1) or rcvbuf (mode=0)
+ */
+static void set_sock_size(struct sock *sk, int mode, int val)
+{
+ /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */
+ /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */
+ lock_sock(sk);
+ if (mode) {
+ val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
+ sysctl_wmem_max);
+ sk->sk_sndbuf = val * 2;
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ } else {
+ val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
+ sysctl_rmem_max);
+ sk->sk_rcvbuf = val * 2;
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ }
+ release_sock(sk);
+}
+
+/*
+ * Setup loopback of outgoing multicasts on a sending socket
+ */
+static void set_mcast_loop(struct sock *sk, u_char loop)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
+ lock_sock(sk);
+ inet->mc_loop = loop ? 1 : 0;
+ release_sock(sk);
+}
+
+/*
+ * Specify TTL for outgoing multicasts on a sending socket
+ */
+static void set_mcast_ttl(struct sock *sk, u_char ttl)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
+ lock_sock(sk);
+ inet->mc_ttl = ttl;
+ release_sock(sk);
+}
+
+/*
+ * Specifiy default interface for outgoing multicasts
+ */
+static int set_mcast_if(struct sock *sk, char *ifname)
+{
+ struct net_device *dev;
+ struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
+
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
+ return -ENODEV;
+
+ if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+ return -EINVAL;
+
+ lock_sock(sk);
+ inet->mc_index = dev->ifindex;
+ /* inet->mc_addr = 0; */
+ release_sock(sk);
+
+ return 0;
+}
+
+
+/*
+ * Set the maximum length of sync message according to the
+ * specified interface's MTU.
+ */
+static int set_sync_mesg_maxlen(struct net *net, int sync_state)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct net_device *dev;
+ int num;
+
+ if (sync_state == IP_VS_STATE_MASTER) {
+ dev = __dev_get_by_name(net, ipvs->master_mcast_ifn);
+ if (!dev)
+ return -ENODEV;
+
+ num = (dev->mtu - sizeof(struct iphdr) -
+ sizeof(struct udphdr) -
+ SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
+ ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
+ SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
+ IP_VS_DBG(7, "setting the maximum length of sync sending "
+ "message %d.\n", ipvs->send_mesg_maxlen);
+ } else if (sync_state == IP_VS_STATE_BACKUP) {
+ dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn);
+ if (!dev)
+ return -ENODEV;
+
+ ipvs->recv_mesg_maxlen = dev->mtu -
+ sizeof(struct iphdr) - sizeof(struct udphdr);
+ IP_VS_DBG(7, "setting the maximum length of sync receiving "
+ "message %d.\n", ipvs->recv_mesg_maxlen);
+ }
+
+ return 0;
+}
+
+
+/*
+ * Join a multicast group.
+ * the group is specified by a class D multicast address 224.0.0.0/8
+ * in the in_addr structure passed in as a parameter.
+ */
+static int
+join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
+{
+ struct net *net = sock_net(sk);
+ struct ip_mreqn mreq;
+ struct net_device *dev;
+ int ret;
+
+ memset(&mreq, 0, sizeof(mreq));
+ memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
+
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
+ return -ENODEV;
+ if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+ return -EINVAL;
+
+ mreq.imr_ifindex = dev->ifindex;
+
+ rtnl_lock();
+ lock_sock(sk);
+ ret = ip_mc_join_group(sk, &mreq);
+ release_sock(sk);
+ rtnl_unlock();
+
+ return ret;
+}
+
+
+static int bind_mcastif_addr(struct socket *sock, char *ifname)
+{
+ struct net *net = sock_net(sock->sk);
+ struct net_device *dev;
+ __be32 addr;
+ struct sockaddr_in sin;
+
+ dev = __dev_get_by_name(net, ifname);
+ if (!dev)
+ return -ENODEV;
+
+ addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+ if (!addr)
+ pr_err("You probably need to specify IP address on "
+ "multicast interface.\n");
+
+ IP_VS_DBG(7, "binding socket with (%s) %pI4\n",
+ ifname, &addr);
+
+ /* Now bind the socket with the address of multicast interface */
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = addr;
+ sin.sin_port = 0;
+
+ return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
+}
+
+/*
+ * Set up sending multicast socket over UDP
+ */
+static struct socket *make_send_sock(struct net *net, int id)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ /* multicast addr */
+ struct sockaddr_in mcast_addr = {
+ .sin_family = AF_INET,
+ .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
+ .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
+ };
+ struct socket *sock;
+ int result;
+
+ /* First create a socket move it to right name space later */
+ result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ if (result < 0) {
+ pr_err("Error during creation of socket; terminating\n");
+ return ERR_PTR(result);
+ }
+ /*
+ * Kernel sockets that are a part of a namespace, should not
+ * hold a reference to a namespace in order to allow to stop it.
+ * After sk_change_net should be released using sk_release_kernel.
+ */
+ sk_change_net(sock->sk, net);
+ result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
+ if (result < 0) {
+ pr_err("Error setting outbound mcast interface\n");
+ goto error;
+ }
+
+ set_mcast_loop(sock->sk, 0);
+ set_mcast_ttl(sock->sk, 1);
+ result = sysctl_sync_sock_size(ipvs);
+ if (result > 0)
+ set_sock_size(sock->sk, 1, result);
+
+ result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
+ if (result < 0) {
+ pr_err("Error binding address of the mcast interface\n");
+ goto error;
+ }
+
+ result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr,
+ sizeof(struct sockaddr), 0);
+ if (result < 0) {
+ pr_err("Error connecting to the multicast addr\n");
+ goto error;
+ }
+
+ return sock;
+
+error:
+ sk_release_kernel(sock->sk);
+ return ERR_PTR(result);
+}
+
+
+/*
+ * Set up receiving multicast socket over UDP
+ */
+static struct socket *make_receive_sock(struct net *net, int id)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ /* multicast addr */
+ struct sockaddr_in mcast_addr = {
+ .sin_family = AF_INET,
+ .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
+ .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
+ };
+ struct socket *sock;
+ int result;
+
+ /* First create a socket */
+ result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ if (result < 0) {
+ pr_err("Error during creation of socket; terminating\n");
+ return ERR_PTR(result);
+ }
+ /*
+ * Kernel sockets that are a part of a namespace, should not
+ * hold a reference to a namespace in order to allow to stop it.
+ * After sk_change_net should be released using sk_release_kernel.
+ */
+ sk_change_net(sock->sk, net);
+ /* it is equivalent to the REUSEADDR option in user-space */
+ sock->sk->sk_reuse = SK_CAN_REUSE;
+ result = sysctl_sync_sock_size(ipvs);
+ if (result > 0)
+ set_sock_size(sock->sk, 0, result);
+
+ result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
+ sizeof(struct sockaddr));
+ if (result < 0) {
+ pr_err("Error binding to the multicast addr\n");
+ goto error;
+ }
+
+ /* join the multicast group */
+ result = join_mcast_group(sock->sk,
+ (struct in_addr *) &mcast_addr.sin_addr,
+ ipvs->backup_mcast_ifn);
+ if (result < 0) {
+ pr_err("Error joining to the multicast group\n");
+ goto error;
+ }
+
+ return sock;
+
+error:
+ sk_release_kernel(sock->sk);
+ return ERR_PTR(result);
+}
+
+
+static int
+ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
+{
+ struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
+ struct kvec iov;
+ int len;
+
+ EnterFunction(7);
+ iov.iov_base = (void *)buffer;
+ iov.iov_len = length;
+
+ len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
+
+ LeaveFunction(7);
+ return len;
+}
+
+static int
+ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
+{
+ int msize;
+ int ret;
+
+ msize = ntohs(msg->size);
+
+ ret = ip_vs_send_async(sock, (char *)msg, msize);
+ if (ret >= 0 || ret == -EAGAIN)
+ return ret;
+ pr_err("ip_vs_send_async error %d\n", ret);
+ return 0;
+}
+
+static int
+ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
+{
+ struct msghdr msg = {NULL,};
+ struct kvec iov;
+ int len;
+
+ EnterFunction(7);
+
+ /* Receive a packet */
+ iov.iov_base = buffer;
+ iov.iov_len = (size_t)buflen;
+
+ len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT);
+
+ if (len < 0)
+ return len;
+
+ LeaveFunction(7);
+ return len;
+}
+
+/* Wakeup the master thread for sending */
+static void master_wakeup_work_handler(struct work_struct *work)
+{
+ struct ipvs_master_sync_state *ms =
+ container_of(work, struct ipvs_master_sync_state,
+ master_wakeup_work.work);
+ struct netns_ipvs *ipvs = ms->ipvs;
+
+ spin_lock_bh(&ipvs->sync_lock);
+ if (ms->sync_queue_len &&
+ ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
+ ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
+ wake_up_process(ms->master_thread);
+ }
+ spin_unlock_bh(&ipvs->sync_lock);
+}
+
+/* Get next buffer to send */
+static inline struct ip_vs_sync_buff *
+next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
+{
+ struct ip_vs_sync_buff *sb;
+
+ sb = sb_dequeue(ipvs, ms);
+ if (sb)
+ return sb;
+ /* Do not delay entries in buffer for more than 2 seconds */
+ return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME);
+}
+
+static int sync_thread_master(void *data)
+{
+ struct ip_vs_sync_thread_data *tinfo = data;
+ struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
+ struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id];
+ struct sock *sk = tinfo->sock->sk;
+ struct ip_vs_sync_buff *sb;
+
+ pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
+ "syncid = %d, id = %d\n",
+ ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id);
+
+ for (;;) {
+ sb = next_sync_buff(ipvs, ms);
+ if (unlikely(kthread_should_stop()))
+ break;
+ if (!sb) {
+ schedule_timeout(IPVS_SYNC_CHECK_PERIOD);
+ continue;
+ }
+ while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
+ /* (Ab)use interruptible sleep to avoid increasing
+ * the load avg.
+ */
+ __wait_event_interruptible(*sk_sleep(sk),
+ sock_writeable(sk) ||
+ kthread_should_stop());
+ if (unlikely(kthread_should_stop()))
+ goto done;
+ }
+ ip_vs_sync_buff_release(sb);
+ }
+
+done:
+ __set_current_state(TASK_RUNNING);
+ if (sb)
+ ip_vs_sync_buff_release(sb);
+
+ /* clean up the sync_buff queue */
+ while ((sb = sb_dequeue(ipvs, ms)))
+ ip_vs_sync_buff_release(sb);
+ __set_current_state(TASK_RUNNING);
+
+ /* clean up the current sync_buff */
+ sb = get_curr_sync_buff(ipvs, ms, 0);
+ if (sb)
+ ip_vs_sync_buff_release(sb);
+
+ /* release the sending multicast socket */
+ sk_release_kernel(tinfo->sock->sk);
+ kfree(tinfo);
+
+ return 0;
+}
+
+
+static int sync_thread_backup(void *data)
+{
+ struct ip_vs_sync_thread_data *tinfo = data;
+ struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
+ int len;
+
+ pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
+ "syncid = %d, id = %d\n",
+ ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id);
+
+ while (!kthread_should_stop()) {
+ wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
+ !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue)
+ || kthread_should_stop());
+
+ /* do we have data now? */
+ while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
+ len = ip_vs_receive(tinfo->sock, tinfo->buf,
+ ipvs->recv_mesg_maxlen);
+ if (len <= 0) {
+ if (len != -EAGAIN)
+ pr_err("receiving message error\n");
+ break;
+ }
+
+ ip_vs_process_message(tinfo->net, tinfo->buf, len);
+ }
+ }
+
+ /* release the sending multicast socket */
+ sk_release_kernel(tinfo->sock->sk);
+ kfree(tinfo->buf);
+ kfree(tinfo);
+
+ return 0;
+}
+
+
+int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
+{
+ struct ip_vs_sync_thread_data *tinfo;
+ struct task_struct **array = NULL, *task;
+ struct socket *sock;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ char *name;
+ int (*threadfn)(void *data);
+ int id, count;
+ int result = -ENOMEM;
+
+ IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
+ IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
+ sizeof(struct ip_vs_sync_conn_v0));
+
+ if (!ipvs->sync_state) {
+ count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX);
+ ipvs->threads_mask = count - 1;
+ } else
+ count = ipvs->threads_mask + 1;
+
+ if (state == IP_VS_STATE_MASTER) {
+ if (ipvs->ms)
+ return -EEXIST;
+
+ strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
+ sizeof(ipvs->master_mcast_ifn));
+ ipvs->master_syncid = syncid;
+ name = "ipvs-m:%d:%d";
+ threadfn = sync_thread_master;
+ } else if (state == IP_VS_STATE_BACKUP) {
+ if (ipvs->backup_threads)
+ return -EEXIST;
+
+ strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
+ sizeof(ipvs->backup_mcast_ifn));
+ ipvs->backup_syncid = syncid;
+ name = "ipvs-b:%d:%d";
+ threadfn = sync_thread_backup;
+ } else {
+ return -EINVAL;
+ }
+
+ if (state == IP_VS_STATE_MASTER) {
+ struct ipvs_master_sync_state *ms;
+
+ ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL);
+ if (!ipvs->ms)
+ goto out;
+ ms = ipvs->ms;
+ for (id = 0; id < count; id++, ms++) {
+ INIT_LIST_HEAD(&ms->sync_queue);
+ ms->sync_queue_len = 0;
+ ms->sync_queue_delay = 0;
+ INIT_DELAYED_WORK(&ms->master_wakeup_work,
+ master_wakeup_work_handler);
+ ms->ipvs = ipvs;
+ }
+ } else {
+ array = kzalloc(count * sizeof(struct task_struct *),
+ GFP_KERNEL);
+ if (!array)
+ goto out;
+ }
+ set_sync_mesg_maxlen(net, state);
+
+ tinfo = NULL;
+ for (id = 0; id < count; id++) {
+ if (state == IP_VS_STATE_MASTER)
+ sock = make_send_sock(net, id);
+ else
+ sock = make_receive_sock(net, id);
+ if (IS_ERR(sock)) {
+ result = PTR_ERR(sock);
+ goto outtinfo;
+ }
+ tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
+ if (!tinfo)
+ goto outsocket;
+ tinfo->net = net;
+ tinfo->sock = sock;
+ if (state == IP_VS_STATE_BACKUP) {
+ tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen,
+ GFP_KERNEL);
+ if (!tinfo->buf)
+ goto outtinfo;
+ } else {
+ tinfo->buf = NULL;
+ }
+ tinfo->id = id;
+
+ task = kthread_run(threadfn, tinfo, name, ipvs->gen, id);
+ if (IS_ERR(task)) {
+ result = PTR_ERR(task);
+ goto outtinfo;
+ }
+ tinfo = NULL;
+ if (state == IP_VS_STATE_MASTER)
+ ipvs->ms[id].master_thread = task;
+ else
+ array[id] = task;
+ }
+
+ /* mark as active */
+
+ if (state == IP_VS_STATE_BACKUP)
+ ipvs->backup_threads = array;
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ ipvs->sync_state |= state;
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+
+ /* increase the module use count */
+ ip_vs_use_count_inc();
+
+ return 0;
+
+outsocket:
+ sk_release_kernel(sock->sk);
+
+outtinfo:
+ if (tinfo) {
+ sk_release_kernel(tinfo->sock->sk);
+ kfree(tinfo->buf);
+ kfree(tinfo);
+ }
+ count = id;
+ while (count-- > 0) {
+ if (state == IP_VS_STATE_MASTER)
+ kthread_stop(ipvs->ms[count].master_thread);
+ else
+ kthread_stop(array[count]);
+ }
+ kfree(array);
+
+out:
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ kfree(ipvs->ms);
+ ipvs->ms = NULL;
+ }
+ return result;
+}
+
+
+int stop_sync_thread(struct net *net, int state)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct task_struct **array;
+ int id;
+ int retc = -EINVAL;
+
+ IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
+
+ if (state == IP_VS_STATE_MASTER) {
+ if (!ipvs->ms)
+ return -ESRCH;
+
+ /*
+ * The lock synchronizes with sb_queue_tail(), so that we don't
+ * add sync buffers to the queue, when we are already in
+ * progress of stopping the master sync daemon.
+ */
+
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ spin_lock(&ipvs->sync_lock);
+ ipvs->sync_state &= ~IP_VS_STATE_MASTER;
+ spin_unlock(&ipvs->sync_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+
+ retc = 0;
+ for (id = ipvs->threads_mask; id >= 0; id--) {
+ struct ipvs_master_sync_state *ms = &ipvs->ms[id];
+ int ret;
+
+ pr_info("stopping master sync thread %d ...\n",
+ task_pid_nr(ms->master_thread));
+ cancel_delayed_work_sync(&ms->master_wakeup_work);
+ ret = kthread_stop(ms->master_thread);
+ if (retc >= 0)
+ retc = ret;
+ }
+ kfree(ipvs->ms);
+ ipvs->ms = NULL;
+ } else if (state == IP_VS_STATE_BACKUP) {
+ if (!ipvs->backup_threads)
+ return -ESRCH;
+
+ ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
+ array = ipvs->backup_threads;
+ retc = 0;
+ for (id = ipvs->threads_mask; id >= 0; id--) {
+ int ret;
+
+ pr_info("stopping backup sync thread %d ...\n",
+ task_pid_nr(array[id]));
+ ret = kthread_stop(array[id]);
+ if (retc >= 0)
+ retc = ret;
+ }
+ kfree(array);
+ ipvs->backup_threads = NULL;
+ }
+
+ /* decrease the module use count */
+ ip_vs_use_count_dec();
+
+ return retc;
+}
+
+/*
+ * Initialize data struct for each netns
+ */
+int __net_init ip_vs_sync_net_init(struct net *net)
+{
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ __mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key);
+ spin_lock_init(&ipvs->sync_lock);
+ spin_lock_init(&ipvs->sync_buff_lock);
+ return 0;
+}
+
+void ip_vs_sync_net_cleanup(struct net *net)
+{
+ int retc;
+ struct netns_ipvs *ipvs = net_ipvs(net);
+
+ mutex_lock(&ipvs->sync_mutex);
+ retc = stop_sync_thread(net, IP_VS_STATE_MASTER);
+ if (retc && retc != -ESRCH)
+ pr_err("Failed to stop Master Daemon\n");
+
+ retc = stop_sync_thread(net, IP_VS_STATE_BACKUP);
+ if (retc && retc != -ESRCH)
+ pr_err("Failed to stop Backup Daemon\n");
+ mutex_unlock(&ipvs->sync_mutex);
+}
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
new file mode 100644
index 000000000..6b366fd90
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -0,0 +1,116 @@
+/*
+ * IPVS: Weighted Least-Connection Scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Peter Kese <peter.kese@ijs.si>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest
+ * Wensong Zhang : changed to use the inactconns in scheduling
+ * Wensong Zhang : changed some comestics things for debugging
+ * Wensong Zhang : changed for the d-linked destination list
+ * Wensong Zhang : added the ip_vs_wlc_update_svc
+ * Wensong Zhang : added any dest with weight=0 is quiesced
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+/*
+ * Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *least;
+ int loh, doh;
+
+ IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
+
+ /*
+ * We calculate the load of each dest server as follows:
+ * (dest overhead) / dest->weight
+ *
+ * Remember -- no floats in kernel mode!!!
+ * The comparison of h1*w2 > h2*w1 is equivalent to that of
+ * h1/w1 > h2/w2
+ * if every weight is larger than zero.
+ *
+ * The server with weight=0 is quiesced and will not receive any
+ * new connections.
+ */
+
+ list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) > 0) {
+ least = dest;
+ loh = ip_vs_dest_conn_overhead(least);
+ goto nextstage;
+ }
+ }
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+
+ /*
+ * Find the destination with the least load.
+ */
+ nextstage:
+ list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
+ if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+ continue;
+ doh = ip_vs_dest_conn_overhead(dest);
+ if ((__s64)loh * atomic_read(&dest->weight) >
+ (__s64)doh * atomic_read(&least->weight)) {
+ least = dest;
+ loh = doh;
+ }
+ }
+
+ IP_VS_DBG_BUF(6, "WLC: server %s:%u "
+ "activeconns %d refcnt %d weight %d overhead %d\n",
+ IP_VS_DBG_ADDR(least->af, &least->addr),
+ ntohs(least->port),
+ atomic_read(&least->activeconns),
+ atomic_read(&least->refcnt),
+ atomic_read(&least->weight), loh);
+
+ return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wlc_scheduler =
+{
+ .name = "wlc",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list),
+ .schedule = ip_vs_wlc_schedule,
+};
+
+
+static int __init ip_vs_wlc_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+
+static void __exit ip_vs_wlc_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_wlc_init);
+module_exit(ip_vs_wlc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c
new file mode 100644
index 000000000..17e6d4406
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_wrr.c
@@ -0,0 +1,270 @@
+/*
+ * IPVS: Weighted Round-Robin Scheduling module
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest
+ * Wensong Zhang : changed some comestics things for debugging
+ * Wensong Zhang : changed for the d-linked destination list
+ * Wensong Zhang : added the ip_vs_wrr_update_svc
+ * Julian Anastasov : fixed the bug of returning destination
+ * with weight 0 when all weights are zero
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/gcd.h>
+
+#include <net/ip_vs.h>
+
+/* The WRR algorithm depends on some caclulations:
+ * - mw: maximum weight
+ * - di: weight step, greatest common divisor from all weights
+ * - cw: current required weight
+ * As result, all weights are in the [di..mw] range with a step=di.
+ *
+ * First, we start with cw = mw and select dests with weight >= cw.
+ * Then cw is reduced with di and all dests are checked again.
+ * Last pass should be with cw = di. We have mw/di passes in total:
+ *
+ * pass 1: cw = max weight
+ * pass 2: cw = max weight - di
+ * pass 3: cw = max weight - 2 * di
+ * ...
+ * last pass: cw = di
+ *
+ * Weights are supposed to be >= di but we run in parallel with
+ * weight changes, it is possible some dest weight to be reduced
+ * below di, bad if it is the only available dest.
+ *
+ * So, we modify how mw is calculated, now it is reduced with (di - 1),
+ * so that last cw is 1 to catch such dests with weight below di:
+ * pass 1: cw = max weight - (di - 1)
+ * pass 2: cw = max weight - di - (di - 1)
+ * pass 3: cw = max weight - 2 * di - (di - 1)
+ * ...
+ * last pass: cw = 1
+ *
+ */
+
+/*
+ * current destination pointer for weighted round-robin scheduling
+ */
+struct ip_vs_wrr_mark {
+ struct ip_vs_dest *cl; /* current dest or head */
+ int cw; /* current weight */
+ int mw; /* maximum weight */
+ int di; /* decreasing interval */
+ struct rcu_head rcu_head;
+};
+
+
+static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest;
+ int weight;
+ int g = 0;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ weight = atomic_read(&dest->weight);
+ if (weight > 0) {
+ if (g > 0)
+ g = gcd(weight, g);
+ else
+ g = weight;
+ }
+ }
+ return g ? g : 1;
+}
+
+
+/*
+ * Get the maximum weight of the service destinations.
+ */
+static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest;
+ int new_weight, weight = 0;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ new_weight = atomic_read(&dest->weight);
+ if (new_weight > weight)
+ weight = new_weight;
+ }
+
+ return weight;
+}
+
+
+static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_wrr_mark *mark;
+
+ /*
+ * Allocate the mark variable for WRR scheduling
+ */
+ mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_KERNEL);
+ if (mark == NULL)
+ return -ENOMEM;
+
+ mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
+ mark->di = ip_vs_wrr_gcd_weight(svc);
+ mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
+ mark->cw = mark->mw;
+ svc->sched_data = mark;
+
+ return 0;
+}
+
+
+static void ip_vs_wrr_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_wrr_mark *mark = svc->sched_data;
+
+ /*
+ * Release the mark variable
+ */
+ kfree_rcu(mark, rcu_head);
+}
+
+
+static int ip_vs_wrr_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_wrr_mark *mark = svc->sched_data;
+
+ spin_lock_bh(&svc->sched_lock);
+ mark->cl = list_entry(&svc->destinations, struct ip_vs_dest, n_list);
+ mark->di = ip_vs_wrr_gcd_weight(svc);
+ mark->mw = ip_vs_wrr_max_weight(svc) - (mark->di - 1);
+ if (mark->cw > mark->mw || !mark->cw)
+ mark->cw = mark->mw;
+ else if (mark->di > 1)
+ mark->cw = (mark->cw / mark->di) * mark->di + 1;
+ spin_unlock_bh(&svc->sched_lock);
+ return 0;
+}
+
+
+/*
+ * Weighted Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest, *last, *stop = NULL;
+ struct ip_vs_wrr_mark *mark = svc->sched_data;
+ bool last_pass = false, restarted = false;
+
+ IP_VS_DBG(6, "%s(): Scheduling...\n", __func__);
+
+ spin_lock_bh(&svc->sched_lock);
+ dest = mark->cl;
+ /* No available dests? */
+ if (mark->mw == 0)
+ goto err_noavail;
+ last = dest;
+ /* Stop only after all dests were checked for weight >= 1 (last pass) */
+ while (1) {
+ list_for_each_entry_continue_rcu(dest,
+ &svc->destinations,
+ n_list) {
+ if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+ atomic_read(&dest->weight) >= mark->cw)
+ goto found;
+ if (dest == stop)
+ goto err_over;
+ }
+ mark->cw -= mark->di;
+ if (mark->cw <= 0) {
+ mark->cw = mark->mw;
+ /* Stop if we tried last pass from first dest:
+ * 1. last_pass: we started checks when cw > di but
+ * then all dests were checked for w >= 1
+ * 2. last was head: the first and only traversal
+ * was for weight >= 1, for all dests.
+ */
+ if (last_pass ||
+ &last->n_list == &svc->destinations)
+ goto err_over;
+ restarted = true;
+ }
+ last_pass = mark->cw <= mark->di;
+ if (last_pass && restarted &&
+ &last->n_list != &svc->destinations) {
+ /* First traversal was for w >= 1 but only
+ * for dests after 'last', now do the same
+ * for all dests up to 'last'.
+ */
+ stop = last;
+ }
+ }
+
+found:
+ IP_VS_DBG_BUF(6, "WRR: server %s:%u "
+ "activeconns %d refcnt %d weight %d\n",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
+ atomic_read(&dest->activeconns),
+ atomic_read(&dest->refcnt),
+ atomic_read(&dest->weight));
+ mark->cl = dest;
+
+ out:
+ spin_unlock_bh(&svc->sched_lock);
+ return dest;
+
+err_noavail:
+ mark->cl = dest;
+ dest = NULL;
+ ip_vs_scheduler_err(svc, "no destination available");
+ goto out;
+
+err_over:
+ mark->cl = dest;
+ dest = NULL;
+ ip_vs_scheduler_err(svc, "no destination available: "
+ "all destinations are overloaded");
+ goto out;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
+ .name = "wrr",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list),
+ .init_service = ip_vs_wrr_init_svc,
+ .done_service = ip_vs_wrr_done_svc,
+ .add_dest = ip_vs_wrr_dest_changed,
+ .del_dest = ip_vs_wrr_dest_changed,
+ .upd_dest = ip_vs_wrr_dest_changed,
+ .schedule = ip_vs_wrr_schedule,
+};
+
+static int __init ip_vs_wrr_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
+}
+
+static void __exit ip_vs_wrr_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
+ synchronize_rcu();
+}
+
+module_init(ip_vs_wrr_init);
+module_exit(ip_vs_wrr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
new file mode 100644
index 000000000..19986ec5f
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -0,0 +1,1380 @@
+/*
+ * ip_vs_xmit.c: various packet transmitters for IPVS
+ *
+ * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
+ * Julian Anastasov <ja@ssi.bg>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ * Description of forwarding methods:
+ * - all transmitters are called from LOCAL_IN (remote clients) and
+ * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD
+ * - not all connections have destination server, for example,
+ * connections in backup server when fwmark is used
+ * - bypass connections use daddr from packet
+ * - we can use dst without ref while sending in RCU section, we use
+ * ref when returning NF_ACCEPT for NAT-ed packet via loopback
+ * LOCAL_OUT rules:
+ * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING)
+ * - skb->pkt_type is not set yet
+ * - the only place where we can see skb->sk != NULL
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/tcp.h> /* for tcphdr */
+#include <net/ip.h>
+#include <net/tcp.h> /* for csum_tcpudp_magic */
+#include <net/udp.h>
+#include <net/icmp.h> /* for icmp_send */
+#include <net/route.h> /* for ip_route_output */
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/ip_tunnels.h>
+#include <net/addrconf.h>
+#include <linux/icmpv6.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+enum {
+ IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */
+ IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */
+ IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to
+ * local
+ */
+ IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */
+ IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */
+ IP_VS_RT_MODE_TUNNEL = 32,/* Tunnel mode */
+};
+
+static inline struct ip_vs_dest_dst *ip_vs_dest_dst_alloc(void)
+{
+ return kmalloc(sizeof(struct ip_vs_dest_dst), GFP_ATOMIC);
+}
+
+static inline void ip_vs_dest_dst_free(struct ip_vs_dest_dst *dest_dst)
+{
+ kfree(dest_dst);
+}
+
+/*
+ * Destination cache to speed up outgoing route lookup
+ */
+static inline void
+__ip_vs_dst_set(struct ip_vs_dest *dest, struct ip_vs_dest_dst *dest_dst,
+ struct dst_entry *dst, u32 dst_cookie)
+{
+ struct ip_vs_dest_dst *old;
+
+ old = rcu_dereference_protected(dest->dest_dst,
+ lockdep_is_held(&dest->dst_lock));
+
+ if (dest_dst) {
+ dest_dst->dst_cache = dst;
+ dest_dst->dst_cookie = dst_cookie;
+ }
+ rcu_assign_pointer(dest->dest_dst, dest_dst);
+
+ if (old)
+ call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
+}
+
+static inline struct ip_vs_dest_dst *
+__ip_vs_dst_check(struct ip_vs_dest *dest)
+{
+ struct ip_vs_dest_dst *dest_dst = rcu_dereference(dest->dest_dst);
+ struct dst_entry *dst;
+
+ if (!dest_dst)
+ return NULL;
+ dst = dest_dst->dst_cache;
+ if (dst->obsolete &&
+ dst->ops->check(dst, dest_dst->dst_cookie) == NULL)
+ return NULL;
+ return dest_dst;
+}
+
+static inline bool
+__mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu)
+{
+ if (IP6CB(skb)->frag_max_size) {
+ /* frag_max_size tell us that, this packet have been
+ * defragmented by netfilter IPv6 conntrack module.
+ */
+ if (IP6CB(skb)->frag_max_size > mtu)
+ return true; /* largest fragment violate MTU */
+ }
+ else if (skb->len > mtu && !skb_is_gso(skb)) {
+ return true; /* Packet size violate MTU size */
+ }
+ return false;
+}
+
+/* Get route to daddr, update *saddr, optionally bind route to saddr */
+static struct rtable *do_output_route4(struct net *net, __be32 daddr,
+ int rt_mode, __be32 *saddr)
+{
+ struct flowi4 fl4;
+ struct rtable *rt;
+ int loop = 0;
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.daddr = daddr;
+ fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0;
+ fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ?
+ FLOWI_FLAG_KNOWN_NH : 0;
+
+retry:
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt)) {
+ /* Invalid saddr ? */
+ if (PTR_ERR(rt) == -EINVAL && *saddr &&
+ rt_mode & IP_VS_RT_MODE_CONNECT && !loop) {
+ *saddr = 0;
+ flowi4_update_output(&fl4, 0, 0, daddr, 0);
+ goto retry;
+ }
+ IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr);
+ return NULL;
+ } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) {
+ ip_rt_put(rt);
+ *saddr = fl4.saddr;
+ flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr);
+ loop++;
+ goto retry;
+ }
+ *saddr = fl4.saddr;
+ return rt;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static inline int __ip_vs_is_local_route6(struct rt6_info *rt)
+{
+ return rt->dst.dev && rt->dst.dev->flags & IFF_LOOPBACK;
+}
+#endif
+
+static inline bool crosses_local_route_boundary(int skb_af, struct sk_buff *skb,
+ int rt_mode,
+ bool new_rt_is_local)
+{
+ bool rt_mode_allow_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL);
+ bool rt_mode_allow_non_local = !!(rt_mode & IP_VS_RT_MODE_LOCAL);
+ bool rt_mode_allow_redirect = !!(rt_mode & IP_VS_RT_MODE_RDR);
+ bool source_is_loopback;
+ bool old_rt_is_local;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (skb_af == AF_INET6) {
+ int addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr);
+
+ source_is_loopback =
+ (!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
+ (addr_type & IPV6_ADDR_LOOPBACK);
+ old_rt_is_local = __ip_vs_is_local_route6(
+ (struct rt6_info *)skb_dst(skb));
+ } else
+#endif
+ {
+ source_is_loopback = ipv4_is_loopback(ip_hdr(skb)->saddr);
+ old_rt_is_local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
+ }
+
+ if (unlikely(new_rt_is_local)) {
+ if (!rt_mode_allow_local)
+ return true;
+ if (!rt_mode_allow_redirect && !old_rt_is_local)
+ return true;
+ } else {
+ if (!rt_mode_allow_non_local)
+ return true;
+ if (source_is_loopback)
+ return true;
+ }
+ return false;
+}
+
+static inline void maybe_update_pmtu(int skb_af, struct sk_buff *skb, int mtu)
+{
+ struct sock *sk = skb->sk;
+ struct rtable *ort = skb_rtable(skb);
+
+ if (!skb->dev && sk && sk_fullsock(sk))
+ ort->dst.ops->update_pmtu(&ort->dst, sk, NULL, mtu);
+}
+
+static inline bool ensure_mtu_is_adequate(int skb_af, int rt_mode,
+ struct ip_vs_iphdr *ipvsh,
+ struct sk_buff *skb, int mtu)
+{
+#ifdef CONFIG_IP_VS_IPV6
+ if (skb_af == AF_INET6) {
+ struct net *net = dev_net(skb_dst(skb)->dev);
+
+ if (unlikely(__mtu_check_toobig_v6(skb, mtu))) {
+ if (!skb->dev)
+ skb->dev = net->loopback_dev;
+ /* only send ICMP too big on first fragment */
+ if (!ipvsh->fragoffs)
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ IP_VS_DBG(1, "frag needed for %pI6c\n",
+ &ipv6_hdr(skb)->saddr);
+ return false;
+ }
+ } else
+#endif
+ {
+ struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
+
+ /* If we're going to tunnel the packet and pmtu discovery
+ * is disabled, we'll just fragment it anyway
+ */
+ if ((rt_mode & IP_VS_RT_MODE_TUNNEL) && !sysctl_pmtu_disc(ipvs))
+ return true;
+
+ if (unlikely(ip_hdr(skb)->frag_off & htons(IP_DF) &&
+ skb->len > mtu && !skb_is_gso(skb))) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ IP_VS_DBG(1, "frag needed for %pI4\n",
+ &ip_hdr(skb)->saddr);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/* Get route to destination or remote server */
+static int
+__ip_vs_get_out_rt(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
+ __be32 daddr, int rt_mode, __be32 *ret_saddr,
+ struct ip_vs_iphdr *ipvsh)
+{
+ struct net *net = dev_net(skb_dst(skb)->dev);
+ struct ip_vs_dest_dst *dest_dst;
+ struct rtable *rt; /* Route to the other host */
+ int mtu;
+ int local, noref = 1;
+
+ if (dest) {
+ dest_dst = __ip_vs_dst_check(dest);
+ if (likely(dest_dst))
+ rt = (struct rtable *) dest_dst->dst_cache;
+ else {
+ dest_dst = ip_vs_dest_dst_alloc();
+ spin_lock_bh(&dest->dst_lock);
+ if (!dest_dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ goto err_unreach;
+ }
+ rt = do_output_route4(net, dest->addr.ip, rt_mode,
+ &dest_dst->dst_saddr.ip);
+ if (!rt) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ ip_vs_dest_dst_free(dest_dst);
+ goto err_unreach;
+ }
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
+ &dest->addr.ip, &dest_dst->dst_saddr.ip,
+ atomic_read(&rt->dst.__refcnt));
+ }
+ if (ret_saddr)
+ *ret_saddr = dest_dst->dst_saddr.ip;
+ } else {
+ __be32 saddr = htonl(INADDR_ANY);
+
+ noref = 0;
+
+ /* For such unconfigured boxes avoid many route lookups
+ * for performance reasons because we do not remember saddr
+ */
+ rt_mode &= ~IP_VS_RT_MODE_CONNECT;
+ rt = do_output_route4(net, daddr, rt_mode, &saddr);
+ if (!rt)
+ goto err_unreach;
+ if (ret_saddr)
+ *ret_saddr = saddr;
+ }
+
+ local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0;
+ if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
+ local))) {
+ IP_VS_DBG_RL("We are crossing local and non-local addresses"
+ " daddr=%pI4\n", &daddr);
+ goto err_put;
+ }
+
+ if (unlikely(local)) {
+ /* skb to local stack, preserve old route */
+ if (!noref)
+ ip_rt_put(rt);
+ return local;
+ }
+
+ if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL))) {
+ mtu = dst_mtu(&rt->dst);
+ } else {
+ mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
+ if (mtu < 68) {
+ IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__);
+ goto err_put;
+ }
+ maybe_update_pmtu(skb_af, skb, mtu);
+ }
+
+ if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu))
+ goto err_put;
+
+ skb_dst_drop(skb);
+ if (noref) {
+ if (!local)
+ skb_dst_set_noref(skb, &rt->dst);
+ else
+ skb_dst_set(skb, dst_clone(&rt->dst));
+ } else
+ skb_dst_set(skb, &rt->dst);
+
+ return local;
+
+err_put:
+ if (!noref)
+ ip_rt_put(rt);
+ return -1;
+
+err_unreach:
+ dst_link_failure(skb);
+ return -1;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+static struct dst_entry *
+__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr,
+ struct in6_addr *ret_saddr, int do_xfrm)
+{
+ struct dst_entry *dst;
+ struct flowi6 fl6 = {
+ .daddr = *daddr,
+ };
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error)
+ goto out_err;
+ if (!ret_saddr)
+ return dst;
+ if (ipv6_addr_any(&fl6.saddr) &&
+ ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev,
+ &fl6.daddr, 0, &fl6.saddr) < 0)
+ goto out_err;
+ if (do_xfrm) {
+ dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
+ if (IS_ERR(dst)) {
+ dst = NULL;
+ goto out_err;
+ }
+ }
+ *ret_saddr = fl6.saddr;
+ return dst;
+
+out_err:
+ dst_release(dst);
+ IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr);
+ return NULL;
+}
+
+/*
+ * Get route to destination or remote server
+ */
+static int
+__ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest,
+ struct in6_addr *daddr, struct in6_addr *ret_saddr,
+ struct ip_vs_iphdr *ipvsh, int do_xfrm, int rt_mode)
+{
+ struct net *net = dev_net(skb_dst(skb)->dev);
+ struct ip_vs_dest_dst *dest_dst;
+ struct rt6_info *rt; /* Route to the other host */
+ struct dst_entry *dst;
+ int mtu;
+ int local, noref = 1;
+
+ if (dest) {
+ dest_dst = __ip_vs_dst_check(dest);
+ if (likely(dest_dst))
+ rt = (struct rt6_info *) dest_dst->dst_cache;
+ else {
+ u32 cookie;
+
+ dest_dst = ip_vs_dest_dst_alloc();
+ spin_lock_bh(&dest->dst_lock);
+ if (!dest_dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ goto err_unreach;
+ }
+ dst = __ip_vs_route_output_v6(net, &dest->addr.in6,
+ &dest_dst->dst_saddr.in6,
+ do_xfrm);
+ if (!dst) {
+ __ip_vs_dst_set(dest, NULL, NULL, 0);
+ spin_unlock_bh(&dest->dst_lock);
+ ip_vs_dest_dst_free(dest_dst);
+ goto err_unreach;
+ }
+ rt = (struct rt6_info *) dst;
+ cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
+ spin_unlock_bh(&dest->dst_lock);
+ IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
+ &dest->addr.in6, &dest_dst->dst_saddr.in6,
+ atomic_read(&rt->dst.__refcnt));
+ }
+ if (ret_saddr)
+ *ret_saddr = dest_dst->dst_saddr.in6;
+ } else {
+ noref = 0;
+ dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm);
+ if (!dst)
+ goto err_unreach;
+ rt = (struct rt6_info *) dst;
+ }
+
+ local = __ip_vs_is_local_route6(rt);
+
+ if (unlikely(crosses_local_route_boundary(skb_af, skb, rt_mode,
+ local))) {
+ IP_VS_DBG_RL("We are crossing local and non-local addresses"
+ " daddr=%pI6\n", daddr);
+ goto err_put;
+ }
+
+ if (unlikely(local)) {
+ /* skb to local stack, preserve old route */
+ if (!noref)
+ dst_release(&rt->dst);
+ return local;
+ }
+
+ /* MTU checking */
+ if (likely(!(rt_mode & IP_VS_RT_MODE_TUNNEL)))
+ mtu = dst_mtu(&rt->dst);
+ else {
+ mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr);
+ if (mtu < IPV6_MIN_MTU) {
+ IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__,
+ IPV6_MIN_MTU);
+ goto err_put;
+ }
+ maybe_update_pmtu(skb_af, skb, mtu);
+ }
+
+ if (!ensure_mtu_is_adequate(skb_af, rt_mode, ipvsh, skb, mtu))
+ goto err_put;
+
+ skb_dst_drop(skb);
+ if (noref) {
+ if (!local)
+ skb_dst_set_noref(skb, &rt->dst);
+ else
+ skb_dst_set(skb, dst_clone(&rt->dst));
+ } else
+ skb_dst_set(skb, &rt->dst);
+
+ return local;
+
+err_put:
+ if (!noref)
+ dst_release(&rt->dst);
+ return -1;
+
+err_unreach:
+ dst_link_failure(skb);
+ return -1;
+}
+#endif
+
+
+/* return NF_ACCEPT to allow forwarding or other NF_xxx on error */
+static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
+ struct ip_vs_conn *cp)
+{
+ int ret = NF_ACCEPT;
+
+ skb->ipvs_property = 1;
+ if (unlikely(cp->flags & IP_VS_CONN_F_NFCT))
+ ret = ip_vs_confirm_conntrack(skb);
+ if (ret == NF_ACCEPT) {
+ nf_reset(skb);
+ skb_forward_csum(skb);
+ }
+ return ret;
+}
+
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
+ struct ip_vs_conn *cp, int local)
+{
+ int ret = NF_STOLEN;
+
+ skb->ipvs_property = 1;
+ if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+ ip_vs_notrack(skb);
+ else
+ ip_vs_update_conntrack(skb, cp, 1);
+ if (!local) {
+ skb_forward_csum(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb,
+ NULL, skb_dst(skb)->dev, dst_output_sk);
+ } else
+ ret = NF_ACCEPT;
+ return ret;
+}
+
+/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */
+static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
+ struct ip_vs_conn *cp, int local)
+{
+ int ret = NF_STOLEN;
+
+ skb->ipvs_property = 1;
+ if (likely(!(cp->flags & IP_VS_CONN_F_NFCT)))
+ ip_vs_notrack(skb);
+ if (!local) {
+ skb_forward_csum(skb);
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb,
+ NULL, skb_dst(skb)->dev, dst_output_sk);
+ } else
+ ret = NF_ACCEPT;
+ return ret;
+}
+
+
+/*
+ * NULL transmitter (do nothing except return NF_ACCEPT)
+ */
+int
+ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ /* we do not touch skb and do not need pskb ptr */
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
+}
+
+
+/*
+ * Bypass transmitter
+ * Let packets bypass the destination when the destination is not
+ * available, it may be only used in transparent cache cluster.
+ */
+int
+ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ if (__ip_vs_get_out_rt(cp->af, skb, NULL, iph->daddr,
+ IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
+ goto tx_error;
+
+ ip_send_check(iph);
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+ tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ EnterFunction(10);
+
+ rcu_read_lock();
+ if (__ip_vs_get_out_rt_v6(cp->af, skb, NULL, &ipvsh->daddr.in6, NULL,
+ ipvsh, 0, IP_VS_RT_MODE_NON_LOCAL) < 0)
+ goto tx_error;
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+ tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+#endif
+
+/*
+ * NAT transmitter (only for outside-to-inside nat forwarding)
+ * Not used for related ICMP
+ */
+int
+ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ struct rtable *rt; /* Route to the other host */
+ int local, rc, was_input;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ /* check if it is a connection of no-client-port */
+ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+ __be16 _pt, *p;
+
+ p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
+ if (p == NULL)
+ goto tx_error;
+ ip_vs_conn_fill_cport(cp, *p);
+ IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
+ }
+
+ was_input = rt_is_input_route(skb_rtable(skb));
+ local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR, NULL, ipvsh);
+ if (local < 0)
+ goto tx_error;
+ rt = skb_rtable(skb);
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0,
+ "ip_vs_nat_xmit(): "
+ "stopping DNAT to local address");
+ goto tx_error;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
+ IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): "
+ "stopping DNAT to loopback address");
+ goto tx_error;
+ }
+
+ /* copy-on-write the packet before mangling it */
+ if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ goto tx_error;
+
+ if (skb_cow(skb, rt->dst.dev->hard_header_len))
+ goto tx_error;
+
+ /* mangle the packet */
+ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
+ goto tx_error;
+ ip_hdr(skb)->daddr = cp->daddr.ip;
+ ip_send_check(ip_hdr(skb));
+
+ IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT");
+
+ /* FIXME: when application helper enlarges the packet and the length
+ is larger than the MTU of outgoing device, there will be still
+ MTU problem. */
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return rc;
+
+ tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ struct rt6_info *rt; /* Route to the other host */
+ int local, rc;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ /* check if it is a connection of no-client-port */
+ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT && !ipvsh->fragoffs)) {
+ __be16 _pt, *p;
+ p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt);
+ if (p == NULL)
+ goto tx_error;
+ ip_vs_conn_fill_cport(cp, *p);
+ IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
+ }
+
+ local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6,
+ NULL, ipvsh, 0,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR);
+ if (local < 0)
+ goto tx_error;
+ rt = (struct rt6_info *) skb_dst(skb);
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0,
+ "ip_vs_nat_xmit_v6(): "
+ "stopping DNAT to local address");
+ goto tx_error;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+ IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0,
+ "ip_vs_nat_xmit_v6(): "
+ "stopping DNAT to loopback address");
+ goto tx_error;
+ }
+
+ /* copy-on-write the packet before mangling it */
+ if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
+ goto tx_error;
+
+ if (skb_cow(skb, rt->dst.dev->hard_header_len))
+ goto tx_error;
+
+ /* mangle the packet */
+ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh))
+ goto tx_error;
+ ipv6_hdr(skb)->daddr = cp->daddr.in6;
+
+ IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT");
+
+ /* FIXME: when application helper enlarges the packet and the length
+ is larger than the MTU of outgoing device, there will be still
+ MTU problem. */
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return rc;
+
+tx_error:
+ LeaveFunction(10);
+ kfree_skb(skb);
+ rcu_read_unlock();
+ return NF_STOLEN;
+}
+#endif
+
+/* When forwarding a packet, we must ensure that we've got enough headroom
+ * for the encapsulation packet in the skb. This also gives us an
+ * opportunity to figure out what the payload_len, dsfield, ttl, and df
+ * values should be, so that we won't need to look at the old ip header
+ * again
+ */
+static struct sk_buff *
+ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
+ unsigned int max_headroom, __u8 *next_protocol,
+ __u32 *payload_len, __u8 *dsfield, __u8 *ttl,
+ __be16 *df)
+{
+ struct sk_buff *new_skb = NULL;
+ struct iphdr *old_iph = NULL;
+#ifdef CONFIG_IP_VS_IPV6
+ struct ipv6hdr *old_ipv6h = NULL;
+#endif
+
+ if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) {
+ new_skb = skb_realloc_headroom(skb, max_headroom);
+ if (!new_skb)
+ goto error;
+ if (skb->sk)
+ skb_set_owner_w(new_skb, skb->sk);
+ consume_skb(skb);
+ skb = new_skb;
+ }
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (skb_af == AF_INET6) {
+ old_ipv6h = ipv6_hdr(skb);
+ *next_protocol = IPPROTO_IPV6;
+ if (payload_len)
+ *payload_len =
+ ntohs(old_ipv6h->payload_len) +
+ sizeof(*old_ipv6h);
+ *dsfield = ipv6_get_dsfield(old_ipv6h);
+ *ttl = old_ipv6h->hop_limit;
+ if (df)
+ *df = 0;
+ } else
+#endif
+ {
+ old_iph = ip_hdr(skb);
+ /* Copy DF, reset fragment offset and MF */
+ if (df)
+ *df = (old_iph->frag_off & htons(IP_DF));
+ *next_protocol = IPPROTO_IPIP;
+
+ /* fix old IP header checksum */
+ ip_send_check(old_iph);
+ *dsfield = ipv4_get_dsfield(old_iph);
+ *ttl = old_iph->ttl;
+ if (payload_len)
+ *payload_len = ntohs(old_iph->tot_len);
+ }
+
+ return skb;
+error:
+ kfree_skb(skb);
+ return ERR_PTR(-ENOMEM);
+}
+
+static inline int __tun_gso_type_mask(int encaps_af, int orig_af)
+{
+ if (encaps_af == AF_INET) {
+ if (orig_af == AF_INET)
+ return SKB_GSO_IPIP;
+
+ return SKB_GSO_SIT;
+ }
+
+ /* GSO: we need to provide proper SKB_GSO_ value for IPv6:
+ * SKB_GSO_SIT/IPV6
+ */
+ return 0;
+}
+
+/*
+ * IP Tunneling transmitter
+ *
+ * This function encapsulates the packet in a new IP packet, its
+ * destination will be set to cp->daddr. Most code of this function
+ * is taken from ipip.c.
+ *
+ * It is used in VS/TUN cluster. The load balancer selects a real
+ * server from a cluster based on a scheduling algorithm,
+ * encapsulates the request packet and forwards it to the selected
+ * server. For example, all real servers are configured with
+ * "ifconfig tunl0 <Virtual IP Address> up". When the server receives
+ * the encapsulated packet, it will decapsulate the packet, processe
+ * the request and return the response packets directly to the client
+ * without passing the load balancer. This can greatly increase the
+ * scalability of virtual server.
+ *
+ * Used for ANY protocol
+ */
+int
+ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ struct net *net = skb_net(skb);
+ struct netns_ipvs *ipvs = net_ipvs(net);
+ struct rtable *rt; /* Route to the other host */
+ __be32 saddr; /* Source for tunnel */
+ struct net_device *tdev; /* Device to other host */
+ __u8 next_protocol = 0;
+ __u8 dsfield = 0;
+ __u8 ttl = 0;
+ __be16 df = 0;
+ __be16 *dfp = NULL;
+ struct iphdr *iph; /* Our new IP header */
+ unsigned int max_headroom; /* The extra header space needed */
+ int ret, local;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_CONNECT |
+ IP_VS_RT_MODE_TUNNEL, &saddr, ipvsh);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
+ }
+
+ rt = skb_rtable(skb);
+ tdev = rt->dst.dev;
+
+ /*
+ * Okay, now see if we can stuff it in the buffer as-is.
+ */
+ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
+
+ /* We only care about the df field if sysctl_pmtu_disc(ipvs) is set */
+ dfp = sysctl_pmtu_disc(ipvs) ? &df : NULL;
+ skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
+ &next_protocol, NULL, &dsfield,
+ &ttl, dfp);
+ if (IS_ERR(skb))
+ goto tx_error;
+
+ skb = iptunnel_handle_offloads(
+ skb, false, __tun_gso_type_mask(AF_INET, cp->af));
+ if (IS_ERR(skb))
+ goto tx_error;
+
+ skb->transport_header = skb->network_header;
+
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+ /*
+ * Push down and install the IPIP header.
+ */
+ iph = ip_hdr(skb);
+ iph->version = 4;
+ iph->ihl = sizeof(struct iphdr)>>2;
+ iph->frag_off = df;
+ iph->protocol = next_protocol;
+ iph->tos = dsfield;
+ iph->daddr = cp->daddr.ip;
+ iph->saddr = saddr;
+ iph->ttl = ttl;
+ ip_select_ident(net, skb, NULL);
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ret = ip_vs_tunnel_xmit_prepare(skb, cp);
+ if (ret == NF_ACCEPT)
+ ip_local_out(skb);
+ else if (ret == NF_DROP)
+ kfree_skb(skb);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+
+ return NF_STOLEN;
+
+ tx_error:
+ if (!IS_ERR(skb))
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ struct rt6_info *rt; /* Route to the other host */
+ struct in6_addr saddr; /* Source for tunnel */
+ struct net_device *tdev; /* Device to other host */
+ __u8 next_protocol = 0;
+ __u32 payload_len = 0;
+ __u8 dsfield = 0;
+ __u8 ttl = 0;
+ struct ipv6hdr *iph; /* Our new IP header */
+ unsigned int max_headroom; /* The extra header space needed */
+ int ret, local;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6,
+ &saddr, ipvsh, 1,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_TUNNEL);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
+ }
+
+ rt = (struct rt6_info *) skb_dst(skb);
+ tdev = rt->dst.dev;
+
+ /*
+ * Okay, now see if we can stuff it in the buffer as-is.
+ */
+ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr);
+
+ skb = ip_vs_prepare_tunneled_skb(skb, cp->af, max_headroom,
+ &next_protocol, &payload_len,
+ &dsfield, &ttl, NULL);
+ if (IS_ERR(skb))
+ goto tx_error;
+
+ skb = iptunnel_handle_offloads(
+ skb, false, __tun_gso_type_mask(AF_INET6, cp->af));
+ if (IS_ERR(skb))
+ goto tx_error;
+
+ skb->transport_header = skb->network_header;
+
+ skb_push(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+ /*
+ * Push down and install the IPIP header.
+ */
+ iph = ipv6_hdr(skb);
+ iph->version = 6;
+ iph->nexthdr = next_protocol;
+ iph->payload_len = htons(payload_len);
+ memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl));
+ ipv6_change_dsfield(iph, 0, dsfield);
+ iph->daddr = cp->daddr.in6;
+ iph->saddr = saddr;
+ iph->hop_limit = ttl;
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ret = ip_vs_tunnel_xmit_prepare(skb, cp);
+ if (ret == NF_ACCEPT)
+ ip6_local_out(skb);
+ else if (ret == NF_DROP)
+ kfree_skb(skb);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+
+ return NF_STOLEN;
+
+tx_error:
+ if (!IS_ERR(skb))
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+#endif
+
+
+/*
+ * Direct Routing transmitter
+ * Used for ANY protocol
+ */
+int
+ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ int local;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_KNOWN_NH, NULL, ipvsh);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 1);
+ }
+
+ ip_send_check(ip_hdr(skb));
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+ tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
+{
+ int local;
+
+ EnterFunction(10);
+
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6,
+ NULL, ipvsh, 0,
+ IP_VS_RT_MODE_LOCAL |
+ IP_VS_RT_MODE_NON_LOCAL);
+ if (local < 0)
+ goto tx_error;
+ if (local) {
+ rcu_read_unlock();
+ return ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 1);
+ }
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ ip_vs_send_or_cont(NFPROTO_IPV6, skb, cp, 0);
+ rcu_read_unlock();
+
+ LeaveFunction(10);
+ return NF_STOLEN;
+
+tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ LeaveFunction(10);
+ return NF_STOLEN;
+}
+#endif
+
+
+/*
+ * ICMP packet transmitter
+ * called by the ip_vs_in_icmp
+ */
+int
+ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
+ struct ip_vs_iphdr *iph)
+{
+ struct rtable *rt; /* Route to the other host */
+ int rc;
+ int local;
+ int rt_mode, was_input;
+
+ EnterFunction(10);
+
+ /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+ forwarded directly here, because there is no need to
+ translate address/port back */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+ if (cp->packet_xmit)
+ rc = cp->packet_xmit(skb, cp, pp, iph);
+ else
+ rc = NF_ACCEPT;
+ /* do not touch skb anymore */
+ atomic_inc(&cp->in_pkts);
+ goto out;
+ }
+
+ /*
+ * mangle and send the packet here (only for VS/NAT)
+ */
+ was_input = rt_is_input_route(skb_rtable(skb));
+
+ /* LOCALNODE from FORWARD hook is not supported */
+ rt_mode = (hooknum != NF_INET_FORWARD) ?
+ IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt(cp->af, skb, cp->dest, cp->daddr.ip, rt_mode,
+ NULL, iph);
+ if (local < 0)
+ goto tx_error;
+ rt = skb_rtable(skb);
+
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG(10, "%s(): "
+ "stopping DNAT to local address %pI4\n",
+ __func__, &cp->daddr.ip);
+ goto tx_error;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) {
+ IP_VS_DBG(1, "%s(): "
+ "stopping DNAT to loopback %pI4\n",
+ __func__, &cp->daddr.ip);
+ goto tx_error;
+ }
+
+ /* copy-on-write the packet before mangling it */
+ if (!skb_make_writable(skb, offset))
+ goto tx_error;
+
+ if (skb_cow(skb, rt->dst.dev->hard_header_len))
+ goto tx_error;
+
+ ip_vs_nat_icmp(skb, pp, cp, 0);
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local);
+ rcu_read_unlock();
+ goto out;
+
+ tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ rc = NF_STOLEN;
+ out:
+ LeaveFunction(10);
+ return rc;
+}
+
+#ifdef CONFIG_IP_VS_IPV6
+int
+ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
+ struct ip_vs_protocol *pp, int offset, unsigned int hooknum,
+ struct ip_vs_iphdr *ipvsh)
+{
+ struct rt6_info *rt; /* Route to the other host */
+ int rc;
+ int local;
+ int rt_mode;
+
+ EnterFunction(10);
+
+ /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+ forwarded directly here, because there is no need to
+ translate address/port back */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+ if (cp->packet_xmit)
+ rc = cp->packet_xmit(skb, cp, pp, ipvsh);
+ else
+ rc = NF_ACCEPT;
+ /* do not touch skb anymore */
+ atomic_inc(&cp->in_pkts);
+ goto out;
+ }
+
+ /*
+ * mangle and send the packet here (only for VS/NAT)
+ */
+
+ /* LOCALNODE from FORWARD hook is not supported */
+ rt_mode = (hooknum != NF_INET_FORWARD) ?
+ IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL |
+ IP_VS_RT_MODE_RDR : IP_VS_RT_MODE_NON_LOCAL;
+ rcu_read_lock();
+ local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6,
+ NULL, ipvsh, 0, rt_mode);
+ if (local < 0)
+ goto tx_error;
+ rt = (struct rt6_info *) skb_dst(skb);
+ /*
+ * Avoid duplicate tuple in reply direction for NAT traffic
+ * to local address when connection is sync-ed
+ */
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ if (cp->flags & IP_VS_CONN_F_SYNC && local) {
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+
+ if (ct && !nf_ct_is_untracked(ct)) {
+ IP_VS_DBG(10, "%s(): "
+ "stopping DNAT to local address %pI6\n",
+ __func__, &cp->daddr.in6);
+ goto tx_error;
+ }
+ }
+#endif
+
+ /* From world but DNAT to loopback address? */
+ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) &&
+ ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) {
+ IP_VS_DBG(1, "%s(): "
+ "stopping DNAT to loopback %pI6\n",
+ __func__, &cp->daddr.in6);
+ goto tx_error;
+ }
+
+ /* copy-on-write the packet before mangling it */
+ if (!skb_make_writable(skb, offset))
+ goto tx_error;
+
+ if (skb_cow(skb, rt->dst.dev->hard_header_len))
+ goto tx_error;
+
+ ip_vs_nat_icmp_v6(skb, pp, cp, 0);
+
+ /* Another hack: avoid icmp_send in ip_fragment */
+ skb->ignore_df = 1;
+
+ rc = ip_vs_nat_send_or_cont(NFPROTO_IPV6, skb, cp, local);
+ rcu_read_unlock();
+ goto out;
+
+tx_error:
+ kfree_skb(skb);
+ rcu_read_unlock();
+ rc = NF_STOLEN;
+out:
+ LeaveFunction(10);
+ return rc;
+}
+#endif