From e5fd91f1ef340da553f7a79da9540c3db711c937 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Fabian=20Silva=20Delgado?= Date: Tue, 8 Sep 2015 01:01:14 -0300 Subject: Linux-libre 4.2-gnu --- net/netfilter/ipvs/ip_vs_core.c | 16 +++++++-- net/netfilter/ipvs/ip_vs_ctl.c | 78 +++++++++++++++++++++++++--------------- net/netfilter/ipvs/ip_vs_sched.c | 12 +++---- net/netfilter/ipvs/ip_vs_sync.c | 32 ++++++----------- net/netfilter/ipvs/ip_vs_xmit.c | 60 ++++++++++++++++++++++++++----- 5 files changed, 131 insertions(+), 67 deletions(-) (limited to 'net/netfilter/ipvs') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 5d2b806a8..38fbc194b 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -319,7 +319,13 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * return *ignored=0 i.e. ICMP and NF_DROP */ sched = rcu_dereference(svc->scheduler); - dest = sched->schedule(svc, skb, iph); + if (sched) { + /* read svc->sched_data after svc->scheduler */ + smp_rmb(); + dest = sched->schedule(svc, skb, iph); + } else { + dest = NULL; + } if (!dest) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); kfree(param.pe_data); @@ -467,7 +473,13 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, } sched = rcu_dereference(svc->scheduler); - dest = sched->schedule(svc, skb, iph); + if (sched) { + /* read svc->sched_data after svc->scheduler */ + smp_rmb(); + dest = sched->schedule(svc, skb, iph); + } else { + dest = NULL; + } if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); return NULL; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 285eae3a1..24c554201 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -842,15 +842,16 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); - sched = rcu_dereference_protected(svc->scheduler, 1); if (add) { ip_vs_start_estimator(svc->net, &dest->stats); list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; - if (sched->add_dest) + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched && sched->add_dest) sched->add_dest(svc, dest); } else { - if (sched->upd_dest) + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched && sched->upd_dest) sched->upd_dest(svc, dest); } } @@ -1084,7 +1085,7 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, struct ip_vs_scheduler *sched; sched = rcu_dereference_protected(svc->scheduler, 1); - if (sched->del_dest) + if (sched && sched->del_dest) sched->del_dest(svc, dest); } } @@ -1175,11 +1176,14 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, ip_vs_use_count_inc(); /* Lookup the scheduler by 'u->sched_name' */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); - ret = -ENOENT; - goto out_err; + if (strcmp(u->sched_name, "none")) { + sched = ip_vs_scheduler_get(u->sched_name); + if (!sched) { + pr_info("Scheduler module ip_vs_%s not found\n", + u->sched_name); + ret = -ENOENT; + goto out_err; + } } if (u->pe_name && *u->pe_name) { @@ -1240,10 +1244,12 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, spin_lock_init(&svc->stats.lock); /* Bind the scheduler */ - ret = ip_vs_bind_scheduler(svc, sched); - if (ret) - goto out_err; - sched = NULL; + if (sched) { + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) + goto out_err; + sched = NULL; + } /* Bind the ct retriever */ RCU_INIT_POINTER(svc->pe, pe); @@ -1291,17 +1297,20 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, static int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) { - struct ip_vs_scheduler *sched, *old_sched; + struct ip_vs_scheduler *sched = NULL, *old_sched; struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; /* * Lookup the scheduler, by 'u->sched_name' */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); - return -ENOENT; + if (strcmp(u->sched_name, "none")) { + sched = ip_vs_scheduler_get(u->sched_name); + if (!sched) { + pr_info("Scheduler module ip_vs_%s not found\n", + u->sched_name); + return -ENOENT; + } } old_sched = sched; @@ -1329,14 +1338,20 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) old_sched = rcu_dereference_protected(svc->scheduler, 1); if (sched != old_sched) { + if (old_sched) { + ip_vs_unbind_scheduler(svc, old_sched); + RCU_INIT_POINTER(svc->scheduler, NULL); + /* Wait all svc->sched_data users */ + synchronize_rcu(); + } /* Bind the new scheduler */ - ret = ip_vs_bind_scheduler(svc, sched); - if (ret) { - old_sched = sched; - goto out; + if (sched) { + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) { + ip_vs_scheduler_put(sched); + goto out; + } } - /* Unbind the old scheduler on success */ - ip_vs_unbind_scheduler(svc, old_sched); } /* @@ -1982,6 +1997,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); + char *sched_name = sched ? sched->name : "none"; if (iter->table == ip_vs_svc_table) { #ifdef CONFIG_IP_VS_IPV6 @@ -1990,18 +2006,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), - sched->name); + sched_name); else #endif seq_printf(seq, "%s %08X:%04X %s %s ", ip_vs_proto_name(svc->protocol), ntohl(svc->addr.ip), ntohs(svc->port), - sched->name, + sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } else { seq_printf(seq, "FWM %08X %s %s", - svc->fwmark, sched->name, + svc->fwmark, sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } @@ -2427,13 +2443,15 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { struct ip_vs_scheduler *sched; struct ip_vs_kstats kstats; + char *sched_name; sched = rcu_dereference_protected(src->scheduler, 1); + sched_name = sched ? sched->name : "none"; dst->protocol = src->protocol; dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; - strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name)); + strlcpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); dst->flags = src->flags; dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; @@ -2892,6 +2910,7 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, struct ip_vs_flags flags = { .flags = svc->flags, .mask = ~0 }; struct ip_vs_kstats kstats; + char *sched_name; nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); if (!nl_service) @@ -2910,8 +2929,9 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, } sched = rcu_dereference_protected(svc->scheduler, 1); + sched_name = sched ? sched->name : "none"; pe = rcu_dereference_protected(svc->pe, 1); - if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) || + if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) || (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index 199760c71..7e8141647 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -74,7 +74,7 @@ void ip_vs_unbind_scheduler(struct ip_vs_service *svc, if (sched->done_service) sched->done_service(svc); - /* svc->scheduler can not be set to NULL */ + /* svc->scheduler can be set to NULL only by caller */ } @@ -147,21 +147,21 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg) { - struct ip_vs_scheduler *sched; + struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); + char *sched_name = sched ? sched->name : "none"; - sched = rcu_dereference(svc->scheduler); if (svc->fwmark) { IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n", - sched->name, svc->fwmark, svc->fwmark, msg); + sched_name, svc->fwmark, svc->fwmark, msg); #ifdef CONFIG_IP_VS_IPV6 } else if (svc->af == AF_INET6) { IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n", - sched->name, ip_vs_proto_name(svc->protocol), + sched_name, ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), msg); #endif } else { IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n", - sched->name, ip_vs_proto_name(svc->protocol), + sched_name, ip_vs_proto_name(svc->protocol), &svc->addr.ip, ntohs(svc->port), msg); } } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 19b9cce6c..d99ad93eb 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -612,7 +612,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, pkts = atomic_add_return(1, &cp->in_pkts); else pkts = sysctl_sync_threshold(ipvs); - ip_vs_sync_conn(net, cp->control, pkts); + ip_vs_sync_conn(net, cp, pkts); } } @@ -1457,18 +1457,12 @@ static struct socket *make_send_sock(struct net *net, int id) struct socket *sock; int result; - /* First create a socket move it to right name space later */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + /* First create a socket */ + result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - /* - * Kernel sockets that are a part of a namespace, should not - * hold a reference to a namespace in order to allow to stop it. - * After sk_change_net should be released using sk_release_kernel. - */ - sk_change_net(sock->sk, net); result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); if (result < 0) { pr_err("Error setting outbound mcast interface\n"); @@ -1497,7 +1491,7 @@ static struct socket *make_send_sock(struct net *net, int id) return sock; error: - sk_release_kernel(sock->sk); + sock_release(sock); return ERR_PTR(result); } @@ -1518,17 +1512,11 @@ static struct socket *make_receive_sock(struct net *net, int id) int result; /* First create a socket */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - /* - * Kernel sockets that are a part of a namespace, should not - * hold a reference to a namespace in order to allow to stop it. - * After sk_change_net should be released using sk_release_kernel. - */ - sk_change_net(sock->sk, net); /* it is equivalent to the REUSEADDR option in user-space */ sock->sk->sk_reuse = SK_CAN_REUSE; result = sysctl_sync_sock_size(ipvs); @@ -1554,7 +1542,7 @@ static struct socket *make_receive_sock(struct net *net, int id) return sock; error: - sk_release_kernel(sock->sk); + sock_release(sock); return ERR_PTR(result); } @@ -1692,7 +1680,7 @@ done: ip_vs_sync_buff_release(sb); /* release the sending multicast socket */ - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo); return 0; @@ -1729,7 +1717,7 @@ static int sync_thread_backup(void *data) } /* release the sending multicast socket */ - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo->buf); kfree(tinfo); @@ -1854,11 +1842,11 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) return 0; outsocket: - sk_release_kernel(sock->sk); + sock_release(sock); outtinfo: if (tinfo) { - sk_release_kernel(tinfo->sock->sk); + sock_release(tinfo->sock); kfree(tinfo->buf); kfree(tinfo); } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 19986ec5f..258a0b0e8 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -130,7 +130,6 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr, memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; - fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0; fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? FLOWI_FLAG_KNOWN_NH : 0; @@ -364,13 +363,16 @@ err_unreach: #ifdef CONFIG_IP_VS_IPV6 static struct dst_entry * __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, - struct in6_addr *ret_saddr, int do_xfrm) + struct in6_addr *ret_saddr, int do_xfrm, int rt_mode) { struct dst_entry *dst; struct flowi6 fl6 = { .daddr = *daddr, }; + if (rt_mode & IP_VS_RT_MODE_KNOWN_NH) + fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; + dst = ip6_route_output(net, NULL, &fl6); if (dst->error) goto out_err; @@ -427,7 +429,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, } dst = __ip_vs_route_output_v6(net, &dest->addr.in6, &dest_dst->dst_saddr.in6, - do_xfrm); + do_xfrm, rt_mode); if (!dst) { __ip_vs_dst_set(dest, NULL, NULL, 0); spin_unlock_bh(&dest->dst_lock); @@ -435,7 +437,7 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, goto err_unreach; } rt = (struct rt6_info *) dst; - cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + cookie = rt6_get_cookie(rt); __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie); spin_unlock_bh(&dest->dst_lock); IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", @@ -446,7 +448,8 @@ __ip_vs_get_out_rt_v6(int skb_af, struct sk_buff *skb, struct ip_vs_dest *dest, *ret_saddr = dest_dst->dst_saddr.in6; } else { noref = 0; - dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); + dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm, + rt_mode); if (!dst) goto err_unreach; rt = (struct rt6_info *) dst; @@ -501,6 +504,13 @@ err_put: return -1; err_unreach: + /* The ip6_link_failure function requires the dev field to be set + * in order to get the net (further for the sake of fwmark + * reflection). + */ + if (!skb->dev) + skb->dev = skb_dst(skb)->dev; + dst_link_failure(skb); return -1; } @@ -519,10 +529,27 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, if (ret == NF_ACCEPT) { nf_reset(skb); skb_forward_csum(skb); + if (!skb->sk) + skb_sender_cpu_clear(skb); } return ret; } +/* In the event of a remote destination, it's possible that we would have + * matches against an old socket (particularly a TIME-WAIT socket). This + * causes havoc down the line (ip_local_out et. al. expect regular sockets + * and invalid memory accesses will happen) so simply drop the association + * in this case. +*/ +static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb) +{ + /* If dev is set, the packet came from the LOCAL_IN callback and + * not from a local TCP socket. + */ + if (skb->dev) + skb_orphan(skb); +} + /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, struct ip_vs_conn *cp, int local) @@ -534,12 +561,23 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 1); + + /* Remove the early_demux association unless it's bound for the + * exact same port and address on this host after translation. + */ + if (!local || cp->vport != cp->dport || + !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr)) + ip_vs_drop_early_demux_sk(skb); + if (!local) { skb_forward_csum(skb); + if (!skb->sk) + skb_sender_cpu_clear(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb, NULL, skb_dst(skb)->dev, dst_output_sk); } else ret = NF_ACCEPT; + return ret; } @@ -553,7 +591,10 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) ip_vs_notrack(skb); if (!local) { + ip_vs_drop_early_demux_sk(skb); skb_forward_csum(skb); + if (!skb->sk) + skb_sender_cpu_clear(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb, NULL, skb_dst(skb)->dev, dst_output_sk); } else @@ -781,7 +822,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && - ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to loopback address"); @@ -841,6 +882,8 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, struct ipv6hdr *old_ipv6h = NULL; #endif + ip_vs_drop_early_demux_sk(skb); + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) @@ -1164,7 +1207,8 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, local = __ip_vs_get_out_rt_v6(cp->af, skb, cp->dest, &cp->daddr.in6, NULL, ipvsh, 0, IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL); + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_KNOWN_NH); if (local < 0) goto tx_error; if (local) { @@ -1346,7 +1390,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && - ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + ipv6_addr_type(&cp->daddr.in6) & IPV6_ADDR_LOOPBACK) { IP_VS_DBG(1, "%s(): " "stopping DNAT to loopback %pI6\n", __func__, &cp->daddr.in6); -- cgit v1.2.3