summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-06-10 05:30:17 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2016-06-10 05:30:17 -0300
commitd635711daa98be86d4c7fd01499c34f566b54ccb (patch)
treeaa5cc3760a27c3d57146498cb82fa549547de06c /net/ipv4
parentc91265cd0efb83778f015b4d4b1129bd2cfd075e (diff)
Linux-libre 4.6.2-gnu
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig9
-rw-r--r--net/ipv4/Makefile1
-rw-r--r--net/ipv4/af_inet.c69
-rw-r--r--net/ipv4/arp.c41
-rw-r--r--net/ipv4/devinet.c66
-rw-r--r--net/ipv4/fib_frontend.c6
-rw-r--r--net/ipv4/fib_semantics.c2
-rw-r--r--net/ipv4/fou.c58
-rw-r--r--net/ipv4/gre_offload.c117
-rw-r--r--net/ipv4/icmp.c5
-rw-r--r--net/ipv4/igmp.c78
-rw-r--r--net/ipv4/inet_connection_sock.c254
-rw-r--r--net/ipv4/inet_diag.c7
-rw-r--r--net/ipv4/inet_hashtables.c239
-rw-r--r--net/ipv4/inet_lro.c374
-rw-r--r--net/ipv4/ip_forward.c1
-rw-r--r--net/ipv4/ip_fragment.c29
-rw-r--r--net/ipv4/ip_gre.c84
-rw-r--r--net/ipv4/ip_input.c30
-rw-r--r--net/ipv4/ip_options.c14
-rw-r--r--net/ipv4/ip_output.c3
-rw-r--r--net/ipv4/ip_sockglue.c10
-rw-r--r--net/ipv4/ip_tunnel.c78
-rw-r--r--net/ipv4/ip_tunnel_core.c43
-rw-r--r--net/ipv4/ip_vti.c18
-rw-r--r--net/ipv4/ipip.c4
-rw-r--r--net/ipv4/netfilter/arp_tables.c109
-rw-r--r--net/ipv4/netfilter/arptable_filter.c44
-rw-r--r--net/ipv4/netfilter/ip_tables.c111
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c55
-rw-r--r--net/ipv4/netfilter/iptable_filter.c44
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c41
-rw-r--r--net/ipv4/netfilter/iptable_nat.c41
-rw-r--r--net/ipv4/netfilter/iptable_raw.c38
-rw-r--r--net/ipv4/netfilter/iptable_security.c44
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c30
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c7
-rw-r--r--net/ipv4/ping.c11
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/raw.c4
-rw-r--r--net/ipv4/route.c19
-rw-r--r--net/ipv4/syncookies.c7
-rw-r--r--net/ipv4/sysctl_net_ipv4.c236
-rw-r--r--net/ipv4/tcp.c84
-rw-r--r--net/ipv4/tcp_fastopen.c79
-rw-r--r--net/ipv4/tcp_input.c180
-rw-r--r--net/ipv4/tcp_ipv4.c46
-rw-r--r--net/ipv4/tcp_metrics.c3
-rw-r--r--net/ipv4/tcp_minisocks.c5
-rw-r--r--net/ipv4/tcp_offload.c8
-rw-r--r--net/ipv4/tcp_output.c32
-rw-r--r--net/ipv4/tcp_probe.c8
-rw-r--r--net/ipv4/tcp_timer.c23
-rw-r--r--net/ipv4/udp.c41
-rw-r--r--net/ipv4/udp_offload.c122
56 files changed, 1594 insertions, 1524 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7ccd693db..eb51c43c2 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -186,6 +186,7 @@ config NET_IPGRE_DEMUX
config NET_IP_TUNNEL
tristate
+ select DST_CACHE
default n
config NET_IPGRE
@@ -405,14 +406,6 @@ config INET_XFRM_MODE_BEET
If unsure, say Y.
-config INET_LRO
- tristate "Large Receive Offload (ipv4/tcp)"
- default y
- ---help---
- Support for Large Receive Offload (ipv4/tcp).
-
- If unsure, say Y.
-
config INET_DIAG
tristate "INET: socket monitoring interface"
default y
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 62c049b64..bfa133691 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -32,7 +32,6 @@ obj-$(CONFIG_INET_ESP) += esp4.o
obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
-obj-$(CONFIG_INET_LRO) += inet_lro.o
obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5c5db6636..9e481992d 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -370,7 +370,11 @@ lookup_protocol:
*/
inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
- sk->sk_prot->hash(sk);
+ err = sk->sk_prot->hash(sk);
+ if (err) {
+ sk_common_release(sk);
+ goto out;
+ }
}
if (sk->sk_prot->init) {
@@ -1091,12 +1095,6 @@ void inet_unregister_protosw(struct inet_protosw *p)
}
EXPORT_SYMBOL(inet_unregister_protosw);
-/*
- * Shall we try to damage output packets if routing dev changes?
- */
-
-int sysctl_ip_dynaddr __read_mostly;
-
static int inet_sk_reselect_saddr(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
@@ -1127,7 +1125,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
if (new_saddr == old_saddr)
return 0;
- if (sysctl_ip_dynaddr > 1) {
+ if (sock_net(sk)->ipv4.sysctl_ip_dynaddr > 1) {
pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
__func__, &old_saddr, &new_saddr);
}
@@ -1142,8 +1140,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
* Besides that, it does not check for connection
* uniqueness. Wait for troubles.
*/
- __sk_prot_rehash(sk);
- return 0;
+ return __sk_prot_rehash(sk);
}
int inet_sk_rebuild_header(struct sock *sk)
@@ -1183,7 +1180,7 @@ int inet_sk_rebuild_header(struct sock *sk)
* Other protocols have to map its equivalent state to TCP_SYN_SENT.
* DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
*/
- if (!sysctl_ip_dynaddr ||
+ if (!sock_net(sk)->ipv4.sysctl_ip_dynaddr ||
sk->sk_state != TCP_SYN_SENT ||
(sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
(err = inet_sk_reselect_saddr(sk)) != 0)
@@ -1383,6 +1380,45 @@ out:
return pp;
}
+static struct sk_buff **ipip_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ if (NAPI_GRO_CB(skb)->encap_mark) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+ NAPI_GRO_CB(skb)->encap_mark = 1;
+
+ return inet_gro_receive(head, skb);
+}
+
+#define SECONDS_PER_DAY 86400
+
+/* inet_current_timestamp - Return IP network timestamp
+ *
+ * Return milliseconds since midnight in network byte order.
+ */
+__be32 inet_current_timestamp(void)
+{
+ u32 secs;
+ u32 msecs;
+ struct timespec64 ts;
+
+ ktime_get_real_ts64(&ts);
+
+ /* Get secs since midnight. */
+ (void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs);
+ /* Convert to msecs. */
+ msecs = secs * MSEC_PER_SEC;
+ /* Convert nsec to msec. */
+ msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC;
+
+ /* Convert to network byte order. */
+ return htonl(msecs);
+}
+EXPORT_SYMBOL(inet_current_timestamp);
+
int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
{
if (sk->sk_family == AF_INET)
@@ -1425,6 +1461,13 @@ out_unlock:
return err;
}
+static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ skb->encapsulation = 1;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPIP;
+ return inet_gro_complete(skb, nhoff);
+}
+
int inet_ctl_sock_create(struct sock **sk, unsigned short family,
unsigned short type, unsigned char protocol,
struct net *net)
@@ -1652,8 +1695,8 @@ static struct packet_offload ip_packet_offload __read_mostly = {
static const struct net_offload ipip_offload = {
.callbacks = {
.gso_segment = inet_gso_segment,
- .gro_receive = inet_gro_receive,
- .gro_complete = inet_gro_complete,
+ .gro_receive = ipip_gro_receive,
+ .gro_complete = ipip_gro_complete,
},
};
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 59b3e0e8f..c34c7544d 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -665,7 +665,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
*/
if (!in_dev)
- goto out;
+ goto out_free_skb;
arp = arp_hdr(skb);
@@ -673,7 +673,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
default:
if (arp->ar_pro != htons(ETH_P_IP) ||
htons(dev_type) != arp->ar_hrd)
- goto out;
+ goto out_free_skb;
break;
case ARPHRD_ETHER:
case ARPHRD_FDDI:
@@ -690,17 +690,17 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
arp->ar_pro != htons(ETH_P_IP))
- goto out;
+ goto out_free_skb;
break;
case ARPHRD_AX25:
if (arp->ar_pro != htons(AX25_P_IP) ||
arp->ar_hrd != htons(ARPHRD_AX25))
- goto out;
+ goto out_free_skb;
break;
case ARPHRD_NETROM:
if (arp->ar_pro != htons(AX25_P_IP) ||
arp->ar_hrd != htons(ARPHRD_NETROM))
- goto out;
+ goto out_free_skb;
break;
}
@@ -708,7 +708,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
if (arp->ar_op != htons(ARPOP_REPLY) &&
arp->ar_op != htons(ARPOP_REQUEST))
- goto out;
+ goto out_free_skb;
/*
* Extract fields
@@ -733,7 +733,15 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
*/
if (ipv4_is_multicast(tip) ||
(!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
- goto out;
+ goto out_free_skb;
+
+ /*
+ * For some 802.11 wireless deployments (and possibly other networks),
+ * there will be an ARP proxy and gratuitous ARP frames are attacks
+ * and thus should not be accepted.
+ */
+ if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP))
+ goto out_free_skb;
/*
* Special case: We must set Frame Relay source Q.922 address
@@ -770,7 +778,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
!arp_ignore(in_dev, sip, tip))
arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip,
sha, dev->dev_addr, sha, reply_dst);
- goto out;
+ goto out_consume_skb;
}
if (arp->ar_op == htons(ARPOP_REQUEST) &&
@@ -795,7 +803,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
neigh_release(n);
}
}
- goto out;
+ goto out_consume_skb;
} else if (IN_DEV_FORWARD(in_dev)) {
if (addr_type == RTN_UNICAST &&
(arp_fwd_proxy(in_dev, dev, rt) ||
@@ -818,7 +826,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
in_dev->arp_parms, skb);
goto out_free_dst;
}
- goto out;
+ goto out_consume_skb;
}
}
}
@@ -868,11 +876,16 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
neigh_release(n);
}
-out:
+out_consume_skb:
consume_skb(skb);
+
out_free_dst:
dst_release(reply_dst);
- return 0;
+ return NET_RX_SUCCESS;
+
+out_free_skb:
+ kfree_skb(skb);
+ return NET_RX_DROP;
}
static void parp_redo(struct sk_buff *skb)
@@ -916,11 +929,11 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
consumeskb:
consume_skb(skb);
- return 0;
+ return NET_RX_SUCCESS;
freeskb:
kfree_skb(skb);
out_of_mem:
- return 0;
+ return NET_RX_DROP;
}
/*
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 0212591b0..e333bc86b 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1198,6 +1198,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
__be32 addr = 0;
struct in_device *in_dev;
struct net *net = dev_net(dev);
+ int master_idx;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
@@ -1218,12 +1219,33 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
if (addr)
goto out_unlock;
no_in_dev:
+ master_idx = l3mdev_master_ifindex_rcu(dev);
+
+ /* For VRFs, the VRF device takes the place of the loopback device,
+ * with addresses on it being preferred. Note in such cases the
+ * loopback device will be among the devices that fail the master_idx
+ * equality check in the loop below.
+ */
+ if (master_idx &&
+ (dev = dev_get_by_index_rcu(net, master_idx)) &&
+ (in_dev = __in_dev_get_rcu(dev))) {
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_scope != RT_SCOPE_LINK &&
+ ifa->ifa_scope <= scope) {
+ addr = ifa->ifa_local;
+ goto out_unlock;
+ }
+ } endfor_ifa(in_dev);
+ }
/* Not loopback addresses on loopback should be preferred
in this case. It is important that lo is the first interface
in dev_base list.
*/
for_each_netdev_rcu(net, dev) {
+ if (l3mdev_master_ifindex_rcu(dev) != master_idx)
+ continue;
+
in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
continue;
@@ -1735,17 +1757,20 @@ static int inet_netconf_msgsize_devconf(int type)
{
int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
+ nla_total_size(4); /* NETCONFA_IFINDEX */
+ bool all = false;
+
+ if (type == NETCONFA_ALL)
+ all = true;
- /* type -1 is used for ALL */
- if (type == -1 || type == NETCONFA_FORWARDING)
+ if (all || type == NETCONFA_FORWARDING)
size += nla_total_size(4);
- if (type == -1 || type == NETCONFA_RP_FILTER)
+ if (all || type == NETCONFA_RP_FILTER)
size += nla_total_size(4);
- if (type == -1 || type == NETCONFA_MC_FORWARDING)
+ if (all || type == NETCONFA_MC_FORWARDING)
size += nla_total_size(4);
- if (type == -1 || type == NETCONFA_PROXY_NEIGH)
+ if (all || type == NETCONFA_PROXY_NEIGH)
size += nla_total_size(4);
- if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+ if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
size += nla_total_size(4);
return size;
@@ -1758,36 +1783,39 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
{
struct nlmsghdr *nlh;
struct netconfmsg *ncm;
+ bool all = false;
nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
flags);
if (!nlh)
return -EMSGSIZE;
+ if (type == NETCONFA_ALL)
+ all = true;
+
ncm = nlmsg_data(nlh);
ncm->ncm_family = AF_INET;
if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
goto nla_put_failure;
- /* type -1 is used for ALL */
- if ((type == -1 || type == NETCONFA_FORWARDING) &&
+ if ((all || type == NETCONFA_FORWARDING) &&
nla_put_s32(skb, NETCONFA_FORWARDING,
IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
goto nla_put_failure;
- if ((type == -1 || type == NETCONFA_RP_FILTER) &&
+ if ((all || type == NETCONFA_RP_FILTER) &&
nla_put_s32(skb, NETCONFA_RP_FILTER,
IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)
goto nla_put_failure;
- if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
+ if ((all || type == NETCONFA_MC_FORWARDING) &&
nla_put_s32(skb, NETCONFA_MC_FORWARDING,
IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
goto nla_put_failure;
- if ((type == -1 || type == NETCONFA_PROXY_NEIGH) &&
+ if ((all || type == NETCONFA_PROXY_NEIGH) &&
nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
goto nla_put_failure;
- if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+ if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
goto nla_put_failure;
@@ -1875,14 +1903,14 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
}
err = -ENOBUFS;
- skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC);
+ skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC);
if (!skb)
goto errout;
err = inet_netconf_fill_devconf(skb, ifindex, devconf,
NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
- -1);
+ NETCONFA_ALL);
if (err < 0) {
/* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
WARN_ON(err == -EMSGSIZE);
@@ -1926,7 +1954,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
cb->nlh->nlmsg_seq,
RTM_NEWNETCONF,
NLM_F_MULTI,
- -1) < 0) {
+ NETCONFA_ALL) < 0) {
rcu_read_unlock();
goto done;
}
@@ -1942,7 +1970,7 @@ cont:
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNETCONF, NLM_F_MULTI,
- -1) < 0)
+ NETCONFA_ALL) < 0)
goto done;
else
h++;
@@ -1953,7 +1981,7 @@ cont:
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNETCONF, NLM_F_MULTI,
- -1) < 0)
+ NETCONFA_ALL) < 0)
goto done;
else
h++;
@@ -2189,6 +2217,8 @@ static struct devinet_sysctl_table {
"igmpv3_unsolicited_report_interval"),
DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,
"ignore_routes_with_linkdown"),
+ DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP,
+ "drop_gratuitous_arp"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
@@ -2196,6 +2226,8 @@ static struct devinet_sysctl_table {
"promote_secondaries"),
DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
"route_localnet"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST,
+ "drop_unicast_in_l2_multicast"),
},
};
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 8a9246dec..63566ec54 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -904,7 +904,11 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
if (ifa->ifa_flags & IFA_F_SECONDARY) {
prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
if (!prim) {
- pr_warn("%s: bug: prim == NULL\n", __func__);
+ /* if the device has been deleted, we don't perform
+ * address promotion
+ */
+ if (!in_dev->dead)
+ pr_warn("%s: bug: prim == NULL\n", __func__);
return;
}
if (iprim && iprim != prim) {
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d97268e8f..2b68418c7 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -975,6 +975,8 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
val = 65535 - 40;
if (type == RTAX_MTU && val > 65535 - 15)
val = 65535 - 15;
+ if (type == RTAX_HOPLIMIT && val > 255)
+ val = 255;
if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
return -EINVAL;
fi->fib_metrics[type - 1] = val;
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 976f0dcf6..a6962ccad 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -48,7 +48,7 @@ static inline struct fou *fou_from_sock(struct sock *sk)
return sk->sk_user_data;
}
-static void fou_recv_pull(struct sk_buff *skb, size_t len)
+static int fou_recv_pull(struct sk_buff *skb, size_t len)
{
struct iphdr *iph = ip_hdr(skb);
@@ -59,6 +59,7 @@ static void fou_recv_pull(struct sk_buff *skb, size_t len)
__skb_pull(skb, len);
skb_postpull_rcsum(skb, udp_hdr(skb), len);
skb_reset_transport_header(skb);
+ return iptunnel_pull_offloads(skb);
}
static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
@@ -68,9 +69,14 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
if (!fou)
return 1;
- fou_recv_pull(skb, sizeof(struct udphdr));
+ if (fou_recv_pull(skb, sizeof(struct udphdr)))
+ goto drop;
return -fou->protocol;
+
+drop:
+ kfree_skb(skb);
+ return 0;
}
static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
@@ -170,6 +176,9 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
__skb_pull(skb, sizeof(struct udphdr) + hdrlen);
skb_reset_transport_header(skb);
+ if (iptunnel_pull_offloads(skb))
+ goto drop;
+
return -guehdr->proto_ctype;
drop:
@@ -186,6 +195,17 @@ static struct sk_buff **fou_gro_receive(struct sk_buff **head,
u8 proto = NAPI_GRO_CB(skb)->proto;
const struct net_offload **offloads;
+ /* We can clear the encap_mark for FOU as we are essentially doing
+ * one of two possible things. We are either adding an L4 tunnel
+ * header to the outer L3 tunnel header, or we are are simply
+ * treating the GRE tunnel header as though it is a UDP protocol
+ * specific header such as VXLAN or GENEVE.
+ */
+ NAPI_GRO_CB(skb)->encap_mark = 0;
+
+ /* Flag this frame as already having an outer encap header */
+ NAPI_GRO_CB(skb)->is_fou = 1;
+
rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[proto]);
@@ -208,8 +228,6 @@ static int fou_gro_complete(struct sk_buff *skb, int nhoff,
int err = -ENOSYS;
const struct net_offload **offloads;
- udp_tunnel_gro_complete(skb, nhoff);
-
rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[proto]);
@@ -218,6 +236,8 @@ static int fou_gro_complete(struct sk_buff *skb, int nhoff,
err = ops->callbacks.gro_complete(skb, nhoff);
+ skb_set_inner_mac_header(skb, nhoff);
+
out_unlock:
rcu_read_unlock();
@@ -319,8 +339,6 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
skb_gro_pull(skb, hdrlen);
- flush = 0;
-
for (p = *head; p; p = p->next) {
const struct guehdr *guehdr2;
@@ -345,6 +363,17 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
}
}
+ /* We can clear the encap_mark for GUE as we are essentially doing
+ * one of two possible things. We are either adding an L4 tunnel
+ * header to the outer L3 tunnel header, or we are are simply
+ * treating the GRE tunnel header as though it is a UDP protocol
+ * specific header such as VXLAN or GENEVE.
+ */
+ NAPI_GRO_CB(skb)->encap_mark = 0;
+
+ /* Flag this frame as already having an outer encap header */
+ NAPI_GRO_CB(skb)->is_fou = 1;
+
rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[guehdr->proto_ctype]);
@@ -352,6 +381,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
goto out_unlock;
pp = ops->callbacks.gro_receive(head, skb);
+ flush = 0;
out_unlock:
rcu_read_unlock();
@@ -384,6 +414,8 @@ static int gue_gro_complete(struct sk_buff *skb, int nhoff,
err = ops->callbacks.gro_complete(skb, nhoff + guehlen);
+ skb_set_inner_mac_header(skb, nhoff + guehlen);
+
out_unlock:
rcu_read_unlock();
return err;
@@ -774,7 +806,6 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
uh->dest = e->dport;
uh->source = sport;
uh->len = htons(skb->len);
- uh->check = 0;
udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb,
fl4->saddr, fl4->daddr, skb->len);
@@ -784,11 +815,11 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
u8 *protocol, struct flowi4 *fl4)
{
- bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
- int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+ int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
+ SKB_GSO_UDP_TUNNEL;
__be16 sport;
- skb = iptunnel_handle_offloads(skb, csum, type);
+ skb = iptunnel_handle_offloads(skb, type);
if (IS_ERR(skb))
return PTR_ERR(skb);
@@ -804,8 +835,8 @@ EXPORT_SYMBOL(fou_build_header);
int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
u8 *protocol, struct flowi4 *fl4)
{
- bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
- int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+ int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
+ SKB_GSO_UDP_TUNNEL;
struct guehdr *guehdr;
size_t hdrlen, optlen = 0;
__be16 sport;
@@ -814,7 +845,6 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) &&
skb->ip_summed == CHECKSUM_PARTIAL) {
- csum = false;
optlen += GUE_PLEN_REMCSUM;
type |= SKB_GSO_TUNNEL_REMCSUM;
need_priv = true;
@@ -822,7 +852,7 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
optlen += need_priv ? GUE_LEN_PRIV : 0;
- skb = iptunnel_handle_offloads(skb, csum, type);
+ skb = iptunnel_handle_offloads(skb, type);
if (IS_ERR(skb))
return PTR_ERR(skb);
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 5a8ee3282..6a5bd4317 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -18,15 +18,13 @@
static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
+ int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
struct sk_buff *segs = ERR_PTR(-EINVAL);
- netdev_features_t enc_features;
- int ghl;
- struct gre_base_hdr *greh;
u16 mac_offset = skb->mac_header;
- int mac_len = skb->mac_len;
__be16 protocol = skb->protocol;
- int tnl_hlen;
- bool csum;
+ u16 mac_len = skb->mac_len;
+ int gre_offset, outer_hlen;
+ bool need_csum, ufo;
if (unlikely(skb_shinfo(skb)->gso_type &
~(SKB_GSO_TCPV4 |
@@ -43,74 +41,75 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
if (!skb->encapsulation)
goto out;
- if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
+ if (unlikely(tnl_hlen < sizeof(struct gre_base_hdr)))
goto out;
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
-
- ghl = skb_inner_mac_header(skb) - skb_transport_header(skb);
- if (unlikely(ghl < sizeof(*greh)))
+ if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
goto out;
- csum = !!(greh->flags & GRE_CSUM);
- if (csum)
- skb->encap_hdr_csum = 1;
-
/* setup inner skb. */
- skb->protocol = greh->protocol;
skb->encapsulation = 0;
-
- if (unlikely(!pskb_may_pull(skb, ghl)))
- goto out;
-
- __skb_pull(skb, ghl);
+ SKB_GSO_CB(skb)->encap_level = 0;
+ __skb_pull(skb, tnl_hlen);
skb_reset_mac_header(skb);
skb_set_network_header(skb, skb_inner_network_offset(skb));
skb->mac_len = skb_inner_network_offset(skb);
+ skb->protocol = skb->inner_protocol;
+
+ need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
+ skb->encap_hdr_csum = need_csum;
+
+ ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
+
+ features &= skb->dev->hw_enc_features;
+
+ /* The only checksum offload we care about from here on out is the
+ * outer one so strip the existing checksum feature flags based
+ * on the fact that we will be computing our checksum in software.
+ */
+ if (ufo) {
+ features &= ~NETIF_F_CSUM_MASK;
+ if (!need_csum)
+ features |= NETIF_F_HW_CSUM;
+ }
/* segment inner packet. */
- enc_features = skb->dev->hw_enc_features & features;
- segs = skb_mac_gso_segment(skb, enc_features);
+ segs = skb_mac_gso_segment(skb, features);
if (IS_ERR_OR_NULL(segs)) {
- skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
+ skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
+ mac_len);
goto out;
}
+ outer_hlen = skb_tnl_header_len(skb);
+ gre_offset = outer_hlen - tnl_hlen;
skb = segs;
- tnl_hlen = skb_tnl_header_len(skb);
do {
- __skb_push(skb, ghl);
- if (csum) {
- __be32 *pcsum;
-
- if (skb_has_shared_frag(skb)) {
- int err;
-
- err = __skb_linearize(skb);
- if (err) {
- kfree_skb_list(segs);
- segs = ERR_PTR(err);
- goto out;
- }
- }
+ struct gre_base_hdr *greh;
+ __be32 *pcsum;
- skb_reset_transport_header(skb);
-
- greh = (struct gre_base_hdr *)
- skb_transport_header(skb);
- pcsum = (__be32 *)(greh + 1);
- *pcsum = 0;
- *(__sum16 *)pcsum = gso_make_checksum(skb, 0);
+ /* Set up inner headers if we are offloading inner checksum */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
}
- __skb_push(skb, tnl_hlen - ghl);
- skb_reset_inner_headers(skb);
- skb->encapsulation = 1;
+ skb->mac_len = mac_len;
+ skb->protocol = protocol;
+ __skb_push(skb, outer_hlen);
skb_reset_mac_header(skb);
skb_set_network_header(skb, mac_len);
- skb->mac_len = mac_len;
- skb->protocol = protocol;
+ skb_set_transport_header(skb, gre_offset);
+
+ if (!need_csum)
+ continue;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ pcsum = (__be32 *)(greh + 1);
+
+ *pcsum = 0;
+ *(__sum16 *)pcsum = gso_make_checksum(skb, 0);
} while ((skb = skb->next));
out:
return segs;
@@ -128,6 +127,11 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
struct packet_offload *ptype;
__be16 type;
+ if (NAPI_GRO_CB(skb)->encap_mark)
+ goto out;
+
+ NAPI_GRO_CB(skb)->encap_mark = 1;
+
off = skb_gro_offset(skb);
hlen = off + sizeof(*greh);
greh = skb_gro_header_fast(skb, off);
@@ -146,6 +150,14 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0)
goto out;
+ /* We can only support GRE_CSUM if we can track the location of
+ * the GRE header. In the case of FOU/GUE we cannot because the
+ * outer UDP header displaces the GRE header leaving us in a state
+ * of limbo.
+ */
+ if ((greh->flags & GRE_CSUM) && NAPI_GRO_CB(skb)->is_fou)
+ goto out;
+
type = greh->protocol;
rcu_read_lock();
@@ -177,8 +189,6 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
null_compute_pseudo);
}
- flush = 0;
-
for (p = *head; p; p = p->next) {
const struct gre_base_hdr *greh2;
@@ -215,6 +225,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
skb_gro_postpull_rcsum(skb, greh, grehlen);
pp = ptype->callbacks.gro_receive(head, skb);
+ flush = 0;
out_unlock:
rcu_read_unlock();
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 36e26977c..633348977 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -931,7 +931,6 @@ static bool icmp_echo(struct sk_buff *skb)
*/
static bool icmp_timestamp(struct sk_buff *skb)
{
- struct timespec tv;
struct icmp_bxm icmp_param;
/*
* Too short.
@@ -942,9 +941,7 @@ static bool icmp_timestamp(struct sk_buff *skb)
/*
* Fill in the current time as ms since midnight UT:
*/
- getnstimeofday(&tv);
- icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC +
- tv.tv_nsec / NSEC_PER_MSEC);
+ icmp_param.data.times[1] = inet_current_timestamp();
icmp_param.data.times[2] = icmp_param.data.times[1];
if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
BUG();
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index b3086cf27..9b4ca87f7 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -107,12 +107,6 @@
#include <linux/seq_file.h>
#endif
-#define IP_MAX_MEMBERSHIPS 20
-#define IP_MAX_MSF 10
-
-/* IGMP reports for link-local multicast groups are enabled by default */
-int sysctl_igmp_llm_reports __read_mostly = 1;
-
#ifdef CONFIG_IP_MULTICAST
/* Parameter names and values are taken from igmp-v2-06 draft */
@@ -432,6 +426,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
int type, int gdeleted, int sdeleted)
{
struct net_device *dev = pmc->interface->dev;
+ struct net *net = dev_net(dev);
struct igmpv3_report *pih;
struct igmpv3_grec *pgr = NULL;
struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
@@ -439,7 +434,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
if (pmc->multiaddr == IGMP_ALL_HOSTS)
return skb;
- if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports)
+ if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
return skb;
isquery = type == IGMPV3_MODE_IS_INCLUDE ||
@@ -542,6 +537,7 @@ empty_source:
static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
{
struct sk_buff *skb = NULL;
+ struct net *net = dev_net(in_dev->dev);
int type;
if (!pmc) {
@@ -550,7 +546,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
if (pmc->multiaddr == IGMP_ALL_HOSTS)
continue;
if (ipv4_is_local_multicast(pmc->multiaddr) &&
- !sysctl_igmp_llm_reports)
+ !net->ipv4.sysctl_igmp_llm_reports)
continue;
spin_lock_bh(&pmc->lock);
if (pmc->sfcount[MCAST_EXCLUDE])
@@ -686,7 +682,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
return igmpv3_send_report(in_dev, pmc);
- if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports)
+ if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports)
return 0;
if (type == IGMP_HOST_LEAVE_MESSAGE)
@@ -765,9 +761,10 @@ static void igmp_ifc_timer_expire(unsigned long data)
static void igmp_ifc_event(struct in_device *in_dev)
{
+ struct net *net = dev_net(in_dev->dev);
if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
return;
- in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ in_dev->mr_ifc_count = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
igmp_ifc_start_timer(in_dev, 1);
}
@@ -857,12 +854,13 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
{
struct ip_mc_list *im;
+ struct net *net = dev_net(in_dev->dev);
/* Timers are only set for non-local groups */
if (group == IGMP_ALL_HOSTS)
return false;
- if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports)
+ if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports)
return false;
rcu_read_lock();
@@ -886,6 +884,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
__be32 group = ih->group;
int max_delay;
int mark = 0;
+ struct net *net = dev_net(in_dev->dev);
if (len == 8) {
@@ -971,7 +970,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
if (ipv4_is_local_multicast(im->multiaddr) &&
- !sysctl_igmp_llm_reports)
+ !net->ipv4.sysctl_igmp_llm_reports)
continue;
spin_lock_bh(&im->lock);
if (im->tm_running)
@@ -1087,6 +1086,7 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
{
struct ip_mc_list *pmc;
+ struct net *net = dev_net(in_dev->dev);
/* this is an "ip_mc_list" for convenience; only the fields below
* are actually used. In particular, the refcnt and users are not
@@ -1101,7 +1101,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
pmc->interface = im->interface;
in_dev_hold(in_dev);
pmc->multiaddr = im->multiaddr;
- pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
pmc->sfmode = im->sfmode;
if (pmc->sfmode == MCAST_INCLUDE) {
struct ip_sf_list *psf;
@@ -1186,6 +1186,7 @@ static void igmp_group_dropped(struct ip_mc_list *im)
{
struct in_device *in_dev = im->interface;
#ifdef CONFIG_IP_MULTICAST
+ struct net *net = dev_net(in_dev->dev);
int reporter;
#endif
@@ -1197,7 +1198,7 @@ static void igmp_group_dropped(struct ip_mc_list *im)
#ifdef CONFIG_IP_MULTICAST
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
- if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports)
+ if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
return;
reporter = im->reporter;
@@ -1222,6 +1223,9 @@ static void igmp_group_dropped(struct ip_mc_list *im)
static void igmp_group_added(struct ip_mc_list *im)
{
struct in_device *in_dev = im->interface;
+#ifdef CONFIG_IP_MULTICAST
+ struct net *net = dev_net(in_dev->dev);
+#endif
if (im->loaded == 0) {
im->loaded = 1;
@@ -1231,7 +1235,7 @@ static void igmp_group_added(struct ip_mc_list *im)
#ifdef CONFIG_IP_MULTICAST
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
- if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports)
+ if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
return;
if (in_dev->dead)
@@ -1244,7 +1248,7 @@ static void igmp_group_added(struct ip_mc_list *im)
}
/* else, v3 */
- im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
igmp_ifc_event(in_dev);
#endif
}
@@ -1313,6 +1317,9 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
{
struct ip_mc_list *im;
+#ifdef CONFIG_IP_MULTICAST
+ struct net *net = dev_net(in_dev->dev);
+#endif
ASSERT_RTNL();
@@ -1339,7 +1346,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
spin_lock_init(&im->lock);
#ifdef CONFIG_IP_MULTICAST
setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im);
- im->unsolicit_count = sysctl_igmp_qrv;
+ im->unsolicit_count = net->ipv4.sysctl_igmp_qrv;
#endif
im->next_rcu = in_dev->mc_list;
@@ -1532,6 +1539,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev)
#ifdef CONFIG_IP_MULTICAST
struct ip_mc_list *im;
int type;
+ struct net *net = dev_net(in_dev->dev);
ASSERT_RTNL();
@@ -1539,7 +1547,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev)
if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
if (ipv4_is_local_multicast(im->multiaddr) &&
- !sysctl_igmp_llm_reports)
+ !net->ipv4.sysctl_igmp_llm_reports)
continue;
/* a failover is happening and switches
@@ -1638,6 +1646,9 @@ void ip_mc_down(struct in_device *in_dev)
void ip_mc_init_dev(struct in_device *in_dev)
{
+#ifdef CONFIG_IP_MULTICAST
+ struct net *net = dev_net(in_dev->dev);
+#endif
ASSERT_RTNL();
#ifdef CONFIG_IP_MULTICAST
@@ -1645,7 +1656,7 @@ void ip_mc_init_dev(struct in_device *in_dev)
(unsigned long)in_dev);
setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
(unsigned long)in_dev);
- in_dev->mr_qrv = sysctl_igmp_qrv;
+ in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
#endif
spin_lock_init(&in_dev->mc_tomb_lock);
@@ -1656,11 +1667,14 @@ void ip_mc_init_dev(struct in_device *in_dev)
void ip_mc_up(struct in_device *in_dev)
{
struct ip_mc_list *pmc;
+#ifdef CONFIG_IP_MULTICAST
+ struct net *net = dev_net(in_dev->dev);
+#endif
ASSERT_RTNL();
#ifdef CONFIG_IP_MULTICAST
- in_dev->mr_qrv = sysctl_igmp_qrv;
+ in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
#endif
ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
@@ -1726,11 +1740,6 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
/*
* Join a socket to a group
*/
-int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS;
-int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF;
-#ifdef CONFIG_IP_MULTICAST
-int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE;
-#endif
static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
__be32 *psfsrc)
@@ -1755,6 +1764,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
#ifdef CONFIG_IP_MULTICAST
struct in_device *in_dev = pmc->interface;
+ struct net *net = dev_net(in_dev->dev);
#endif
/* no more filters for this source */
@@ -1765,7 +1775,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
#ifdef CONFIG_IP_MULTICAST
if (psf->sf_oldin &&
!IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
- psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
psf->sf_next = pmc->tomb;
pmc->tomb = psf;
rv = 1;
@@ -1823,12 +1833,13 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
pmc->sfcount[MCAST_INCLUDE]) {
#ifdef CONFIG_IP_MULTICAST
struct ip_sf_list *psf;
+ struct net *net = dev_net(in_dev->dev);
#endif
/* filter mode change */
pmc->sfmode = MCAST_INCLUDE;
#ifdef CONFIG_IP_MULTICAST
- pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
in_dev->mr_ifc_count = pmc->crcount;
for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
@@ -1995,6 +2006,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
#ifdef CONFIG_IP_MULTICAST
struct ip_sf_list *psf;
+ struct net *net = dev_net(pmc->interface->dev);
in_dev = pmc->interface;
#endif
@@ -2006,7 +2018,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
#ifdef CONFIG_IP_MULTICAST
/* else no filters; keep old mode for reports */
- pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
in_dev->mr_ifc_count = pmc->crcount;
for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
@@ -2073,7 +2085,7 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
count++;
}
err = -ENOBUFS;
- if (count >= sysctl_igmp_max_memberships)
+ if (count >= net->ipv4.sysctl_igmp_max_memberships)
goto done;
iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
if (!iml)
@@ -2245,7 +2257,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
}
/* else, add a new source to the filter */
- if (psl && psl->sl_count >= sysctl_igmp_max_msf) {
+ if (psl && psl->sl_count >= net->ipv4.sysctl_igmp_max_msf) {
err = -ENOBUFS;
goto done;
}
@@ -2918,6 +2930,12 @@ static int __net_init igmp_net_init(struct net *net)
goto out_sock;
}
+ /* Sysctl initialization */
+ net->ipv4.sysctl_igmp_max_memberships = 20;
+ net->ipv4.sysctl_igmp_max_msf = 10;
+ /* IGMP reports for link-local multicast groups are enabled by default */
+ net->ipv4.sysctl_igmp_llm_reports = 1;
+ net->ipv4.sysctl_igmp_qrv = 2;
return 0;
out_sock:
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 641489148..bc5196ea1 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -24,6 +24,7 @@
#include <net/tcp_states.h>
#include <net/xfrm.h>
#include <net/tcp.h>
+#include <net/sock_reuseport.h>
#ifdef INET_CSK_DEBUG
const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
@@ -67,7 +68,8 @@ int inet_csk_bind_conflict(const struct sock *sk,
if ((!reuse || !sk2->sk_reuse ||
sk2->sk_state == TCP_LISTEN) &&
(!reuseport || !sk2->sk_reuseport ||
- (sk2->sk_state != TCP_TIME_WAIT &&
+ rcu_access_pointer(sk->sk_reuseport_cb) ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
!uid_eq(uid, sock_i_uid(sk2))))) {
if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
@@ -89,161 +91,154 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
/* Obtain a reference to a local port for the given sock,
* if snum is zero it means select any available local port.
+ * We try to allocate an odd port (and leave even ports for connect())
*/
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+ struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+ int ret = 1, attempts = 5, port = snum;
+ int smallest_size = -1, smallest_port;
struct inet_bind_hashbucket *head;
- struct inet_bind_bucket *tb;
- int ret, attempts = 5;
struct net *net = sock_net(sk);
- int smallest_size = -1, smallest_rover;
+ int i, low, high, attempt_half;
+ struct inet_bind_bucket *tb;
kuid_t uid = sock_i_uid(sk);
- int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
+ u32 remaining, offset;
- local_bh_disable();
- if (!snum) {
- int remaining, rover, low, high;
+ if (port) {
+have_port:
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ spin_lock_bh(&head->lock);
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (net_eq(ib_net(tb), net) && tb->port == port)
+ goto tb_found;
+ goto tb_not_found;
+ }
again:
- inet_get_local_port_range(net, &low, &high);
- if (attempt_half) {
- int half = low + ((high - low) >> 1);
-
- if (attempt_half == 1)
- high = half;
- else
- low = half;
- }
- remaining = (high - low) + 1;
- smallest_rover = rover = prandom_u32() % remaining + low;
-
- smallest_size = -1;
- do {
- if (inet_is_local_reserved_port(net, rover))
- goto next_nolock;
- head = &hashinfo->bhash[inet_bhashfn(net, rover,
- hashinfo->bhash_size)];
- spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == rover) {
- if (((tb->fastreuse > 0 &&
- sk->sk_reuse &&
- sk->sk_state != TCP_LISTEN) ||
- (tb->fastreuseport > 0 &&
- sk->sk_reuseport &&
- uid_eq(tb->fastuid, uid))) &&
- (tb->num_owners < smallest_size || smallest_size == -1)) {
- smallest_size = tb->num_owners;
- smallest_rover = rover;
- }
- if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
- snum = rover;
- goto tb_found;
- }
- goto next;
+ attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
+other_half_scan:
+ inet_get_local_port_range(net, &low, &high);
+ high++; /* [32768, 60999] -> [32768, 61000[ */
+ if (high - low < 4)
+ attempt_half = 0;
+ if (attempt_half) {
+ int half = low + (((high - low) >> 2) << 1);
+
+ if (attempt_half == 1)
+ high = half;
+ else
+ low = half;
+ }
+ remaining = high - low;
+ if (likely(remaining > 1))
+ remaining &= ~1U;
+
+ offset = prandom_u32() % remaining;
+ /* __inet_hash_connect() favors ports having @low parity
+ * We do the opposite to not pollute connect() users.
+ */
+ offset |= 1U;
+ smallest_size = -1;
+ smallest_port = low; /* avoid compiler warning */
+
+other_parity_scan:
+ port = low + offset;
+ for (i = 0; i < remaining; i += 2, port += 2) {
+ if (unlikely(port >= high))
+ port -= remaining;
+ if (inet_is_local_reserved_port(net, port))
+ continue;
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ spin_lock_bh(&head->lock);
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (net_eq(ib_net(tb), net) && tb->port == port) {
+ if (((tb->fastreuse > 0 && reuse) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
+ uid_eq(tb->fastuid, uid))) &&
+ (tb->num_owners < smallest_size || smallest_size == -1)) {
+ smallest_size = tb->num_owners;
+ smallest_port = port;
}
- break;
- next:
- spin_unlock(&head->lock);
- next_nolock:
- if (++rover > high)
- rover = low;
- } while (--remaining > 0);
-
- /* Exhausted local port range during search? It is not
- * possible for us to be holding one of the bind hash
- * locks if this test triggers, because if 'remaining'
- * drops to zero, we broke out of the do/while loop at
- * the top level, not from the 'break;' statement.
- */
- ret = 1;
- if (remaining <= 0) {
- if (smallest_size != -1) {
- snum = smallest_rover;
- goto have_snum;
- }
- if (attempt_half == 1) {
- /* OK we now try the upper half of the range */
- attempt_half = 2;
- goto again;
+ if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false))
+ goto tb_found;
+ goto next_port;
}
- goto fail;
- }
- /* OK, here is the one we will use. HEAD is
- * non-NULL and we hold it's mutex.
- */
- snum = rover;
- } else {
-have_snum:
- head = &hashinfo->bhash[inet_bhashfn(net, snum,
- hashinfo->bhash_size)];
- spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == snum)
- goto tb_found;
+ goto tb_not_found;
+next_port:
+ spin_unlock_bh(&head->lock);
+ cond_resched();
+ }
+
+ if (smallest_size != -1) {
+ port = smallest_port;
+ goto have_port;
}
- tb = NULL;
- goto tb_not_found;
+ offset--;
+ if (!(offset & 1))
+ goto other_parity_scan;
+
+ if (attempt_half == 1) {
+ /* OK we now try the upper half of the range */
+ attempt_half = 2;
+ goto other_half_scan;
+ }
+ return ret;
+
+tb_not_found:
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+ net, head, port);
+ if (!tb)
+ goto fail_unlock;
tb_found:
if (!hlist_empty(&tb->owners)) {
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
- if (((tb->fastreuse > 0 &&
- sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ if (((tb->fastreuse > 0 && reuse) ||
(tb->fastreuseport > 0 &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
- smallest_size == -1) {
+ smallest_size == -1)
goto success;
- } else {
- ret = 1;
- if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
- if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
- (tb->fastreuseport > 0 &&
- sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
- smallest_size != -1 && --attempts >= 0) {
- spin_unlock(&head->lock);
- goto again;
- }
-
- goto fail_unlock;
+ if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
+ if ((reuse ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
+ uid_eq(tb->fastuid, uid))) &&
+ smallest_size != -1 && --attempts >= 0) {
+ spin_unlock_bh(&head->lock);
+ goto again;
}
+ goto fail_unlock;
}
- }
-tb_not_found:
- ret = 1;
- if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
- net, head, snum)) == NULL)
- goto fail_unlock;
- if (hlist_empty(&tb->owners)) {
- if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
- tb->fastreuse = 1;
- else
+ if (!reuse)
tb->fastreuse = 0;
+ if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
+ tb->fastreuseport = 0;
+ } else {
+ tb->fastreuse = reuse;
if (sk->sk_reuseport) {
tb->fastreuseport = 1;
tb->fastuid = uid;
- } else
- tb->fastreuseport = 0;
- } else {
- if (tb->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
- tb->fastreuse = 0;
- if (tb->fastreuseport &&
- (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
+ } else {
tb->fastreuseport = 0;
+ }
}
success:
if (!inet_csk(sk)->icsk_bind_hash)
- inet_bind_hash(sk, tb, snum);
+ inet_bind_hash(sk, tb, port);
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
ret = 0;
fail_unlock:
- spin_unlock(&head->lock);
-fail:
- local_bh_enable();
+ spin_unlock_bh(&head->lock);
return ret;
}
EXPORT_SYMBOL_GPL(inet_csk_get_port);
@@ -482,10 +477,6 @@ EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
#define AF_INET_FAMILY(fam) true
#endif
-/* Only thing we need from tcp.h */
-extern int sysctl_tcp_synack_retries;
-
-
/* Decide when to expire the request and when to resend SYN-ACK */
static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
const int max_retries,
@@ -557,6 +548,7 @@ static void reqsk_timer_handler(unsigned long data)
{
struct request_sock *req = (struct request_sock *)data;
struct sock *sk_listener = req->rsk_listener;
+ struct net *net = sock_net(sk_listener);
struct inet_connection_sock *icsk = inet_csk(sk_listener);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
int qlen, expire = 0, resend = 0;
@@ -566,7 +558,7 @@ static void reqsk_timer_handler(unsigned long data)
if (sk_state_load(sk_listener) != TCP_LISTEN)
goto drop;
- max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+ max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;
thresh = max_retries;
/* Normally all the openreqs are young and become mature
* (i.e. converted to established socket) for first timeout.
@@ -737,6 +729,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet = inet_sk(sk);
+ int err = -EADDRINUSE;
reqsk_queue_alloc(&icsk->icsk_accept_queue);
@@ -754,13 +747,14 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
inet->inet_sport = htons(inet->inet_num);
sk_dst_reset(sk);
- sk->sk_prot->hash(sk);
+ err = sk->sk_prot->hash(sk);
- return 0;
+ if (likely(!err))
+ return 0;
}
sk->sk_state = TCP_CLOSE;
- return -EADDRINUSE;
+ return err;
}
EXPORT_SYMBOL_GPL(inet_csk_listen_start);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 6029157a1..5fdb02f55 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -357,18 +357,18 @@ struct sock *inet_diag_find_one_icsk(struct net *net,
struct sock *sk;
if (req->sdiag_family == AF_INET)
- sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
+ sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0],
req->id.idiag_dport, req->id.idiag_src[0],
req->id.idiag_sport, req->id.idiag_if);
#if IS_ENABLED(CONFIG_IPV6)
else if (req->sdiag_family == AF_INET6) {
if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
- sk = inet_lookup(net, hashinfo, req->id.idiag_dst[3],
+ sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3],
req->id.idiag_dport, req->id.idiag_src[3],
req->id.idiag_sport, req->id.idiag_if);
else
- sk = inet6_lookup(net, hashinfo,
+ sk = inet6_lookup(net, hashinfo, NULL, 0,
(struct in6_addr *)req->id.idiag_dst,
req->id.idiag_dport,
(struct in6_addr *)req->id.idiag_src,
@@ -879,6 +879,7 @@ next_normal:
}
spin_unlock_bh(lock);
+ cond_resched();
}
done:
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ccc598079..0d9e9d7bb 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -20,10 +20,12 @@
#include <linux/wait.h>
#include <linux/vmalloc.h>
+#include <net/addrconf.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#include <net/secure_seq.h>
#include <net/ip.h>
+#include <net/sock_reuseport.h>
static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
const __u16 lport, const __be32 faddr,
@@ -205,6 +207,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+ struct sk_buff *skb, int doff,
const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
const int dif)
@@ -214,6 +217,7 @@ struct sock *__inet_lookup_listener(struct net *net,
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
int score, hiscore, matches = 0, reuseport = 0;
+ bool select_ok = true;
u32 phash = 0;
rcu_read_lock();
@@ -229,6 +233,15 @@ begin:
if (reuseport) {
phash = inet_ehashfn(net, daddr, hnum,
saddr, sport);
+ if (select_ok) {
+ struct sock *sk2;
+ sk2 = reuseport_select_sock(sk, phash,
+ skb, doff);
+ if (sk2) {
+ result = sk2;
+ goto found;
+ }
+ }
matches = 1;
}
} else if (score == hiscore && reuseport) {
@@ -246,11 +259,13 @@ begin:
if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
goto begin;
if (result) {
+found:
if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
result = NULL;
else if (unlikely(compute_score(result, net, hnum, daddr,
dif) < hiscore)) {
sock_put(result);
+ select_ok = false;
goto begin;
}
}
@@ -449,32 +464,76 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
}
EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
-void __inet_hash(struct sock *sk, struct sock *osk)
+static int inet_reuseport_add_sock(struct sock *sk,
+ struct inet_listen_hashbucket *ilb,
+ int (*saddr_same)(const struct sock *sk1,
+ const struct sock *sk2,
+ bool match_wildcard))
+{
+ struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
+ struct sock *sk2;
+ struct hlist_nulls_node *node;
+ kuid_t uid = sock_i_uid(sk);
+
+ sk_nulls_for_each_rcu(sk2, node, &ilb->head) {
+ if (sk2 != sk &&
+ sk2->sk_family == sk->sk_family &&
+ ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
+ inet_csk(sk2)->icsk_bind_hash == tb &&
+ sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
+ saddr_same(sk, sk2, false))
+ return reuseport_add_sock(sk, sk2);
+ }
+
+ /* Initial allocation may have already happened via setsockopt */
+ if (!rcu_access_pointer(sk->sk_reuseport_cb))
+ return reuseport_alloc(sk);
+ return 0;
+}
+
+int __inet_hash(struct sock *sk, struct sock *osk,
+ int (*saddr_same)(const struct sock *sk1,
+ const struct sock *sk2,
+ bool match_wildcard))
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
+ int err = 0;
if (sk->sk_state != TCP_LISTEN) {
inet_ehash_nolisten(sk, osk);
- return;
+ return 0;
}
WARN_ON(!sk_unhashed(sk));
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
spin_lock(&ilb->lock);
+ if (sk->sk_reuseport) {
+ err = inet_reuseport_add_sock(sk, ilb, saddr_same);
+ if (err)
+ goto unlock;
+ }
__sk_nulls_add_node_rcu(sk, &ilb->head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+unlock:
spin_unlock(&ilb->lock);
+
+ return err;
}
EXPORT_SYMBOL(__inet_hash);
-void inet_hash(struct sock *sk)
+int inet_hash(struct sock *sk)
{
+ int err = 0;
+
if (sk->sk_state != TCP_CLOSE) {
local_bh_disable();
- __inet_hash(sk, NULL);
+ err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal);
local_bh_enable();
}
+
+ return err;
}
EXPORT_SYMBOL_GPL(inet_hash);
@@ -493,6 +552,8 @@ void inet_unhash(struct sock *sk)
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
spin_lock_bh(lock);
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_detach_sock(sk);
done = __sk_nulls_del_node_init_rcu(sk);
if (done)
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
@@ -506,106 +567,106 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *, __u16, struct inet_timewait_sock **))
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
- const unsigned short snum = inet_sk(sk)->inet_num;
+ struct inet_timewait_sock *tw = NULL;
struct inet_bind_hashbucket *head;
- struct inet_bind_bucket *tb;
- int ret;
+ int port = inet_sk(sk)->inet_num;
struct net *net = sock_net(sk);
+ struct inet_bind_bucket *tb;
+ u32 remaining, offset;
+ int ret, i, low, high;
+ static u32 hint;
+
+ if (port) {
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ tb = inet_csk(sk)->icsk_bind_hash;
+ spin_lock_bh(&head->lock);
+ if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+ inet_ehash_nolisten(sk, NULL);
+ spin_unlock_bh(&head->lock);
+ return 0;
+ }
+ spin_unlock(&head->lock);
+ /* No definite answer... Walk to established hash table */
+ ret = check_established(death_row, sk, port, NULL);
+ local_bh_enable();
+ return ret;
+ }
- if (!snum) {
- int i, remaining, low, high, port;
- static u32 hint;
- u32 offset = hint + port_offset;
- struct inet_timewait_sock *tw = NULL;
+ inet_get_local_port_range(net, &low, &high);
+ high++; /* [32768, 60999] -> [32768, 61000[ */
+ remaining = high - low;
+ if (likely(remaining > 1))
+ remaining &= ~1U;
- inet_get_local_port_range(net, &low, &high);
- remaining = (high - low) + 1;
+ offset = (hint + port_offset) % remaining;
+ /* In first pass we try ports of @low parity.
+ * inet_csk_get_port() does the opposite choice.
+ */
+ offset &= ~1U;
+other_parity_scan:
+ port = low + offset;
+ for (i = 0; i < remaining; i += 2, port += 2) {
+ if (unlikely(port >= high))
+ port -= remaining;
+ if (inet_is_local_reserved_port(net, port))
+ continue;
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ spin_lock_bh(&head->lock);
- /* By starting with offset being an even number,
- * we tend to leave about 50% of ports for other uses,
- * like bind(0).
+ /* Does not bother with rcv_saddr checks, because
+ * the established check is already unique enough.
*/
- offset &= ~1;
-
- local_bh_disable();
- for (i = 0; i < remaining; i++) {
- port = low + (i + offset) % remaining;
- if (inet_is_local_reserved_port(net, port))
- continue;
- head = &hinfo->bhash[inet_bhashfn(net, port,
- hinfo->bhash_size)];
- spin_lock(&head->lock);
-
- /* Does not bother with rcv_saddr checks,
- * because the established check is already
- * unique enough.
- */
- inet_bind_bucket_for_each(tb, &head->chain) {
- if (net_eq(ib_net(tb), net) &&
- tb->port == port) {
- if (tb->fastreuse >= 0 ||
- tb->fastreuseport >= 0)
- goto next_port;
- WARN_ON(hlist_empty(&tb->owners));
- if (!check_established(death_row, sk,
- port, &tw))
- goto ok;
+ inet_bind_bucket_for_each(tb, &head->chain) {
+ if (net_eq(ib_net(tb), net) && tb->port == port) {
+ if (tb->fastreuse >= 0 ||
+ tb->fastreuseport >= 0)
goto next_port;
- }
+ WARN_ON(hlist_empty(&tb->owners));
+ if (!check_established(death_row, sk,
+ port, &tw))
+ goto ok;
+ goto next_port;
}
-
- tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
- net, head, port);
- if (!tb) {
- spin_unlock(&head->lock);
- break;
- }
- tb->fastreuse = -1;
- tb->fastreuseport = -1;
- goto ok;
-
- next_port:
- spin_unlock(&head->lock);
}
- local_bh_enable();
-
- return -EADDRNOTAVAIL;
-ok:
- hint += (i + 2) & ~1;
-
- /* Head lock still held and bh's disabled */
- inet_bind_hash(sk, tb, port);
- if (sk_unhashed(sk)) {
- inet_sk(sk)->inet_sport = htons(port);
- inet_ehash_nolisten(sk, (struct sock *)tw);
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+ net, head, port);
+ if (!tb) {
+ spin_unlock_bh(&head->lock);
+ return -ENOMEM;
}
- if (tw)
- inet_twsk_bind_unhash(tw, hinfo);
- spin_unlock(&head->lock);
+ tb->fastreuse = -1;
+ tb->fastreuseport = -1;
+ goto ok;
+next_port:
+ spin_unlock_bh(&head->lock);
+ cond_resched();
+ }
- if (tw)
- inet_twsk_deschedule_put(tw);
+ offset++;
+ if ((offset & 1) && remaining > 1)
+ goto other_parity_scan;
- ret = 0;
- goto out;
- }
+ return -EADDRNOTAVAIL;
- head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
- tb = inet_csk(sk)->icsk_bind_hash;
- spin_lock_bh(&head->lock);
- if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- inet_ehash_nolisten(sk, NULL);
- spin_unlock_bh(&head->lock);
- return 0;
- } else {
- spin_unlock(&head->lock);
- /* No definite answer... Walk to established hash table */
- ret = check_established(death_row, sk, snum, NULL);
-out:
- local_bh_enable();
- return ret;
+ok:
+ hint += i + 2;
+
+ /* Head lock still held and bh's disabled */
+ inet_bind_hash(sk, tb, port);
+ if (sk_unhashed(sk)) {
+ inet_sk(sk)->inet_sport = htons(port);
+ inet_ehash_nolisten(sk, (struct sock *)tw);
}
+ if (tw)
+ inet_twsk_bind_unhash(tw, hinfo);
+ spin_unlock(&head->lock);
+ if (tw)
+ inet_twsk_deschedule_put(tw);
+ local_bh_enable();
+ return 0;
}
/*
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
deleted file mode 100644
index f17ea49b2..000000000
--- a/net/ipv4/inet_lro.c
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- * linux/net/ipv4/inet_lro.c
- *
- * Large Receive Offload (ipv4 / tcp)
- *
- * (C) Copyright IBM Corp. 2007
- *
- * Authors:
- * Jan-Bernd Themann <themann@de.ibm.com>
- * Christoph Raisch <raisch@de.ibm.com>
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-
-#include <linux/module.h>
-#include <linux/if_vlan.h>
-#include <linux/inet_lro.h>
-#include <net/checksum.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
-MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
-
-#define TCP_HDR_LEN(tcph) (tcph->doff << 2)
-#define IP_HDR_LEN(iph) (iph->ihl << 2)
-#define TCP_PAYLOAD_LENGTH(iph, tcph) \
- (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph))
-
-#define IPH_LEN_WO_OPTIONS 5
-#define TCPH_LEN_WO_OPTIONS 5
-#define TCPH_LEN_W_TIMESTAMP 8
-
-#define LRO_MAX_PG_HLEN 64
-
-#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; }
-
-/*
- * Basic tcp checks whether packet is suitable for LRO
- */
-
-static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph,
- int len, const struct net_lro_desc *lro_desc)
-{
- /* check ip header: don't aggregate padded frames */
- if (ntohs(iph->tot_len) != len)
- return -1;
-
- if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0)
- return -1;
-
- if (iph->ihl != IPH_LEN_WO_OPTIONS)
- return -1;
-
- if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack ||
- tcph->rst || tcph->syn || tcph->fin)
- return -1;
-
- if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
- return -1;
-
- if (tcph->doff != TCPH_LEN_WO_OPTIONS &&
- tcph->doff != TCPH_LEN_W_TIMESTAMP)
- return -1;
-
- /* check tcp options (only timestamp allowed) */
- if (tcph->doff == TCPH_LEN_W_TIMESTAMP) {
- __be32 *topt = (__be32 *)(tcph + 1);
-
- if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
- | (TCPOPT_TIMESTAMP << 8)
- | TCPOLEN_TIMESTAMP))
- return -1;
-
- /* timestamp should be in right order */
- topt++;
- if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval),
- ntohl(*topt)))
- return -1;
-
- /* timestamp reply should not be zero */
- topt++;
- if (*topt == 0)
- return -1;
- }
-
- return 0;
-}
-
-static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
-{
- struct iphdr *iph = lro_desc->iph;
- struct tcphdr *tcph = lro_desc->tcph;
- __be32 *p;
- __wsum tcp_hdr_csum;
-
- tcph->ack_seq = lro_desc->tcp_ack;
- tcph->window = lro_desc->tcp_window;
-
- if (lro_desc->tcp_saw_tstamp) {
- p = (__be32 *)(tcph + 1);
- *(p+2) = lro_desc->tcp_rcv_tsecr;
- }
-
- csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len));
- iph->tot_len = htons(lro_desc->ip_tot_len);
-
- tcph->check = 0;
- tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0);
- lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
- tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
- lro_desc->ip_tot_len -
- IP_HDR_LEN(iph), IPPROTO_TCP,
- lro_desc->data_csum);
-}
-
-static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
-{
- __wsum tcp_csum;
- __wsum tcp_hdr_csum;
- __wsum tcp_ps_hdr_csum;
-
- tcp_csum = ~csum_unfold(tcph->check);
- tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum);
-
- tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
- len + TCP_HDR_LEN(tcph),
- IPPROTO_TCP, 0);
-
- return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum),
- tcp_ps_hdr_csum);
-}
-
-static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
- struct iphdr *iph, struct tcphdr *tcph)
-{
- int nr_frags;
- __be32 *ptr;
- u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
-
- nr_frags = skb_shinfo(skb)->nr_frags;
- lro_desc->parent = skb;
- lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]);
- lro_desc->iph = iph;
- lro_desc->tcph = tcph;
- lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len;
- lro_desc->tcp_ack = tcph->ack_seq;
- lro_desc->tcp_window = tcph->window;
-
- lro_desc->pkt_aggr_cnt = 1;
- lro_desc->ip_tot_len = ntohs(iph->tot_len);
-
- if (tcph->doff == 8) {
- ptr = (__be32 *)(tcph+1);
- lro_desc->tcp_saw_tstamp = 1;
- lro_desc->tcp_rcv_tsval = *(ptr+1);
- lro_desc->tcp_rcv_tsecr = *(ptr+2);
- }
-
- lro_desc->mss = tcp_data_len;
- lro_desc->active = 1;
-
- lro_desc->data_csum = lro_tcp_data_csum(iph, tcph,
- tcp_data_len);
-}
-
-static inline void lro_clear_desc(struct net_lro_desc *lro_desc)
-{
- memset(lro_desc, 0, sizeof(struct net_lro_desc));
-}
-
-static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph,
- struct tcphdr *tcph, int tcp_data_len)
-{
- struct sk_buff *parent = lro_desc->parent;
- __be32 *topt;
-
- lro_desc->pkt_aggr_cnt++;
- lro_desc->ip_tot_len += tcp_data_len;
- lro_desc->tcp_next_seq += tcp_data_len;
- lro_desc->tcp_window = tcph->window;
- lro_desc->tcp_ack = tcph->ack_seq;
-
- /* don't update tcp_rcv_tsval, would not work with PAWS */
- if (lro_desc->tcp_saw_tstamp) {
- topt = (__be32 *) (tcph + 1);
- lro_desc->tcp_rcv_tsecr = *(topt + 2);
- }
-
- lro_desc->data_csum = csum_block_add(lro_desc->data_csum,
- lro_tcp_data_csum(iph, tcph,
- tcp_data_len),
- parent->len);
-
- parent->len += tcp_data_len;
- parent->data_len += tcp_data_len;
- if (tcp_data_len > lro_desc->mss)
- lro_desc->mss = tcp_data_len;
-}
-
-static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb,
- struct iphdr *iph, struct tcphdr *tcph)
-{
- struct sk_buff *parent = lro_desc->parent;
- int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
-
- lro_add_common(lro_desc, iph, tcph, tcp_data_len);
-
- skb_pull(skb, (skb->len - tcp_data_len));
- parent->truesize += skb->truesize;
-
- if (lro_desc->last_skb)
- lro_desc->last_skb->next = skb;
- else
- skb_shinfo(parent)->frag_list = skb;
-
- lro_desc->last_skb = skb;
-}
-
-
-static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
- struct iphdr *iph,
- struct tcphdr *tcph)
-{
- if ((lro_desc->iph->saddr != iph->saddr) ||
- (lro_desc->iph->daddr != iph->daddr) ||
- (lro_desc->tcph->source != tcph->source) ||
- (lro_desc->tcph->dest != tcph->dest))
- return -1;
- return 0;
-}
-
-static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr,
- struct net_lro_desc *lro_arr,
- struct iphdr *iph,
- struct tcphdr *tcph)
-{
- struct net_lro_desc *lro_desc = NULL;
- struct net_lro_desc *tmp;
- int max_desc = lro_mgr->max_desc;
- int i;
-
- for (i = 0; i < max_desc; i++) {
- tmp = &lro_arr[i];
- if (tmp->active)
- if (!lro_check_tcp_conn(tmp, iph, tcph)) {
- lro_desc = tmp;
- goto out;
- }
- }
-
- for (i = 0; i < max_desc; i++) {
- if (!lro_arr[i].active) {
- lro_desc = &lro_arr[i];
- goto out;
- }
- }
-
- LRO_INC_STATS(lro_mgr, no_desc);
-out:
- return lro_desc;
-}
-
-static void lro_flush(struct net_lro_mgr *lro_mgr,
- struct net_lro_desc *lro_desc)
-{
- if (lro_desc->pkt_aggr_cnt > 1)
- lro_update_tcp_ip_header(lro_desc);
-
- skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss;
-
- if (lro_mgr->features & LRO_F_NAPI)
- netif_receive_skb(lro_desc->parent);
- else
- netif_rx(lro_desc->parent);
-
- LRO_INC_STATS(lro_mgr, flushed);
- lro_clear_desc(lro_desc);
-}
-
-static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
- void *priv)
-{
- struct net_lro_desc *lro_desc;
- struct iphdr *iph;
- struct tcphdr *tcph;
- u64 flags;
- int vlan_hdr_len = 0;
-
- if (!lro_mgr->get_skb_header ||
- lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
- &flags, priv))
- goto out;
-
- if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
- goto out;
-
- lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
- if (!lro_desc)
- goto out;
-
- if ((skb->protocol == htons(ETH_P_8021Q)) &&
- !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
- vlan_hdr_len = VLAN_HLEN;
-
- if (!lro_desc->active) { /* start new lro session */
- if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL))
- goto out;
-
- skb->ip_summed = lro_mgr->ip_summed_aggr;
- lro_init_desc(lro_desc, skb, iph, tcph);
- LRO_INC_STATS(lro_mgr, aggregated);
- return 0;
- }
-
- if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
- goto out2;
-
- if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc))
- goto out2;
-
- lro_add_packet(lro_desc, skb, iph, tcph);
- LRO_INC_STATS(lro_mgr, aggregated);
-
- if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) ||
- lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
- lro_flush(lro_mgr, lro_desc);
-
- return 0;
-
-out2: /* send aggregated SKBs to stack */
- lro_flush(lro_mgr, lro_desc);
-
-out:
- return 1;
-}
-
-void lro_receive_skb(struct net_lro_mgr *lro_mgr,
- struct sk_buff *skb,
- void *priv)
-{
- if (__lro_proc_skb(lro_mgr, skb, priv)) {
- if (lro_mgr->features & LRO_F_NAPI)
- netif_receive_skb(skb);
- else
- netif_rx(skb);
- }
-}
-EXPORT_SYMBOL(lro_receive_skb);
-
-void lro_flush_all(struct net_lro_mgr *lro_mgr)
-{
- int i;
- struct net_lro_desc *lro_desc = lro_mgr->lro_arr;
-
- for (i = 0; i < lro_mgr->max_desc; i++) {
- if (lro_desc[i].active)
- lro_flush(lro_mgr, &lro_desc[i]);
- }
-}
-EXPORT_SYMBOL(lro_flush_all);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index da0d7ce85..af18f1e48 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -71,7 +71,6 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s
if (unlikely(opt->optlen))
ip_forward_options(skb);
- skb_sender_cpu_clear(skb);
return dst_output(net, sk, skb);
}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 187c6fcc3..efbd47d1a 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -54,8 +54,6 @@
* code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
* as well. Or notify me, at least. --ANK
*/
-
-static int sysctl_ipfrag_max_dist __read_mostly = 64;
static const char ip_frag_cache_name[] = "ip4-frags";
struct ipfrag_skb_cb
@@ -150,7 +148,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
qp->daddr = arg->iph->daddr;
qp->vif = arg->vif;
qp->user = arg->user;
- qp->peer = sysctl_ipfrag_max_dist ?
+ qp->peer = q->net->max_dist ?
inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :
NULL;
}
@@ -275,7 +273,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
static int ip_frag_too_far(struct ipq *qp)
{
struct inet_peer *peer = qp->peer;
- unsigned int max = sysctl_ipfrag_max_dist;
+ unsigned int max = qp->q.net->max_dist;
unsigned int start, end;
int rc;
@@ -749,6 +747,14 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+ {
+ .procname = "ipfrag_max_dist",
+ .data = &init_net.ipv4.frags.max_dist,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero
+ },
{ }
};
@@ -762,14 +768,6 @@ static struct ctl_table ip4_frags_ctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- {
- .procname = "ipfrag_max_dist",
- .data = &sysctl_ipfrag_max_dist,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero
- },
{ }
};
@@ -790,10 +788,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
table[1].data = &net->ipv4.frags.low_thresh;
table[1].extra2 = &net->ipv4.frags.high_thresh;
table[2].data = &net->ipv4.frags.timeout;
-
- /* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns)
- table[0].procname = NULL;
+ table[3].data = &net->ipv4.frags.max_dist;
}
hdr = register_net_sysctl(net, "net/ipv4", table);
@@ -865,6 +860,8 @@ static int __net_init ipv4_frags_init_net(struct net *net)
*/
net->ipv4.frags.timeout = IP_FRAG_TIME;
+ net->ipv4.frags.max_dist = 64;
+
res = inet_frags_init_net(&net->ipv4.frags);
if (res)
return res;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 41ba68de4..4cc84212c 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -179,6 +179,7 @@ static __be16 tnl_flags_to_gre_flags(__be16 tflags)
return flags;
}
+/* Fills in tpi and returns header length to be pulled. */
static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
bool *csum_err)
{
@@ -238,7 +239,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
return -EINVAL;
}
}
- return iptunnel_pull_header(skb, hdr_len, tpi->proto);
+ return hdr_len;
}
static void ipgre_err(struct sk_buff *skb, u32 info,
@@ -341,7 +342,7 @@ static void gre_err(struct sk_buff *skb, u32 info)
struct tnl_ptk_info tpi;
bool csum_err = false;
- if (parse_gre_header(skb, &tpi, &csum_err)) {
+ if (parse_gre_header(skb, &tpi, &csum_err) < 0) {
if (!csum_err) /* ignore csum errors. */
return;
}
@@ -397,7 +398,10 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
iph->saddr, iph->daddr, tpi->key);
if (tunnel) {
- skb_pop_mac_header(skb);
+ if (tunnel->dev->type != ARPHRD_NONE)
+ skb_pop_mac_header(skb);
+ else
+ skb_reset_mac_header(skb);
if (tunnel->collect_md) {
__be16 flags;
__be64 tun_id;
@@ -419,6 +423,7 @@ static int gre_rcv(struct sk_buff *skb)
{
struct tnl_ptk_info tpi;
bool csum_err = false;
+ int hdr_len;
#ifdef CONFIG_NET_IPGRE_BROADCAST
if (ipv4_is_multicast(ip_hdr(skb)->daddr)) {
@@ -428,7 +433,10 @@ static int gre_rcv(struct sk_buff *skb)
}
#endif
- if (parse_gre_header(skb, &tpi, &csum_err) < 0)
+ hdr_len = parse_gre_header(skb, &tpi, &csum_err);
+ if (hdr_len < 0)
+ goto drop;
+ if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false) < 0)
goto drop;
if (ipgre_rcv(skb, &tpi) == PACKET_RCVD)
@@ -440,6 +448,17 @@ drop:
return 0;
}
+static __sum16 gre_checksum(struct sk_buff *skb)
+{
+ __wsum csum;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ csum = lco_csum(skb);
+ else
+ csum = skb_checksum(skb, 0, skb->len, 0);
+ return csum_fold(csum);
+}
+
static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
__be16 proto, __be32 key, __be32 seq)
{
@@ -467,8 +486,7 @@ static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
!(skb_shinfo(skb)->gso_type &
(SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) {
*ptr = 0;
- *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
- skb->len, 0));
+ *(__sum16 *)ptr = gre_checksum(skb);
}
}
}
@@ -493,8 +511,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
bool csum)
{
- return iptunnel_handle_offloads(skb, csum,
- csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
+ return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
}
static struct rtable *gre_get_rt(struct sk_buff *skb,
@@ -514,15 +531,17 @@ static struct rtable *gre_get_rt(struct sk_buff *skb,
return ip_route_output_key(net, fl);
}
-static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
+static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
+ __be16 proto)
{
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
+ struct rtable *rt = NULL;
struct flowi4 fl;
- struct rtable *rt;
int min_headroom;
int tunnel_hlen;
__be16 df, flags;
+ bool use_cache;
int err;
tun_info = skb_tunnel_info(skb);
@@ -531,9 +550,17 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
goto err_free_skb;
key = &tun_info->key;
- rt = gre_get_rt(skb, dev, &fl, key);
- if (IS_ERR(rt))
- goto err_free_skb;
+ use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
+ if (use_cache)
+ rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr);
+ if (!rt) {
+ rt = gre_get_rt(skb, dev, &fl, key);
+ if (IS_ERR(rt))
+ goto err_free_skb;
+ if (use_cache)
+ dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
+ fl.saddr);
+ }
tunnel_hlen = ip_gre_calc_hlen(key->tun_flags);
@@ -557,7 +584,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
}
flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
- build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB),
+ build_header(skb, tunnel_hlen, flags, proto,
tunnel_id_to_key(tun_info->key.tun_id), 0);
df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
@@ -598,7 +625,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
const struct iphdr *tnl_params;
if (tunnel->collect_md) {
- gre_fb_xmit(skb, dev);
+ gre_fb_xmit(skb, dev, skb->protocol);
return NETDEV_TX_OK;
}
@@ -642,7 +669,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
struct ip_tunnel *tunnel = netdev_priv(dev);
if (tunnel->collect_md) {
- gre_fb_xmit(skb, dev);
+ gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
return NETDEV_TX_OK;
}
@@ -844,9 +871,16 @@ static void __gre_tunnel_init(struct net_device *dev)
dev->hw_features |= GRE_FEATURES;
if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
- /* TCP offload with GRE SEQ is not supported. */
- dev->features |= NETIF_F_GSO_SOFTWARE;
- dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ /* TCP offload with GRE SEQ is not supported, nor
+ * can we support 2 levels of outer headers requiring
+ * an update.
+ */
+ if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
+ (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ }
+
/* Can use a lockless transmit, unless we generate
* output sequences
*/
@@ -868,7 +902,7 @@ static int ipgre_tunnel_init(struct net_device *dev)
netif_keep_dst(dev);
dev->addr_len = 4;
- if (iph->daddr) {
+ if (iph->daddr && !tunnel->collect_md) {
#ifdef CONFIG_NET_IPGRE_BROADCAST
if (ipv4_is_multicast(iph->daddr)) {
if (!iph->saddr)
@@ -877,8 +911,9 @@ static int ipgre_tunnel_init(struct net_device *dev)
dev->header_ops = &ipgre_header_ops;
}
#endif
- } else
+ } else if (!tunnel->collect_md) {
dev->header_ops = &ipgre_header_ops;
+ }
return ip_tunnel_init(dev);
}
@@ -921,6 +956,11 @@ static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
if (flags & (GRE_VERSION|GRE_ROUTING))
return -EINVAL;
+ if (data[IFLA_GRE_COLLECT_METADATA] &&
+ data[IFLA_GRE_ENCAP_TYPE] &&
+ nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE)
+ return -EINVAL;
+
return 0;
}
@@ -994,6 +1034,8 @@ static void ipgre_netlink_parms(struct net_device *dev,
struct ip_tunnel *t = netdev_priv(dev);
t->collect_md = true;
+ if (dev->type == ARPHRD_IPGRE)
+ dev->type = ARPHRD_NONE;
}
}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d77eb0c3b..e3d782746 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -308,15 +308,12 @@ drop:
return true;
}
-int sysctl_ip_early_demux __read_mostly = 1;
-EXPORT_SYMBOL(sysctl_ip_early_demux);
-
static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
- if (sysctl_ip_early_demux &&
+ if (net->ipv4.sysctl_ip_early_demux &&
!skb_dst(skb) &&
!skb->sk &&
!ip_is_fragment(iph)) {
@@ -362,8 +359,31 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len);
- } else if (rt->rt_type == RTN_BROADCAST)
+ } else if (rt->rt_type == RTN_BROADCAST) {
IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len);
+ } else if (skb->pkt_type == PACKET_BROADCAST ||
+ skb->pkt_type == PACKET_MULTICAST) {
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+
+ /* RFC 1122 3.3.6:
+ *
+ * When a host sends a datagram to a link-layer broadcast
+ * address, the IP destination address MUST be a legal IP
+ * broadcast or IP multicast address.
+ *
+ * A host SHOULD silently discard a datagram that is received
+ * via a link-layer broadcast (see Section 2.4) but does not
+ * specify an IP multicast or broadcast destination address.
+ *
+ * This doesn't explicitly say L2 *broadcast*, but broadcast is
+ * in a way a form of multicast and the most common use case for
+ * this is 802.11 protecting against cross-station spoofing (the
+ * so-called "hole-196" attack) so do it for both.
+ */
+ if (in_dev &&
+ IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))
+ goto drop;
+ }
return dst_input(skb);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index bd2467923..4d158ff1d 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -58,10 +58,9 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
if (opt->ts_needaddr)
ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);
if (opt->ts_needtime) {
- struct timespec tv;
__be32 midtime;
- getnstimeofday(&tv);
- midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC);
+
+ midtime = inet_current_timestamp();
memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
}
return;
@@ -415,11 +414,10 @@ int ip_options_compile(struct net *net,
break;
}
if (timeptr) {
- struct timespec tv;
- u32 midtime;
- getnstimeofday(&tv);
- midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC;
- put_unaligned_be32(midtime, timeptr);
+ __be32 midtime;
+
+ midtime = inet_current_timestamp();
+ memcpy(timeptr, &midtime, 4);
opt->is_changed = 1;
}
} else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 565bf64b2..124bf0a66 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -79,9 +79,6 @@
#include <linux/netlink.h>
#include <linux/tcp.h>
-int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
-EXPORT_SYMBOL(sysctl_ip_default_ttl);
-
static int
ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
unsigned int mtu,
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index a50124260..035ad645a 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -573,6 +573,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
int optname, char __user *optval, unsigned int optlen)
{
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
int val = 0, err;
bool needs_rtnl = setsockopt_needs_rtnl(optname);
@@ -912,7 +913,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
}
/* numsrc >= (1G-4) overflow in 32 bits */
if (msf->imsf_numsrc >= 0x3ffffffcU ||
- msf->imsf_numsrc > sysctl_igmp_max_msf) {
+ msf->imsf_numsrc > net->ipv4.sysctl_igmp_max_msf) {
kfree(msf);
err = -ENOBUFS;
break;
@@ -1067,7 +1068,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
/* numsrc >= (4G-140)/128 overflow in 32 bits */
if (gsf->gf_numsrc >= 0x1ffffff ||
- gsf->gf_numsrc > sysctl_igmp_max_msf) {
+ gsf->gf_numsrc > net->ipv4.sysctl_igmp_max_msf) {
err = -ENOBUFS;
goto mc_msf_out;
}
@@ -1342,10 +1343,13 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
val = inet->tos;
break;
case IP_TTL:
+ {
+ struct net *net = sock_net(sk);
val = (inet->uc_ttl == -1 ?
- sysctl_ip_default_ttl :
+ net->ipv4.sysctl_ip_default_ttl :
inet->uc_ttl);
break;
+ }
case IP_HDRINCL:
val = inet->hdrincl;
break;
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 336e6892a..a69ed94bd 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -68,61 +68,6 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
IP_TNL_HASH_BITS);
}
-static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
- struct dst_entry *dst, __be32 saddr)
-{
- struct dst_entry *old_dst;
-
- dst_clone(dst);
- old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
- dst_release(old_dst);
- idst->saddr = saddr;
-}
-
-static noinline void tunnel_dst_set(struct ip_tunnel *t,
- struct dst_entry *dst, __be32 saddr)
-{
- __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
-}
-
-static void tunnel_dst_reset(struct ip_tunnel *t)
-{
- tunnel_dst_set(t, NULL, 0);
-}
-
-void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
-{
- int i;
-
- for_each_possible_cpu(i)
- __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
-}
-EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
-
-static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
- u32 cookie, __be32 *saddr)
-{
- struct ip_tunnel_dst *idst;
- struct dst_entry *dst;
-
- rcu_read_lock();
- idst = raw_cpu_ptr(t->dst_cache);
- dst = rcu_dereference(idst->dst);
- if (dst && !atomic_inc_not_zero(&dst->__refcnt))
- dst = NULL;
- if (dst) {
- if (!dst->obsolete || dst->ops->check(dst, cookie)) {
- *saddr = idst->saddr;
- } else {
- tunnel_dst_reset(t);
- dst_release(dst);
- dst = NULL;
- }
- }
- rcu_read_unlock();
- return (struct rtable *)dst;
-}
-
static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
__be16 flags, __be32 key)
{
@@ -381,11 +326,12 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
if (!IS_ERR(rt)) {
tdev = rt->dst.dev;
- tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
ip_rt_put(rt);
}
if (dev->type != ARPHRD_ETHER)
dev->flags |= IFF_POINTOPOINT;
+
+ dst_cache_reset(&tunnel->dst_cache);
}
if (!tdev && tunnel->parms.link)
@@ -731,7 +677,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
goto tx_error;
- rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
+ rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
+ NULL;
if (!rt) {
rt = ip_route_output_key(tunnel->net, &fl4);
@@ -741,7 +688,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
goto tx_error;
}
if (connected)
- tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
+ dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
+ fl4.saddr);
}
if (rt->dst.dev == dev) {
@@ -837,7 +785,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
if (set_mtu)
dev->mtu = mtu;
}
- ip_tunnel_dst_reset_all(t);
+ dst_cache_reset(&t->dst_cache);
netdev_state_change(dev);
}
@@ -976,7 +924,7 @@ static void ip_tunnel_dev_free(struct net_device *dev)
struct ip_tunnel *tunnel = netdev_priv(dev);
gro_cells_destroy(&tunnel->gro_cells);
- free_percpu(tunnel->dst_cache);
+ dst_cache_destroy(&tunnel->dst_cache);
free_percpu(dev->tstats);
free_netdev(dev);
}
@@ -1170,15 +1118,15 @@ int ip_tunnel_init(struct net_device *dev)
if (!dev->tstats)
return -ENOMEM;
- tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
- if (!tunnel->dst_cache) {
+ err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
+ if (err) {
free_percpu(dev->tstats);
- return -ENOMEM;
+ return err;
}
err = gro_cells_init(&tunnel->gro_cells, dev);
if (err) {
- free_percpu(tunnel->dst_cache);
+ dst_cache_destroy(&tunnel->dst_cache);
free_percpu(dev->tstats);
return err;
}
@@ -1208,7 +1156,7 @@ void ip_tunnel_uninit(struct net_device *dev)
if (itn->fb_tunnel_dev != dev)
ip_tunnel_del(itn, netdev_priv(dev));
- ip_tunnel_dst_reset_all(tunnel);
+ dst_cache_reset(&tunnel->dst_cache);
}
EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 859d415c0..6165f30c4 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -86,7 +86,8 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(iptunnel_xmit);
-int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
+int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto,
+ bool xnet)
{
if (unlikely(!pskb_may_pull(skb, hdr_len)))
return -ENOMEM;
@@ -109,14 +110,12 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
skb->protocol = inner_proto;
}
- nf_reset(skb);
- secpath_reset(skb);
skb_clear_hash_if_not_l4(skb);
- skb_dst_drop(skb);
skb->vlan_tci = 0;
skb_set_queue_mapping(skb, 0);
- skb->pkt_type = PACKET_HOST;
- return 0;
+ skb_scrub_packet(skb, xnet);
+
+ return iptunnel_pull_offloads(skb);
}
EXPORT_SYMBOL_GPL(iptunnel_pull_header);
@@ -148,7 +147,6 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
EXPORT_SYMBOL_GPL(iptunnel_metadata_reply);
struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
- bool csum_help,
int gso_type_mask)
{
int err;
@@ -166,20 +164,15 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
return skb;
}
- /* If packet is not gso and we are resolving any partial checksum,
- * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL
- * on the outer header without confusing devices that implement
- * NETIF_F_IP_CSUM with encapsulation.
- */
- if (csum_help)
- skb->encapsulation = 0;
-
- if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
- err = skb_checksum_help(skb);
- if (unlikely(err))
- goto error;
- } else if (skb->ip_summed != CHECKSUM_PARTIAL)
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
skb->ip_summed = CHECKSUM_NONE;
+ /* We clear encapsulation here to prevent badly-written
+ * drivers potentially deciding to offload an inner checksum
+ * if we set CHECKSUM_PARTIAL on the outer header.
+ * This should go away when the drivers are all fixed.
+ */
+ skb->encapsulation = 0;
+ }
return skb;
error:
@@ -379,8 +372,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb,
if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) ||
nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) ||
nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
- nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) ||
- nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) ||
+ nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
+ nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) ||
nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags))
return -ENOMEM;
@@ -406,6 +399,12 @@ static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
void __init ip_tunnel_core_init(void)
{
+ /* If you land here, make sure whether increasing ip_tunnel_info's
+ * options_len is a reasonable choice with its usage in front ends
+ * (f.e., it's part of flow keys, etc).
+ */
+ BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255);
+
lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
}
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 5cf10b777..a917903d5 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -156,6 +156,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
struct dst_entry *dst = skb_dst(skb);
struct net_device *tdev; /* Device to other host */
int err;
+ int mtu;
if (!dst) {
dev->stats.tx_carrier_errors++;
@@ -192,6 +193,23 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
tunnel->err_count = 0;
}
+ mtu = dst_mtu(dst);
+ if (skb->len > mtu) {
+ skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+ if (skb->protocol == htons(ETH_P_IP)) {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ } else {
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ }
+
+ dst_release(dst);
+ goto tx_error;
+ }
+
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
skb_dst_set(skb, dst);
skb->dev = skb_dst(skb)->dev;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 4044da61e..ec51d0216 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -195,7 +195,7 @@ static int ipip_rcv(struct sk_buff *skb)
if (tunnel) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
- if (iptunnel_pull_header(skb, 0, tpi.proto))
+ if (iptunnel_pull_header(skb, 0, tpi.proto, false))
goto drop;
return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
}
@@ -219,7 +219,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (unlikely(skb->protocol != htons(ETH_P_IP)))
goto tx_error;
- skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+ skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP);
if (IS_ERR(skb))
goto out;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index b488cac9c..4133b0f51 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -359,11 +359,12 @@ unsigned int arpt_do_table(struct sk_buff *skb,
}
/* All zeroes == unconditional rule. */
-static inline bool unconditional(const struct arpt_arp *arp)
+static inline bool unconditional(const struct arpt_entry *e)
{
static const struct arpt_arp uncond;
- return memcmp(arp, &uncond, sizeof(uncond)) == 0;
+ return e->target_offset == sizeof(struct arpt_entry) &&
+ memcmp(&e->arp, &uncond, sizeof(uncond)) == 0;
}
/* Figures out from what hook each rule can be called: returns 0 if
@@ -402,11 +403,10 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
|= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
/* Unconditional return/END. */
- if ((e->target_offset == sizeof(struct arpt_entry) &&
+ if ((unconditional(e) &&
(strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0) &&
- t->verdict < 0 && unconditional(&e->arp)) ||
- visited) {
+ t->verdict < 0) || visited) {
unsigned int oldpos, size;
if ((strcmp(t->target.u.user.name,
@@ -474,14 +474,12 @@ next:
return 1;
}
-static inline int check_entry(const struct arpt_entry *e, const char *name)
+static inline int check_entry(const struct arpt_entry *e)
{
const struct xt_entry_target *t;
- if (!arp_checkentry(&e->arp)) {
- duprintf("arp_tables: arp check failed %p %s.\n", e, name);
+ if (!arp_checkentry(&e->arp))
return -EINVAL;
- }
if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
return -EINVAL;
@@ -522,10 +520,6 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
struct xt_target *target;
int ret;
- ret = check_entry(e, name);
- if (ret)
- return ret;
-
e->counters.pcnt = xt_percpu_counter_alloc();
if (IS_ERR_VALUE(e->counters.pcnt))
return -ENOMEM;
@@ -557,7 +551,7 @@ static bool check_underflow(const struct arpt_entry *e)
const struct xt_entry_target *t;
unsigned int verdict;
- if (!unconditional(&e->arp))
+ if (!unconditional(e))
return false;
t = arpt_get_target_c(e);
if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
@@ -576,9 +570,11 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
unsigned int valid_hooks)
{
unsigned int h;
+ int err;
if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 ||
- (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
+ (unsigned char *)e + sizeof(struct arpt_entry) >= limit ||
+ (unsigned char *)e + e->next_offset > limit) {
duprintf("Bad offset %p\n", e);
return -EINVAL;
}
@@ -590,6 +586,10 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
return -EINVAL;
}
+ err = check_entry(e);
+ if (err)
+ return err;
+
/* Check hooks & underflows */
for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
if (!(valid_hooks & (1 << h)))
@@ -598,9 +598,9 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
newinfo->hook_entry[h] = hook_entries[h];
if ((unsigned char *)e - base == underflows[h]) {
if (!check_underflow(e)) {
- pr_err("Underflows must be unconditional and "
- "use the STANDARD target with "
- "ACCEPT/DROP\n");
+ pr_debug("Underflows must be unconditional and "
+ "use the STANDARD target with "
+ "ACCEPT/DROP\n");
return -EINVAL;
}
newinfo->underflow[h] = underflows[h];
@@ -969,6 +969,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
sizeof(struct arpt_get_entries) + get.size);
return -EINVAL;
}
+ get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
if (!IS_ERR_OR_NULL(t)) {
@@ -1233,7 +1234,8 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
duprintf("check_compat_entry_size_and_hooks %p\n", e);
if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 ||
- (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) {
+ (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit ||
+ (unsigned char *)e + e->next_offset > limit) {
duprintf("Bad offset %p, limit = %p\n", e, limit);
return -EINVAL;
}
@@ -1246,7 +1248,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
}
/* For purposes of check_entry casting the compat entry is fine */
- ret = check_entry((struct arpt_entry *)e, name);
+ ret = check_entry((struct arpt_entry *)e);
if (ret)
return ret;
@@ -1662,6 +1664,7 @@ static int compat_get_entries(struct net *net,
*len, sizeof(get) + get.size);
return -EINVAL;
}
+ get.name[sizeof(get.name) - 1] = '\0';
xt_compat_lock(NFPROTO_ARP);
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
@@ -1780,9 +1783,29 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
return ret;
}
-struct xt_table *arpt_register_table(struct net *net,
- const struct xt_table *table,
- const struct arpt_replace *repl)
+static void __arpt_unregister_table(struct xt_table *table)
+{
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+ struct module *table_owner = table->me;
+ struct arpt_entry *iter;
+
+ private = xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries;
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter);
+ if (private->number > private->initial_entries)
+ module_put(table_owner);
+ xt_free_table_info(private);
+}
+
+int arpt_register_table(struct net *net,
+ const struct xt_table *table,
+ const struct arpt_replace *repl,
+ const struct nf_hook_ops *ops,
+ struct xt_table **res)
{
int ret;
struct xt_table_info *newinfo;
@@ -1791,10 +1814,8 @@ struct xt_table *arpt_register_table(struct net *net,
struct xt_table *new_table;
newinfo = xt_alloc_table_info(repl->size);
- if (!newinfo) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!newinfo)
+ return -ENOMEM;
loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
@@ -1809,30 +1830,28 @@ struct xt_table *arpt_register_table(struct net *net,
ret = PTR_ERR(new_table);
goto out_free;
}
- return new_table;
+
+ /* set res now, will see skbs right after nf_register_net_hooks */
+ WRITE_ONCE(*res, new_table);
+
+ ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
+ if (ret != 0) {
+ __arpt_unregister_table(new_table);
+ *res = NULL;
+ }
+
+ return ret;
out_free:
xt_free_table_info(newinfo);
-out:
- return ERR_PTR(ret);
+ return ret;
}
-void arpt_unregister_table(struct xt_table *table)
+void arpt_unregister_table(struct net *net, struct xt_table *table,
+ const struct nf_hook_ops *ops)
{
- struct xt_table_info *private;
- void *loc_cpu_entry;
- struct module *table_owner = table->me;
- struct arpt_entry *iter;
-
- private = xt_unregister_table(table);
-
- /* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries;
- xt_entry_foreach(iter, loc_cpu_entry, private->size)
- cleanup_entry(iter);
- if (private->number > private->initial_entries)
- module_put(table_owner);
- xt_free_table_info(private);
+ nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+ __arpt_unregister_table(table);
}
/* The built-in targets: standard (NULL) and error. */
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 1897ee160..8f8713b43 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -17,12 +17,15 @@ MODULE_DESCRIPTION("arptables filter table");
#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
(1 << NF_ARP_FORWARD))
+static int __net_init arptable_filter_table_init(struct net *net);
+
static const struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_ARP,
.priority = NF_IP_PRI_FILTER,
+ .table_init = arptable_filter_table_init,
};
/* The work comes in here from netfilter.c */
@@ -35,26 +38,32 @@ arptable_filter_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *arpfilter_ops __read_mostly;
-static int __net_init arptable_filter_net_init(struct net *net)
+static int __net_init arptable_filter_table_init(struct net *net)
{
struct arpt_replace *repl;
-
+ int err;
+
+ if (net->ipv4.arptable_filter)
+ return 0;
+
repl = arpt_alloc_initial_table(&packet_filter);
if (repl == NULL)
return -ENOMEM;
- net->ipv4.arptable_filter =
- arpt_register_table(net, &packet_filter, repl);
+ err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops,
+ &net->ipv4.arptable_filter);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter);
+ return err;
}
static void __net_exit arptable_filter_net_exit(struct net *net)
{
- arpt_unregister_table(net->ipv4.arptable_filter);
+ if (!net->ipv4.arptable_filter)
+ return;
+ arpt_unregister_table(net, net->ipv4.arptable_filter, arpfilter_ops);
+ net->ipv4.arptable_filter = NULL;
}
static struct pernet_operations arptable_filter_net_ops = {
- .init = arptable_filter_net_init,
.exit = arptable_filter_net_exit,
};
@@ -62,26 +71,29 @@ static int __init arptable_filter_init(void)
{
int ret;
+ arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arptable_filter_hook);
+ if (IS_ERR(arpfilter_ops))
+ return PTR_ERR(arpfilter_ops);
+
ret = register_pernet_subsys(&arptable_filter_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(arpfilter_ops);
return ret;
+ }
- arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook);
- if (IS_ERR(arpfilter_ops)) {
- ret = PTR_ERR(arpfilter_ops);
- goto cleanup_table;
+ ret = arptable_filter_table_init(&init_net);
+ if (ret) {
+ unregister_pernet_subsys(&arptable_filter_net_ops);
+ kfree(arpfilter_ops);
}
- return ret;
-cleanup_table:
- unregister_pernet_subsys(&arptable_filter_net_ops);
return ret;
}
static void __exit arptable_filter_fini(void)
{
- xt_hook_unlink(&packet_filter, arpfilter_ops);
unregister_pernet_subsys(&arptable_filter_net_ops);
+ kfree(arpfilter_ops);
}
module_init(arptable_filter_init);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index b99affad6..631c100a1 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -168,11 +168,12 @@ get_entry(const void *base, unsigned int offset)
/* All zeroes == unconditional rule. */
/* Mildly perf critical (only if packet tracing is on) */
-static inline bool unconditional(const struct ipt_ip *ip)
+static inline bool unconditional(const struct ipt_entry *e)
{
static const struct ipt_ip uncond;
- return memcmp(ip, &uncond, sizeof(uncond)) == 0;
+ return e->target_offset == sizeof(struct ipt_entry) &&
+ memcmp(&e->ip, &uncond, sizeof(uncond)) == 0;
#undef FWINV
}
@@ -229,11 +230,10 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
} else if (s == e) {
(*rulenum)++;
- if (s->target_offset == sizeof(struct ipt_entry) &&
+ if (unconditional(s) &&
strcmp(t->target.u.kernel.target->name,
XT_STANDARD_TARGET) == 0 &&
- t->verdict < 0 &&
- unconditional(&s->ip)) {
+ t->verdict < 0) {
/* Tail of chains: STANDARD target (return/policy) */
*comment = *chainname == hookname
? comments[NF_IP_TRACE_COMMENT_POLICY]
@@ -476,11 +476,10 @@ mark_source_chains(const struct xt_table_info *newinfo,
e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
/* Unconditional return/END. */
- if ((e->target_offset == sizeof(struct ipt_entry) &&
+ if ((unconditional(e) &&
(strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0) &&
- t->verdict < 0 && unconditional(&e->ip)) ||
- visited) {
+ t->verdict < 0) || visited) {
unsigned int oldpos, size;
if ((strcmp(t->target.u.user.name,
@@ -569,14 +568,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net)
}
static int
-check_entry(const struct ipt_entry *e, const char *name)
+check_entry(const struct ipt_entry *e)
{
const struct xt_entry_target *t;
- if (!ip_checkentry(&e->ip)) {
- duprintf("ip check failed %p %s.\n", e, name);
+ if (!ip_checkentry(&e->ip))
return -EINVAL;
- }
if (e->target_offset + sizeof(struct xt_entry_target) >
e->next_offset)
@@ -666,10 +663,6 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
- ret = check_entry(e, name);
- if (ret)
- return ret;
-
e->counters.pcnt = xt_percpu_counter_alloc();
if (IS_ERR_VALUE(e->counters.pcnt))
return -ENOMEM;
@@ -721,7 +714,7 @@ static bool check_underflow(const struct ipt_entry *e)
const struct xt_entry_target *t;
unsigned int verdict;
- if (!unconditional(&e->ip))
+ if (!unconditional(e))
return false;
t = ipt_get_target_c(e);
if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
@@ -741,9 +734,11 @@ check_entry_size_and_hooks(struct ipt_entry *e,
unsigned int valid_hooks)
{
unsigned int h;
+ int err;
if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 ||
- (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
+ (unsigned char *)e + sizeof(struct ipt_entry) >= limit ||
+ (unsigned char *)e + e->next_offset > limit) {
duprintf("Bad offset %p\n", e);
return -EINVAL;
}
@@ -755,6 +750,10 @@ check_entry_size_and_hooks(struct ipt_entry *e,
return -EINVAL;
}
+ err = check_entry(e);
+ if (err)
+ return err;
+
/* Check hooks & underflows */
for (h = 0; h < NF_INET_NUMHOOKS; h++) {
if (!(valid_hooks & (1 << h)))
@@ -763,9 +762,9 @@ check_entry_size_and_hooks(struct ipt_entry *e,
newinfo->hook_entry[h] = hook_entries[h];
if ((unsigned char *)e - base == underflows[h]) {
if (!check_underflow(e)) {
- pr_err("Underflows must be unconditional and "
- "use the STANDARD target with "
- "ACCEPT/DROP\n");
+ pr_debug("Underflows must be unconditional and "
+ "use the STANDARD target with "
+ "ACCEPT/DROP\n");
return -EINVAL;
}
newinfo->underflow[h] = underflows[h];
@@ -1157,6 +1156,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
*len, sizeof(get) + get.size);
return -EINVAL;
}
+ get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, AF_INET, get.name);
if (!IS_ERR_OR_NULL(t)) {
@@ -1493,7 +1493,8 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
duprintf("check_compat_entry_size_and_hooks %p\n", e);
if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ||
- (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) {
+ (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit ||
+ (unsigned char *)e + e->next_offset > limit) {
duprintf("Bad offset %p, limit = %p\n", e, limit);
return -EINVAL;
}
@@ -1506,7 +1507,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
}
/* For purposes of check_entry casting the compat entry is fine */
- ret = check_entry((struct ipt_entry *)e, name);
+ ret = check_entry((struct ipt_entry *)e);
if (ret)
return ret;
@@ -1935,6 +1936,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
*len, sizeof(get) + get.size);
return -EINVAL;
}
+ get.name[sizeof(get.name) - 1] = '\0';
xt_compat_lock(AF_INET);
t = xt_find_table_lock(net, AF_INET, get.name);
@@ -2062,9 +2064,27 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
return ret;
}
-struct xt_table *ipt_register_table(struct net *net,
- const struct xt_table *table,
- const struct ipt_replace *repl)
+static void __ipt_unregister_table(struct net *net, struct xt_table *table)
+{
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+ struct module *table_owner = table->me;
+ struct ipt_entry *iter;
+
+ private = xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries;
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter, net);
+ if (private->number > private->initial_entries)
+ module_put(table_owner);
+ xt_free_table_info(private);
+}
+
+int ipt_register_table(struct net *net, const struct xt_table *table,
+ const struct ipt_replace *repl,
+ const struct nf_hook_ops *ops, struct xt_table **res)
{
int ret;
struct xt_table_info *newinfo;
@@ -2073,10 +2093,8 @@ struct xt_table *ipt_register_table(struct net *net,
struct xt_table *new_table;
newinfo = xt_alloc_table_info(repl->size);
- if (!newinfo) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!newinfo)
+ return -ENOMEM;
loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
@@ -2091,30 +2109,27 @@ struct xt_table *ipt_register_table(struct net *net,
goto out_free;
}
- return new_table;
+ /* set res now, will see skbs right after nf_register_net_hooks */
+ WRITE_ONCE(*res, new_table);
+
+ ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
+ if (ret != 0) {
+ __ipt_unregister_table(net, new_table);
+ *res = NULL;
+ }
+
+ return ret;
out_free:
xt_free_table_info(newinfo);
-out:
- return ERR_PTR(ret);
+ return ret;
}
-void ipt_unregister_table(struct net *net, struct xt_table *table)
+void ipt_unregister_table(struct net *net, struct xt_table *table,
+ const struct nf_hook_ops *ops)
{
- struct xt_table_info *private;
- void *loc_cpu_entry;
- struct module *table_owner = table->me;
- struct ipt_entry *iter;
-
- private = xt_unregister_table(table);
-
- /* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries;
- xt_entry_foreach(iter, loc_cpu_entry, private->size)
- cleanup_entry(iter, net);
- if (private->number > private->initial_entries)
- module_put(table_owner);
- xt_free_table_info(private);
+ nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+ __ipt_unregister_table(net, table);
}
/* Returns 1 if the type and code is matched by the range, 0 otherwise */
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 5fdc55651..db5b87509 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -18,7 +18,8 @@
#include <net/netfilter/nf_conntrack_synproxy.h>
static struct iphdr *
-synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr)
+synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr,
+ __be32 daddr)
{
struct iphdr *iph;
@@ -29,7 +30,7 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr)
iph->tos = 0;
iph->id = 0;
iph->frag_off = htons(IP_DF);
- iph->ttl = sysctl_ip_default_ttl;
+ iph->ttl = net->ipv4.sysctl_ip_default_ttl;
iph->protocol = IPPROTO_TCP;
iph->check = 0;
iph->saddr = saddr;
@@ -39,14 +40,12 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr)
}
static void
-synproxy_send_tcp(const struct synproxy_net *snet,
+synproxy_send_tcp(struct net *net,
const struct sk_buff *skb, struct sk_buff *nskb,
struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
struct iphdr *niph, struct tcphdr *nth,
unsigned int tcp_hdr_size)
{
- struct net *net = nf_ct_net(snet->tmpl);
-
nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
nskb->ip_summed = CHECKSUM_PARTIAL;
nskb->csum_start = (unsigned char *)nth - nskb->head;
@@ -71,7 +70,7 @@ free_nskb:
}
static void
-synproxy_send_client_synack(const struct synproxy_net *snet,
+synproxy_send_client_synack(struct net *net,
const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts)
{
@@ -90,7 +89,7 @@ synproxy_send_client_synack(const struct synproxy_net *snet,
return;
skb_reserve(nskb, MAX_TCP_HEADER);
- niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
+ niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
skb_reset_transport_header(nskb);
nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
@@ -108,15 +107,16 @@ synproxy_send_client_synack(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
niph, nth, tcp_hdr_size);
}
static void
-synproxy_send_server_syn(const struct synproxy_net *snet,
+synproxy_send_server_syn(struct net *net,
const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts, u32 recv_seq)
{
+ struct synproxy_net *snet = synproxy_pernet(net);
struct sk_buff *nskb;
struct iphdr *iph, *niph;
struct tcphdr *nth;
@@ -131,7 +131,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet,
return;
skb_reserve(nskb, MAX_TCP_HEADER);
- niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
+ niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
skb_reset_transport_header(nskb);
nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
@@ -152,12 +152,12 @@ synproxy_send_server_syn(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+ synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
niph, nth, tcp_hdr_size);
}
static void
-synproxy_send_server_ack(const struct synproxy_net *snet,
+synproxy_send_server_ack(struct net *net,
const struct ip_ct_tcp *state,
const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts)
@@ -176,7 +176,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet,
return;
skb_reserve(nskb, MAX_TCP_HEADER);
- niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
+ niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
skb_reset_transport_header(nskb);
nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
@@ -192,11 +192,11 @@ synproxy_send_server_ack(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
}
static void
-synproxy_send_client_ack(const struct synproxy_net *snet,
+synproxy_send_client_ack(struct net *net,
const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts)
{
@@ -214,7 +214,7 @@ synproxy_send_client_ack(const struct synproxy_net *snet,
return;
skb_reserve(nskb, MAX_TCP_HEADER);
- niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
+ niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
skb_reset_transport_header(nskb);
nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
@@ -230,15 +230,16 @@ synproxy_send_client_ack(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
niph, nth, tcp_hdr_size);
}
static bool
-synproxy_recv_client_ack(const struct synproxy_net *snet,
+synproxy_recv_client_ack(struct net *net,
const struct sk_buff *skb, const struct tcphdr *th,
struct synproxy_options *opts, u32 recv_seq)
{
+ struct synproxy_net *snet = synproxy_pernet(net);
int mss;
mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
@@ -254,7 +255,7 @@ synproxy_recv_client_ack(const struct synproxy_net *snet,
if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
synproxy_check_timestamp_cookie(opts);
- synproxy_send_server_syn(snet, skb, th, opts, recv_seq);
+ synproxy_send_server_syn(net, skb, th, opts, recv_seq);
return true;
}
@@ -262,7 +263,8 @@ static unsigned int
synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_synproxy_info *info = par->targinfo;
- struct synproxy_net *snet = synproxy_pernet(par->net);
+ struct net *net = par->net;
+ struct synproxy_net *snet = synproxy_pernet(net);
struct synproxy_options opts = {};
struct tcphdr *th, _th;
@@ -291,12 +293,12 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
XT_SYNPROXY_OPT_SACK_PERM |
XT_SYNPROXY_OPT_ECN);
- synproxy_send_client_synack(snet, skb, th, &opts);
+ synproxy_send_client_synack(net, skb, th, &opts);
return NF_DROP;
} else if (th->ack && !(th->fin || th->rst || th->syn)) {
/* ACK from client */
- synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq));
+ synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq));
return NF_DROP;
}
@@ -307,7 +309,8 @@ static unsigned int ipv4_synproxy_hook(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *nhs)
{
- struct synproxy_net *snet = synproxy_pernet(nhs->net);
+ struct net *net = nhs->net;
+ struct synproxy_net *snet = synproxy_pernet(net);
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
struct nf_conn_synproxy *synproxy;
@@ -364,7 +367,7 @@ static unsigned int ipv4_synproxy_hook(void *priv,
* therefore we need to add 1 to make the SYN sequence
* number match the one of first SYN.
*/
- if (synproxy_recv_client_ack(snet, skb, th, &opts,
+ if (synproxy_recv_client_ack(net, skb, th, &opts,
ntohl(th->seq) + 1))
this_cpu_inc(snet->stats->cookie_retrans);
@@ -390,12 +393,12 @@ static unsigned int ipv4_synproxy_hook(void *priv,
XT_SYNPROXY_OPT_SACK_PERM);
swap(opts.tsval, opts.tsecr);
- synproxy_send_server_ack(snet, state, skb, th, &opts);
+ synproxy_send_server_ack(net, state, skb, th, &opts);
nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
swap(opts.tsval, opts.tsecr);
- synproxy_send_client_ack(snet, skb, th, &opts);
+ synproxy_send_client_ack(net, skb, th, &opts);
consume_skb(skb);
return NF_STOLEN;
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 397ef2dd1..7667f223d 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -23,6 +23,7 @@ MODULE_DESCRIPTION("iptables filter table");
#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT))
+static int __net_init iptable_filter_table_init(struct net *net);
static const struct xt_table packet_filter = {
.name = "filter",
@@ -30,6 +31,7 @@ static const struct xt_table packet_filter = {
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_FILTER,
+ .table_init = iptable_filter_table_init,
};
static unsigned int
@@ -48,12 +50,16 @@ iptable_filter_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *filter_ops __read_mostly;
/* Default to forward because I got too much mail already. */
-static bool forward = true;
+static bool forward __read_mostly = true;
module_param(forward, bool, 0000);
-static int __net_init iptable_filter_net_init(struct net *net)
+static int __net_init iptable_filter_table_init(struct net *net)
{
struct ipt_replace *repl;
+ int err;
+
+ if (net->ipv4.iptable_filter)
+ return 0;
repl = ipt_alloc_initial_table(&packet_filter);
if (repl == NULL)
@@ -62,15 +68,26 @@ static int __net_init iptable_filter_net_init(struct net *net)
((struct ipt_standard *)repl->entries)[1].target.verdict =
forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
- net->ipv4.iptable_filter =
- ipt_register_table(net, &packet_filter, repl);
+ err = ipt_register_table(net, &packet_filter, repl, filter_ops,
+ &net->ipv4.iptable_filter);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter);
+ return err;
+}
+
+static int __net_init iptable_filter_net_init(struct net *net)
+{
+ if (net == &init_net || !forward)
+ return iptable_filter_table_init(net);
+
+ return 0;
}
static void __net_exit iptable_filter_net_exit(struct net *net)
{
- ipt_unregister_table(net, net->ipv4.iptable_filter);
+ if (!net->ipv4.iptable_filter)
+ return;
+ ipt_unregister_table(net, net->ipv4.iptable_filter, filter_ops);
+ net->ipv4.iptable_filter = NULL;
}
static struct pernet_operations iptable_filter_net_ops = {
@@ -82,24 +99,21 @@ static int __init iptable_filter_init(void)
{
int ret;
+ filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook);
+ if (IS_ERR(filter_ops))
+ return PTR_ERR(filter_ops);
+
ret = register_pernet_subsys(&iptable_filter_net_ops);
if (ret < 0)
- return ret;
-
- /* Register hooks */
- filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
- if (IS_ERR(filter_ops)) {
- ret = PTR_ERR(filter_ops);
- unregister_pernet_subsys(&iptable_filter_net_ops);
- }
+ kfree(filter_ops);
return ret;
}
static void __exit iptable_filter_fini(void)
{
- xt_hook_unlink(&packet_filter, filter_ops);
unregister_pernet_subsys(&iptable_filter_net_ops);
+ kfree(filter_ops);
}
module_init(iptable_filter_init);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index ba5d392a1..57fc97cda 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables mangle table");
(1 << NF_INET_LOCAL_OUT) | \
(1 << NF_INET_POST_ROUTING))
+static int __net_init iptable_mangle_table_init(struct net *net);
+
static const struct xt_table packet_mangler = {
.name = "mangle",
.valid_hooks = MANGLE_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_MANGLE,
+ .table_init = iptable_mangle_table_init,
};
static unsigned int
@@ -92,27 +95,32 @@ iptable_mangle_hook(void *priv,
}
static struct nf_hook_ops *mangle_ops __read_mostly;
-
-static int __net_init iptable_mangle_net_init(struct net *net)
+static int __net_init iptable_mangle_table_init(struct net *net)
{
struct ipt_replace *repl;
+ int ret;
+
+ if (net->ipv4.iptable_mangle)
+ return 0;
repl = ipt_alloc_initial_table(&packet_mangler);
if (repl == NULL)
return -ENOMEM;
- net->ipv4.iptable_mangle =
- ipt_register_table(net, &packet_mangler, repl);
+ ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops,
+ &net->ipv4.iptable_mangle);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle);
+ return ret;
}
static void __net_exit iptable_mangle_net_exit(struct net *net)
{
- ipt_unregister_table(net, net->ipv4.iptable_mangle);
+ if (!net->ipv4.iptable_mangle)
+ return;
+ ipt_unregister_table(net, net->ipv4.iptable_mangle, mangle_ops);
+ net->ipv4.iptable_mangle = NULL;
}
static struct pernet_operations iptable_mangle_net_ops = {
- .init = iptable_mangle_net_init,
.exit = iptable_mangle_net_exit,
};
@@ -120,15 +128,22 @@ static int __init iptable_mangle_init(void)
{
int ret;
+ mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook);
+ if (IS_ERR(mangle_ops)) {
+ ret = PTR_ERR(mangle_ops);
+ return ret;
+ }
+
ret = register_pernet_subsys(&iptable_mangle_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(mangle_ops);
return ret;
+ }
- /* Register hooks */
- mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
- if (IS_ERR(mangle_ops)) {
- ret = PTR_ERR(mangle_ops);
+ ret = iptable_mangle_table_init(&init_net);
+ if (ret) {
unregister_pernet_subsys(&iptable_mangle_net_ops);
+ kfree(mangle_ops);
}
return ret;
@@ -136,8 +151,8 @@ static int __init iptable_mangle_init(void)
static void __exit iptable_mangle_fini(void)
{
- xt_hook_unlink(&packet_mangler, mangle_ops);
unregister_pernet_subsys(&iptable_mangle_net_ops);
+ kfree(mangle_ops);
}
module_init(iptable_mangle_init);
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index ae2cd2752..138a24bc7 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -18,6 +18,8 @@
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_l3proto.h>
+static int __net_init iptable_nat_table_init(struct net *net);
+
static const struct xt_table nf_nat_ipv4_table = {
.name = "nat",
.valid_hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -26,6 +28,7 @@ static const struct xt_table nf_nat_ipv4_table = {
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
+ .table_init = iptable_nat_table_init,
};
static unsigned int iptable_nat_do_chain(void *priv,
@@ -95,50 +98,50 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
},
};
-static int __net_init iptable_nat_net_init(struct net *net)
+static int __net_init iptable_nat_table_init(struct net *net)
{
struct ipt_replace *repl;
+ int ret;
+
+ if (net->ipv4.nat_table)
+ return 0;
repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
if (repl == NULL)
return -ENOMEM;
- net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
+ ret = ipt_register_table(net, &nf_nat_ipv4_table, repl,
+ nf_nat_ipv4_ops, &net->ipv4.nat_table);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.nat_table);
+ return ret;
}
static void __net_exit iptable_nat_net_exit(struct net *net)
{
- ipt_unregister_table(net, net->ipv4.nat_table);
+ if (!net->ipv4.nat_table)
+ return;
+ ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops);
+ net->ipv4.nat_table = NULL;
}
static struct pernet_operations iptable_nat_net_ops = {
- .init = iptable_nat_net_init,
.exit = iptable_nat_net_exit,
};
static int __init iptable_nat_init(void)
{
- int err;
+ int ret = register_pernet_subsys(&iptable_nat_net_ops);
- err = register_pernet_subsys(&iptable_nat_net_ops);
- if (err < 0)
- goto err1;
+ if (ret)
+ return ret;
- err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
- if (err < 0)
- goto err2;
- return 0;
-
-err2:
- unregister_pernet_subsys(&iptable_nat_net_ops);
-err1:
- return err;
+ ret = iptable_nat_table_init(&init_net);
+ if (ret)
+ unregister_pernet_subsys(&iptable_nat_net_ops);
+ return ret;
}
static void __exit iptable_nat_exit(void)
{
- nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
unregister_pernet_subsys(&iptable_nat_net_ops);
}
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 1ba02811a..2642ecd26 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -10,12 +10,15 @@
#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
+static int __net_init iptable_raw_table_init(struct net *net);
+
static const struct xt_table packet_raw = {
.name = "raw",
.valid_hooks = RAW_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_RAW,
+ .table_init = iptable_raw_table_init,
};
/* The work comes in here from netfilter.c. */
@@ -34,26 +37,32 @@ iptable_raw_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *rawtable_ops __read_mostly;
-static int __net_init iptable_raw_net_init(struct net *net)
+static int __net_init iptable_raw_table_init(struct net *net)
{
struct ipt_replace *repl;
+ int ret;
+
+ if (net->ipv4.iptable_raw)
+ return 0;
repl = ipt_alloc_initial_table(&packet_raw);
if (repl == NULL)
return -ENOMEM;
- net->ipv4.iptable_raw =
- ipt_register_table(net, &packet_raw, repl);
+ ret = ipt_register_table(net, &packet_raw, repl, rawtable_ops,
+ &net->ipv4.iptable_raw);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw);
+ return ret;
}
static void __net_exit iptable_raw_net_exit(struct net *net)
{
- ipt_unregister_table(net, net->ipv4.iptable_raw);
+ if (!net->ipv4.iptable_raw)
+ return;
+ ipt_unregister_table(net, net->ipv4.iptable_raw, rawtable_ops);
+ net->ipv4.iptable_raw = NULL;
}
static struct pernet_operations iptable_raw_net_ops = {
- .init = iptable_raw_net_init,
.exit = iptable_raw_net_exit,
};
@@ -61,15 +70,20 @@ static int __init iptable_raw_init(void)
{
int ret;
+ rawtable_ops = xt_hook_ops_alloc(&packet_raw, iptable_raw_hook);
+ if (IS_ERR(rawtable_ops))
+ return PTR_ERR(rawtable_ops);
+
ret = register_pernet_subsys(&iptable_raw_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(rawtable_ops);
return ret;
+ }
- /* Register hooks */
- rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
- if (IS_ERR(rawtable_ops)) {
- ret = PTR_ERR(rawtable_ops);
+ ret = iptable_raw_table_init(&init_net);
+ if (ret) {
unregister_pernet_subsys(&iptable_raw_net_ops);
+ kfree(rawtable_ops);
}
return ret;
@@ -77,8 +91,8 @@ static int __init iptable_raw_init(void)
static void __exit iptable_raw_fini(void)
{
- xt_hook_unlink(&packet_raw, rawtable_ops);
unregister_pernet_subsys(&iptable_raw_net_ops);
+ kfree(rawtable_ops);
}
module_init(iptable_raw_init);
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index c2e23d5e9..ff226596e 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules");
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT)
+static int __net_init iptable_security_table_init(struct net *net);
+
static const struct xt_table security_table = {
.name = "security",
.valid_hooks = SECURITY_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_SECURITY,
+ .table_init = iptable_security_table_init,
};
static unsigned int
@@ -51,26 +54,33 @@ iptable_security_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *sectbl_ops __read_mostly;
-static int __net_init iptable_security_net_init(struct net *net)
+static int __net_init iptable_security_table_init(struct net *net)
{
struct ipt_replace *repl;
+ int ret;
+
+ if (net->ipv4.iptable_security)
+ return 0;
repl = ipt_alloc_initial_table(&security_table);
if (repl == NULL)
return -ENOMEM;
- net->ipv4.iptable_security =
- ipt_register_table(net, &security_table, repl);
+ ret = ipt_register_table(net, &security_table, repl, sectbl_ops,
+ &net->ipv4.iptable_security);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.iptable_security);
+ return ret;
}
static void __net_exit iptable_security_net_exit(struct net *net)
{
- ipt_unregister_table(net, net->ipv4.iptable_security);
+ if (!net->ipv4.iptable_security)
+ return;
+
+ ipt_unregister_table(net, net->ipv4.iptable_security, sectbl_ops);
+ net->ipv4.iptable_security = NULL;
}
static struct pernet_operations iptable_security_net_ops = {
- .init = iptable_security_net_init,
.exit = iptable_security_net_exit,
};
@@ -78,27 +88,29 @@ static int __init iptable_security_init(void)
{
int ret;
+ sectbl_ops = xt_hook_ops_alloc(&security_table, iptable_security_hook);
+ if (IS_ERR(sectbl_ops))
+ return PTR_ERR(sectbl_ops);
+
ret = register_pernet_subsys(&iptable_security_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(sectbl_ops);
return ret;
-
- sectbl_ops = xt_hook_link(&security_table, iptable_security_hook);
- if (IS_ERR(sectbl_ops)) {
- ret = PTR_ERR(sectbl_ops);
- goto cleanup_table;
}
- return ret;
+ ret = iptable_security_table_init(&init_net);
+ if (ret) {
+ unregister_pernet_subsys(&iptable_security_net_ops);
+ kfree(sectbl_ops);
+ }
-cleanup_table:
- unregister_pernet_subsys(&iptable_security_net_ops);
return ret;
}
static void __exit iptable_security_fini(void)
{
- xt_hook_unlink(&security_table, sectbl_ops);
unregister_pernet_subsys(&iptable_security_net_ops);
+ kfree(sectbl_ops);
}
module_init(iptable_security_init);
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index a04dee536..d88da36b3 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -31,10 +31,8 @@ static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
err = ip_defrag(net, skb, user);
local_bh_enable();
- if (!err) {
- ip_send_check(ip_hdr(skb));
+ if (!err)
skb->ignore_df = 1;
- }
return err;
}
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 61c7cc22e..f8aad03d6 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -127,29 +127,15 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
u8 proto, void *data, __sum16 *check,
int datalen, int oldlen)
{
- const struct iphdr *iph = ip_hdr(skb);
- struct rtable *rt = skb_rtable(skb);
-
if (skb->ip_summed != CHECKSUM_PARTIAL) {
- if (!(rt->rt_flags & RTCF_LOCAL) &&
- (!skb->dev || skb->dev->features &
- (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) {
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum_start = skb_headroom(skb) +
- skb_network_offset(skb) +
- ip_hdrlen(skb);
- skb->csum_offset = (void *)check - data;
- *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
- datalen, proto, 0);
- } else {
- *check = 0;
- *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
- datalen, proto,
- csum_partial(data, datalen,
- 0));
- if (proto == IPPROTO_UDP && !*check)
- *check = CSUM_MANGLED_0;
- }
+ const struct iphdr *iph = ip_hdr(skb);
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
+ ip_hdrlen(skb);
+ skb->csum_offset = (void *)check - data;
+ *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
+ proto, 0);
} else
inet_proto_csum_replace2(check, skb,
htons(oldlen), htons(datalen), true);
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index b72ffc58e..51ced81b6 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -25,7 +25,12 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
memset(&range, 0, sizeof(range));
range.flags = priv->flags;
-
+ if (priv->sreg_proto_min) {
+ range.min_proto.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_min];
+ range.max_proto.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_max];
+ }
regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook,
&range, pkt->out);
}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index d3a27165f..cf9700b1a 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -145,10 +145,12 @@ fail:
}
EXPORT_SYMBOL_GPL(ping_get_port);
-void ping_hash(struct sock *sk)
+int ping_hash(struct sock *sk)
{
pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
BUG(); /* "Please do not press this button again." */
+
+ return 0;
}
void ping_unhash(struct sock *sk)
@@ -1140,13 +1142,6 @@ static int ping_v4_seq_show(struct seq_file *seq, void *v)
return 0;
}
-static const struct seq_operations ping_v4_seq_ops = {
- .show = ping_v4_seq_show,
- .start = ping_v4_seq_start,
- .next = ping_seq_next,
- .stop = ping_seq_stop,
-};
-
static int ping_seq_open(struct inode *inode, struct file *file)
{
struct ping_seq_afinfo *afinfo = PDE_DATA(inode);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 3abd9d7a3..9f665b63a 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -390,7 +390,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\nIp: %d %d",
IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
- sysctl_ip_default_ttl);
+ net->ipv4.sysctl_ip_default_ttl);
BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 7113bae4e..8d22de740 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -93,7 +93,7 @@ static struct raw_hashinfo raw_v4_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
};
-void raw_hash_sk(struct sock *sk)
+int raw_hash_sk(struct sock *sk)
{
struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
struct hlist_head *head;
@@ -104,6 +104,8 @@ void raw_hash_sk(struct sock *sk)
sk_add_node(sk, head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
write_unlock_bh(&h->lock);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(raw_hash_sk);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 02c62299d..60398a937 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1438,9 +1438,9 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
#endif
}
-static struct rtable *rt_dst_alloc(struct net_device *dev,
- unsigned int flags, u16 type,
- bool nopolicy, bool noxfrm, bool will_cache)
+struct rtable *rt_dst_alloc(struct net_device *dev,
+ unsigned int flags, u16 type,
+ bool nopolicy, bool noxfrm, bool will_cache)
{
struct rtable *rt;
@@ -1468,6 +1468,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
return rt;
}
+EXPORT_SYMBOL(rt_dst_alloc);
/* called in rcu_read_lock() section */
static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2045,6 +2046,18 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
*/
if (fi && res->prefixlen < 4)
fi = NULL;
+ } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
+ (orig_oif != dev_out->ifindex)) {
+ /* For local routes that require a particular output interface
+ * we do not want to cache the result. Caching the result
+ * causes incorrect behaviour when there are multiple source
+ * addresses on the interface, the end result being that if the
+ * intended recipient is waiting on that interface for the
+ * packet he won't receive it because it will be delivered on
+ * the loopback interface and the IP_PKTINFO ipi_ifindex will
+ * be set to the loopback interface as well.
+ */
+ fi = NULL;
}
fnhe = NULL;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 643a86c49..4c04f0933 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -19,8 +19,6 @@
#include <net/tcp.h>
#include <net/route.h>
-extern int sysctl_tcp_syncookies;
-
static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
#define COOKIEBITS 24 /* Upper bits store count */
@@ -50,8 +48,7 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
#define TSBITS 6
#define TSMASK (((__u32)1 << TSBITS) - 1)
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
- ipv4_cookie_scratch);
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch);
static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 count, int c)
@@ -307,7 +304,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
__u8 rcv_wscale;
struct flowi4 fl4;
- if (!sysctl_tcp_syncookies || !th->ack || th->rst)
+ if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst)
goto out;
if (tcp_synq_no_recent_overflow(sk))
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4d367b413..1e1fe6086 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -283,31 +283,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "ip_default_ttl",
- .data = &sysctl_ip_default_ttl,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &ip_ttl_min,
- .extra2 = &ip_ttl_max,
- },
- {
- .procname = "tcp_syn_retries",
- .data = &sysctl_tcp_syn_retries,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &tcp_syn_retries_min,
- .extra2 = &tcp_syn_retries_max
- },
- {
- .procname = "tcp_synack_retries",
- .data = &sysctl_tcp_synack_retries,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_max_orphans",
.data = &sysctl_tcp_max_orphans,
.maxlen = sizeof(int),
@@ -322,51 +297,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "ip_early_demux",
- .data = &sysctl_ip_early_demux,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "ip_dynaddr",
- .data = &sysctl_ip_dynaddr,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_retries1",
- .data = &sysctl_tcp_retries1,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra2 = &tcp_retr1_max
- },
- {
- .procname = "tcp_retries2",
- .data = &sysctl_tcp_retries2,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_fin_timeout",
- .data = &sysctl_tcp_fin_timeout,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
-#ifdef CONFIG_SYN_COOKIES
- {
- .procname = "tcp_syncookies",
- .data = &sysctl_tcp_syncookies,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
-#endif
- {
.procname = "tcp_fastopen",
.data = &sysctl_tcp_fastopen,
.maxlen = sizeof(int),
@@ -415,30 +345,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "igmp_max_memberships",
- .data = &sysctl_igmp_max_memberships,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "igmp_max_msf",
- .data = &sysctl_igmp_max_msf,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
-#ifdef CONFIG_IP_MULTICAST
- {
- .procname = "igmp_qrv",
- .data = &sysctl_igmp_qrv,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &one
- },
-#endif
- {
.procname = "inet_peer_threshold",
.data = &inet_peer_threshold,
.maxlen = sizeof(int),
@@ -460,13 +366,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec_jiffies,
},
{
- .procname = "tcp_orphan_retries",
- .data = &sysctl_tcp_orphan_retries,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_fack",
.data = &sysctl_tcp_fack,
.maxlen = sizeof(int),
@@ -481,13 +380,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec,
},
{
- .procname = "tcp_reordering",
- .data = &sysctl_tcp_reordering,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_max_reordering",
.data = &sysctl_tcp_max_reordering,
.maxlen = sizeof(int),
@@ -517,13 +409,6 @@ static struct ctl_table ipv4_table[] = {
.extra1 = &one,
},
{
- .procname = "tcp_notsent_lowat",
- .data = &sysctl_tcp_notsent_lowat,
- .maxlen = sizeof(sysctl_tcp_notsent_lowat),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "tcp_rmem",
.data = &sysctl_tcp_rmem,
.maxlen = sizeof(sysctl_tcp_rmem),
@@ -845,6 +730,29 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "ip_dynaddr",
+ .data = &init_net.ipv4.sysctl_ip_dynaddr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ip_early_demux",
+ .data = &init_net.ipv4.sysctl_ip_early_demux,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ip_default_ttl",
+ .data = &init_net.ipv4.sysctl_ip_default_ttl,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &ip_ttl_min,
+ .extra2 = &ip_ttl_max,
+ },
+ {
.procname = "ip_local_port_range",
.maxlen = sizeof(init_net.ipv4.ip_local_ports.range),
.data = &init_net.ipv4.ip_local_ports.range,
@@ -934,12 +842,36 @@ static struct ctl_table ipv4_net_table[] = {
},
{
.procname = "igmp_link_local_mcast_reports",
- .data = &sysctl_igmp_llm_reports,
+ .data = &init_net.ipv4.sysctl_igmp_llm_reports,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "igmp_max_memberships",
+ .data = &init_net.ipv4.sysctl_igmp_max_memberships,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
+ .procname = "igmp_max_msf",
+ .data = &init_net.ipv4.sysctl_igmp_max_msf,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#ifdef CONFIG_IP_MULTICAST
+ {
+ .procname = "igmp_qrv",
+ .data = &init_net.ipv4.sysctl_igmp_qrv,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one
+ },
+#endif
+ {
.procname = "tcp_keepalive_time",
.data = &init_net.ipv4.sysctl_tcp_keepalive_time,
.maxlen = sizeof(int),
@@ -960,6 +892,74 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+ {
+ .procname = "tcp_syn_retries",
+ .data = &init_net.ipv4.sysctl_tcp_syn_retries,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_syn_retries_min,
+ .extra2 = &tcp_syn_retries_max
+ },
+ {
+ .procname = "tcp_synack_retries",
+ .data = &init_net.ipv4.sysctl_tcp_synack_retries,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#ifdef CONFIG_SYN_COOKIES
+ {
+ .procname = "tcp_syncookies",
+ .data = &init_net.ipv4.sysctl_tcp_syncookies,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
+ {
+ .procname = "tcp_reordering",
+ .data = &init_net.ipv4.sysctl_tcp_reordering,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_retries1",
+ .data = &init_net.ipv4.sysctl_tcp_retries1,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra2 = &tcp_retr1_max
+ },
+ {
+ .procname = "tcp_retries2",
+ .data = &init_net.ipv4.sysctl_tcp_retries2,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_orphan_retries",
+ .data = &init_net.ipv4.sysctl_tcp_orphan_retries,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_fin_timeout",
+ .data = &init_net.ipv4.sysctl_tcp_fin_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "tcp_notsent_lowat",
+ .data = &init_net.ipv4.sysctl_tcp_notsent_lowat,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};
@@ -988,6 +988,10 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
if (!net->ipv4.sysctl_local_reserved_ports)
goto err_ports;
+ net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
+ net->ipv4.sysctl_ip_dynaddr = 0;
+ net->ipv4.sysctl_ip_early_demux = 1;
+
return 0;
err_ports:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0da50fe0a..32c9d8c25 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -247,6 +247,7 @@
#define pr_fmt(fmt) "TCP: " fmt
+#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -266,7 +267,6 @@
#include <linux/swap.h>
#include <linux/cache.h>
#include <linux/err.h>
-#include <linux/crypto.h>
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -283,8 +283,6 @@
#include <asm/unaligned.h>
#include <net/busy_poll.h>
-int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
-
int sysctl_tcp_min_tso_segs __read_mostly = 2;
int sysctl_tcp_autocorking __read_mostly = 1;
@@ -407,7 +405,7 @@ void tcp_init_sock(struct sock *sk)
tp->mss_cache = TCP_MSS_DEFAULT;
u64_stats_init(&tp->syncp);
- tp->reordering = sysctl_tcp_reordering;
+ tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
tcp_enable_early_retrans(tp);
tcp_assign_congestion_control(sk);
@@ -559,20 +557,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return -EINVAL;
slow = lock_sock_fast(sk);
- if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
- answ = 0;
- else if (sock_flag(sk, SOCK_URGINLINE) ||
- !tp->urg_data ||
- before(tp->urg_seq, tp->copied_seq) ||
- !before(tp->urg_seq, tp->rcv_nxt)) {
-
- answ = tp->rcv_nxt - tp->copied_seq;
-
- /* Subtract 1, if FIN was received */
- if (answ && sock_flag(sk, SOCK_DONE))
- answ--;
- } else
- answ = tp->urg_seq - tp->copied_seq;
+ answ = tcp_inq(sk);
unlock_sock_fast(sk, slow);
break;
case SIOCATMARK:
@@ -1467,8 +1452,10 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
offset = seq - TCP_SKB_CB(skb)->seq;
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ pr_err_once("%s: found a SYN, please report !\n", __func__);
offset--;
+ }
if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
*off = offset;
return skb;
@@ -1658,8 +1645,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
break;
offset = *seq - TCP_SKB_CB(skb)->seq;
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ pr_err_once("%s: found a SYN, please report !\n", __func__);
offset--;
+ }
if (offset < skb->len)
goto found_ok_skb;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -2364,6 +2353,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct net *net = sock_net(sk);
int val;
int err = 0;
@@ -2620,7 +2610,7 @@ stealth_integrity_out_1:
case TCP_LINGER2:
if (val < 0)
tp->linger2 = -1;
- else if (val > sysctl_tcp_fin_timeout / HZ)
+ else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
tp->linger2 = 0;
else
tp->linger2 = val * HZ;
@@ -2749,6 +2739,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp;
unsigned int start;
+ int notsent_bytes;
u64 rate64;
u32 rate;
@@ -2829,6 +2820,13 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
} while (u64_stats_fetch_retry_irq(&tp->syncp, start));
info->tcpi_segs_out = tp->segs_out;
info->tcpi_segs_in = tp->segs_in;
+
+ notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
+ info->tcpi_notsent_bytes = max(0, notsent_bytes);
+
+ info->tcpi_min_rtt = tcp_min_rtt(tp);
+ info->tcpi_data_segs_in = tp->data_segs_in;
+ info->tcpi_data_segs_out = tp->data_segs_out;
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -2837,6 +2835,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
int val, len;
if (get_user(len, optlen))
@@ -2871,12 +2870,12 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = keepalive_probes(tp);
break;
case TCP_SYNCNT:
- val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+ val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
break;
case TCP_LINGER2:
val = tp->linger2;
if (val >= 0)
- val = (val ? : sysctl_tcp_fin_timeout) / HZ;
+ val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
break;
case TCP_DEFER_ACCEPT:
val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
@@ -3053,17 +3052,26 @@ static bool tcp_md5sig_pool_populated = false;
static void __tcp_alloc_md5sig_pool(void)
{
+ struct crypto_ahash *hash;
int cpu;
+ hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(hash))
+ return;
+
for_each_possible_cpu(cpu) {
- if (!per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm) {
- struct crypto_hash *hash;
+ struct ahash_request *req;
- hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(hash))
- return;
- per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash;
- }
+ if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
+ continue;
+
+ req = ahash_request_alloc(hash, GFP_KERNEL);
+ if (!req)
+ return;
+
+ ahash_request_set_callback(req, 0, NULL, NULL);
+
+ per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
}
/* before setting tcp_md5sig_pool_populated, we must commit all writes
* to memory. See smp_rmb() in tcp_get_md5sig_pool()
@@ -3113,7 +3121,6 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
{
struct scatterlist sg;
struct tcphdr hdr;
- int err;
/* We are not allowed to change tcphdr, make a local copy */
memcpy(&hdr, th, sizeof(hdr));
@@ -3121,8 +3128,8 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
/* options aren't included in the hash */
sg_init_one(&sg, &hdr, sizeof(hdr));
- err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
- return err;
+ ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(hdr));
+ return crypto_ahash_update(hp->md5_req);
}
EXPORT_SYMBOL(tcp_md5_hash_header);
@@ -3131,7 +3138,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
{
struct scatterlist sg;
const struct tcphdr *tp = tcp_hdr(skb);
- struct hash_desc *desc = &hp->md5_desc;
+ struct ahash_request *req = hp->md5_req;
unsigned int i;
const unsigned int head_data_len = skb_headlen(skb) > header_len ?
skb_headlen(skb) - header_len : 0;
@@ -3141,7 +3148,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
sg_init_table(&sg, 1);
sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
- if (crypto_hash_update(desc, &sg, head_data_len))
+ ahash_request_set_crypt(req, &sg, NULL, head_data_len);
+ if (crypto_ahash_update(req))
return 1;
for (i = 0; i < shi->nr_frags; ++i) {
@@ -3151,7 +3159,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
sg_set_page(&sg, page, skb_frag_size(f),
offset_in_page(offset));
- if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
+ ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
+ if (crypto_ahash_update(req))
return 1;
}
@@ -3168,7 +3177,8 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *ke
struct scatterlist sg;
sg_init_one(&sg, key->key, key->keylen);
- return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
+ ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen);
+ return crypto_ahash_update(hp->md5_req);
}
EXPORT_SYMBOL(tcp_md5_hash_key);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 55be6ac70..cffd8f9ed 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -1,3 +1,4 @@
+#include <linux/crypto.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -124,6 +125,49 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
return false;
}
+
+/* If an incoming SYN or SYNACK frame contains a payload and/or FIN,
+ * queue this additional data / FIN.
+ */
+void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
+ return;
+
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ skb_dst_drop(skb);
+ /* segs_in has been initialized to 1 in tcp_create_openreq_child().
+ * Hence, reset segs_in to 0 before calling tcp_segs_in()
+ * to avoid double counting. Also, tcp_segs_in() expects
+ * skb->len to include the tcp_hdrlen. Hence, it should
+ * be called before __skb_pull().
+ */
+ tp->segs_in = 0;
+ tcp_segs_in(tp, skb);
+ __skb_pull(skb, tcp_hdrlen(skb));
+ skb_set_owner_r(skb, sk);
+
+ TCP_SKB_CB(skb)->seq++;
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;
+
+ tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ tp->syn_data_acked = 1;
+
+ /* u64_stats_update_begin(&tp->syncp) not needed here,
+ * as we certainly are not changing upper 32bit value (0)
+ */
+ tp->bytes_received = skb->len;
+
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ tcp_fin(sk);
+}
+
static struct sock *tcp_fastopen_create_child(struct sock *sk,
struct sk_buff *skb,
struct dst_entry *dst,
@@ -132,7 +176,6 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
struct tcp_sock *tp;
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
struct sock *child;
- u32 end_seq;
bool own_req;
req->num_retrans = 0;
@@ -178,35 +221,11 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
tcp_init_metrics(child);
tcp_init_buffer_space(child);
- /* Queue the data carried in the SYN packet.
- * We used to play tricky games with skb_get().
- * With lockless listener, it is a dead end.
- * Do not think about it.
- *
- * XXX (TFO) - we honor a zero-payload TFO request for now,
- * (any reason not to?) but no need to queue the skb since
- * there is no data. How about SYN+FIN?
- */
- end_seq = TCP_SKB_CB(skb)->end_seq;
- if (end_seq != TCP_SKB_CB(skb)->seq + 1) {
- struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
-
- if (likely(skb2)) {
- skb_dst_drop(skb2);
- __skb_pull(skb2, tcp_hdrlen(skb));
- skb_set_owner_r(skb2, child);
- __skb_queue_tail(&child->sk_receive_queue, skb2);
- tp->syn_data_acked = 1;
-
- /* u64_stats_update_begin(&tp->syncp) not needed here,
- * as we certainly are not changing upper 32bit value (0)
- */
- tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1;
- } else {
- end_seq = TCP_SKB_CB(skb)->seq + 1;
- }
- }
- tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq;
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+
+ tcp_fastopen_add_skb(child, skb);
+
+ tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
/* tcp_conn_request() is sending the SYNACK,
* and queues the child into listener accept queue.
*/
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b3a47bc08..131005f82 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -83,9 +83,7 @@ EXPORT_SYMBOL(sysctl_tcp_timestamps);
int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1;
int sysctl_tcp_fack __read_mostly = 1;
-int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
int sysctl_tcp_max_reordering __read_mostly = 300;
-EXPORT_SYMBOL(sysctl_tcp_reordering);
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
int sysctl_tcp_adv_win_scale __read_mostly = 1;
@@ -129,6 +127,10 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
+#define REXMIT_NONE 0 /* no loss recovery to do */
+#define REXMIT_LOST 1 /* retransmit packets marked lost */
+#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
+
/* Adapt the MSS value used to make delayed ack decision to the
* real world.
*/
@@ -1213,6 +1215,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
sacked |= TCPCB_SACKED_ACKED;
state->flag |= FLAG_DATA_SACKED;
tp->sacked_out += pcount;
+ tp->delivered += pcount; /* Out-of-order packets delivered */
fack_count += pcount;
@@ -1309,6 +1312,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
if (skb == tcp_highest_sack(sk))
tcp_advance_highest_sack(sk, skb);
+ tcp_skb_collapse_tstamp(prev, skb);
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
@@ -1824,8 +1828,12 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
static void tcp_add_reno_sack(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_sacked = tp->sacked_out;
+
tp->sacked_out++;
tcp_check_reno_reordering(sk, 0);
+ if (tp->sacked_out > prior_sacked)
+ tp->delivered++; /* Some out-of-order packet is delivered */
tcp_verify_left_out(tp);
}
@@ -1837,6 +1845,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked)
if (acked > 0) {
/* One ACK acked hole. The rest eat duplicate ACKs. */
+ tp->delivered += max_t(int, acked - tp->sacked_out, 1);
if (acked - 1 >= tp->sacked_out)
tp->sacked_out = 0;
else
@@ -1876,6 +1885,7 @@ void tcp_enter_loss(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
struct sk_buff *skb;
bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg; /* is receiver reneging on SACKs? */
@@ -1926,9 +1936,9 @@ void tcp_enter_loss(struct sock *sk)
* suggests that the degree of reordering is over-estimated.
*/
if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
- tp->sacked_out >= sysctl_tcp_reordering)
+ tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
tp->reordering = min_t(unsigned int, tp->reordering,
- sysctl_tcp_reordering);
+ net->ipv4.sysctl_tcp_reordering);
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
tcp_ecn_queue_cwr(tp);
@@ -2112,6 +2122,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 packets_out;
+ int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
/* Trick#1: The loss is proven. */
if (tp->lost_out)
@@ -2126,7 +2137,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
*/
packets_out = tp->packets_out;
if (packets_out <= tp->reordering &&
- tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
+ tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
!tcp_may_send_now(sk)) {
/* We have nothing to send. This connection is limited
* either by receiver window or by application.
@@ -2470,14 +2481,12 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
tcp_ecn_queue_cwr(tp);
}
-static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
- int fast_rexmit, int flag)
+static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
+ int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
- int newly_acked_sacked = prior_unsacked -
- (tp->packets_out - tp->sacked_out);
if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
return;
@@ -2495,7 +2504,8 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
} else {
sndcnt = min(delta, newly_acked_sacked);
}
- sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
+ /* Force a fast retransmit upon entering fast recovery */
+ sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
}
@@ -2540,7 +2550,7 @@ static void tcp_try_keep_open(struct sock *sk)
}
}
-static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
+static void tcp_try_to_open(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2554,8 +2564,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
tcp_try_keep_open(sk);
- } else {
- tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
}
}
@@ -2665,7 +2673,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
* recovered or spurious. Otherwise retransmits more on partial ACKs.
*/
-static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
+static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
+ int *rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
bool recovered = !before(tp->snd_una, tp->high_seq);
@@ -2687,10 +2696,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
tp->frto = 0; /* Step 3.a. loss was real */
} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
tp->high_seq = tp->snd_nxt;
- __tcp_push_pending_frames(sk, tcp_current_mss(sk),
- TCP_NAGLE_OFF);
- if (after(tp->snd_nxt, tp->high_seq))
- return; /* Step 2.b */
+ /* Step 2.b. Try send new data (but deferred until cwnd
+ * is updated in tcp_ack()). Otherwise fall back to
+ * the conventional recovery.
+ */
+ if (tcp_send_head(sk) &&
+ after(tcp_wnd_end(tp), tp->snd_nxt)) {
+ *rexmit = REXMIT_NEW;
+ return;
+ }
tp->frto = 0;
}
}
@@ -2709,12 +2723,11 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
else if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
}
- tcp_xmit_retransmit_queue(sk);
+ *rexmit = REXMIT_LOST;
}
/* Undo during fast recovery after partial ACK. */
-static bool tcp_try_undo_partial(struct sock *sk, const int acked,
- const int prior_unsacked, int flag)
+static bool tcp_try_undo_partial(struct sock *sk, const int acked)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2729,10 +2742,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,
* can undo. Otherwise we clock out new packets but do not
* mark more packets lost or retransmit more.
*/
- if (tp->retrans_out) {
- tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
+ if (tp->retrans_out)
return true;
- }
if (!tcp_any_retrans_done(sk))
tp->retrans_stamp = 0;
@@ -2751,21 +2762,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,
* taking into account both packets sitting in receiver's buffer and
* packets lost by network.
*
- * Besides that it does CWND reduction, when packet loss is detected
- * and changes state of machine.
+ * Besides that it updates the congestion state when packet loss or ECN
+ * is detected. But it does not reduce the cwnd, it is done by the
+ * congestion control later.
*
* It does _not_ decide what to send, it is made in function
* tcp_xmit_retransmit_queue().
*/
static void tcp_fastretrans_alert(struct sock *sk, const int acked,
- const int prior_unsacked,
- bool is_dupack, int flag)
+ bool is_dupack, int *ack_flag, int *rexmit)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ int fast_rexmit = 0, flag = *ack_flag;
bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
(tcp_fackets_out(tp) > tp->reordering));
- int fast_rexmit = 0;
if (WARN_ON(!tp->packets_out && tp->sacked_out))
tp->sacked_out = 0;
@@ -2812,8 +2823,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
/* Use RACK to detect loss */
if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
- tcp_rack_mark_lost(sk))
+ tcp_rack_mark_lost(sk)) {
flag |= FLAG_LOST_RETRANS;
+ *ack_flag |= FLAG_LOST_RETRANS;
+ }
/* E. Process state. */
switch (icsk->icsk_ca_state) {
@@ -2822,7 +2835,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
} else {
- if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag))
+ if (tcp_try_undo_partial(sk, acked))
return;
/* Partial ACK arrived. Force fast retransmit. */
do_lost = tcp_is_reno(tp) ||
@@ -2834,7 +2847,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
}
break;
case TCP_CA_Loss:
- tcp_process_loss(sk, flag, is_dupack);
+ tcp_process_loss(sk, flag, is_dupack, rexmit);
if (icsk->icsk_ca_state != TCP_CA_Open &&
!(flag & FLAG_LOST_RETRANS))
return;
@@ -2851,7 +2864,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
tcp_try_undo_dsack(sk);
if (!tcp_time_to_recover(sk, flag)) {
- tcp_try_to_open(sk, flag, prior_unsacked);
+ tcp_try_to_open(sk, flag);
return;
}
@@ -2873,8 +2886,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (do_lost)
tcp_update_scoreboard(sk, fast_rexmit);
- tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag);
- tcp_xmit_retransmit_queue(sk);
+ *rexmit = REXMIT_LOST;
}
/* Kathleen Nichols' algorithm for tracking the minimum value of
@@ -3090,7 +3102,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
shinfo = skb_shinfo(skb);
if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
- between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1))
+ !before(shinfo->tskey, prior_snd_una) &&
+ before(shinfo->tskey, tcp_sk(sk)->snd_una))
__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
}
@@ -3099,7 +3112,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
* arrived at the other end.
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
- u32 prior_snd_una,
+ u32 prior_snd_una, int *acked,
struct tcp_sacktag_state *sack)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -3157,10 +3170,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
flag |= FLAG_ORIG_SACK_ACKED;
}
- if (sacked & TCPCB_SACKED_ACKED)
+ if (sacked & TCPCB_SACKED_ACKED) {
tp->sacked_out -= acked_pcount;
- else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb))
- tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
+ } else if (tcp_is_sack(tp)) {
+ tp->delivered += acked_pcount;
+ if (!tcp_skb_spurious_retrans(tp, skb))
+ tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
+ }
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
@@ -3269,6 +3285,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
}
}
#endif
+ *acked = pkts_acked;
return flag;
}
@@ -3302,21 +3319,36 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
/* Decide wheather to run the increase function of congestion control. */
static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
{
- if (tcp_in_cwnd_reduction(sk))
- return false;
-
/* If reordering is high then always grow cwnd whenever data is
* delivered regardless of its ordering. Otherwise stay conservative
* and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
* new SACK or ECE mark may first advance cwnd here and later reduce
* cwnd in tcp_fastretrans_alert() based on more states.
*/
- if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
+ if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
return flag & FLAG_FORWARD_PROGRESS;
return flag & FLAG_DATA_ACKED;
}
+/* The "ultimate" congestion control function that aims to replace the rigid
+ * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
+ * It's called toward the end of processing an ACK with precise rate
+ * information. All transmission or retransmission are delayed afterwards.
+ */
+static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
+ int flag)
+{
+ if (tcp_in_cwnd_reduction(sk)) {
+ /* Reduce cwnd if state mandates */
+ tcp_cwnd_reduction(sk, acked_sacked, flag);
+ } else if (tcp_may_raise_cwnd(sk, flag)) {
+ /* Advance cwnd if state allows */
+ tcp_cong_avoid(sk, ack, acked_sacked);
+ }
+ tcp_update_pacing_rate(sk);
+}
+
/* Check that window update is acceptable.
* The function assumes that snd_una<=ack<=snd_next.
*/
@@ -3512,6 +3544,27 @@ static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
icsk->icsk_ca_ops->in_ack_event(sk, flags);
}
+/* Congestion control has updated the cwnd already. So if we're in
+ * loss recovery then now we do any new sends (for FRTO) or
+ * retransmits (for CA_Loss or CA_recovery) that make sense.
+ */
+static void tcp_xmit_recovery(struct sock *sk, int rexmit)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (rexmit == REXMIT_NONE)
+ return;
+
+ if (unlikely(rexmit == 2)) {
+ __tcp_push_pending_frames(sk, tcp_current_mss(sk),
+ TCP_NAGLE_OFF);
+ if (after(tp->snd_nxt, tp->high_seq))
+ return;
+ tp->frto = 0;
+ }
+ tcp_xmit_retransmit_queue(sk);
+}
+
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
@@ -3524,8 +3577,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
bool is_dupack = false;
u32 prior_fackets;
int prior_packets = tp->packets_out;
- const int prior_unsacked = tp->packets_out - tp->sacked_out;
+ u32 prior_delivered = tp->delivered;
int acked = 0; /* Number of packets newly acked */
+ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
sack_state.first_sackt.v64 = 0;
@@ -3614,23 +3668,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
goto no_queue;
/* See if we can take anything off of the retransmit queue. */
- acked = tp->packets_out;
- flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
+ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
&sack_state);
- acked -= tp->packets_out;
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
- tcp_fastretrans_alert(sk, acked, prior_unsacked,
- is_dupack, flag);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
}
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
- /* Advance cwnd if state allows */
- if (tcp_may_raise_cwnd(sk, flag))
- tcp_cong_avoid(sk, ack, acked);
-
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
struct dst_entry *dst = __sk_dst_get(sk);
if (dst)
@@ -3639,14 +3686,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
- tcp_update_pacing_rate(sk);
+ tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
+ tcp_xmit_recovery(sk, rexmit);
return 1;
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK)
- tcp_fastretrans_alert(sk, acked, prior_unsacked,
- is_dupack, flag);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
@@ -3669,8 +3716,8 @@ old_ack:
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
- tcp_fastretrans_alert(sk, acked, prior_unsacked,
- is_dupack, flag);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_xmit_recovery(sk, rexmit);
}
SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
@@ -4042,7 +4089,7 @@ void tcp_reset(struct sock *sk)
*
* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
*/
-static void tcp_fin(struct sock *sk)
+void tcp_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -5598,6 +5645,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
tp->syn_data_acked = tp->syn_data;
if (tp->syn_data_acked)
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+
+ tcp_fastopen_add_skb(sk, synack);
+
return false;
}
@@ -6204,9 +6254,10 @@ static bool tcp_syn_flood_action(const struct sock *sk,
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
const char *msg = "Dropping request";
bool want_cookie = false;
+ struct net *net = sock_net(sk);
#ifdef CONFIG_SYN_COOKIES
- if (sysctl_tcp_syncookies) {
+ if (net->ipv4.sysctl_tcp_syncookies) {
msg = "Sending cookies";
want_cookie = true;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
@@ -6215,7 +6266,7 @@ static bool tcp_syn_flood_action(const struct sock *sk,
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
if (!queue->synflood_warned &&
- sysctl_tcp_syncookies != 2 &&
+ net->ipv4.sysctl_tcp_syncookies != 2 &&
xchg(&queue->synflood_warned, 1) == 0)
pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
proto, ntohs(tcp_hdr(skb)->dest), msg);
@@ -6248,6 +6299,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
struct tcp_options_received tmp_opt;
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
struct sock *fastopen_sk = NULL;
struct dst_entry *dst = NULL;
struct request_sock *req;
@@ -6258,7 +6310,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
* limitations, they conserve resources and peer is
* evidently real one.
*/
- if ((sysctl_tcp_syncookies == 2 ||
+ if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
inet_csk_reqsk_queue_is_full(sk)) && !isn) {
want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
if (!want_cookie)
@@ -6324,7 +6376,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
}
}
/* Kill the following clause, if you dislike this way. */
- else if (!sysctl_tcp_syncookies &&
+ else if (!net->ipv4.sysctl_tcp_syncookies &&
(sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(sysctl_max_syn_backlog >> 2)) &&
!tcp_peer_is_proven(req, dst, false,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b8f3908dd..dfd153fbd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -82,7 +82,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include <linux/crypto.h>
+#include <crypto/hash.h>
#include <linux/scatterlist.h>
int sysctl_tcp_tw_reuse __read_mostly;
@@ -656,8 +656,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
- sk1 = __inet_lookup_listener(net,
- &tcp_hashinfo, ip_hdr(skb)->saddr,
+ sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
+ ip_hdr(skb)->saddr,
th->source, ip_hdr(skb)->daddr,
ntohs(th->source), inet_iif(skb));
/* don't send rst if it can't find key */
@@ -879,7 +879,6 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
kfree(inet_rsk(req)->opt);
}
-
#ifdef CONFIG_TCP_MD5SIG
/*
* RFC2385 MD5 checksumming requires a mapping of
@@ -1053,21 +1052,22 @@ static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
bp->len = cpu_to_be16(nbytes);
sg_init_one(&sg, bp, sizeof(*bp));
- return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
+ ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
+ return crypto_ahash_update(hp->md5_req);
}
static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
__be32 daddr, __be32 saddr, const struct tcphdr *th)
{
struct tcp_md5sig_pool *hp;
- struct hash_desc *desc;
+ struct ahash_request *req;
hp = tcp_get_md5sig_pool();
if (!hp)
goto clear_hash_noput;
- desc = &hp->md5_desc;
+ req = hp->md5_req;
- if (crypto_hash_init(desc))
+ if (crypto_ahash_init(req))
goto clear_hash;
if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
goto clear_hash;
@@ -1075,7 +1075,8 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
goto clear_hash;
if (tcp_md5_hash_key(hp, key))
goto clear_hash;
- if (crypto_hash_final(desc, md5_hash))
+ ahash_request_set_crypt(req, NULL, md5_hash, 0);
+ if (crypto_ahash_final(req))
goto clear_hash;
tcp_put_md5sig_pool();
@@ -1093,7 +1094,7 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
const struct sk_buff *skb)
{
struct tcp_md5sig_pool *hp;
- struct hash_desc *desc;
+ struct ahash_request *req;
const struct tcphdr *th = tcp_hdr(skb);
__be32 saddr, daddr;
@@ -1109,9 +1110,9 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
hp = tcp_get_md5sig_pool();
if (!hp)
goto clear_hash_noput;
- desc = &hp->md5_desc;
+ req = hp->md5_req;
- if (crypto_hash_init(desc))
+ if (crypto_ahash_init(req))
goto clear_hash;
if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
@@ -1122,7 +1123,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
goto clear_hash;
if (tcp_md5_hash_key(hp, key))
goto clear_hash;
- if (crypto_hash_final(desc, md5_hash))
+ ahash_request_set_crypt(req, NULL, md5_hash, 0);
+ if (crypto_ahash_final(req))
goto clear_hash;
tcp_put_md5sig_pool();
@@ -1612,7 +1614,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
TCP_SKB_CB(skb)->sacked = 0;
lookup:
- sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
+ sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
+ th->dest);
if (!sk)
goto no_tcp_socket;
@@ -1675,7 +1678,7 @@ process:
sk_incoming_cpu_update(sk);
bh_lock_sock_nested(sk);
- tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+ tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
if (!tcp_prequeue(sk, skb))
@@ -1728,7 +1731,8 @@ do_time_wait:
switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
- &tcp_hashinfo,
+ &tcp_hashinfo, skb,
+ __tcp_hdrlen(th),
iph->saddr, th->source,
iph->daddr, th->dest,
inet_iif(skb));
@@ -2420,6 +2424,16 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
+ net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
+ net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
+ net->ipv4.sysctl_tcp_syncookies = 1;
+ net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
+ net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
+ net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
+ net->ipv4.sysctl_tcp_orphan_retries = 0;
+ net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
+ net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
+
return 0;
fail:
tcp_sk_exit(net);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index a726d7853..7b7eec439 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -369,6 +369,7 @@ void tcp_update_metrics(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
struct tcp_metrics_block *tm;
unsigned long rtt;
u32 val;
@@ -473,7 +474,7 @@ void tcp_update_metrics(struct sock *sk)
if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
if (val < tp->reordering &&
- tp->reordering != sysctl_tcp_reordering)
+ tp->reordering != net->ipv4.sysctl_tcp_reordering)
tcp_metric_set(tm, TCP_METRIC_REORDERING,
tp->reordering);
}
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 9b02af213..acb366dd6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -27,9 +27,6 @@
#include <net/inet_common.h>
#include <net/xfrm.h>
-int sysctl_tcp_syncookies __read_mostly = 1;
-EXPORT_SYMBOL(sysctl_tcp_syncookies);
-
int sysctl_tcp_abort_on_overflow __read_mostly;
struct inet_timewait_death_row tcp_death_row = {
@@ -815,7 +812,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
int ret = 0;
int state = child->sk_state;
- tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+ tcp_segs_in(tcp_sk(child), skb);
if (!sock_owned_by_user(child)) {
ret = tcp_rcv_state_process(child, skb);
/* Wakeup parent, send SIGIO */
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 9864a2dba..773083b7f 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -135,7 +135,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
th->fin = th->psh = 0;
th->check = newcheck;
- if (skb->ip_summed != CHECKSUM_PARTIAL)
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ gso_reset_checksum(skb, ~th->check);
+ else
th->check = gso_make_checksum(skb, ~th->check);
seq += mss;
@@ -169,7 +171,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
skb->data_len);
th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
(__force u32)delta));
- if (skb->ip_summed != CHECKSUM_PARTIAL)
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ gso_reset_checksum(skb, ~th->check);
+ else
th->check = gso_make_checksum(skb, ~th->check);
out:
return segs;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 20f2812a9..c0c1dac81 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -62,9 +62,6 @@ int sysctl_tcp_tso_win_divisor __read_mostly = 3;
/* By default, RFC2861 behavior. */
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
-unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
-EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
-
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
@@ -1013,8 +1010,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
if (likely(tcb->tcp_flags & TCPHDR_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
- if (skb->len != tcp_header_size)
+ if (skb->len != tcp_header_size) {
tcp_event_data_sent(tp, sk);
+ tp->data_segs_out += tcp_skb_pcount(skb);
+ }
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
@@ -2449,6 +2448,20 @@ u32 __tcp_select_window(struct sock *sk)
return window;
}
+void tcp_skb_collapse_tstamp(struct sk_buff *skb,
+ const struct sk_buff *next_skb)
+{
+ const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb);
+ u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
+
+ if (unlikely(tsflags)) {
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ shinfo->tx_flags |= tsflags;
+ shinfo->tskey = next_shinfo->tskey;
+ }
+}
+
/* Collapses two adjacent SKB's during retransmission. */
static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
{
@@ -2492,6 +2505,8 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
+ tcp_skb_collapse_tstamp(skb, next_skb);
+
sk_wmem_free_skb(sk, next_skb);
}
@@ -2632,8 +2647,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
*/
if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
skb_headroom(skb) >= 0xFFFF)) {
- struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER,
- GFP_ATOMIC);
+ struct sk_buff *nskb;
+
+ skb_mstamp_get(&skb->skb_mstamp);
+ nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
-ENOBUFS;
} else {
@@ -3491,6 +3508,7 @@ void tcp_send_probe0(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
unsigned long probe_max;
int err;
@@ -3504,7 +3522,7 @@ void tcp_send_probe0(struct sock *sk)
}
if (err <= 0) {
- if (icsk->icsk_backoff < sysctl_tcp_retries2)
+ if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
icsk->icsk_backoff++;
icsk->icsk_probes_out++;
probe_max = TCP_RTO_MAX;
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index ebf5ff575..f6c50af24 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -187,13 +187,13 @@ static int tcpprobe_sprint(char *tbuf, int n)
{
const struct tcp_log *p
= tcp_probe.log + tcp_probe.tail;
- struct timespec tv
- = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
+ struct timespec64 ts
+ = ktime_to_timespec64(ktime_sub(p->tstamp, tcp_probe.start));
return scnprintf(tbuf, n,
"%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",
- (unsigned long)tv.tv_sec,
- (unsigned long)tv.tv_nsec,
+ (unsigned long)ts.tv_sec,
+ (unsigned long)ts.tv_nsec,
&p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,
p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index a4730a28b..49bc474f8 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,11 +22,6 @@
#include <linux/gfp.h>
#include <net/tcp.h>
-int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
-int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
-int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
-int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
-int sysctl_tcp_orphan_retries __read_mostly;
int sysctl_tcp_thin_linear_timeouts __read_mostly;
static void tcp_write_err(struct sock *sk)
@@ -82,7 +77,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
/* Calculate maximal number or retries on an orphaned socket. */
static int tcp_orphan_retries(struct sock *sk, bool alive)
{
- int retries = sysctl_tcp_orphan_retries; /* May be zero. */
+ int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */
/* We know from an ICMP that something is wrong. */
if (sk->sk_err_soft && !alive)
@@ -157,6 +152,7 @@ static int tcp_write_timeout(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
int retry_until;
bool do_reset, syn_set = false;
@@ -169,10 +165,10 @@ static int tcp_write_timeout(struct sock *sk)
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
}
- retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+ retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
syn_set = true;
} else {
- if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
+ if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) {
/* Some middle-boxes may black-hole Fast Open _after_
* the handshake. Therefore we conservatively disable
* Fast Open on this path on recurring timeouts with
@@ -181,7 +177,7 @@ static int tcp_write_timeout(struct sock *sk)
if (tp->syn_data_acked &&
tp->bytes_acked <= tp->rx_opt.mss_clamp) {
tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
- if (icsk->icsk_retransmits == sysctl_tcp_retries1)
+ if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1)
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
}
@@ -191,7 +187,7 @@ static int tcp_write_timeout(struct sock *sk)
dst_negative_advice(sk);
}
- retry_until = sysctl_tcp_retries2;
+ retry_until = net->ipv4.sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
const bool alive = icsk->icsk_rto < TCP_RTO_MAX;
@@ -305,7 +301,7 @@ static void tcp_probe_timer(struct sock *sk)
(s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout)
goto abort;
- max_probes = sysctl_tcp_retries2;
+ max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
@@ -332,7 +328,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
int max_retries = icsk->icsk_syn_retries ? :
- sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
+ sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
struct request_sock *req;
req = tcp_sk(sk)->fastopen_rsk;
@@ -360,6 +356,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
void tcp_retransmit_timer(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
if (tp->fastopen_rsk) {
@@ -490,7 +487,7 @@ out_reset_timer:
icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
}
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
- if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
+ if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0, 0))
__sk_dst_reset(sk);
out:;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index eb8933bc0..a2e7f55a1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -339,8 +339,13 @@ found:
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
spin_lock(&hslot2->lock);
- hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
- &hslot2->head);
+ if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
+ sk->sk_family == AF_INET6)
+ hlist_nulls_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &hslot2->head);
+ else
+ hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &hslot2->head);
hslot2->count++;
spin_unlock(&hslot2->lock);
}
@@ -356,8 +361,8 @@ EXPORT_SYMBOL(udp_lib_get_port);
* match_wildcard == false: addresses must be exactly the same, i.e.
* 0.0.0.0 only equals to 0.0.0.0
*/
-static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
- bool match_wildcard)
+int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
+ bool match_wildcard)
{
struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
@@ -848,32 +853,20 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
{
struct udphdr *uh = udp_hdr(skb);
- if (nocheck)
+ if (nocheck) {
uh->check = 0;
- else if (skb_is_gso(skb))
+ } else if (skb_is_gso(skb)) {
uh->check = ~udp_v4_check(len, saddr, daddr, 0);
- else if (skb_dst(skb) && skb_dst(skb)->dev &&
- (skb_dst(skb)->dev->features &
- (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) {
-
- BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
-
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ uh->check = 0;
+ uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb));
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ } else {
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum_start = skb_transport_header(skb) - skb->head;
skb->csum_offset = offsetof(struct udphdr, check);
uh->check = ~udp_v4_check(len, saddr, daddr, 0);
- } else {
- __wsum csum;
-
- BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
-
- uh->check = 0;
- csum = skb_checksum(skb, 0, len, 0);
- uh->check = udp_v4_check(len, saddr, daddr, csum);
- if (uh->check == 0)
- uh->check = CSUM_MANGLED_0;
-
- skb->ip_summed = CHECKSUM_UNNECESSARY;
}
}
EXPORT_SYMBOL(udp_set_csum);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 4c519c1dc..e330c0e56 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -32,42 +32,65 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
netdev_features_t features),
__be16 new_protocol, bool is_ipv6)
{
+ int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+ bool remcsum, need_csum, offload_csum, ufo;
struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct udphdr *uh = udp_hdr(skb);
u16 mac_offset = skb->mac_header;
- int mac_len = skb->mac_len;
- int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
__be16 protocol = skb->protocol;
- netdev_features_t enc_features;
+ u16 mac_len = skb->mac_len;
int udp_offset, outer_hlen;
- unsigned int oldlen;
- bool need_csum = !!(skb_shinfo(skb)->gso_type &
- SKB_GSO_UDP_TUNNEL_CSUM);
- bool remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
- bool offload_csum = false, dont_encap = (need_csum || remcsum);
-
- oldlen = (u16)~skb->len;
+ __wsum partial;
if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
goto out;
+ /* Adjust partial header checksum to negate old length.
+ * We cannot rely on the value contained in uh->len as it is
+ * possible that the actual value exceeds the boundaries of the
+ * 16 bit length field due to the header being added outside of an
+ * IP or IPv6 frame that was already limited to 64K - 1.
+ */
+ partial = csum_sub(csum_unfold(uh->check),
+ (__force __wsum)htonl(skb->len));
+
+ /* setup inner skb. */
skb->encapsulation = 0;
+ SKB_GSO_CB(skb)->encap_level = 0;
__skb_pull(skb, tnl_hlen);
skb_reset_mac_header(skb);
skb_set_network_header(skb, skb_inner_network_offset(skb));
skb->mac_len = skb_inner_network_offset(skb);
skb->protocol = new_protocol;
+
+ need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
skb->encap_hdr_csum = need_csum;
+
+ remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
skb->remcsum_offload = remcsum;
+ ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
+
/* Try to offload checksum if possible */
offload_csum = !!(need_csum &&
- ((skb->dev->features & NETIF_F_HW_CSUM) ||
- (skb->dev->features & (is_ipv6 ?
- NETIF_F_IPV6_CSUM : NETIF_F_IP_CSUM))));
+ (skb->dev->features &
+ (is_ipv6 ? (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM) :
+ (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM))));
+
+ features &= skb->dev->hw_enc_features;
+
+ /* The only checksum offload we care about from here on out is the
+ * outer one so strip the existing checksum feature flags and
+ * instead set the flag based on our outer checksum offload value.
+ */
+ if (remcsum || ufo) {
+ features &= ~NETIF_F_CSUM_MASK;
+ if (!need_csum || offload_csum)
+ features |= NETIF_F_HW_CSUM;
+ }
/* segment inner packet. */
- enc_features = skb->dev->hw_enc_features & features;
- segs = gso_inner_segment(skb, enc_features);
+ segs = gso_inner_segment(skb, features);
if (IS_ERR_OR_NULL(segs)) {
skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
mac_len);
@@ -78,17 +101,13 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
udp_offset = outer_hlen - tnl_hlen;
skb = segs;
do {
- struct udphdr *uh;
- int len;
- __be32 delta;
+ __be16 len;
- if (dont_encap) {
- skb->encapsulation = 0;
+ if (remcsum)
skb->ip_summed = CHECKSUM_NONE;
- } else {
- /* Only set up inner headers if we might be offloading
- * inner checksum.
- */
+
+ /* Set up inner headers if we are offloading inner checksum */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
skb_reset_inner_headers(skb);
skb->encapsulation = 1;
}
@@ -96,43 +115,27 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
skb->mac_len = mac_len;
skb->protocol = protocol;
- skb_push(skb, outer_hlen);
+ __skb_push(skb, outer_hlen);
skb_reset_mac_header(skb);
skb_set_network_header(skb, mac_len);
skb_set_transport_header(skb, udp_offset);
- len = skb->len - udp_offset;
+ len = htons(skb->len - udp_offset);
uh = udp_hdr(skb);
- uh->len = htons(len);
+ uh->len = len;
if (!need_csum)
continue;
- delta = htonl(oldlen + len);
+ uh->check = ~csum_fold(csum_add(partial, (__force __wsum)len));
- uh->check = ~csum_fold((__force __wsum)
- ((__force u32)uh->check +
- (__force u32)delta));
- if (offload_csum) {
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum_start = skb_transport_header(skb) - skb->head;
- skb->csum_offset = offsetof(struct udphdr, check);
- } else if (remcsum) {
- /* Need to calculate checksum from scratch,
- * inner checksums are never when doing
- * remote_checksum_offload.
- */
-
- skb->csum = skb_checksum(skb, udp_offset,
- skb->len - udp_offset,
- 0);
- uh->check = csum_fold(skb->csum);
- if (uh->check == 0)
- uh->check = CSUM_MANGLED_0;
- } else {
+ if (skb->encapsulation || !offload_csum) {
uh->check = gso_make_checksum(skb, ~uh->check);
-
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
+ } else {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
}
} while ((skb = skb->next));
out:
@@ -235,6 +238,13 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
skb->ip_summed = CHECKSUM_NONE;
+ /* If there is no outer header we can fake a checksum offload
+ * due to the fact that we have already done the checksum in
+ * software prior to segmenting the frame.
+ */
+ if (!skb->encap_hdr_csum)
+ features |= NETIF_F_HW_CSUM;
+
/* Fragment the skb. IP headers of the fragments are updated in
* inet_gso_segment()
*/
@@ -302,14 +312,14 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
unsigned int off = skb_gro_offset(skb);
int flush = 1;
- if (NAPI_GRO_CB(skb)->udp_mark ||
+ if (NAPI_GRO_CB(skb)->encap_mark ||
(skb->ip_summed != CHECKSUM_PARTIAL &&
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
!NAPI_GRO_CB(skb)->csum_valid))
goto out;
- /* mark that this skb passed once through the udp gro layer */
- NAPI_GRO_CB(skb)->udp_mark = 1;
+ /* mark that this skb passed once through the tunnel gro layer */
+ NAPI_GRO_CB(skb)->encap_mark = 1;
rcu_read_lock();
uo_priv = rcu_dereference(udp_offload_base);
@@ -389,6 +399,11 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
uh->len = newlen;
+ /* Set encapsulation before calling into inner gro_complete() functions
+ * to make them set up the inner offsets.
+ */
+ skb->encapsulation = 1;
+
rcu_read_lock();
uo_priv = rcu_dereference(udp_offload_base);
@@ -411,9 +426,6 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
if (skb->remcsum_offload)
skb_shinfo(skb)->gso_type |= SKB_GSO_TUNNEL_REMCSUM;
- skb->encapsulation = 1;
- skb_set_inner_mac_header(skb, nhoff + sizeof(struct udphdr));
-
return err;
}