summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c101
-rw-r--r--net/ipv4/arp.c2
-rw-r--r--net/ipv4/cipso_ipv4.c3
-rw-r--r--net/ipv4/fib_frontend.c1
-rw-r--r--net/ipv4/fib_semantics.c40
-rw-r--r--net/ipv4/fou.c196
-rw-r--r--net/ipv4/gre_demux.c61
-rw-r--r--net/ipv4/gre_offload.c49
-rw-r--r--net/ipv4/icmp.c18
-rw-r--r--net/ipv4/inet_connection_sock.c9
-rw-r--r--net/ipv4/inet_diag.c98
-rw-r--r--net/ipv4/inet_hashtables.c86
-rw-r--r--net/ipv4/inet_timewait_sock.c10
-rw-r--r--net/ipv4/ip_forward.c6
-rw-r--r--net/ipv4/ip_fragment.c14
-rw-r--r--net/ipv4/ip_gre.c281
-rw-r--r--net/ipv4/ip_input.c42
-rw-r--r--net/ipv4/ip_output.c4
-rw-r--r--net/ipv4/ip_sockglue.c34
-rw-r--r--net/ipv4/ip_tunnel.c45
-rw-r--r--net/ipv4/ip_tunnel_core.c50
-rw-r--r--net/ipv4/ipconfig.c4
-rw-r--r--net/ipv4/ipip.c7
-rw-r--r--net/ipv4/ipmr.c4
-rw-r--r--net/ipv4/netfilter/arp_tables.c233
-rw-r--r--net/ipv4/netfilter/ip_tables.c259
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c47
-rw-r--r--net/ipv4/ping.c7
-rw-r--r--net/ipv4/raw.c13
-rw-r--r--net/ipv4/route.c10
-rw-r--r--net/ipv4/syncookies.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c11
-rw-r--r--net/ipv4/tcp.c65
-rw-r--r--net/ipv4/tcp_bic.c6
-rw-r--r--net/ipv4/tcp_cdg.c34
-rw-r--r--net/ipv4/tcp_cubic.c26
-rw-r--r--net/ipv4/tcp_fastopen.c14
-rw-r--r--net/ipv4/tcp_htcp.c10
-rw-r--r--net/ipv4/tcp_illinois.c21
-rw-r--r--net/ipv4/tcp_input.c300
-rw-r--r--net/ipv4/tcp_ipv4.c144
-rw-r--r--net/ipv4/tcp_lp.c6
-rw-r--r--net/ipv4/tcp_metrics.c6
-rw-r--r--net/ipv4/tcp_minisocks.c19
-rw-r--r--net/ipv4/tcp_offload.c43
-rw-r--r--net/ipv4/tcp_output.c175
-rw-r--r--net/ipv4/tcp_recovery.c4
-rw-r--r--net/ipv4/tcp_timer.c28
-rw-r--r--net/ipv4/tcp_vegas.c6
-rw-r--r--net/ipv4/tcp_vegas.h2
-rw-r--r--net/ipv4/tcp_veno.c7
-rw-r--r--net/ipv4/tcp_westwood.c7
-rw-r--r--net/ipv4/tcp_yeah.c7
-rw-r--r--net/ipv4/udp.c504
-rw-r--r--net/ipv4/udp_diag.c18
-rw-r--r--net/ipv4/udp_offload.c150
-rw-r--r--net/ipv4/udp_tunnel.c2
58 files changed, 1440 insertions, 1915 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 7ad0e567c..d39e9e47a 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -948,6 +948,7 @@ const struct proto_ops inet_dgram_ops = {
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
+ .set_peek_off = sk_set_peek_off,
#ifdef CONFIG_COMPAT
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
@@ -1106,7 +1107,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
struct ip_options_rcu *inet_opt;
inet_opt = rcu_dereference_protected(inet->inet_opt,
- sock_owned_by_user(sk));
+ lockdep_sock_is_held(sk));
if (inet_opt && inet_opt->opt.srr)
daddr = inet_opt->opt.faddr;
@@ -1191,35 +1192,19 @@ int inet_sk_rebuild_header(struct sock *sk)
}
EXPORT_SYMBOL(inet_sk_rebuild_header);
-static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
+struct sk_buff *inet_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
+ bool udpfrag = false, fixedid = false, encap;
struct sk_buff *segs = ERR_PTR(-EINVAL);
const struct net_offload *ops;
unsigned int offset = 0;
- bool udpfrag, encap;
struct iphdr *iph;
- int proto;
+ int proto, tot_len;
int nhoff;
int ihl;
int id;
- if (unlikely(skb_shinfo(skb)->gso_type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_UDP |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- SKB_GSO_GRE |
- SKB_GSO_GRE_CSUM |
- SKB_GSO_IPIP |
- SKB_GSO_SIT |
- SKB_GSO_TCPV6 |
- SKB_GSO_UDP_TUNNEL |
- SKB_GSO_UDP_TUNNEL_CSUM |
- SKB_GSO_TUNNEL_REMCSUM |
- 0)))
- goto out;
-
skb_reset_network_header(skb);
nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*iph))))
@@ -1247,11 +1232,14 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
segs = ERR_PTR(-EPROTONOSUPPORT);
- if (skb->encapsulation &&
- skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP))
- udpfrag = proto == IPPROTO_UDP && encap;
- else
- udpfrag = proto == IPPROTO_UDP && !skb->encapsulation;
+ if (!skb->encapsulation || encap) {
+ udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
+ fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
+
+ /* fixed ID is invalid if DF bit is not set */
+ if (fixedid && !(iph->frag_off & htons(IP_DF)))
+ goto out;
+ }
ops = rcu_dereference(inet_offloads[proto]);
if (likely(ops && ops->callbacks.gso_segment))
@@ -1264,15 +1252,25 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
do {
iph = (struct iphdr *)(skb_mac_header(skb) + nhoff);
if (udpfrag) {
- iph->id = htons(id);
iph->frag_off = htons(offset >> 3);
if (skb->next)
iph->frag_off |= htons(IP_MF);
offset += skb->len - nhoff - ihl;
+ tot_len = skb->len - nhoff;
+ } else if (skb_is_gso(skb)) {
+ if (!fixedid) {
+ iph->id = htons(id);
+ id += skb_shinfo(skb)->gso_segs;
+ }
+ tot_len = skb_shinfo(skb)->gso_size +
+ SKB_GSO_CB(skb)->data_offset +
+ skb->head - (unsigned char *)iph;
} else {
- iph->id = htons(id++);
+ if (!fixedid)
+ iph->id = htons(id++);
+ tot_len = skb->len - nhoff;
}
- iph->tot_len = htons(skb->len - nhoff);
+ iph->tot_len = htons(tot_len);
ip_send_check(iph);
if (encap)
skb_reset_inner_headers(skb);
@@ -1282,9 +1280,9 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
out:
return segs;
}
+EXPORT_SYMBOL(inet_gso_segment);
-static struct sk_buff **inet_gro_receive(struct sk_buff **head,
- struct sk_buff *skb)
+struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
const struct net_offload *ops;
struct sk_buff **pp = NULL;
@@ -1324,6 +1322,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
for (p = *head; p; p = p->next) {
struct iphdr *iph2;
+ u16 flush_id;
if (!NAPI_GRO_CB(p)->same_flow)
continue;
@@ -1347,16 +1346,36 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head,
(iph->tos ^ iph2->tos) |
((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
- /* Save the IP ID check to be included later when we get to
- * the transport layer so only the inner most IP ID is checked.
- * This is because some GSO/TSO implementations do not
- * correctly increment the IP ID for the outer hdrs.
- */
- NAPI_GRO_CB(p)->flush_id =
- ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
NAPI_GRO_CB(p)->flush |= flush;
+
+ /* We need to store of the IP ID check to be included later
+ * when we can verify that this packet does in fact belong
+ * to a given flow.
+ */
+ flush_id = (u16)(id - ntohs(iph2->id));
+
+ /* This bit of code makes it much easier for us to identify
+ * the cases where we are doing atomic vs non-atomic IP ID
+ * checks. Specifically an atomic check can return IP ID
+ * values 0 - 0xFFFF, while a non-atomic check can only
+ * return 0 or 0xFFFF.
+ */
+ if (!NAPI_GRO_CB(p)->is_atomic ||
+ !(iph->frag_off & htons(IP_DF))) {
+ flush_id ^= NAPI_GRO_CB(p)->count;
+ flush_id = flush_id ? 0xFFFF : 0;
+ }
+
+ /* If the previous IP ID value was based on an atomic
+ * datagram we can overwrite the value and ignore it.
+ */
+ if (NAPI_GRO_CB(skb)->is_atomic)
+ NAPI_GRO_CB(p)->flush_id = flush_id;
+ else
+ NAPI_GRO_CB(p)->flush_id |= flush_id;
}
+ NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
NAPI_GRO_CB(skb)->flush |= flush;
skb_set_network_header(skb, off);
/* The above will be needed by the transport layer if there is one
@@ -1379,6 +1398,7 @@ out:
return pp;
}
+EXPORT_SYMBOL(inet_gro_receive);
static struct sk_buff **ipip_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
@@ -1430,7 +1450,7 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
return -EINVAL;
}
-static int inet_gro_complete(struct sk_buff *skb, int nhoff)
+int inet_gro_complete(struct sk_buff *skb, int nhoff)
{
__be16 newlen = htons(skb->len - nhoff);
struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
@@ -1460,11 +1480,12 @@ out_unlock:
return err;
}
+EXPORT_SYMBOL(inet_gro_complete);
static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
{
skb->encapsulation = 1;
- skb_shinfo(skb)->gso_type |= SKB_GSO_IPIP;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
return inet_gro_complete(skb, nhoff);
}
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index c34c7544d..89a8cac47 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -436,7 +436,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
if (IS_ERR(rt))
return 1;
if (rt->dst.dev != dev) {
- NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER);
+ __NET_INC_STATS(net, LINUX_MIB_ARPFILTER);
flag = 1;
}
ip_rt_put(rt);
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index bdb2a07ec..40d6b8771 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1933,7 +1933,8 @@ int cipso_v4_sock_setattr(struct sock *sk,
sk_inet = inet_sk(sk);
- old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk));
+ old = rcu_dereference_protected(sk_inet->inet_opt,
+ lockdep_sock_is_held(sk));
if (sk_inet->is_icsk) {
sk_conn = inet_csk(sk);
if (old)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 63566ec54..ef2ebeb89 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -110,6 +110,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
return tb;
}
+EXPORT_SYMBOL_GPL(fib_new_table);
/* caller must hold either rtnl or rcu read lock */
struct fib_table *fib_get_table(struct net *net, u32 id)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 2b68418c7..539fa264e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -479,6 +479,9 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
if (!rtnh_ok(rtnh, remaining))
return -EINVAL;
+ if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
+ return -EINVAL;
+
nexthop_nh->nh_flags =
(cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
@@ -1003,6 +1006,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
goto err_inval;
+ if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
+ goto err_inval;
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (cfg->fc_mp) {
nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
@@ -1561,21 +1567,45 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
+static bool fib_good_nh(const struct fib_nh *nh)
+{
+ int state = NUD_REACHABLE;
+
+ if (nh->nh_scope == RT_SCOPE_LINK) {
+ struct neighbour *n;
+
+ rcu_read_lock_bh();
+
+ n = __ipv4_neigh_lookup_noref(nh->nh_dev, nh->nh_gw);
+ if (n)
+ state = n->nud_state;
+
+ rcu_read_unlock_bh();
+ }
+
+ return !!(state & NUD_VALID);
+}
void fib_select_multipath(struct fib_result *res, int hash)
{
struct fib_info *fi = res->fi;
+ struct net *net = fi->fib_net;
+ bool first = false;
for_nexthops(fi) {
if (hash > atomic_read(&nh->nh_upper_bound))
continue;
- res->nh_sel = nhsel;
- return;
+ if (!net->ipv4.sysctl_fib_multipath_use_neigh ||
+ fib_good_nh(nh)) {
+ res->nh_sel = nhsel;
+ return;
+ }
+ if (!first) {
+ res->nh_sel = nhsel;
+ first = true;
+ }
} endfor_nexthops(fi);
-
- /* Race condition: route has just become dead. */
- res->nh_sel = 0;
}
#endif
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index a6962ccad..5f9207c03 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -21,8 +21,8 @@ struct fou {
u8 protocol;
u8 flags;
__be16 port;
+ u8 family;
u16 type;
- struct udp_offload udp_offloads;
struct list_head list;
struct rcu_head rcu;
};
@@ -48,14 +48,17 @@ static inline struct fou *fou_from_sock(struct sock *sk)
return sk->sk_user_data;
}
-static int fou_recv_pull(struct sk_buff *skb, size_t len)
+static int fou_recv_pull(struct sk_buff *skb, struct fou *fou, size_t len)
{
- struct iphdr *iph = ip_hdr(skb);
-
/* Remove 'len' bytes from the packet (UDP header and
* FOU header if present).
*/
- iph->tot_len = htons(ntohs(iph->tot_len) - len);
+ if (fou->family == AF_INET)
+ ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+ else
+ ipv6_hdr(skb)->payload_len =
+ htons(ntohs(ipv6_hdr(skb)->payload_len) - len);
+
__skb_pull(skb, len);
skb_postpull_rcsum(skb, udp_hdr(skb), len);
skb_reset_transport_header(skb);
@@ -69,7 +72,7 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
if (!fou)
return 1;
- if (fou_recv_pull(skb, sizeof(struct udphdr)))
+ if (fou_recv_pull(skb, fou, sizeof(struct udphdr)))
goto drop;
return -fou->protocol;
@@ -142,7 +145,11 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
hdrlen = sizeof(struct guehdr) + optlen;
- ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+ if (fou->family == AF_INET)
+ ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+ else
+ ipv6_hdr(skb)->payload_len =
+ htons(ntohs(ipv6_hdr(skb)->payload_len) - len);
/* Pull csum through the guehdr now . This can be used if
* there is a remote checksum offload.
@@ -186,13 +193,13 @@ drop:
return 0;
}
-static struct sk_buff **fou_gro_receive(struct sk_buff **head,
- struct sk_buff *skb,
- struct udp_offload *uoff)
+static struct sk_buff **fou_gro_receive(struct sock *sk,
+ struct sk_buff **head,
+ struct sk_buff *skb)
{
const struct net_offload *ops;
struct sk_buff **pp = NULL;
- u8 proto = NAPI_GRO_CB(skb)->proto;
+ u8 proto = fou_from_sock(sk)->protocol;
const struct net_offload **offloads;
/* We can clear the encap_mark for FOU as we are essentially doing
@@ -220,11 +227,11 @@ out_unlock:
return pp;
}
-static int fou_gro_complete(struct sk_buff *skb, int nhoff,
- struct udp_offload *uoff)
+static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
+ int nhoff)
{
const struct net_offload *ops;
- u8 proto = NAPI_GRO_CB(skb)->proto;
+ u8 proto = fou_from_sock(sk)->protocol;
int err = -ENOSYS;
const struct net_offload **offloads;
@@ -267,9 +274,9 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off,
return guehdr;
}
-static struct sk_buff **gue_gro_receive(struct sk_buff **head,
- struct sk_buff *skb,
- struct udp_offload *uoff)
+static struct sk_buff **gue_gro_receive(struct sock *sk,
+ struct sk_buff **head,
+ struct sk_buff *skb)
{
const struct net_offload **offloads;
const struct net_offload *ops;
@@ -280,7 +287,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
void *data;
u16 doffset = 0;
int flush = 1;
- struct fou *fou = container_of(uoff, struct fou, udp_offloads);
+ struct fou *fou = fou_from_sock(sk);
struct gro_remcsum grc;
skb_gro_remcsum_init(&grc);
@@ -392,8 +399,7 @@ out:
return pp;
}
-static int gue_gro_complete(struct sk_buff *skb, int nhoff,
- struct udp_offload *uoff)
+static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
{
const struct net_offload **offloads;
struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
@@ -428,7 +434,8 @@ static int fou_add_to_port_list(struct net *net, struct fou *fou)
mutex_lock(&fn->fou_lock);
list_for_each_entry(fout, &fn->fou_list, list) {
- if (fou->port == fout->port) {
+ if (fou->port == fout->port &&
+ fou->family == fout->family) {
mutex_unlock(&fn->fou_lock);
return -EALREADY;
}
@@ -443,44 +450,20 @@ static int fou_add_to_port_list(struct net *net, struct fou *fou)
static void fou_release(struct fou *fou)
{
struct socket *sock = fou->sock;
- struct sock *sk = sock->sk;
- if (sk->sk_family == AF_INET)
- udp_del_offload(&fou->udp_offloads);
list_del(&fou->list);
udp_tunnel_sock_release(sock);
kfree_rcu(fou, rcu);
}
-static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
-{
- udp_sk(sk)->encap_rcv = fou_udp_recv;
- fou->protocol = cfg->protocol;
- fou->udp_offloads.callbacks.gro_receive = fou_gro_receive;
- fou->udp_offloads.callbacks.gro_complete = fou_gro_complete;
- fou->udp_offloads.port = cfg->udp_config.local_udp_port;
- fou->udp_offloads.ipproto = cfg->protocol;
-
- return 0;
-}
-
-static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
-{
- udp_sk(sk)->encap_rcv = gue_udp_recv;
- fou->udp_offloads.callbacks.gro_receive = gue_gro_receive;
- fou->udp_offloads.callbacks.gro_complete = gue_gro_complete;
- fou->udp_offloads.port = cfg->udp_config.local_udp_port;
-
- return 0;
-}
-
static int fou_create(struct net *net, struct fou_cfg *cfg,
struct socket **sockp)
{
struct socket *sock = NULL;
struct fou *fou = NULL;
struct sock *sk;
+ struct udp_tunnel_sock_cfg tunnel_cfg;
int err;
/* Open UDP socket */
@@ -497,44 +480,39 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
sk = sock->sk;
- fou->flags = cfg->flags;
fou->port = cfg->udp_config.local_udp_port;
+ fou->family = cfg->udp_config.family;
+ fou->flags = cfg->flags;
+ fou->type = cfg->type;
+ fou->sock = sock;
+
+ memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
+ tunnel_cfg.encap_type = 1;
+ tunnel_cfg.sk_user_data = fou;
+ tunnel_cfg.encap_destroy = NULL;
/* Initial for fou type */
switch (cfg->type) {
case FOU_ENCAP_DIRECT:
- err = fou_encap_init(sk, fou, cfg);
- if (err)
- goto error;
+ tunnel_cfg.encap_rcv = fou_udp_recv;
+ tunnel_cfg.gro_receive = fou_gro_receive;
+ tunnel_cfg.gro_complete = fou_gro_complete;
+ fou->protocol = cfg->protocol;
break;
case FOU_ENCAP_GUE:
- err = gue_encap_init(sk, fou, cfg);
- if (err)
- goto error;
+ tunnel_cfg.encap_rcv = gue_udp_recv;
+ tunnel_cfg.gro_receive = gue_gro_receive;
+ tunnel_cfg.gro_complete = gue_gro_complete;
break;
default:
err = -EINVAL;
goto error;
}
- fou->type = cfg->type;
-
- udp_sk(sk)->encap_type = 1;
- udp_encap_enable();
-
- sk->sk_user_data = fou;
- fou->sock = sock;
-
- inet_inc_convert_csum(sk);
+ setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
sk->sk_allocation = GFP_ATOMIC;
- if (cfg->udp_config.family == AF_INET) {
- err = udp_add_offload(net, &fou->udp_offloads);
- if (err)
- goto error;
- }
-
err = fou_add_to_port_list(net, fou);
if (err)
goto error;
@@ -556,12 +534,13 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg)
{
struct fou_net *fn = net_generic(net, fou_net_id);
__be16 port = cfg->udp_config.local_udp_port;
+ u8 family = cfg->udp_config.family;
int err = -EINVAL;
struct fou *fou;
mutex_lock(&fn->fou_lock);
list_for_each_entry(fou, &fn->fou_list, list) {
- if (fou->port == port) {
+ if (fou->port == port && fou->family == family) {
fou_release(fou);
err = 0;
break;
@@ -599,8 +578,15 @@ static int parse_nl_config(struct genl_info *info,
if (info->attrs[FOU_ATTR_AF]) {
u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]);
- if (family != AF_INET)
- return -EINVAL;
+ switch (family) {
+ case AF_INET:
+ break;
+ case AF_INET6:
+ cfg->udp_config.ipv6_v6only = 1;
+ break;
+ default:
+ return -EAFNOSUPPORT;
+ }
cfg->udp_config.family = family;
}
@@ -691,6 +677,7 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
struct fou_cfg cfg;
struct fou *fout;
__be16 port;
+ u8 family;
int ret;
ret = parse_nl_config(info, &cfg);
@@ -700,6 +687,10 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
if (port == 0)
return -EINVAL;
+ family = cfg.udp_config.family;
+ if (family != AF_INET && family != AF_INET6)
+ return -EINVAL;
+
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
@@ -707,7 +698,7 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
ret = -ESRCH;
mutex_lock(&fn->fou_lock);
list_for_each_entry(fout, &fn->fou_list, list) {
- if (port == fout->port) {
+ if (port == fout->port && family == fout->family) {
ret = fou_dump_info(fout, info->snd_portid,
info->snd_seq, 0, msg,
info->genlhdr->cmd);
@@ -812,36 +803,48 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
*protocol = IPPROTO_UDP;
}
+int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, __be16 *sport, int type)
+{
+ int err;
+
+ err = iptunnel_handle_offloads(skb, type);
+ if (err)
+ return err;
+
+ *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+ skb, 0, 0, false);
+
+ return 0;
+}
+EXPORT_SYMBOL(__fou_build_header);
+
int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
u8 *protocol, struct flowi4 *fl4)
{
int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
SKB_GSO_UDP_TUNNEL;
__be16 sport;
+ int err;
- skb = iptunnel_handle_offloads(skb, type);
-
- if (IS_ERR(skb))
- return PTR_ERR(skb);
+ err = __fou_build_header(skb, e, protocol, &sport, type);
+ if (err)
+ return err;
- sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
- skb, 0, 0, false);
fou_build_udp(skb, e, fl4, protocol, sport);
return 0;
}
EXPORT_SYMBOL(fou_build_header);
-int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
- u8 *protocol, struct flowi4 *fl4)
+int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, __be16 *sport, int type)
{
- int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
- SKB_GSO_UDP_TUNNEL;
struct guehdr *guehdr;
size_t hdrlen, optlen = 0;
- __be16 sport;
void *data;
bool need_priv = false;
+ int err;
if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) &&
skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -852,14 +855,13 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
optlen += need_priv ? GUE_LEN_PRIV : 0;
- skb = iptunnel_handle_offloads(skb, type);
-
- if (IS_ERR(skb))
- return PTR_ERR(skb);
+ err = iptunnel_handle_offloads(skb, type);
+ if (err)
+ return err;
/* Get source port (based on flow hash) before skb_push */
- sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
- skb, 0, 0, false);
+ *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+ skb, 0, 0, false);
hdrlen = sizeof(struct guehdr) + optlen;
@@ -904,6 +906,22 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
}
+ return 0;
+}
+EXPORT_SYMBOL(__gue_build_header);
+
+int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi4 *fl4)
+{
+ int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
+ SKB_GSO_UDP_TUNNEL;
+ __be16 sport;
+ int err;
+
+ err = __gue_build_header(skb, e, protocol, &sport, type);
+ if (err)
+ return err;
+
fou_build_udp(skb, e, fl4, protocol, sport);
return 0;
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index d9c552a72..de1d119a4 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -60,6 +60,67 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version)
}
EXPORT_SYMBOL_GPL(gre_del_protocol);
+/* Fills in tpi and returns header length to be pulled. */
+int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
+ bool *csum_err, __be16 proto, int nhs)
+{
+ const struct gre_base_hdr *greh;
+ __be32 *options;
+ int hdr_len;
+
+ if (unlikely(!pskb_may_pull(skb, nhs + sizeof(struct gre_base_hdr))))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)(skb->data + nhs);
+ if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
+ return -EINVAL;
+
+ tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+ hdr_len = gre_calc_hlen(tpi->flags);
+
+ if (!pskb_may_pull(skb, nhs + hdr_len))
+ return -EINVAL;
+
+ greh = (struct gre_base_hdr *)(skb->data + nhs);
+ tpi->proto = greh->protocol;
+
+ options = (__be32 *)(greh + 1);
+ if (greh->flags & GRE_CSUM) {
+ if (skb_checksum_simple_validate(skb)) {
+ *csum_err = true;
+ return -EINVAL;
+ }
+
+ skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
+ null_compute_pseudo);
+ options++;
+ }
+
+ if (greh->flags & GRE_KEY) {
+ tpi->key = *options;
+ options++;
+ } else {
+ tpi->key = 0;
+ }
+ if (unlikely(greh->flags & GRE_SEQ)) {
+ tpi->seq = *options;
+ options++;
+ } else {
+ tpi->seq = 0;
+ }
+ /* WCCP version 1 and 2 protocol decoding.
+ * - Change protocol to IPv4/IPv6
+ * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+ */
+ if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+ tpi->proto = proto;
+ if ((*(u8 *)options & 0xF0) != 0x40)
+ hdr_len += 4;
+ }
+ return hdr_len;
+}
+EXPORT_SYMBOL(gre_parse_header);
+
static int gre_rcv(struct sk_buff *skb)
{
const struct gre_protocol *proto;
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 6a5bd4317..ecd1e09db 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -26,18 +26,6 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
int gre_offset, outer_hlen;
bool need_csum, ufo;
- if (unlikely(skb_shinfo(skb)->gso_type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_TCPV6 |
- SKB_GSO_UDP |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- SKB_GSO_GRE |
- SKB_GSO_GRE_CSUM |
- SKB_GSO_IPIP |
- SKB_GSO_SIT)))
- goto out;
-
if (!skb->encapsulation)
goto out;
@@ -86,7 +74,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
skb = segs;
do {
struct gre_base_hdr *greh;
- __be32 *pcsum;
+ __sum16 *pcsum;
/* Set up inner headers if we are offloading inner checksum */
if (skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -106,10 +94,25 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
continue;
greh = (struct gre_base_hdr *)skb_transport_header(skb);
- pcsum = (__be32 *)(greh + 1);
+ pcsum = (__sum16 *)(greh + 1);
+
+ if (skb_is_gso(skb)) {
+ unsigned int partial_adj;
+
+ /* Adjust checksum to account for the fact that
+ * the partial checksum is based on actual size
+ * whereas headers should be based on MSS size.
+ */
+ partial_adj = skb->len + skb_headroom(skb) -
+ SKB_GSO_CB(skb)->data_offset -
+ skb_shinfo(skb)->gso_size;
+ *pcsum = ~csum_fold((__force __wsum)htonl(partial_adj));
+ } else {
+ *pcsum = 0;
+ }
- *pcsum = 0;
- *(__sum16 *)pcsum = gso_make_checksum(skb, 0);
+ *(pcsum + 1) = 0;
+ *pcsum = gso_make_checksum(skb, 0);
} while ((skb = skb->next));
out:
return segs;
@@ -275,6 +278,18 @@ static const struct net_offload gre_offload = {
static int __init gre_offload_init(void)
{
- return inet_add_offload(&gre_offload, IPPROTO_GRE);
+ int err;
+
+ err = inet_add_offload(&gre_offload, IPPROTO_GRE);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (err)
+ return err;
+
+ err = inet6_add_offload(&gre_offload, IPPROTO_GRE);
+ if (err)
+ inet_del_offload(&gre_offload, IPPROTO_GRE);
+#endif
+
+ return err;
}
device_initcall(gre_offload_init);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 633348977..38abe70e5 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -363,7 +363,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
icmp_param->data_len+icmp_param->head_len,
icmp_param->head_len,
ipc, rt, MSG_DONTWAIT) < 0) {
- ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_OUTERRORS);
+ __ICMP_INC_STATS(sock_net(sk), ICMP_MIB_OUTERRORS);
ip_flush_pending_frames(sk);
} else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
struct icmphdr *icmph = icmp_hdr(skb);
@@ -744,7 +744,7 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
* avoid additional coding at protocol handlers.
*/
if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
- ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
return;
}
@@ -865,7 +865,7 @@ static bool icmp_unreach(struct sk_buff *skb)
out:
return true;
out_err:
- ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return false;
}
@@ -877,7 +877,7 @@ out_err:
static bool icmp_redirect(struct sk_buff *skb)
{
if (skb->len < sizeof(struct iphdr)) {
- ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
return false;
}
@@ -956,7 +956,7 @@ static bool icmp_timestamp(struct sk_buff *skb)
return true;
out_err:
- ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
return false;
}
@@ -996,7 +996,7 @@ int icmp_rcv(struct sk_buff *skb)
skb_set_network_header(skb, nh);
}
- ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS);
+ __ICMP_INC_STATS(net, ICMP_MIB_INMSGS);
if (skb_checksum_simple_validate(skb))
goto csum_error;
@@ -1006,7 +1006,7 @@ int icmp_rcv(struct sk_buff *skb)
icmph = icmp_hdr(skb);
- ICMPMSGIN_INC_STATS_BH(net, icmph->type);
+ ICMPMSGIN_INC_STATS(net, icmph->type);
/*
* 18 is the highest 'known' ICMP type. Anything else is a mystery
*
@@ -1052,9 +1052,9 @@ drop:
kfree_skb(skb);
return 0;
csum_error:
- ICMP_INC_STATS_BH(net, ICMP_MIB_CSUMERRORS);
+ __ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS);
error:
- ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
goto drop;
}
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index bc5196ea1..fa8c39804 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -427,7 +427,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
route_err:
ip_rt_put(rt);
no_route:
- IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
EXPORT_SYMBOL_GPL(inet_csk_route_req);
@@ -466,7 +466,7 @@ route_err:
ip_rt_put(rt);
no_route:
rcu_read_unlock();
- IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
@@ -661,6 +661,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
newsk->sk_write_space = sk_stream_write_space;
+ /* listeners have SOCK_RCU_FREE, not the children */
+ sock_reset_flag(newsk, SOCK_RCU_FREE);
+
newsk->sk_mark = inet_rsk(req)->ir_mark;
atomic64_set(&newsk->sk_cookie,
atomic64_read(&inet_rsk(req)->ir_cookie));
@@ -703,7 +706,9 @@ void inet_csk_destroy_sock(struct sock *sk)
sk_refcnt_debug_release(sk);
+ local_bh_disable();
percpu_counter_dec(sk->sk_prot->orphan_count);
+ local_bh_enable();
sock_put(sk);
}
EXPORT_SYMBOL(inet_csk_destroy_sock);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 5fdb02f55..25af12436 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -66,7 +66,7 @@ static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
mutex_unlock(&inet_diag_table_mutex);
}
-static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
+void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
{
r->idiag_family = sk->sk_family;
@@ -89,6 +89,7 @@ static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
r->id.idiag_dst[0] = sk->sk_daddr;
}
}
+EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill);
static size_t inet_sk_attr_size(void)
{
@@ -104,13 +105,50 @@ static size_t inet_sk_attr_size(void)
+ 64;
}
+int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
+ struct inet_diag_msg *r, int ext,
+ struct user_namespace *user_ns)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+
+ if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
+ goto errout;
+
+ /* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
+ * hence this needs to be included regardless of socket family.
+ */
+ if (ext & (1 << (INET_DIAG_TOS - 1)))
+ if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
+ goto errout;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (r->idiag_family == AF_INET6) {
+ if (ext & (1 << (INET_DIAG_TCLASS - 1)))
+ if (nla_put_u8(skb, INET_DIAG_TCLASS,
+ inet6_sk(sk)->tclass) < 0)
+ goto errout;
+
+ if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
+ nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk)))
+ goto errout;
+ }
+#endif
+
+ r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+ r->idiag_inode = sock_i_ino(sk);
+
+ return 0;
+errout:
+ return 1;
+}
+EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill);
+
int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
struct sk_buff *skb, const struct inet_diag_req_v2 *req,
struct user_namespace *user_ns,
u32 portid, u32 seq, u16 nlmsg_flags,
const struct nlmsghdr *unlh)
{
- const struct inet_sock *inet = inet_sk(sk);
const struct tcp_congestion_ops *ca_ops;
const struct inet_diag_handler *handler;
int ext = req->idiag_ext;
@@ -135,32 +173,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->idiag_timer = 0;
r->idiag_retrans = 0;
- if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
+ if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns))
goto errout;
- /* IPv6 dual-stack sockets use inet->tos for IPv4 connections,
- * hence this needs to be included regardless of socket family.
- */
- if (ext & (1 << (INET_DIAG_TOS - 1)))
- if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
- goto errout;
-
-#if IS_ENABLED(CONFIG_IPV6)
- if (r->idiag_family == AF_INET6) {
- if (ext & (1 << (INET_DIAG_TCLASS - 1)))
- if (nla_put_u8(skb, INET_DIAG_TCLASS,
- inet6_sk(sk)->tclass) < 0)
- goto errout;
-
- if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
- nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk)))
- goto errout;
- }
-#endif
-
- r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
- r->idiag_inode = sock_i_ino(sk);
-
if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
struct inet_diag_meminfo minfo = {
.idiag_rmem = sk_rmem_alloc_get(sk),
@@ -182,31 +197,32 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
goto out;
}
-#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
-
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
r->idiag_timer = 1;
r->idiag_retrans = icsk->icsk_retransmits;
- r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+ r->idiag_expires =
+ jiffies_to_msecs(icsk->icsk_timeout - jiffies);
} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
r->idiag_timer = 4;
r->idiag_retrans = icsk->icsk_probes_out;
- r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
+ r->idiag_expires =
+ jiffies_to_msecs(icsk->icsk_timeout - jiffies);
} else if (timer_pending(&sk->sk_timer)) {
r->idiag_timer = 2;
r->idiag_retrans = icsk->icsk_probes_out;
- r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
+ r->idiag_expires =
+ jiffies_to_msecs(sk->sk_timer.expires - jiffies);
} else {
r->idiag_timer = 0;
r->idiag_expires = 0;
}
-#undef EXPIRES_IN_MS
if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {
- attr = nla_reserve(skb, INET_DIAG_INFO,
- handler->idiag_info_size);
+ attr = nla_reserve_64bit(skb, INET_DIAG_INFO,
+ handler->idiag_info_size,
+ INET_DIAG_PAD);
if (!attr)
goto errout;
@@ -356,6 +372,7 @@ struct sock *inet_diag_find_one_icsk(struct net *net,
{
struct sock *sk;
+ rcu_read_lock();
if (req->sdiag_family == AF_INET)
sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0],
req->id.idiag_dport, req->id.idiag_src[0],
@@ -376,9 +393,11 @@ struct sock *inet_diag_find_one_icsk(struct net *net,
req->id.idiag_if);
}
#endif
- else
+ else {
+ rcu_read_unlock();
return ERR_PTR(-EINVAL);
-
+ }
+ rcu_read_unlock();
if (!sk)
return ERR_PTR(-ENOENT);
@@ -772,13 +791,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
struct inet_listen_hashbucket *ilb;
- struct hlist_nulls_node *node;
struct sock *sk;
num = 0;
ilb = &hashinfo->listening_hash[i];
spin_lock_bh(&ilb->lock);
- sk_nulls_for_each(sk, node, &ilb->head) {
+ sk_for_each(sk, &ilb->head) {
struct inet_sock *inet = inet_sk(sk);
if (!net_eq(sock_net(sk), net))
@@ -1061,7 +1079,9 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
}
attr = handler->idiag_info_size
- ? nla_reserve(skb, INET_DIAG_INFO, handler->idiag_info_size)
+ ? nla_reserve_64bit(skb, INET_DIAG_INFO,
+ handler->idiag_info_size,
+ INET_DIAG_PAD)
: NULL;
if (attr)
info = nla_data(attr);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 0d9e9d7bb..77c20a489 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -198,13 +198,13 @@ static inline int compute_score(struct sock *sk, struct net *net,
}
/*
- * Don't inline this cruft. Here are some nice properties to exploit here. The
- * BSD API does not allow a listening sock to specify the remote port nor the
+ * Here are some nice properties to exploit here. The BSD API
+ * does not allow a listening sock to specify the remote port nor the
* remote address for the connection. So always assume those are both
* wildcarded during the search since they can never be otherwise.
*/
-
+/* called with rcu_read_lock() : No refcount taken on the socket */
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
@@ -212,38 +212,27 @@ struct sock *__inet_lookup_listener(struct net *net,
const __be32 daddr, const unsigned short hnum,
const int dif)
{
- struct sock *sk, *result;
- struct hlist_nulls_node *node;
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- int score, hiscore, matches = 0, reuseport = 0;
- bool select_ok = true;
+ int score, hiscore = 0, matches = 0, reuseport = 0;
+ struct sock *sk, *result = NULL;
u32 phash = 0;
- rcu_read_lock();
-begin:
- result = NULL;
- hiscore = 0;
- sk_nulls_for_each_rcu(sk, node, &ilb->head) {
+ sk_for_each_rcu(sk, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif);
if (score > hiscore) {
- result = sk;
- hiscore = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
phash = inet_ehashfn(net, daddr, hnum,
saddr, sport);
- if (select_ok) {
- struct sock *sk2;
- sk2 = reuseport_select_sock(sk, phash,
- skb, doff);
- if (sk2) {
- result = sk2;
- goto found;
- }
- }
+ result = reuseport_select_sock(sk, phash,
+ skb, doff);
+ if (result)
+ return result;
matches = 1;
}
+ result = sk;
+ hiscore = score;
} else if (score == hiscore && reuseport) {
matches++;
if (reciprocal_scale(phash, matches) == 0)
@@ -251,25 +240,6 @@ begin:
phash = next_pseudo_random32(phash);
}
}
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
- goto begin;
- if (result) {
-found:
- if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
- result = NULL;
- else if (unlikely(compute_score(result, net, hnum, daddr,
- dif) < hiscore)) {
- sock_put(result);
- select_ok = false;
- goto begin;
- }
- }
- rcu_read_unlock();
return result;
}
EXPORT_SYMBOL_GPL(__inet_lookup_listener);
@@ -312,7 +282,6 @@ struct sock *__inet_lookup_established(struct net *net,
unsigned int slot = hash & hashinfo->ehash_mask;
struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
- rcu_read_lock();
begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
if (sk->sk_hash != hash)
@@ -339,7 +308,6 @@ begin:
out:
sk = NULL;
found:
- rcu_read_unlock();
return sk;
}
EXPORT_SYMBOL_GPL(__inet_lookup_established);
@@ -392,7 +360,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
__sk_nulls_add_node_rcu(sk, &head->chain);
if (tw) {
sk_nulls_del_node_init_rcu((struct sock *)tw);
- NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
+ __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
}
spin_unlock(lock);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -472,10 +440,9 @@ static int inet_reuseport_add_sock(struct sock *sk,
{
struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
struct sock *sk2;
- struct hlist_nulls_node *node;
kuid_t uid = sock_i_uid(sk);
- sk_nulls_for_each_rcu(sk2, node, &ilb->head) {
+ sk_for_each_rcu(sk2, &ilb->head) {
if (sk2 != sk &&
sk2->sk_family == sk->sk_family &&
ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
@@ -514,7 +481,12 @@ int __inet_hash(struct sock *sk, struct sock *osk,
if (err)
goto unlock;
}
- __sk_nulls_add_node_rcu(sk, &ilb->head);
+ if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
+ sk->sk_family == AF_INET6)
+ hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
+ else
+ hlist_add_head_rcu(&sk->sk_node, &ilb->head);
+ sock_set_flag(sk, SOCK_RCU_FREE);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
unlock:
spin_unlock(&ilb->lock);
@@ -541,20 +513,25 @@ void inet_unhash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
spinlock_t *lock;
+ bool listener = false;
int done;
if (sk_unhashed(sk))
return;
- if (sk->sk_state == TCP_LISTEN)
+ if (sk->sk_state == TCP_LISTEN) {
lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock;
- else
+ listener = true;
+ } else {
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
-
+ }
spin_lock_bh(lock);
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_detach_sock(sk);
- done = __sk_nulls_del_node_init_rcu(sk);
+ if (listener)
+ done = __sk_del_node_init(sk);
+ else
+ done = __sk_nulls_del_node_init_rcu(sk);
if (done)
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_unlock_bh(lock);
@@ -690,9 +667,8 @@ void inet_hashinfo_init(struct inet_hashinfo *h)
for (i = 0; i < INET_LHTABLE_SIZE; i++) {
spin_lock_init(&h->listening_hash[i].lock);
- INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head,
- i + LISTENING_NULLS_BASE);
- }
+ INIT_HLIST_HEAD(&h->listening_hash[i].head);
+ }
}
EXPORT_SYMBOL_GPL(inet_hashinfo_init);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index c67f9bd76..206581674 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -94,7 +94,7 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
}
/*
- * Enter the time wait state. This is called with locally disabled BH.
+ * Enter the time wait state.
* Essentially we whip up a timewait bucket, copy the relevant info into it
* from the SK, and mess with hash chains and list linkage.
*/
@@ -112,7 +112,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
*/
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
hashinfo->bhash_size)];
- spin_lock(&bhead->lock);
+ spin_lock_bh(&bhead->lock);
tw->tw_tb = icsk->icsk_bind_hash;
WARN_ON(!icsk->icsk_bind_hash);
inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
@@ -138,7 +138,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
if (__sk_nulls_del_node_init_rcu(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
- spin_unlock(lock);
+ spin_unlock_bh(lock);
}
EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
@@ -147,9 +147,9 @@ static void tw_timer_handler(unsigned long data)
struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data;
if (tw->tw_kill)
- NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
+ __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
else
- NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED);
+ __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITED);
inet_twsk_kill(tw);
}
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index af18f1e48..cbfb1808f 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -65,8 +65,8 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s
{
struct ip_options *opt = &(IPCB(skb)->opt);
- IP_INC_STATS_BH(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
- IP_ADD_STATS_BH(net, IPSTATS_MIB_OUTOCTETS, skb->len);
+ __IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
+ __IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
if (unlikely(opt->optlen))
ip_forward_options(skb);
@@ -157,7 +157,7 @@ sr_failed:
too_many_hops:
/* Tell the sender its packet died... */
- IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
drop:
kfree_skb(skb);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index efbd47d1a..bbe7f72db 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -204,14 +204,14 @@ static void ip_expire(unsigned long arg)
goto out;
ipq_kill(qp);
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
if (!inet_frag_evicting(&qp->q)) {
struct sk_buff *head = qp->q.fragments;
const struct iphdr *iph;
int err;
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
goto out;
@@ -291,7 +291,7 @@ static int ip_frag_too_far(struct ipq *qp)
struct net *net;
net = container_of(qp->q.net, struct net, ipv4.frags);
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
}
return rc;
@@ -635,7 +635,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
ip_send_check(iph);
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL;
qp->q.fragments_tail = NULL;
return 0;
@@ -647,7 +647,7 @@ out_nomem:
out_oversize:
net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
out_fail:
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
return err;
}
@@ -658,7 +658,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
int vif = l3mdev_master_ifindex_rcu(dev);
struct ipq *qp;
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
skb_orphan(skb);
/* Lookup (or create) queue header */
@@ -675,7 +675,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
return ret;
}
- IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
kfree_skb(skb);
return -ENOMEM;
}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 4cc84212c..1d000af7f 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -49,12 +49,6 @@
#include <net/gre.h>
#include <net/dst_metadata.h>
-#if IS_ENABLED(CONFIG_IPV6)
-#include <net/ipv6.h>
-#include <net/ip6_fib.h>
-#include <net/ip6_route.h>
-#endif
-
/*
Problems & solutions
--------------------
@@ -122,126 +116,6 @@ static int ipgre_tunnel_init(struct net_device *dev);
static int ipgre_net_id __read_mostly;
static int gre_tap_net_id __read_mostly;
-static int ip_gre_calc_hlen(__be16 o_flags)
-{
- int addend = 4;
-
- if (o_flags & TUNNEL_CSUM)
- addend += 4;
- if (o_flags & TUNNEL_KEY)
- addend += 4;
- if (o_flags & TUNNEL_SEQ)
- addend += 4;
- return addend;
-}
-
-static __be16 gre_flags_to_tnl_flags(__be16 flags)
-{
- __be16 tflags = 0;
-
- if (flags & GRE_CSUM)
- tflags |= TUNNEL_CSUM;
- if (flags & GRE_ROUTING)
- tflags |= TUNNEL_ROUTING;
- if (flags & GRE_KEY)
- tflags |= TUNNEL_KEY;
- if (flags & GRE_SEQ)
- tflags |= TUNNEL_SEQ;
- if (flags & GRE_STRICT)
- tflags |= TUNNEL_STRICT;
- if (flags & GRE_REC)
- tflags |= TUNNEL_REC;
- if (flags & GRE_VERSION)
- tflags |= TUNNEL_VERSION;
-
- return tflags;
-}
-
-static __be16 tnl_flags_to_gre_flags(__be16 tflags)
-{
- __be16 flags = 0;
-
- if (tflags & TUNNEL_CSUM)
- flags |= GRE_CSUM;
- if (tflags & TUNNEL_ROUTING)
- flags |= GRE_ROUTING;
- if (tflags & TUNNEL_KEY)
- flags |= GRE_KEY;
- if (tflags & TUNNEL_SEQ)
- flags |= GRE_SEQ;
- if (tflags & TUNNEL_STRICT)
- flags |= GRE_STRICT;
- if (tflags & TUNNEL_REC)
- flags |= GRE_REC;
- if (tflags & TUNNEL_VERSION)
- flags |= GRE_VERSION;
-
- return flags;
-}
-
-/* Fills in tpi and returns header length to be pulled. */
-static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
- bool *csum_err)
-{
- const struct gre_base_hdr *greh;
- __be32 *options;
- int hdr_len;
-
- if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
- return -EINVAL;
-
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
- if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
- return -EINVAL;
-
- tpi->flags = gre_flags_to_tnl_flags(greh->flags);
- hdr_len = ip_gre_calc_hlen(tpi->flags);
-
- if (!pskb_may_pull(skb, hdr_len))
- return -EINVAL;
-
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
- tpi->proto = greh->protocol;
-
- options = (__be32 *)(greh + 1);
- if (greh->flags & GRE_CSUM) {
- if (skb_checksum_simple_validate(skb)) {
- *csum_err = true;
- return -EINVAL;
- }
-
- skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
- null_compute_pseudo);
- options++;
- }
-
- if (greh->flags & GRE_KEY) {
- tpi->key = *options;
- options++;
- } else {
- tpi->key = 0;
- }
- if (unlikely(greh->flags & GRE_SEQ)) {
- tpi->seq = *options;
- options++;
- } else {
- tpi->seq = 0;
- }
- /* WCCP version 1 and 2 protocol decoding.
- * - Change protocol to IP
- * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
- */
- if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
- tpi->proto = htons(ETH_P_IP);
- if ((*(u8 *)options & 0xF0) != 0x40) {
- hdr_len += 4;
- if (!pskb_may_pull(skb, hdr_len))
- return -EINVAL;
- }
- }
- return hdr_len;
-}
-
static void ipgre_err(struct sk_buff *skb, u32 info,
const struct tnl_ptk_info *tpi)
{
@@ -337,12 +211,14 @@ static void gre_err(struct sk_buff *skb, u32 info)
* by themselves???
*/
+ const struct iphdr *iph = (struct iphdr *)skb->data;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct tnl_ptk_info tpi;
bool csum_err = false;
- if (parse_gre_header(skb, &tpi, &csum_err) < 0) {
+ if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP),
+ iph->ihl * 4) < 0) {
if (!csum_err) /* ignore csum errors. */
return;
}
@@ -380,24 +256,22 @@ static __be32 tunnel_id_to_key(__be64 x)
#endif
}
-static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
+static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+ struct ip_tunnel_net *itn, int hdr_len, bool raw_proto)
{
- struct net *net = dev_net(skb->dev);
struct metadata_dst *tun_dst = NULL;
- struct ip_tunnel_net *itn;
const struct iphdr *iph;
struct ip_tunnel *tunnel;
- if (tpi->proto == htons(ETH_P_TEB))
- itn = net_generic(net, gre_tap_net_id);
- else
- itn = net_generic(net, ipgre_net_id);
-
iph = ip_hdr(skb);
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
iph->saddr, iph->daddr, tpi->key);
if (tunnel) {
+ if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
+ raw_proto, false) < 0)
+ goto drop;
+
if (tunnel->dev->type != ARPHRD_NONE)
skb_pop_mac_header(skb);
else
@@ -416,7 +290,34 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
return PACKET_RCVD;
}
- return PACKET_REJECT;
+ return PACKET_NEXT;
+
+drop:
+ kfree_skb(skb);
+ return PACKET_RCVD;
+}
+
+static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
+ int hdr_len)
+{
+ struct net *net = dev_net(skb->dev);
+ struct ip_tunnel_net *itn;
+ int res;
+
+ if (tpi->proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
+
+ res = __ipgre_rcv(skb, tpi, itn, hdr_len, false);
+ if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) {
+ /* ipgre tunnels in collect metadata mode should receive
+ * also ETH_P_TEB traffic.
+ */
+ itn = net_generic(net, ipgre_net_id);
+ res = __ipgre_rcv(skb, tpi, itn, hdr_len, true);
+ }
+ return res;
}
static int gre_rcv(struct sk_buff *skb)
@@ -433,13 +334,11 @@ static int gre_rcv(struct sk_buff *skb)
}
#endif
- hdr_len = parse_gre_header(skb, &tpi, &csum_err);
+ hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0);
if (hdr_len < 0)
goto drop;
- if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false) < 0)
- goto drop;
- if (ipgre_rcv(skb, &tpi) == PACKET_RCVD)
+ if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
return 0;
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
@@ -448,49 +347,6 @@ drop:
return 0;
}
-static __sum16 gre_checksum(struct sk_buff *skb)
-{
- __wsum csum;
-
- if (skb->ip_summed == CHECKSUM_PARTIAL)
- csum = lco_csum(skb);
- else
- csum = skb_checksum(skb, 0, skb->len, 0);
- return csum_fold(csum);
-}
-
-static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
- __be16 proto, __be32 key, __be32 seq)
-{
- struct gre_base_hdr *greh;
-
- skb_push(skb, hdr_len);
-
- skb_reset_transport_header(skb);
- greh = (struct gre_base_hdr *)skb->data;
- greh->flags = tnl_flags_to_gre_flags(flags);
- greh->protocol = proto;
-
- if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) {
- __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
-
- if (flags & TUNNEL_SEQ) {
- *ptr = seq;
- ptr--;
- }
- if (flags & TUNNEL_KEY) {
- *ptr = key;
- ptr--;
- }
- if (flags & TUNNEL_CSUM &&
- !(skb_shinfo(skb)->gso_type &
- (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) {
- *ptr = 0;
- *(__sum16 *)ptr = gre_checksum(skb);
- }
- }
-}
-
static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params,
__be16 proto)
@@ -501,15 +357,15 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
tunnel->o_seqno++;
/* Push GRE header. */
- build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
- proto, tunnel->parms.o_key, htonl(tunnel->o_seqno));
+ gre_build_header(skb, tunnel->tun_hlen,
+ tunnel->parms.o_flags, proto, tunnel->parms.o_key,
+ htonl(tunnel->o_seqno));
skb_set_inner_protocol(skb, proto);
ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
}
-static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
- bool csum)
+static int gre_handle_offloads(struct sk_buff *skb, bool csum)
{
return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
}
@@ -562,7 +418,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
fl.saddr);
}
- tunnel_hlen = ip_gre_calc_hlen(key->tun_flags);
+ tunnel_hlen = gre_calc_hlen(key->tun_flags);
min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ tunnel_hlen + sizeof(struct iphdr);
@@ -577,15 +433,12 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
}
/* Push Tunnel header. */
- skb = gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM));
- if (IS_ERR(skb)) {
- skb = NULL;
+ if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
goto err_free_rt;
- }
flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
- build_header(skb, tunnel_hlen, flags, proto,
- tunnel_id_to_key(tun_info->key.tun_id), 0);
+ gre_build_header(skb, tunnel_hlen, flags, proto,
+ tunnel_id_to_key(tun_info->key.tun_id), 0);
df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
@@ -649,16 +502,14 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
tnl_params = &tunnel->parms.iph;
}
- skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
- if (IS_ERR(skb))
- goto out;
+ if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
+ goto free_skb;
__gre_xmit(skb, dev, tnl_params, skb->protocol);
return NETDEV_TX_OK;
free_skb:
kfree_skb(skb);
-out:
dev->stats.tx_dropped++;
return NETDEV_TX_OK;
}
@@ -673,9 +524,8 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
}
- skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
- if (IS_ERR(skb))
- goto out;
+ if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
+ goto free_skb;
if (skb_cow_head(skb, dev->needed_headroom))
goto free_skb;
@@ -685,7 +535,6 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
free_skb:
kfree_skb(skb);
-out:
dev->stats.tx_dropped++;
return NETDEV_TX_OK;
}
@@ -711,8 +560,8 @@ static int ipgre_tunnel_ioctl(struct net_device *dev,
if (err)
return err;
- p.i_flags = tnl_flags_to_gre_flags(p.i_flags);
- p.o_flags = tnl_flags_to_gre_flags(p.o_flags);
+ p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
+ p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
return -EFAULT;
@@ -756,7 +605,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph));
greh = (struct gre_base_hdr *)(iph+1);
- greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags);
+ greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags);
greh->protocol = htons(type);
memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
@@ -857,7 +706,7 @@ static void __gre_tunnel_init(struct net_device *dev)
int t_hlen;
tunnel = netdev_priv(dev);
- tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags);
+ tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
tunnel->parms.iph.protocol = IPPROTO_GRE;
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
@@ -1180,8 +1029,10 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
struct ip_tunnel_parm *p = &t->parms;
if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
- nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) ||
- nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) ||
+ nla_put_be16(skb, IFLA_GRE_IFLAGS,
+ gre_tnl_flags_to_gre_flags(p->i_flags)) ||
+ nla_put_be16(skb, IFLA_GRE_OFLAGS,
+ gre_tnl_flags_to_gre_flags(p->o_flags)) ||
nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
@@ -1266,6 +1117,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
{
struct nlattr *tb[IFLA_MAX + 1];
struct net_device *dev;
+ LIST_HEAD(list_kill);
struct ip_tunnel *t;
int err;
@@ -1281,8 +1133,10 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
t->collect_md = true;
err = ipgre_newlink(net, dev, tb, NULL);
- if (err < 0)
- goto out;
+ if (err < 0) {
+ free_netdev(dev);
+ return ERR_PTR(err);
+ }
/* openvswitch users expect packet sizes to be unrestricted,
* so set the largest MTU we can.
@@ -1291,9 +1145,14 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
if (err)
goto out;
+ err = rtnl_configure_link(dev, NULL);
+ if (err < 0)
+ goto out;
+
return dev;
out:
- free_netdev(dev);
+ ip_tunnel_dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(gretap_fb_dev_create);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index e3d782746..4b351af3e 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -218,17 +218,17 @@ static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_b
protocol = -ret;
goto resubmit;
}
- IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
} else {
if (!raw) {
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
kfree_skb(skb);
} else {
- IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
consume_skb(skb);
}
}
@@ -273,7 +273,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
--ANK (980813)
*/
if (skb_cow(skb, skb_headroom(skb))) {
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+ __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto drop;
}
@@ -282,7 +282,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
if (ip_options_compile(dev_net(dev), opt, skb)) {
- IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+ __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
goto drop;
}
@@ -313,6 +313,13 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
+ /* if ingress device is enslaved to an L3 master device pass the
+ * skb to its handler for processing
+ */
+ skb = l3mdev_ip_rcv(skb);
+ if (!skb)
+ return NET_RX_SUCCESS;
+
if (net->ipv4.sysctl_ip_early_demux &&
!skb_dst(skb) &&
!skb->sk &&
@@ -337,7 +344,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
iph->tos, skb->dev);
if (unlikely(err)) {
if (err == -EXDEV)
- NET_INC_STATS_BH(net, LINUX_MIB_IPRPFILTER);
+ __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
goto drop;
}
}
@@ -358,9 +365,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
- IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len);
+ __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST) {
- IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len);
+ __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
} else if (skb->pkt_type == PACKET_BROADCAST ||
skb->pkt_type == PACKET_MULTICAST) {
struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
@@ -409,11 +416,11 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
net = dev_net(dev);
- IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_IN, skb->len);
+ __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb) {
- IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
goto out;
}
@@ -439,9 +446,9 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
- IP_ADD_STATS_BH(net,
- IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
- max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
+ __IP_ADD_STATS(net,
+ IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
+ max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
@@ -453,7 +460,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
len = ntohs(iph->tot_len);
if (skb->len < len) {
- IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
goto inhdr_error;
@@ -463,7 +470,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
if (pskb_trim_rcsum(skb, len)) {
- IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
goto drop;
}
@@ -471,6 +478,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ IPCB(skb)->iif = skb->skb_iif;
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
@@ -480,9 +488,9 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
ip_rcv_finish);
csum_error:
- IP_INC_STATS_BH(net, IPSTATS_MIB_CSUMERRORS);
+ __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
- IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 124bf0a66..4bd492163 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -271,7 +271,7 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
return dst_output(net, sk, skb);
}
#endif
- mtu = ip_skb_dst_mtu(skb);
+ mtu = ip_skb_dst_mtu(sk, skb);
if (skb_is_gso(skb))
return ip_finish_output_gso(net, sk, skb, mtu);
@@ -541,7 +541,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
iph = ip_hdr(skb);
- mtu = ip_skb_dst_mtu(skb);
+ mtu = ip_skb_dst_mtu(sk, skb);
if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu)
mtu = IPCB(skb)->frag_max_size;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 035ad645a..71a52f4d4 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -106,7 +106,8 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
return;
if (offset != 0)
- csum = csum_sub(csum, csum_partial(skb->data, offset, 0));
+ csum = csum_sub(csum, csum_partial(skb_transport_header(skb),
+ offset, 0));
put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum);
}
@@ -219,11 +220,12 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb,
}
EXPORT_SYMBOL(ip_cmsg_recv_offset);
-int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
+int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
bool allow_ipv6)
{
int err, val;
struct cmsghdr *cmsg;
+ struct net *net = sock_net(sk);
for_each_cmsghdr(cmsg, msg) {
if (!CMSG_OK(msg, cmsg))
@@ -244,6 +246,13 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
continue;
}
#endif
+ if (cmsg->cmsg_level == SOL_SOCKET) {
+ err = __sock_cmsg_send(sk, msg, cmsg, &ipc->sockc);
+ if (err)
+ return err;
+ continue;
+ }
+
if (cmsg->cmsg_level != SOL_IP)
continue;
switch (cmsg->cmsg_type) {
@@ -502,9 +511,10 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
copied = len;
}
err = skb_copy_datagram_msg(skb, 0, msg, copied);
- if (err)
- goto out_free_skb;
-
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
sock_recv_timestamp(msg, sk, skb);
serr = SKB_EXT_ERR(skb);
@@ -536,8 +546,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
msg->msg_flags |= MSG_ERRQUEUE;
err = copied;
-out_free_skb:
- kfree_skb(skb);
+ consume_skb(skb);
out:
return err;
}
@@ -635,7 +644,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
if (err)
break;
old = rcu_dereference_protected(inet->inet_opt,
- sock_owned_by_user(sk));
+ lockdep_sock_is_held(sk));
if (inet->is_icsk) {
struct inet_connection_sock *icsk = inet_csk(sk);
#if IS_ENABLED(CONFIG_IPV6)
@@ -1185,7 +1194,12 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
ipv6_sk_rxinfo(sk);
if (prepare && skb_rtable(skb)) {
- pktinfo->ipi_ifindex = inet_iif(skb);
+ /* skb->cb is overloaded: prior to this point it is IP{6}CB
+ * which has interface index (iif) as the first member of the
+ * underlying inet{6}_skb_parm struct. This code then overlays
+ * PKTINFO_SKB_CB and in_pktinfo also has iif as the first
+ * element so the iif is picked up from the prior IPCB
+ */
pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb);
} else {
pktinfo->ipi_ifindex = 0;
@@ -1295,7 +1309,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
struct ip_options_rcu *inet_opt;
inet_opt = rcu_dereference_protected(inet->inet_opt,
- sock_owned_by_user(sk));
+ lockdep_sock_is_held(sk));
opt->optlen = 0;
if (inet_opt)
memcpy(optbuf, &inet_opt->opt,
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index a69ed94bd..d8f5e0a26 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -443,29 +443,6 @@ drop:
}
EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
-static int ip_encap_hlen(struct ip_tunnel_encap *e)
-{
- const struct ip_tunnel_encap_ops *ops;
- int hlen = -EINVAL;
-
- if (e->type == TUNNEL_ENCAP_NONE)
- return 0;
-
- if (e->type >= MAX_IPTUN_ENCAP_OPS)
- return -EINVAL;
-
- rcu_read_lock();
- ops = rcu_dereference(iptun_encaps[e->type]);
- if (likely(ops && ops->encap_hlen))
- hlen = ops->encap_hlen(e);
- rcu_read_unlock();
-
- return hlen;
-}
-
-const struct ip_tunnel_encap_ops __rcu *
- iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
-
int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
unsigned int num)
{
@@ -519,28 +496,6 @@ int ip_tunnel_encap_setup(struct ip_tunnel *t,
}
EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
-int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
- u8 *protocol, struct flowi4 *fl4)
-{
- const struct ip_tunnel_encap_ops *ops;
- int ret = -EINVAL;
-
- if (t->encap.type == TUNNEL_ENCAP_NONE)
- return 0;
-
- if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
- return -EINVAL;
-
- rcu_read_lock();
- ops = rcu_dereference(iptun_encaps[t->encap.type]);
- if (likely(ops && ops->build_header))
- ret = ops->build_header(skb, &t->encap, protocol, fl4);
- rcu_read_unlock();
-
- return ret;
-}
-EXPORT_SYMBOL(ip_tunnel_encap);
-
static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
struct rtable *rt, __be16 df,
const struct iphdr *inner_iph)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 6165f30c4..afd6b5968 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -37,6 +37,7 @@
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/ip_tunnels.h>
+#include <net/ip6_tunnel.h>
#include <net/arp.h>
#include <net/checksum.h>
#include <net/dsfield.h>
@@ -47,6 +48,14 @@
#include <net/rtnetlink.h>
#include <net/dst_metadata.h>
+const struct ip_tunnel_encap_ops __rcu *
+ iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
+EXPORT_SYMBOL(iptun_encaps);
+
+const struct ip6_tnl_encap_ops __rcu *
+ ip6tun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
+EXPORT_SYMBOL(ip6tun_encaps);
+
void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 proto,
__u8 tos, __u8 ttl, __be16 df, bool xnet)
@@ -86,15 +95,15 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(iptunnel_xmit);
-int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto,
- bool xnet)
+int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
+ __be16 inner_proto, bool raw_proto, bool xnet)
{
if (unlikely(!pskb_may_pull(skb, hdr_len)))
return -ENOMEM;
skb_pull_rcsum(skb, hdr_len);
- if (inner_proto == htons(ETH_P_TEB)) {
+ if (!raw_proto && inner_proto == htons(ETH_P_TEB)) {
struct ethhdr *eh;
if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
@@ -117,7 +126,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto,
return iptunnel_pull_offloads(skb);
}
-EXPORT_SYMBOL_GPL(iptunnel_pull_header);
+EXPORT_SYMBOL_GPL(__iptunnel_pull_header);
struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
gfp_t flags)
@@ -146,8 +155,8 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
}
EXPORT_SYMBOL_GPL(iptunnel_metadata_reply);
-struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
- int gso_type_mask)
+int iptunnel_handle_offloads(struct sk_buff *skb,
+ int gso_type_mask)
{
int err;
@@ -157,11 +166,11 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
}
if (skb_is_gso(skb)) {
- err = skb_unclone(skb, GFP_ATOMIC);
+ err = skb_header_unclone(skb, GFP_ATOMIC);
if (unlikely(err))
- goto error;
+ return err;
skb_shinfo(skb)->gso_type |= gso_type_mask;
- return skb;
+ return 0;
}
if (skb->ip_summed != CHECKSUM_PARTIAL) {
@@ -174,10 +183,7 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
skb->encapsulation = 0;
}
- return skb;
-error:
- kfree_skb(skb);
- return ERR_PTR(err);
+ return 0;
}
EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
@@ -247,10 +253,10 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]);
if (tb[LWTUNNEL_IP_DST])
- tun_info->key.u.ipv4.dst = nla_get_be32(tb[LWTUNNEL_IP_DST]);
+ tun_info->key.u.ipv4.dst = nla_get_in_addr(tb[LWTUNNEL_IP_DST]);
if (tb[LWTUNNEL_IP_SRC])
- tun_info->key.u.ipv4.src = nla_get_be32(tb[LWTUNNEL_IP_SRC]);
+ tun_info->key.u.ipv4.src = nla_get_in_addr(tb[LWTUNNEL_IP_SRC]);
if (tb[LWTUNNEL_IP_TTL])
tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]);
@@ -274,9 +280,10 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,
{
struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
- if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) ||
- nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
- nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
+ if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id,
+ LWTUNNEL_IP_PAD) ||
+ nla_put_in_addr(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
+ nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags))
@@ -287,7 +294,7 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,
static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
{
- return nla_total_size(8) /* LWTUNNEL_IP_ID */
+ return nla_total_size_64bit(8) /* LWTUNNEL_IP_ID */
+ nla_total_size(4) /* LWTUNNEL_IP_DST */
+ nla_total_size(4) /* LWTUNNEL_IP_SRC */
+ nla_total_size(1) /* LWTUNNEL_IP_TOS */
@@ -369,7 +376,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb,
{
struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
- if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) ||
+ if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id,
+ LWTUNNEL_IP6_PAD) ||
nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) ||
nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
@@ -382,7 +390,7 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb,
static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
{
- return nla_total_size(8) /* LWTUNNEL_IP6_ID */
+ return nla_total_size_64bit(8) /* LWTUNNEL_IP6_ID */
+ nla_total_size(16) /* LWTUNNEL_IP6_DST */
+ nla_total_size(16) /* LWTUNNEL_IP6_SRC */
+ nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 2ed9dd2b5..1d71c40ea 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -127,7 +127,9 @@ __be32 ic_myaddr = NONE; /* My IP address */
static __be32 ic_netmask = NONE; /* Netmask for local subnet */
__be32 ic_gateway = NONE; /* Gateway IP address */
-__be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */
+#ifdef IPCONFIG_DYNAMIC
+static __be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */
+#endif
__be32 ic_servaddr = NONE; /* Boot server IP address */
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ec51d0216..978370132 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -219,9 +219,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (unlikely(skb->protocol != htons(ETH_P_IP)))
goto tx_error;
- skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP);
- if (IS_ERR(skb))
- goto out;
+ if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
+ goto tx_error;
skb_set_inner_ipproto(skb, IPPROTO_IPIP);
@@ -230,7 +229,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
tx_error:
kfree_skb(skb);
-out:
+
dev->stats.tx_errors++;
return NETDEV_TX_OK;
}
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index a42dd8021..5ad48ec77 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -2106,7 +2106,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
mfcs.mfcs_packets = c->mfc_un.res.pkt;
mfcs.mfcs_bytes = c->mfc_un.res.bytes;
mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
- if (nla_put(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs) < 0)
+ if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) < 0)
return -EMSGSIZE;
rtm->rtm_type = RTN_MULTICAST;
@@ -2239,7 +2239,7 @@ static size_t mroute_msgsize(bool unresolved, int maxvif)
+ nla_total_size(0) /* RTA_MULTIPATH */
+ maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
/* RTA_MFC_STATS */
- + nla_total_size(sizeof(struct rta_mfc_stats))
+ + nla_total_size_64bit(sizeof(struct rta_mfc_stats))
;
return len;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 85d60c69b..2033f929a 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -34,27 +34,6 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
MODULE_DESCRIPTION("arptables core");
-/*#define DEBUG_ARP_TABLES*/
-/*#define DEBUG_ARP_TABLES_USER*/
-
-#ifdef DEBUG_ARP_TABLES
-#define dprintf(format, args...) pr_debug(format, ## args)
-#else
-#define dprintf(format, args...)
-#endif
-
-#ifdef DEBUG_ARP_TABLES_USER
-#define duprintf(format, args...) pr_debug(format, ## args)
-#else
-#define duprintf(format, args...)
-#endif
-
-#ifdef CONFIG_NETFILTER_DEBUG
-#define ARP_NF_ASSERT(x) WARN_ON(!(x))
-#else
-#define ARP_NF_ASSERT(x)
-#endif
-
void *arpt_alloc_initial_table(const struct xt_table *info)
{
return xt_alloc_initial_table(arpt, ARPT);
@@ -113,36 +92,20 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
#define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg)))
if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop,
- ARPT_INV_ARPOP)) {
- dprintf("ARP operation field mismatch.\n");
- dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n",
- arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask);
+ ARPT_INV_ARPOP))
return 0;
- }
if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd,
- ARPT_INV_ARPHRD)) {
- dprintf("ARP hardware address format mismatch.\n");
- dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n",
- arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask);
+ ARPT_INV_ARPHRD))
return 0;
- }
if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro,
- ARPT_INV_ARPPRO)) {
- dprintf("ARP protocol address format mismatch.\n");
- dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n",
- arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask);
+ ARPT_INV_ARPPRO))
return 0;
- }
if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln,
- ARPT_INV_ARPHLN)) {
- dprintf("ARP hardware address length mismatch.\n");
- dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n",
- arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask);
+ ARPT_INV_ARPHLN))
return 0;
- }
src_devaddr = arpptr;
arpptr += dev->addr_len;
@@ -155,49 +118,25 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len),
ARPT_INV_SRCDEVADDR) ||
FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len),
- ARPT_INV_TGTDEVADDR)) {
- dprintf("Source or target device address mismatch.\n");
-
+ ARPT_INV_TGTDEVADDR))
return 0;
- }
if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr,
ARPT_INV_SRCIP) ||
FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr),
- ARPT_INV_TGTIP)) {
- dprintf("Source or target IP address mismatch.\n");
-
- dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n",
- &src_ipaddr,
- &arpinfo->smsk.s_addr,
- &arpinfo->src.s_addr,
- arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : "");
- dprintf("TGT: %pI4 Mask: %pI4 Target: %pI4.%s\n",
- &tgt_ipaddr,
- &arpinfo->tmsk.s_addr,
- &arpinfo->tgt.s_addr,
- arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : "");
+ ARPT_INV_TGTIP))
return 0;
- }
/* Look for ifname matches. */
ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask);
- if (FWINV(ret != 0, ARPT_INV_VIA_IN)) {
- dprintf("VIA in mismatch (%s vs %s).%s\n",
- indev, arpinfo->iniface,
- arpinfo->invflags & ARPT_INV_VIA_IN ? " (INV)" : "");
+ if (FWINV(ret != 0, ARPT_INV_VIA_IN))
return 0;
- }
ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask);
- if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) {
- dprintf("VIA out mismatch (%s vs %s).%s\n",
- outdev, arpinfo->outiface,
- arpinfo->invflags & ARPT_INV_VIA_OUT ? " (INV)" : "");
+ if (FWINV(ret != 0, ARPT_INV_VIA_OUT))
return 0;
- }
return 1;
#undef FWINV
@@ -205,16 +144,10 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
static inline int arp_checkentry(const struct arpt_arp *arp)
{
- if (arp->flags & ~ARPT_F_MASK) {
- duprintf("Unknown flag bits set: %08X\n",
- arp->flags & ~ARPT_F_MASK);
+ if (arp->flags & ~ARPT_F_MASK)
return 0;
- }
- if (arp->invflags & ~ARPT_INV_MASK) {
- duprintf("Unknown invflag bits set: %08X\n",
- arp->invflags & ~ARPT_INV_MASK);
+ if (arp->invflags & ~ARPT_INV_MASK)
return 0;
- }
return 1;
}
@@ -406,11 +339,9 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
= (void *)arpt_get_target_c(e);
int visited = e->comefrom & (1 << hook);
- if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) {
- pr_notice("arptables: loop hook %u pos %u %08X.\n",
- hook, pos, e->comefrom);
+ if (e->comefrom & (1 << NF_ARP_NUMHOOKS))
return 0;
- }
+
e->comefrom
|= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
@@ -423,12 +354,8 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
if ((strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0) &&
- t->verdict < -NF_MAX_VERDICT - 1) {
- duprintf("mark_source_chains: bad "
- "negative verdict (%i)\n",
- t->verdict);
+ t->verdict < -NF_MAX_VERDICT - 1)
return 0;
- }
/* Return: backtrack through the last
* big jump.
@@ -461,17 +388,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
if (strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0 &&
newpos >= 0) {
- if (newpos > newinfo->size -
- sizeof(struct arpt_entry)) {
- duprintf("mark_source_chains: "
- "bad verdict (%i)\n",
- newpos);
- return 0;
- }
-
/* This a jump; chase it. */
- duprintf("Jump rule %u -> %u\n",
- pos, newpos);
e = (struct arpt_entry *)
(entry0 + newpos);
if (!find_jump_target(newinfo, e))
@@ -488,8 +405,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
pos = newpos;
}
}
-next:
- duprintf("Finished chain %u\n", hook);
+next: ;
}
return 1;
}
@@ -497,7 +413,6 @@ next:
static inline int check_target(struct arpt_entry *e, const char *name)
{
struct xt_entry_target *t = arpt_get_target(e);
- int ret;
struct xt_tgchk_param par = {
.table = name,
.entryinfo = e,
@@ -507,13 +422,7 @@ static inline int check_target(struct arpt_entry *e, const char *name)
.family = NFPROTO_ARP,
};
- ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
- if (ret < 0) {
- duprintf("arp_tables: check failed for `%s'.\n",
- t->u.kernel.target->name);
- return ret;
- }
- return 0;
+ return xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
}
static inline int
@@ -521,17 +430,18 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
{
struct xt_entry_target *t;
struct xt_target *target;
+ unsigned long pcnt;
int ret;
- e->counters.pcnt = xt_percpu_counter_alloc();
- if (IS_ERR_VALUE(e->counters.pcnt))
+ pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(pcnt))
return -ENOMEM;
+ e->counters.pcnt = pcnt;
t = arpt_get_target(e);
target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
t->u.user.revision);
if (IS_ERR(target)) {
- duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
ret = PTR_ERR(target);
goto out;
}
@@ -577,17 +487,12 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 ||
(unsigned char *)e + sizeof(struct arpt_entry) >= limit ||
- (unsigned char *)e + e->next_offset > limit) {
- duprintf("Bad offset %p\n", e);
+ (unsigned char *)e + e->next_offset > limit)
return -EINVAL;
- }
if (e->next_offset
- < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) {
- duprintf("checking: element %p size %u\n",
- e, e->next_offset);
+ < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target))
return -EINVAL;
- }
if (!arp_checkentry(&e->arp))
return -EINVAL;
@@ -604,12 +509,9 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
if ((unsigned char *)e - base == hook_entries[h])
newinfo->hook_entry[h] = hook_entries[h];
if ((unsigned char *)e - base == underflows[h]) {
- if (!check_underflow(e)) {
- pr_debug("Underflows must be unconditional and "
- "use the STANDARD target with "
- "ACCEPT/DROP\n");
+ if (!check_underflow(e))
return -EINVAL;
- }
+
newinfo->underflow[h] = underflows[h];
}
}
@@ -654,7 +556,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
newinfo->underflow[i] = 0xFFFFFFFF;
}
- duprintf("translate_table: size %u\n", newinfo->size);
i = 0;
/* Walk through entries, checking offsets. */
@@ -671,31 +572,21 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
XT_ERROR_TARGET) == 0)
++newinfo->stacksize;
}
- duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
if (ret != 0)
return ret;
- if (i != repl->num_entries) {
- duprintf("translate_table: %u not %u entries\n",
- i, repl->num_entries);
+ if (i != repl->num_entries)
return -EINVAL;
- }
/* Check hooks all assigned */
for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
/* Only hooks which are valid */
if (!(repl->valid_hooks & (1 << i)))
continue;
- if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
- duprintf("Invalid hook entry %u %u\n",
- i, repl->hook_entry[i]);
+ if (newinfo->hook_entry[i] == 0xFFFFFFFF)
return -EINVAL;
- }
- if (newinfo->underflow[i] == 0xFFFFFFFF) {
- duprintf("Invalid underflow %u %u\n",
- i, repl->underflow[i]);
+ if (newinfo->underflow[i] == 0xFFFFFFFF)
return -EINVAL;
- }
}
if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
@@ -903,11 +794,8 @@ static int get_info(struct net *net, void __user *user,
struct xt_table *t;
int ret;
- if (*len != sizeof(struct arpt_getinfo)) {
- duprintf("length %u != %Zu\n", *len,
- sizeof(struct arpt_getinfo));
+ if (*len != sizeof(struct arpt_getinfo))
return -EINVAL;
- }
if (copy_from_user(name, user, sizeof(name)) != 0)
return -EFAULT;
@@ -963,33 +851,25 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
struct arpt_get_entries get;
struct xt_table *t;
- if (*len < sizeof(get)) {
- duprintf("get_entries: %u < %Zu\n", *len, sizeof(get));
+ if (*len < sizeof(get))
return -EINVAL;
- }
if (copy_from_user(&get, uptr, sizeof(get)) != 0)
return -EFAULT;
- if (*len != sizeof(struct arpt_get_entries) + get.size) {
- duprintf("get_entries: %u != %Zu\n", *len,
- sizeof(struct arpt_get_entries) + get.size);
+ if (*len != sizeof(struct arpt_get_entries) + get.size)
return -EINVAL;
- }
+
get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
- duprintf("t->private->number = %u\n",
- private->number);
if (get.size == private->size)
ret = copy_entries_to_user(private->size,
t, uptr->entrytable);
- else {
- duprintf("get_entries: I've got %u not %u!\n",
- private->size, get.size);
+ else
ret = -EAGAIN;
- }
+
module_put(t->me);
xt_table_unlock(t);
} else
@@ -1027,8 +907,6 @@ static int __do_replace(struct net *net, const char *name,
/* You lied! */
if (valid_hooks != t->valid_hooks) {
- duprintf("Valid hook crap: %08X vs %08X\n",
- valid_hooks, t->valid_hooks);
ret = -EINVAL;
goto put_module;
}
@@ -1038,8 +916,6 @@ static int __do_replace(struct net *net, const char *name,
goto put_module;
/* Update module usage count based on number of rules */
- duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
- oldinfo->number, oldinfo->initial_entries, newinfo->number);
if ((oldinfo->number > oldinfo->initial_entries) ||
(newinfo->number <= oldinfo->initial_entries))
module_put(t->me);
@@ -1109,8 +985,6 @@ static int do_replace(struct net *net, const void __user *user,
if (ret != 0)
goto free_newinfo;
- duprintf("arp_tables: Translated table\n");
-
ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
tmp.num_counters, tmp.counters);
if (ret)
@@ -1208,20 +1082,14 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
unsigned int entry_offset;
int ret, off;
- duprintf("check_compat_entry_size_and_hooks %p\n", e);
if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 ||
(unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit ||
- (unsigned char *)e + e->next_offset > limit) {
- duprintf("Bad offset %p, limit = %p\n", e, limit);
+ (unsigned char *)e + e->next_offset > limit)
return -EINVAL;
- }
if (e->next_offset < sizeof(struct compat_arpt_entry) +
- sizeof(struct compat_xt_entry_target)) {
- duprintf("checking: element %p size %u\n",
- e, e->next_offset);
+ sizeof(struct compat_xt_entry_target))
return -EINVAL;
- }
if (!arp_checkentry(&e->arp))
return -EINVAL;
@@ -1238,8 +1106,6 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
t->u.user.revision);
if (IS_ERR(target)) {
- duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
- t->u.user.name);
ret = PTR_ERR(target);
goto out;
}
@@ -1309,7 +1175,6 @@ static int translate_compat_table(struct xt_table_info **pinfo,
size = compatr->size;
info->number = compatr->num_entries;
- duprintf("translate_compat_table: size %u\n", info->size);
j = 0;
xt_compat_lock(NFPROTO_ARP);
xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries);
@@ -1324,11 +1189,8 @@ static int translate_compat_table(struct xt_table_info **pinfo,
}
ret = -EINVAL;
- if (j != compatr->num_entries) {
- duprintf("translate_compat_table: %u not %u entries\n",
- j, compatr->num_entries);
+ if (j != compatr->num_entries)
goto out_unlock;
- }
ret = -ENOMEM;
newinfo = xt_alloc_table_info(size);
@@ -1398,8 +1260,6 @@ static int compat_do_replace(struct net *net, void __user *user,
return -EFAULT;
/* overflow check */
- if (tmp.size >= INT_MAX / num_possible_cpus())
- return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
if (tmp.num_counters == 0)
@@ -1421,8 +1281,6 @@ static int compat_do_replace(struct net *net, void __user *user,
if (ret != 0)
goto free_newinfo;
- duprintf("compat_do_replace: Translated table\n");
-
ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
tmp.num_counters, compat_ptr(tmp.counters));
if (ret)
@@ -1455,7 +1313,6 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user,
break;
default:
- duprintf("do_arpt_set_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -1538,17 +1395,13 @@ static int compat_get_entries(struct net *net,
struct compat_arpt_get_entries get;
struct xt_table *t;
- if (*len < sizeof(get)) {
- duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get));
+ if (*len < sizeof(get))
return -EINVAL;
- }
if (copy_from_user(&get, uptr, sizeof(get)) != 0)
return -EFAULT;
- if (*len != sizeof(struct compat_arpt_get_entries) + get.size) {
- duprintf("compat_get_entries: %u != %zu\n",
- *len, sizeof(get) + get.size);
+ if (*len != sizeof(struct compat_arpt_get_entries) + get.size)
return -EINVAL;
- }
+
get.name[sizeof(get.name) - 1] = '\0';
xt_compat_lock(NFPROTO_ARP);
@@ -1557,16 +1410,13 @@ static int compat_get_entries(struct net *net,
const struct xt_table_info *private = t->private;
struct xt_table_info info;
- duprintf("t->private->number = %u\n", private->number);
ret = compat_table_info(private, &info);
if (!ret && get.size == info.size) {
ret = compat_copy_entries_to_user(private->size,
t, uptr->entrytable);
- } else if (!ret) {
- duprintf("compat_get_entries: I've got %u not %u!\n",
- private->size, get.size);
+ } else if (!ret)
ret = -EAGAIN;
- }
+
xt_compat_flush_offsets(NFPROTO_ARP);
module_put(t->me);
xt_table_unlock(t);
@@ -1618,7 +1468,6 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned
break;
default:
- duprintf("do_arpt_set_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -1661,7 +1510,6 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
}
default:
- duprintf("do_arpt_get_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -1706,7 +1554,6 @@ int arpt_register_table(struct net *net,
memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(newinfo, loc_cpu_entry, repl);
- duprintf("arpt_register_table: translate table gives %d\n", ret);
if (ret != 0)
goto out_free;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 0984ea3fc..54906e0e8 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -35,34 +35,12 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("IPv4 packet filter");
-/*#define DEBUG_IP_FIREWALL*/
-/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
-/*#define DEBUG_IP_FIREWALL_USER*/
-
-#ifdef DEBUG_IP_FIREWALL
-#define dprintf(format, args...) pr_info(format , ## args)
-#else
-#define dprintf(format, args...)
-#endif
-
-#ifdef DEBUG_IP_FIREWALL_USER
-#define duprintf(format, args...) pr_info(format , ## args)
-#else
-#define duprintf(format, args...)
-#endif
-
#ifdef CONFIG_NETFILTER_DEBUG
#define IP_NF_ASSERT(x) WARN_ON(!(x))
#else
#define IP_NF_ASSERT(x)
#endif
-#if 0
-/* All the better to debug you with... */
-#define static
-#define inline
-#endif
-
void *ipt_alloc_initial_table(const struct xt_table *info)
{
return xt_alloc_initial_table(ipt, IPT);
@@ -85,52 +63,28 @@ ip_packet_match(const struct iphdr *ip,
if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
IPT_INV_SRCIP) ||
FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
- IPT_INV_DSTIP)) {
- dprintf("Source or dest mismatch.\n");
-
- dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n",
- &ip->saddr, &ipinfo->smsk.s_addr, &ipinfo->src.s_addr,
- ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : "");
- dprintf("DST: %pI4 Mask: %pI4 Target: %pI4.%s\n",
- &ip->daddr, &ipinfo->dmsk.s_addr, &ipinfo->dst.s_addr,
- ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : "");
+ IPT_INV_DSTIP))
return false;
- }
ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask);
- if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
- dprintf("VIA in mismatch (%s vs %s).%s\n",
- indev, ipinfo->iniface,
- ipinfo->invflags & IPT_INV_VIA_IN ? " (INV)" : "");
+ if (FWINV(ret != 0, IPT_INV_VIA_IN))
return false;
- }
ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask);
- if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
- dprintf("VIA out mismatch (%s vs %s).%s\n",
- outdev, ipinfo->outiface,
- ipinfo->invflags & IPT_INV_VIA_OUT ? " (INV)" : "");
+ if (FWINV(ret != 0, IPT_INV_VIA_OUT))
return false;
- }
/* Check specific protocol */
if (ipinfo->proto &&
- FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
- dprintf("Packet protocol %hi does not match %hi.%s\n",
- ip->protocol, ipinfo->proto,
- ipinfo->invflags & IPT_INV_PROTO ? " (INV)" : "");
+ FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO))
return false;
- }
/* If we have a fragment rule but the packet is not a fragment
* then we return zero */
- if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) {
- dprintf("Fragment rule but not fragment.%s\n",
- ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : "");
+ if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG))
return false;
- }
return true;
}
@@ -138,16 +92,10 @@ ip_packet_match(const struct iphdr *ip,
static bool
ip_checkentry(const struct ipt_ip *ip)
{
- if (ip->flags & ~IPT_F_MASK) {
- duprintf("Unknown flag bits set: %08X\n",
- ip->flags & ~IPT_F_MASK);
+ if (ip->flags & ~IPT_F_MASK)
return false;
- }
- if (ip->invflags & ~IPT_INV_MASK) {
- duprintf("Unknown invflag bits set: %08X\n",
- ip->invflags & ~IPT_INV_MASK);
+ if (ip->invflags & ~IPT_INV_MASK)
return false;
- }
return true;
}
@@ -346,10 +294,6 @@ ipt_do_table(struct sk_buff *skb,
e = get_entry(table_base, private->hook_entry[hook]);
- pr_debug("Entering %s(hook %u), UF %p\n",
- table->name, hook,
- get_entry(table_base, private->underflow[hook]));
-
do {
const struct xt_entry_target *t;
const struct xt_entry_match *ematch;
@@ -396,22 +340,15 @@ ipt_do_table(struct sk_buff *skb,
if (stackidx == 0) {
e = get_entry(table_base,
private->underflow[hook]);
- pr_debug("Underflow (this is normal) "
- "to %p\n", e);
} else {
e = jumpstack[--stackidx];
- pr_debug("Pulled %p out from pos %u\n",
- e, stackidx);
e = ipt_next_entry(e);
}
continue;
}
if (table_base + v != ipt_next_entry(e) &&
- !(e->ip.flags & IPT_F_GOTO)) {
+ !(e->ip.flags & IPT_F_GOTO))
jumpstack[stackidx++] = e;
- pr_debug("Pushed %p into pos %u\n",
- e, stackidx - 1);
- }
e = get_entry(table_base, v);
continue;
@@ -429,18 +366,13 @@ ipt_do_table(struct sk_buff *skb,
/* Verdict */
break;
} while (!acpar.hotdrop);
- pr_debug("Exiting %s; sp at %u\n", __func__, stackidx);
xt_write_recseq_end(addend);
local_bh_enable();
-#ifdef DEBUG_ALLOW_ALL
- return NF_ACCEPT;
-#else
if (acpar.hotdrop)
return NF_DROP;
else return verdict;
-#endif
}
static bool find_jump_target(const struct xt_table_info *t,
@@ -480,11 +412,9 @@ mark_source_chains(const struct xt_table_info *newinfo,
= (void *)ipt_get_target_c(e);
int visited = e->comefrom & (1 << hook);
- if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
- pr_err("iptables: loop hook %u pos %u %08X.\n",
- hook, pos, e->comefrom);
+ if (e->comefrom & (1 << NF_INET_NUMHOOKS))
return 0;
- }
+
e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
/* Unconditional return/END. */
@@ -496,26 +426,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
if ((strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0) &&
- t->verdict < -NF_MAX_VERDICT - 1) {
- duprintf("mark_source_chains: bad "
- "negative verdict (%i)\n",
- t->verdict);
+ t->verdict < -NF_MAX_VERDICT - 1)
return 0;
- }
/* Return: backtrack through the last
big jump. */
do {
e->comefrom ^= (1<<NF_INET_NUMHOOKS);
-#ifdef DEBUG_IP_FIREWALL_USER
- if (e->comefrom
- & (1 << NF_INET_NUMHOOKS)) {
- duprintf("Back unset "
- "on hook %u "
- "rule %u\n",
- hook, pos);
- }
-#endif
oldpos = pos;
pos = e->counters.pcnt;
e->counters.pcnt = 0;
@@ -542,16 +459,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
if (strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0 &&
newpos >= 0) {
- if (newpos > newinfo->size -
- sizeof(struct ipt_entry)) {
- duprintf("mark_source_chains: "
- "bad verdict (%i)\n",
- newpos);
- return 0;
- }
/* This a jump; chase it. */
- duprintf("Jump rule %u -> %u\n",
- pos, newpos);
e = (struct ipt_entry *)
(entry0 + newpos);
if (!find_jump_target(newinfo, e))
@@ -568,8 +476,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
pos = newpos;
}
}
-next:
- duprintf("Finished chain %u\n", hook);
+next: ;
}
return 1;
}
@@ -591,18 +498,12 @@ static int
check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
{
const struct ipt_ip *ip = par->entryinfo;
- int ret;
par->match = m->u.kernel.match;
par->matchinfo = m->data;
- ret = xt_check_match(par, m->u.match_size - sizeof(*m),
- ip->proto, ip->invflags & IPT_INV_PROTO);
- if (ret < 0) {
- duprintf("check failed for `%s'.\n", par->match->name);
- return ret;
- }
- return 0;
+ return xt_check_match(par, m->u.match_size - sizeof(*m),
+ ip->proto, ip->invflags & IPT_INV_PROTO);
}
static int
@@ -613,10 +514,8 @@ find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
m->u.user.revision);
- if (IS_ERR(match)) {
- duprintf("find_check_match: `%s' not found\n", m->u.user.name);
+ if (IS_ERR(match))
return PTR_ERR(match);
- }
m->u.kernel.match = match;
ret = check_match(m, par);
@@ -641,16 +540,9 @@ static int check_target(struct ipt_entry *e, struct net *net, const char *name)
.hook_mask = e->comefrom,
.family = NFPROTO_IPV4,
};
- int ret;
- ret = xt_check_target(&par, t->u.target_size - sizeof(*t),
- e->ip.proto, e->ip.invflags & IPT_INV_PROTO);
- if (ret < 0) {
- duprintf("check failed for `%s'.\n",
- t->u.kernel.target->name);
- return ret;
- }
- return 0;
+ return xt_check_target(&par, t->u.target_size - sizeof(*t),
+ e->ip.proto, e->ip.invflags & IPT_INV_PROTO);
}
static int
@@ -663,10 +555,12 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
unsigned int j;
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
+ unsigned long pcnt;
- e->counters.pcnt = xt_percpu_counter_alloc();
- if (IS_ERR_VALUE(e->counters.pcnt))
+ pcnt = xt_percpu_counter_alloc();
+ if (IS_ERR_VALUE(pcnt))
return -ENOMEM;
+ e->counters.pcnt = pcnt;
j = 0;
mtpar.net = net;
@@ -685,7 +579,6 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
t->u.user.revision);
if (IS_ERR(target)) {
- duprintf("find_check_entry: `%s' not found\n", t->u.user.name);
ret = PTR_ERR(target);
goto cleanup_matches;
}
@@ -739,17 +632,12 @@ check_entry_size_and_hooks(struct ipt_entry *e,
if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 ||
(unsigned char *)e + sizeof(struct ipt_entry) >= limit ||
- (unsigned char *)e + e->next_offset > limit) {
- duprintf("Bad offset %p\n", e);
+ (unsigned char *)e + e->next_offset > limit)
return -EINVAL;
- }
if (e->next_offset
- < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) {
- duprintf("checking: element %p size %u\n",
- e, e->next_offset);
+ < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target))
return -EINVAL;
- }
if (!ip_checkentry(&e->ip))
return -EINVAL;
@@ -766,12 +654,9 @@ check_entry_size_and_hooks(struct ipt_entry *e,
if ((unsigned char *)e - base == hook_entries[h])
newinfo->hook_entry[h] = hook_entries[h];
if ((unsigned char *)e - base == underflows[h]) {
- if (!check_underflow(e)) {
- pr_debug("Underflows must be unconditional and "
- "use the STANDARD target with "
- "ACCEPT/DROP\n");
+ if (!check_underflow(e))
return -EINVAL;
- }
+
newinfo->underflow[h] = underflows[h];
}
}
@@ -823,7 +708,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
newinfo->underflow[i] = 0xFFFFFFFF;
}
- duprintf("translate_table: size %u\n", newinfo->size);
i = 0;
/* Walk through entries, checking offsets. */
xt_entry_foreach(iter, entry0, newinfo->size) {
@@ -840,27 +724,18 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
++newinfo->stacksize;
}
- if (i != repl->num_entries) {
- duprintf("translate_table: %u not %u entries\n",
- i, repl->num_entries);
+ if (i != repl->num_entries)
return -EINVAL;
- }
/* Check hooks all assigned */
for (i = 0; i < NF_INET_NUMHOOKS; i++) {
/* Only hooks which are valid */
if (!(repl->valid_hooks & (1 << i)))
continue;
- if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
- duprintf("Invalid hook entry %u %u\n",
- i, repl->hook_entry[i]);
+ if (newinfo->hook_entry[i] == 0xFFFFFFFF)
return -EINVAL;
- }
- if (newinfo->underflow[i] == 0xFFFFFFFF) {
- duprintf("Invalid underflow %u %u\n",
- i, repl->underflow[i]);
+ if (newinfo->underflow[i] == 0xFFFFFFFF)
return -EINVAL;
- }
}
if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
@@ -1088,11 +963,8 @@ static int get_info(struct net *net, void __user *user,
struct xt_table *t;
int ret;
- if (*len != sizeof(struct ipt_getinfo)) {
- duprintf("length %u != %zu\n", *len,
- sizeof(struct ipt_getinfo));
+ if (*len != sizeof(struct ipt_getinfo))
return -EINVAL;
- }
if (copy_from_user(name, user, sizeof(name)) != 0)
return -EFAULT;
@@ -1150,31 +1022,23 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
struct ipt_get_entries get;
struct xt_table *t;
- if (*len < sizeof(get)) {
- duprintf("get_entries: %u < %zu\n", *len, sizeof(get));
+ if (*len < sizeof(get))
return -EINVAL;
- }
if (copy_from_user(&get, uptr, sizeof(get)) != 0)
return -EFAULT;
- if (*len != sizeof(struct ipt_get_entries) + get.size) {
- duprintf("get_entries: %u != %zu\n",
- *len, sizeof(get) + get.size);
+ if (*len != sizeof(struct ipt_get_entries) + get.size)
return -EINVAL;
- }
get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, AF_INET, get.name);
if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
- duprintf("t->private->number = %u\n", private->number);
if (get.size == private->size)
ret = copy_entries_to_user(private->size,
t, uptr->entrytable);
- else {
- duprintf("get_entries: I've got %u not %u!\n",
- private->size, get.size);
+ else
ret = -EAGAIN;
- }
+
module_put(t->me);
xt_table_unlock(t);
} else
@@ -1210,8 +1074,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
/* You lied! */
if (valid_hooks != t->valid_hooks) {
- duprintf("Valid hook crap: %08X vs %08X\n",
- valid_hooks, t->valid_hooks);
ret = -EINVAL;
goto put_module;
}
@@ -1221,8 +1083,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
goto put_module;
/* Update module usage count based on number of rules */
- duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
- oldinfo->number, oldinfo->initial_entries, newinfo->number);
if ((oldinfo->number > oldinfo->initial_entries) ||
(newinfo->number <= oldinfo->initial_entries))
module_put(t->me);
@@ -1291,8 +1151,6 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
if (ret != 0)
goto free_newinfo;
- duprintf("Translated table\n");
-
ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
tmp.num_counters, tmp.counters);
if (ret)
@@ -1418,11 +1276,9 @@ compat_find_calc_match(struct xt_entry_match *m,
match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name,
m->u.user.revision);
- if (IS_ERR(match)) {
- duprintf("compat_check_calc_match: `%s' not found\n",
- m->u.user.name);
+ if (IS_ERR(match))
return PTR_ERR(match);
- }
+
m->u.kernel.match = match;
*size += xt_compat_match_offset(match);
return 0;
@@ -1454,20 +1310,14 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
unsigned int j;
int ret, off;
- duprintf("check_compat_entry_size_and_hooks %p\n", e);
if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ||
(unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit ||
- (unsigned char *)e + e->next_offset > limit) {
- duprintf("Bad offset %p, limit = %p\n", e, limit);
+ (unsigned char *)e + e->next_offset > limit)
return -EINVAL;
- }
if (e->next_offset < sizeof(struct compat_ipt_entry) +
- sizeof(struct compat_xt_entry_target)) {
- duprintf("checking: element %p size %u\n",
- e, e->next_offset);
+ sizeof(struct compat_xt_entry_target))
return -EINVAL;
- }
if (!ip_checkentry(&e->ip))
return -EINVAL;
@@ -1491,8 +1341,6 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name,
t->u.user.revision);
if (IS_ERR(target)) {
- duprintf("check_compat_entry_size_and_hooks: `%s' not found\n",
- t->u.user.name);
ret = PTR_ERR(target);
goto release_matches;
}
@@ -1574,7 +1422,6 @@ translate_compat_table(struct net *net,
size = compatr->size;
info->number = compatr->num_entries;
- duprintf("translate_compat_table: size %u\n", info->size);
j = 0;
xt_compat_lock(AF_INET);
xt_compat_init_offsets(AF_INET, compatr->num_entries);
@@ -1589,11 +1436,8 @@ translate_compat_table(struct net *net,
}
ret = -EINVAL;
- if (j != compatr->num_entries) {
- duprintf("translate_compat_table: %u not %u entries\n",
- j, compatr->num_entries);
+ if (j != compatr->num_entries)
goto out_unlock;
- }
ret = -ENOMEM;
newinfo = xt_alloc_table_info(size);
@@ -1668,8 +1512,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
return -EFAULT;
/* overflow check */
- if (tmp.size >= INT_MAX / num_possible_cpus())
- return -ENOMEM;
if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
return -ENOMEM;
if (tmp.num_counters == 0)
@@ -1692,8 +1534,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
if (ret != 0)
goto free_newinfo;
- duprintf("compat_do_replace: Translated table\n");
-
ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo,
tmp.num_counters, compat_ptr(tmp.counters));
if (ret)
@@ -1727,7 +1567,6 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
break;
default:
- duprintf("do_ipt_set_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -1777,19 +1616,15 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
struct compat_ipt_get_entries get;
struct xt_table *t;
- if (*len < sizeof(get)) {
- duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get));
+ if (*len < sizeof(get))
return -EINVAL;
- }
if (copy_from_user(&get, uptr, sizeof(get)) != 0)
return -EFAULT;
- if (*len != sizeof(struct compat_ipt_get_entries) + get.size) {
- duprintf("compat_get_entries: %u != %zu\n",
- *len, sizeof(get) + get.size);
+ if (*len != sizeof(struct compat_ipt_get_entries) + get.size)
return -EINVAL;
- }
+
get.name[sizeof(get.name) - 1] = '\0';
xt_compat_lock(AF_INET);
@@ -1797,16 +1632,13 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
if (!IS_ERR_OR_NULL(t)) {
const struct xt_table_info *private = t->private;
struct xt_table_info info;
- duprintf("t->private->number = %u\n", private->number);
ret = compat_table_info(private, &info);
- if (!ret && get.size == info.size) {
+ if (!ret && get.size == info.size)
ret = compat_copy_entries_to_user(private->size,
t, uptr->entrytable);
- } else if (!ret) {
- duprintf("compat_get_entries: I've got %u not %u!\n",
- private->size, get.size);
+ else if (!ret)
ret = -EAGAIN;
- }
+
xt_compat_flush_offsets(AF_INET);
module_put(t->me);
xt_table_unlock(t);
@@ -1859,7 +1691,6 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
break;
default:
- duprintf("do_ipt_set_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -1911,7 +1742,6 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
}
default:
- duprintf("do_ipt_get_ctl: unknown request %i\n", cmd);
ret = -EINVAL;
}
@@ -2013,7 +1843,6 @@ icmp_match(const struct sk_buff *skb, struct xt_action_param *par)
/* We've been asked to examine this packet, and we
* can't. Hence, no choice but to drop.
*/
- duprintf("Dropping evil ICMP tinygram.\n");
par->hotdrop = true;
return false;
}
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index e3c46e8e2..ae1a71a97 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -360,7 +360,7 @@ static int ipv4_init_net(struct net *net)
in->ctl_table[0].data = &nf_conntrack_max;
in->ctl_table[1].data = &net->ct.count;
- in->ctl_table[2].data = &net->ct.htable_size;
+ in->ctl_table[2].data = &nf_conntrack_htable_size;
in->ctl_table[3].data = &net->ct.sysctl_checksum;
in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
#endif
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index f0dfe92a0..c6f3c406f 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -31,15 +31,14 @@ struct ct_iter_state {
static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
{
- struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
struct hlist_nulls_node *n;
for (st->bucket = 0;
- st->bucket < net->ct.htable_size;
+ st->bucket < nf_conntrack_htable_size;
st->bucket++) {
n = rcu_dereference(
- hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+ hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
@@ -49,17 +48,16 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
struct hlist_nulls_node *head)
{
- struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
- if (++st->bucket >= net->ct.htable_size)
+ if (++st->bucket >= nf_conntrack_htable_size)
return NULL;
}
head = rcu_dereference(
- hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
+ hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
}
return head;
}
@@ -114,6 +112,23 @@ static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
}
#endif
+static bool ct_seq_should_skip(const struct nf_conn *ct,
+ const struct net *net,
+ const struct nf_conntrack_tuple_hash *hash)
+{
+ /* we only want to print DIR_ORIGINAL */
+ if (NF_CT_DIRECTION(hash))
+ return true;
+
+ if (nf_ct_l3num(ct) != AF_INET)
+ return true;
+
+ if (!net_eq(nf_ct_net(ct), net))
+ return true;
+
+ return false;
+}
+
static int ct_seq_show(struct seq_file *s, void *v)
{
struct nf_conntrack_tuple_hash *hash = v;
@@ -123,14 +138,15 @@ static int ct_seq_show(struct seq_file *s, void *v)
int ret = 0;
NF_CT_ASSERT(ct);
- if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
+ if (ct_seq_should_skip(ct, seq_file_net(s), hash))
return 0;
+ if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
+ return 0;
- /* we only want to print DIR_ORIGINAL */
- if (NF_CT_DIRECTION(hash))
- goto release;
- if (nf_ct_l3num(ct) != AF_INET)
+ /* check if we raced w. object reuse */
+ if (!nf_ct_is_confirmed(ct) ||
+ ct_seq_should_skip(ct, seq_file_net(s), hash))
goto release;
l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
@@ -220,13 +236,12 @@ struct ct_expect_iter_state {
static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
{
- struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
struct hlist_node *n;
for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
n = rcu_dereference(
- hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
if (n)
return n;
}
@@ -236,7 +251,6 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
struct hlist_node *head)
{
- struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
head = rcu_dereference(hlist_next_rcu(head));
@@ -244,7 +258,7 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
if (++st->bucket >= nf_ct_expect_hsize)
return NULL;
head = rcu_dereference(
- hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
+ hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
}
return head;
}
@@ -285,6 +299,9 @@ static int exp_seq_show(struct seq_file *s, void *v)
exp = hlist_entry(n, struct nf_conntrack_expect, hnode);
+ if (!net_eq(nf_ct_net(exp->master), seq_file_net(s)))
+ return 0;
+
if (exp->tuple.src.l3num != AF_INET)
return 0;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index cf9700b1a..66ddcb605 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -737,6 +737,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
/* no remote port */
}
+ ipc.sockc.tsflags = sk->sk_tsflags;
ipc.addr = inet->inet_saddr;
ipc.opt = NULL;
ipc.oif = sk->sk_bound_dev_if;
@@ -744,10 +745,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.ttl = 0;
ipc.tos = -1;
- sock_tx_timestamp(sk, &ipc.tx_flags);
-
if (msg->msg_controllen) {
- err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
+ err = ip_cmsg_send(sk, msg, &ipc, false);
if (unlikely(err)) {
kfree(ipc.opt);
return err;
@@ -768,6 +767,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_unlock();
}
+ sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
+
saddr = ipc.addr;
ipc.addr = faddr = daddr;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 8d22de740..438f50c1a 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -339,8 +339,8 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
struct msghdr *msg, size_t length,
- struct rtable **rtp,
- unsigned int flags)
+ struct rtable **rtp, unsigned int flags,
+ const struct sockcm_cookie *sockc)
{
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
@@ -379,7 +379,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb->ip_summed = CHECKSUM_NONE;
- sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
+ sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
skb->transport_header = skb->network_header;
err = -EFAULT;
@@ -540,6 +540,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
daddr = inet->inet_daddr;
}
+ ipc.sockc.tsflags = sk->sk_tsflags;
ipc.addr = inet->inet_saddr;
ipc.opt = NULL;
ipc.tx_flags = 0;
@@ -548,7 +549,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.oif = sk->sk_bound_dev_if;
if (msg->msg_controllen) {
- err = ip_cmsg_send(net, msg, &ipc, false);
+ err = ip_cmsg_send(sk, msg, &ipc, false);
if (unlikely(err)) {
kfree(ipc.opt);
goto out;
@@ -638,10 +639,10 @@ back_from_confirm:
if (inet->hdrincl)
err = raw_send_hdrinc(sk, &fl4, msg, len,
- &rt, msg->msg_flags);
+ &rt, msg->msg_flags, &ipc.sockc);
else {
- sock_tx_timestamp(sk, &ipc.tx_flags);
+ sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
if (!ipc.addr)
ipc.addr = fl4.daddr;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 60398a937..a1f2830d8 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -915,11 +915,11 @@ static int ip_error(struct sk_buff *skb)
if (!IN_DEV_FORWARD(in_dev)) {
switch (rt->dst.error) {
case EHOSTUNREACH:
- IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
+ __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
break;
case ENETUNREACH:
- IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
+ __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
break;
}
goto out;
@@ -934,7 +934,7 @@ static int ip_error(struct sk_buff *skb)
break;
case ENETUNREACH:
code = ICMP_NET_UNREACH;
- IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
+ __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
break;
case EACCES:
code = ICMP_PKT_FILTERED;
@@ -2146,6 +2146,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
unsigned int flags = 0;
struct fib_result res;
struct rtable *rth;
+ int master_idx;
int orig_oif;
int err = -ENETUNREACH;
@@ -2155,6 +2156,9 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
orig_oif = fl4->flowi4_oif;
+ master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif);
+ if (master_idx)
+ fl4->flowi4_oif = master_idx;
fl4->flowi4_iif = LOOPBACK_IFINDEX;
fl4->flowi4_tos = tos & IPTOS_RT_MASK;
fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 4c04f0933..e3c4043c2 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -312,11 +312,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
mss = __cookie_v4_check(ip_hdr(skb), th, cookie);
if (mss == 0) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
goto out;
}
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 03112a310..1cb67de10 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -960,6 +960,17 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ {
+ .procname = "fib_multipath_use_neigh",
+ .data = &init_net.ipv4.sysctl_fib_multipath_use_neigh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{ }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 29bb8a5b9..99fbc1479 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -429,13 +429,16 @@ void tcp_init_sock(struct sock *sk)
}
EXPORT_SYMBOL(tcp_init_sock);
-static void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb)
+static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb)
{
- if (sk->sk_tsflags) {
+ if (tsflags) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
- sock_tx_timestamp(sk, &shinfo->tx_flags);
- if (shinfo->tx_flags & SKBTX_ANY_TSTAMP)
+ sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
+ if (tsflags & SOF_TIMESTAMPING_TX_ACK)
+ tcb->txstamp_ack = 1;
+ if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
}
}
@@ -907,7 +910,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
int copy, i;
bool can_coalesce;
- if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
+ if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
+ !tcp_skb_can_collapse_to(skb)) {
new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
@@ -958,7 +962,7 @@ new_segment:
offset += copy;
size -= copy;
if (!size) {
- tcp_tx_timestamp(sk, skb);
+ tcp_tx_timestamp(sk, sk->sk_tsflags, skb);
goto out;
}
@@ -1078,8 +1082,10 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
+ struct sockcm_cookie sockc;
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
+ bool process_backlog = false;
bool sg;
long timeo;
@@ -1120,14 +1126,24 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
/* 'common' sending to sendq */
}
+ sockc.tsflags = sk->sk_tsflags;
+ if (msg->msg_controllen) {
+ err = sock_cmsg_send(sk, msg, &sockc);
+ if (unlikely(err)) {
+ err = -EINVAL;
+ goto out_err;
+ }
+ }
+
/* This should be in poll */
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
- mss_now = tcp_send_mss(sk, &size_goal, flags);
-
/* Ok commence sending. */
copied = 0;
+restart:
+ mss_now = tcp_send_mss(sk, &size_goal, flags);
+
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
@@ -1145,7 +1161,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
copy = max - skb->len;
}
- if (copy <= 0) {
+ if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
new_segment:
/* Allocate new segment. If the interface is SG,
* allocate skb fitting to single page.
@@ -1153,6 +1169,10 @@ new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
+ if (process_backlog && sk_flush_backlog(sk)) {
+ process_backlog = false;
+ goto restart;
+ }
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg),
sk->sk_allocation,
@@ -1160,6 +1180,7 @@ new_segment:
if (!skb)
goto wait_for_memory;
+ process_backlog = true;
/*
* Check whether we can use HW checksum.
*/
@@ -1238,7 +1259,9 @@ new_segment:
copied += copy;
if (!msg_data_left(msg)) {
- tcp_tx_timestamp(sk, skb);
+ tcp_tx_timestamp(sk, sockc.tsflags, skb);
+ if (unlikely(flags & MSG_EOR))
+ TCP_SKB_CB(skb)->eor = 1;
goto out;
}
@@ -1432,14 +1455,10 @@ static void tcp_prequeue_process(struct sock *sk)
struct sk_buff *skb;
struct tcp_sock *tp = tcp_sk(sk);
- NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
- /* RX process wants to run with disabled BHs, though it is not
- * necessary */
- local_bh_disable();
while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
sk_backlog_rcv(sk, skb);
- local_bh_enable();
/* Clear memory counter. */
tp->ucopy.memory = 0;
@@ -1766,7 +1785,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
chunk = len - tp->ucopy.len;
if (chunk != 0) {
- NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
len -= chunk;
copied += chunk;
}
@@ -1778,7 +1797,7 @@ do_prequeue:
chunk = len - tp->ucopy.len;
if (chunk != 0) {
- NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
}
@@ -1864,7 +1883,7 @@ skip_copy:
tcp_prequeue_process(sk);
if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
- NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
}
@@ -2054,13 +2073,13 @@ void tcp_close(struct sock *sk, long timeout)
sk->sk_prot->disconnect(sk, 0);
} else if (data_was_unread) {
/* Unread data was tossed, zap the connection. */
- NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, sk->sk_allocation);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
- NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
} else if (tcp_close_state(sk)) {
/* We FIN if the application ate all the data before
* zapping the connection.
@@ -2137,7 +2156,7 @@ adjudge_to_death:
if (tp->linger2 < 0) {
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, GFP_ATOMIC);
- NET_INC_STATS_BH(sock_net(sk),
+ __NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONLINGER);
} else {
const int tmo = tcp_fin_time(sk);
@@ -2156,7 +2175,7 @@ adjudge_to_death:
if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, GFP_ATOMIC);
- NET_INC_STATS_BH(sock_net(sk),
+ __NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
}
}
@@ -3195,7 +3214,7 @@ void tcp_done(struct sock *sk)
struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
tcp_set_state(sk, TCP_CLOSE);
tcp_clear_xmit_timers(sk);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index fd1405d37..36087bca9 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -197,15 +197,15 @@ static void bictcp_state(struct sock *sk, u8 new_state)
/* Track delayed acknowledgment ratio using sliding window
* ratio = (15*ratio + sample) / 16
*/
-static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt)
+static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_state == TCP_CA_Open) {
struct bictcp *ca = inet_csk_ca(sk);
- cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT;
- ca->delayed_ack += cnt;
+ ca->delayed_ack += sample->pkts_acked -
+ (ca->delayed_ack >> ACK_RATIO_SHIFT);
}
}
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 167b6a3e1..03725b294 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -155,11 +155,11 @@ static void tcp_cdg_hystart_update(struct sock *sk)
ca->last_ack = now_us;
if (after(now_us, ca->round_start + base_owd)) {
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPHYSTARTTRAINDETECT);
- NET_ADD_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPHYSTARTTRAINCWND,
- tp->snd_cwnd);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINDETECT);
+ NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINCWND,
+ tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
return;
}
@@ -174,11 +174,11 @@ static void tcp_cdg_hystart_update(struct sock *sk)
125U);
if (ca->rtt.min > thresh) {
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPHYSTARTDELAYDETECT);
- NET_ADD_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPHYSTARTDELAYCWND,
- tp->snd_cwnd);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYDETECT);
+ NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYCWND,
+ tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
}
}
@@ -294,12 +294,12 @@ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked)
ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr);
}
-static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
+static void tcp_cdg_acked(struct sock *sk, const struct ack_sample *sample)
{
struct cdg *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- if (rtt_us <= 0)
+ if (sample->rtt_us <= 0)
return;
/* A heuristic for filtering delayed ACKs, adapted from:
@@ -307,20 +307,20 @@ static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
* delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010.
*/
if (tp->sacked_out == 0) {
- if (num_acked == 1 && ca->delack) {
+ if (sample->pkts_acked == 1 && ca->delack) {
/* A delayed ACK is only used for the minimum if it is
* provenly lower than an existing non-zero minimum.
*/
- ca->rtt.min = min(ca->rtt.min, rtt_us);
+ ca->rtt.min = min(ca->rtt.min, sample->rtt_us);
ca->delack--;
return;
- } else if (num_acked > 1 && ca->delack < 5) {
+ } else if (sample->pkts_acked > 1 && ca->delack < 5) {
ca->delack++;
}
}
- ca->rtt.min = min_not_zero(ca->rtt.min, rtt_us);
- ca->rtt.max = max(ca->rtt.max, rtt_us);
+ ca->rtt.min = min_not_zero(ca->rtt.min, sample->rtt_us);
+ ca->rtt.max = max(ca->rtt.max, sample->rtt_us);
}
static u32 tcp_cdg_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 448c2615f..c99230efc 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -402,11 +402,11 @@ static void hystart_update(struct sock *sk, u32 delay)
ca->last_ack = now;
if ((s32)(now - ca->round_start) > ca->delay_min >> 4) {
ca->found |= HYSTART_ACK_TRAIN;
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPHYSTARTTRAINDETECT);
- NET_ADD_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPHYSTARTTRAINCWND,
- tp->snd_cwnd);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINDETECT);
+ NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTTRAINCWND,
+ tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
}
}
@@ -423,11 +423,11 @@ static void hystart_update(struct sock *sk, u32 delay)
if (ca->curr_rtt > ca->delay_min +
HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
ca->found |= HYSTART_DELAY;
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPHYSTARTDELAYDETECT);
- NET_ADD_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPHYSTARTDELAYCWND,
- tp->snd_cwnd);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYDETECT);
+ NET_ADD_STATS(sock_net(sk),
+ LINUX_MIB_TCPHYSTARTDELAYCWND,
+ tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
}
}
@@ -437,21 +437,21 @@ static void hystart_update(struct sock *sk, u32 delay)
/* Track delayed acknowledgment ratio using sliding window
* ratio = (15*ratio + sample) / 16
*/
-static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
u32 delay;
/* Some calls are for duplicates without timetamps */
- if (rtt_us < 0)
+ if (sample->rtt_us < 0)
return;
/* Discard delay samples right after fast recovery */
if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ)
return;
- delay = (rtt_us << 3) / USEC_PER_MSEC;
+ delay = (sample->rtt_us << 3) / USEC_PER_MSEC;
if (delay == 0)
delay = 1;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index cffd8f9ed..54d9f9b01 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -255,9 +255,9 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
spin_lock(&fastopenq->lock);
req1 = fastopenq->rskq_rst_head;
if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) {
+ __NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
spin_unlock(&fastopenq->lock);
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
return false;
}
fastopenq->rskq_rst_head = req1->dl_next;
@@ -282,7 +282,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
struct sock *child;
if (foc->len == 0) /* Client requests a cookie */
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
(syn_data || foc->len >= 0) &&
@@ -311,13 +311,13 @@ fastopen:
child = tcp_fastopen_create_child(sk, skb, dst, req);
if (child) {
foc->len = -1;
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPFASTOPENPASSIVE);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVE);
return child;
}
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
} else if (foc->len > 0) /* Client presents an invalid cookie */
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
valid_foc.exp = foc->exp;
*foc = valid_foc;
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 82f0d9ed6..4a4d8e767 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -99,7 +99,7 @@ static inline void measure_rtt(struct sock *sk, u32 srtt)
}
static void measure_achieved_throughput(struct sock *sk,
- u32 pkts_acked, s32 rtt)
+ const struct ack_sample *sample)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_sock *tp = tcp_sk(sk);
@@ -107,10 +107,10 @@ static void measure_achieved_throughput(struct sock *sk,
u32 now = tcp_time_stamp;
if (icsk->icsk_ca_state == TCP_CA_Open)
- ca->pkts_acked = pkts_acked;
+ ca->pkts_acked = sample->pkts_acked;
- if (rtt > 0)
- measure_rtt(sk, usecs_to_jiffies(rtt));
+ if (sample->rtt_us > 0)
+ measure_rtt(sk, usecs_to_jiffies(sample->rtt_us));
if (!use_bandwidth_switch)
return;
@@ -122,7 +122,7 @@ static void measure_achieved_throughput(struct sock *sk,
return;
}
- ca->packetcount += pkts_acked;
+ ca->packetcount += sample->pkts_acked;
if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) &&
now - ca->lasttime >= ca->minRTT &&
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 2ab9bbb6f..c8e6d86be 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -82,30 +82,31 @@ static void tcp_illinois_init(struct sock *sk)
}
/* Measure RTT for each ack. */
-static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, s32 rtt)
+static void tcp_illinois_acked(struct sock *sk, const struct ack_sample *sample)
{
struct illinois *ca = inet_csk_ca(sk);
+ s32 rtt_us = sample->rtt_us;
- ca->acked = pkts_acked;
+ ca->acked = sample->pkts_acked;
/* dup ack, no rtt sample */
- if (rtt < 0)
+ if (rtt_us < 0)
return;
/* ignore bogus values, this prevents wraparound in alpha math */
- if (rtt > RTT_MAX)
- rtt = RTT_MAX;
+ if (rtt_us > RTT_MAX)
+ rtt_us = RTT_MAX;
/* keep track of minimum RTT seen so far */
- if (ca->base_rtt > rtt)
- ca->base_rtt = rtt;
+ if (ca->base_rtt > rtt_us)
+ ca->base_rtt = rtt_us;
/* and max */
- if (ca->max_rtt < rtt)
- ca->max_rtt = rtt;
+ if (ca->max_rtt < rtt_us)
+ ca->max_rtt = rtt_us;
++ca->cnt_rtt;
- ca->sum_rtt += rtt;
+ ca->sum_rtt += rtt_us;
}
/* Maximum queuing delay */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 131005f82..cd3c4cbbb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -90,7 +90,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 1;
EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
/* rfc5961 challenge ack rate limiting */
-int sysctl_tcp_challenge_ack_limit = 100;
+int sysctl_tcp_challenge_ack_limit = 1000;
int sysctl_tcp_stdurg __read_mostly;
int sysctl_tcp_rfc1337 __read_mostly;
@@ -872,7 +872,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
else
mib_idx = LINUX_MIB_TCPSACKREORDER;
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
#if FASTRETRANS_DEBUG > 1
pr_debug("Disorder%d %d %u f%u s%u rr%d\n",
tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
@@ -1065,7 +1065,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
dup_sack = true;
tcp_dsack_seen(tp);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
} else if (num_sacks > 1) {
u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
@@ -1074,7 +1074,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
!before(start_seq_0, start_seq_1)) {
dup_sack = true;
tcp_dsack_seen(tp);
- NET_INC_STATS_BH(sock_net(sk),
+ NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPDSACKOFORECV);
}
}
@@ -1292,7 +1292,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
if (skb->len > 0) {
BUG_ON(!tcp_skb_pcount(skb));
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED);
return false;
}
@@ -1306,6 +1306,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
}
TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+ TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
TCP_SKB_CB(prev)->end_seq++;
@@ -1316,7 +1317,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
tcp_unlink_write_queue(skb, sk);
sk_wmem_free_skb(sk, skb);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED);
return true;
}
@@ -1371,6 +1372,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
goto fallback;
+ if (!tcp_skb_can_collapse_to(prev))
+ goto fallback;
+
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
!before(end_seq, TCP_SKB_CB(skb)->end_seq);
@@ -1472,7 +1476,7 @@ noop:
return skb;
fallback:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
return NULL;
}
@@ -1660,7 +1664,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
mib_idx = LINUX_MIB_TCPSACKDISCARD;
}
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
if (i == 0)
first_sack_index = -1;
continue;
@@ -1912,7 +1916,7 @@ void tcp_enter_loss(struct sock *sk)
skb = tcp_write_queue_head(sk);
is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
if (is_reneg) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
tp->sacked_out = 0;
tp->fackets_out = 0;
}
@@ -2256,16 +2260,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
}
}
-/* CWND moderation, preventing bursts due to too big ACKs
- * in dubious situations.
- */
-static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
-{
- tp->snd_cwnd = min(tp->snd_cwnd,
- tcp_packets_in_flight(tp) + tcp_max_burst(tp));
- tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-
static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
{
return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
@@ -2408,13 +2402,12 @@ static bool tcp_try_undo_recovery(struct sock *sk)
else
mib_idx = LINUX_MIB_TCPFULLUNDO;
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
}
if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
/* Hold old state until something *above* high_seq
* is ACKed. For Reno it is MUST to prevent false
* fast retransmits (RFC2582). SACK TCP is safe. */
- tcp_moderate_cwnd(tp);
if (!tcp_any_retrans_done(sk))
tp->retrans_stamp = 0;
return true;
@@ -2431,7 +2424,7 @@ static bool tcp_try_undo_dsack(struct sock *sk)
if (tp->undo_marker && !tp->undo_retrans) {
DBGUNDO(sk, "D-SACK");
tcp_undo_cwnd_reduction(sk, false);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
return true;
}
return false;
@@ -2446,10 +2439,10 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
tcp_undo_cwnd_reduction(sk, true);
DBGUNDO(sk, "partial loss");
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
if (frto_undo)
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPSPURIOUSRTOS);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUSRTOS);
inet_csk(sk)->icsk_retransmits = 0;
if (frto_undo || tcp_is_sack(tp))
tcp_set_ca_state(sk, TCP_CA_Open);
@@ -2573,7 +2566,7 @@ static void tcp_mtup_probe_failed(struct sock *sk)
icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
icsk->icsk_mtup.probe_size = 0;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL);
}
static void tcp_mtup_probe_success(struct sock *sk)
@@ -2593,7 +2586,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
icsk->icsk_mtup.probe_size = 0;
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
}
/* Do a simple retransmit without using the backoff mechanisms in
@@ -2657,7 +2650,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
else
mib_idx = LINUX_MIB_TCPSACKRECOVERY;
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
tp->prior_ssthresh = 0;
tcp_init_undo(tp);
@@ -2750,7 +2743,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked)
DBGUNDO(sk, "partial recovery");
tcp_undo_cwnd_reduction(sk, true);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
tcp_try_keep_open(sk);
return true;
}
@@ -3097,12 +3090,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
const struct skb_shared_info *shinfo;
/* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */
- if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)))
+ if (likely(!TCP_SKB_CB(skb)->txstamp_ack))
return;
shinfo = skb_shinfo(skb);
- if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) &&
- !before(shinfo->tskey, prior_snd_una) &&
+ if (!before(shinfo->tskey, prior_snd_una) &&
before(shinfo->tskey, tcp_sk(sk)->snd_una))
__skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
}
@@ -3259,8 +3251,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tcp_rearm_rto(sk);
}
- if (icsk->icsk_ca_ops->pkts_acked)
- icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us);
+ if (icsk->icsk_ca_ops->pkts_acked) {
+ struct ack_sample sample = { .pkts_acked = pkts_acked,
+ .rtt_us = ca_rtt_us };
+
+ icsk->icsk_ca_ops->pkts_acked(sk, &sample);
+ }
#if FASTRETRANS_DEBUG > 0
WARN_ON((int)tp->sacked_out < 0);
@@ -3366,9 +3362,10 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
{
u32 delta = ack - tp->snd_una;
- u64_stats_update_begin(&tp->syncp);
+ sock_owned_by_me((struct sock *)tp);
+ u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_acked += delta;
- u64_stats_update_end(&tp->syncp);
+ u64_stats_update_end_raw(&tp->syncp);
tp->snd_una = ack;
}
@@ -3377,9 +3374,10 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
{
u32 delta = seq - tp->rcv_nxt;
- u64_stats_update_begin(&tp->syncp);
+ sock_owned_by_me((struct sock *)tp);
+ u64_stats_update_begin_raw(&tp->syncp);
tp->bytes_received += delta;
- u64_stats_update_end(&tp->syncp);
+ u64_stats_update_end_raw(&tp->syncp);
tp->rcv_nxt = seq;
}
@@ -3426,6 +3424,23 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
return flag;
}
+static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
+ u32 *last_oow_ack_time)
+{
+ if (*last_oow_ack_time) {
+ s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
+
+ if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
+ NET_INC_STATS(net, mib_idx);
+ return true; /* rate-limited: don't send yet! */
+ }
+ }
+
+ *last_oow_ack_time = tcp_time_stamp;
+
+ return false; /* not rate-limited: go ahead, send dupack now! */
+}
+
/* Return true if we're currently rate-limiting out-of-window ACKs and
* thus shouldn't send a dupack right now. We rate-limit dupacks in
* response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS
@@ -3439,21 +3454,9 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
/* Data packets without SYNs are not likely part of an ACK loop. */
if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) &&
!tcp_hdr(skb)->syn)
- goto not_rate_limited;
-
- if (*last_oow_ack_time) {
- s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time);
-
- if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) {
- NET_INC_STATS_BH(net, mib_idx);
- return true; /* rate-limited: don't send yet! */
- }
- }
-
- *last_oow_ack_time = tcp_time_stamp;
+ return false;
-not_rate_limited:
- return false; /* not rate-limited: go ahead, send dupack now! */
+ return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
}
/* RFC 5961 7 [ACK Throttling] */
@@ -3463,22 +3466,27 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
static u32 challenge_timestamp;
static unsigned int challenge_count;
struct tcp_sock *tp = tcp_sk(sk);
- u32 now;
+ u32 count, now;
/* First check our per-socket dupack rate limit. */
- if (tcp_oow_rate_limited(sock_net(sk), skb,
- LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
- &tp->last_oow_ack_time))
+ if (__tcp_oow_rate_limited(sock_net(sk),
+ LINUX_MIB_TCPACKSKIPPEDCHALLENGE,
+ &tp->last_oow_ack_time))
return;
- /* Then check the check host-wide RFC 5961 rate limit. */
+ /* Then check host-wide RFC 5961 rate limit. */
now = jiffies / HZ;
if (now != challenge_timestamp) {
+ u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1;
+
challenge_timestamp = now;
- challenge_count = 0;
+ WRITE_ONCE(challenge_count, half +
+ prandom_u32_max(sysctl_tcp_challenge_ack_limit));
}
- if (++challenge_count <= sysctl_tcp_challenge_ack_limit) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
+ count = READ_ONCE(challenge_count);
+ if (count > 0) {
+ WRITE_ONCE(challenge_count, count - 1);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK);
tcp_send_ack(sk);
}
}
@@ -3527,8 +3535,8 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
tcp_set_ca_state(sk, TCP_CA_CWR);
tcp_end_cwnd_reduction(sk);
tcp_try_keep_open(sk);
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPLOSSPROBERECOVERY);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPLOSSPROBERECOVERY);
} else if (!(flag & (FLAG_SND_UNA_ADVANCED |
FLAG_NOT_DUP | FLAG_DATA_SACKED))) {
/* Pure dupack: original and TLP probe arrived; no loss */
@@ -3632,14 +3640,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
u32 ack_ev_flags = CA_ACK_SLOWPATH;
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
flag |= FLAG_DATA;
else
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS);
flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
@@ -4183,7 +4191,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
else
mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
tp->rx_opt.dsack = 1;
tp->duplicate_sack[0].start_seq = seq;
@@ -4207,7 +4215,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_enter_quickack_mode(sk);
if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
@@ -4357,13 +4365,19 @@ static bool tcp_try_coalesce(struct sock *sk,
atomic_add(delta, &sk->sk_rmem_alloc);
sk_mem_charge(sk, delta);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE);
TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq;
TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq;
TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags;
return true;
}
+static void tcp_drop(struct sock *sk, struct sk_buff *skb)
+{
+ sk_drops_add(sk, skb);
+ __kfree_skb(skb);
+}
+
/* This one checks to see if we can put data from the
* out_of_order queue into the receive_queue.
*/
@@ -4388,7 +4402,7 @@ static void tcp_ofo_queue(struct sock *sk)
__skb_unlink(skb, &tp->out_of_order_queue);
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
SOCK_DEBUG(sk, "ofo packet was already received\n");
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
continue;
}
SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
@@ -4439,8 +4453,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
tcp_ecn_check_ce(tp, skb);
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
- __kfree_skb(skb);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
+ tcp_drop(sk, skb);
return;
}
@@ -4448,7 +4462,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -4503,8 +4517,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
/* All the bits are present. Drop. */
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
- __kfree_skb(skb);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+ tcp_drop(sk, skb);
skb = NULL;
tcp_dsack_set(sk, seq, end_seq);
goto add_sack;
@@ -4542,8 +4556,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
__skb_unlink(skb1, &tp->out_of_order_queue);
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
- __kfree_skb(skb1);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
+ tcp_drop(sk, skb1);
}
add_sack:
@@ -4651,11 +4665,13 @@ static int __tcp_stealth_integrity_check(struct sock *sk, struct sk_buff *skb)
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- int eaten = -1;
bool fragstolen = false;
+ int eaten = -1;
- if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
- goto drop;
+ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
+ __kfree_skb(skb);
+ return;
+ }
#ifdef CONFIG_TCP_STEALTH
if (unlikely(tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN) &&
@@ -4689,14 +4705,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
__set_current_state(TASK_RUNNING);
- local_bh_enable();
if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) {
tp->ucopy.len -= chunk;
tp->copied_seq += chunk;
eaten = (chunk == skb->len);
tcp_rcv_space_adjust(sk);
}
- local_bh_disable();
}
if (eaten <= 0) {
@@ -4739,14 +4753,14 @@ queue_and_out:
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
/* A retransmit, 2nd most common case. Force an immediate ack. */
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
out_of_window:
tcp_enter_quickack_mode(sk);
inet_csk_schedule_ack(sk);
drop:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
return;
}
@@ -4785,7 +4799,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
__skb_unlink(skb, list);
__kfree_skb(skb);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
return next;
}
@@ -4944,7 +4958,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
bool res = false;
if (!skb_queue_empty(&tp->out_of_order_queue)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
__skb_queue_purge(&tp->out_of_order_queue);
/* Reset SACK state. A conforming SACK implementation will
@@ -4973,7 +4987,7 @@ static int tcp_prune_queue(struct sock *sk)
SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
tcp_clamp_window(sk);
@@ -5003,7 +5017,7 @@ static int tcp_prune_queue(struct sock *sk)
* drop receive data on the floor. It will get retransmitted
* and hopefully then we'll have sufficient space.
*/
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED);
/* Massive buffer overcommit. */
tp->pred_flags = 0;
@@ -5212,7 +5226,6 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
int chunk = skb->len - hlen;
int err;
- local_bh_enable();
if (skb_csum_unnecessary(skb))
err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk);
else
@@ -5224,32 +5237,9 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
tcp_rcv_space_adjust(sk);
}
- local_bh_disable();
return err;
}
-static __sum16 __tcp_checksum_complete_user(struct sock *sk,
- struct sk_buff *skb)
-{
- __sum16 result;
-
- if (sock_owned_by_user(sk)) {
- local_bh_enable();
- result = __tcp_checksum_complete(skb);
- local_bh_disable();
- } else {
- result = __tcp_checksum_complete(skb);
- }
- return result;
-}
-
-static inline bool tcp_checksum_complete_user(struct sock *sk,
- struct sk_buff *skb)
-{
- return !skb_csum_unnecessary(skb) &&
- __tcp_checksum_complete_user(sk, skb);
-}
-
/* Does PAWS and seqno based validation of an incoming segment, flags will
* play significant role here.
*/
@@ -5262,7 +5252,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
tcp_paws_discard(sk, skb)) {
if (!th->rst) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
if (!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDPAWS,
&tp->last_oow_ack_time))
@@ -5314,8 +5304,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
if (th->syn) {
syn_challenge:
if (syn_inerr)
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
tcp_send_challenge_ack(sk, skb);
goto discard;
}
@@ -5323,7 +5313,7 @@ syn_challenge:
return true;
discard:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
return false;
}
@@ -5430,7 +5420,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_data_snd_check(sk);
return;
} else { /* Header too small */
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
} else {
@@ -5467,12 +5457,13 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
__skb_pull(skb, tcp_header_len);
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPHPHITSTOUSER);
eaten = 1;
}
}
if (!eaten) {
- if (tcp_checksum_complete_user(sk, skb))
+ if (tcp_checksum_complete(skb))
goto csum_error;
if ((int)skb->truesize > sk->sk_forward_alloc)
@@ -5489,7 +5480,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
tcp_rcv_rtt_measure_ts(sk, skb);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
@@ -5516,7 +5507,7 @@ no_ack:
}
slow_path:
- if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
+ if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;
if (!th->ack && !th->rst && !th->syn)
@@ -5546,11 +5537,11 @@ step5:
return;
csum_error:
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
discard:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
}
EXPORT_SYMBOL(tcp_rcv_established);
@@ -5635,16 +5626,18 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
if (data) { /* Retransmit unacked data in SYN */
tcp_for_write_queue_from(data, sk) {
if (data == tcp_send_head(sk) ||
- __tcp_retransmit_skb(sk, data))
+ __tcp_retransmit_skb(sk, data, 1))
break;
}
tcp_rearm_rto(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
return true;
}
tp->syn_data_acked = tp->syn_data;
if (tp->syn_data_acked)
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVE);
tcp_fastopen_add_skb(sk, synack);
@@ -5679,7 +5672,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
tcp_time_stamp)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_PAWSACTIVEREJECTED);
goto reset_and_undo;
}
@@ -5781,7 +5775,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
TCP_DELACK_MAX, TCP_RTO_MAX);
discard:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
return 0;
} else {
tcp_send_ack(sk);
@@ -5888,8 +5882,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
int queued = 0;
bool acceptable;
- tp->rx_opt.saw_tstamp = 0;
-
switch (sk->sk_state) {
case TCP_CLOSE:
goto discard;
@@ -5907,29 +5899,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
return 1;
- /* Now we have several options: In theory there is
- * nothing else in the frame. KA9Q has an option to
- * send data with the syn, BSD accepts data with the
- * syn up to the [to be] advertised window and
- * Solaris 2.1 gives you a protocol error. For now
- * we just ignore it, that fits the spec precisely
- * and avoids incompatibilities. It would be nice in
- * future to drop through and process the data.
- *
- * Now that TTCP is starting to be used we ought to
- * queue this data.
- * But, this leaves one open to an easy denial of
- * service attack, and SYN cookies can't defend
- * against this problem. So, we drop the data
- * in the interest of security over speed unless
- * it's still in use.
- */
- kfree_skb(skb);
+ consume_skb(skb);
return 0;
}
goto discard;
case TCP_SYN_SENT:
+ tp->rx_opt.saw_tstamp = 0;
queued = tcp_rcv_synsent_state_process(sk, skb, th);
if (queued >= 0)
return queued;
@@ -5941,6 +5917,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
return 0;
}
+ tp->rx_opt.saw_tstamp = 0;
req = tp->fastopen_rsk;
if (req) {
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
@@ -6065,7 +6042,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
(TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
tcp_done(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
}
@@ -6122,7 +6099,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (sk->sk_shutdown & RCV_SHUTDOWN) {
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
tcp_reset(sk);
return 1;
}
@@ -6142,7 +6119,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (!queued) {
discard:
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
}
return 0;
}
@@ -6260,10 +6237,10 @@ static bool tcp_syn_flood_action(const struct sock *sk,
if (net->ipv4.sysctl_tcp_syncookies) {
msg = "Sending cookies";
want_cookie = true;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
} else
#endif
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
if (!queue->synflood_warned &&
net->ipv4.sysctl_tcp_syncookies != 2 &&
@@ -6324,7 +6301,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
* timeout.
*/
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
goto drop;
}
@@ -6371,7 +6348,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (dst && strict &&
!tcp_peer_is_proven(req, dst, true,
tmp_opt.saw_tstamp)) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
goto drop_and_release;
}
}
@@ -6419,7 +6396,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
}
if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, &fl, req,
- &foc, false);
+ &foc, TCP_SYNACK_FASTOPEN);
/* Add the child socket directly into the accept queue */
inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
sk->sk_data_ready(sk);
@@ -6429,10 +6406,13 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->tfo_listener = false;
if (!want_cookie)
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
- af_ops->send_synack(sk, dst, &fl, req,
- &foc, !want_cookie);
- if (want_cookie)
- goto drop_and_free;
+ af_ops->send_synack(sk, dst, &fl, req, &foc,
+ !want_cookie ? TCP_SYNACK_NORMAL :
+ TCP_SYNACK_COOKIE);
+ if (want_cookie) {
+ reqsk_free(req);
+ return 0;
+ }
}
reqsk_put(req);
return 0;
@@ -6442,7 +6422,7 @@ drop_and_release:
drop_and_free:
reqsk_free(req);
drop:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ tcp_listendrop(sk);
return 0;
}
EXPORT_SYMBOL(tcp_conn_request);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dfd153fbd..469942798 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -158,7 +158,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
nexthop = daddr = usin->sin_addr.s_addr;
inet_opt = rcu_dereference_protected(inet->inet_opt,
- sock_owned_by_user(sk));
+ lockdep_sock_is_held(sk));
if (inet_opt && inet_opt->opt.srr) {
if (!daddr)
return -EINVAL;
@@ -336,7 +336,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort)
* an established socket here.
*/
if (seq != tcp_rsk(req)->snt_isn) {
- NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
} else if (abort) {
/*
* Still in SYN_RECV, just remove it silently.
@@ -345,7 +345,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort)
* errors returned from accept().
*/
inet_csk_reqsk_queue_drop(req->rsk_listener, req);
- NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
+ tcp_listendrop(req->rsk_listener);
}
reqsk_put(req);
}
@@ -388,7 +388,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
th->dest, iph->saddr, ntohs(th->source),
inet_iif(icmp_skb));
if (!sk) {
- ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return;
}
if (sk->sk_state == TCP_TIME_WAIT) {
@@ -412,13 +412,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
*/
if (sock_owned_by_user(sk)) {
if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
- NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
+ __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
}
if (sk->sk_state == TCP_CLOSE)
goto out;
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
- NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto out;
}
@@ -429,7 +429,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
if (sk->sk_state != TCP_LISTEN &&
!between(seq, snd_una, tp->snd_nxt)) {
- NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
@@ -644,6 +644,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
#ifdef CONFIG_TCP_MD5SIG
+ rcu_read_lock();
hash_location = tcp_parse_md5sig_option(th);
if (sk && sk_fullsock(sk)) {
key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
@@ -662,16 +663,18 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
ntohs(th->source), inet_iif(skb));
/* don't send rst if it can't find key */
if (!sk1)
- return;
- rcu_read_lock();
+ goto out;
+
key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
&ip_hdr(skb)->saddr, AF_INET);
if (!key)
- goto release_sk1;
+ goto out;
+
genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0)
- goto release_sk1;
+ goto out;
+
}
if (key) {
@@ -705,20 +708,19 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
offsetof(struct inet_timewait_sock, tw_bound_dev_if));
arg.tos = ip_hdr(skb)->tos;
+ local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len);
- TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
- TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
+ __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
+ __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
+ local_bh_enable();
#ifdef CONFIG_TCP_MD5SIG
-release_sk1:
- if (sk1) {
- rcu_read_unlock();
- sock_put(sk1);
- }
+out:
+ rcu_read_unlock();
#endif
}
@@ -790,12 +792,14 @@ static void tcp_v4_send_ack(struct net *net,
if (oif)
arg.bound_dev_if = oif;
arg.tos = tos;
+ local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len);
- TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
+ __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
+ local_bh_enable();
}
static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
@@ -846,7 +850,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- bool attach_req)
+ enum tcp_synack_type synack_type)
{
const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
@@ -857,7 +861,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req, foc, attach_req);
+ skb = tcp_make_synack(sk, dst, req, foc, synack_type);
if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
@@ -898,8 +902,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
/* caller either holds rcu_read_lock() or socket lock */
md5sig = rcu_dereference_check(tp->md5sig_info,
- sock_owned_by_user(sk) ||
- lockdep_is_held((spinlock_t *)&sk->sk_lock.slock));
+ lockdep_sock_is_held(sk));
if (!md5sig)
return NULL;
#if IS_ENABLED(CONFIG_IPV6)
@@ -944,8 +947,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
}
md5sig = rcu_dereference_protected(tp->md5sig_info,
- sock_owned_by_user(sk) ||
- lockdep_is_held(&sk->sk_lock.slock));
+ lockdep_sock_is_held(sk));
if (!md5sig) {
md5sig = kmalloc(sizeof(*md5sig), gfp);
if (!md5sig)
@@ -1169,12 +1171,12 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
return false;
if (hash_expected && !hash_location) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
return true;
}
if (!hash_expected && hash_location) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
return true;
}
@@ -1262,7 +1264,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
&tcp_request_sock_ipv4_ops, sk, skb);
drop:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ tcp_listendrop(sk);
return 0;
}
EXPORT_SYMBOL(tcp_v4_conn_request);
@@ -1360,11 +1362,11 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
return newsk;
exit_overflow:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
exit_nonewsk:
dst_release(dst);
exit:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+ tcp_listendrop(sk);
return NULL;
put_and_exit:
inet_csk_prepare_forced_close(newsk);
@@ -1461,8 +1463,8 @@ discard:
return 0;
csum_err:
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
EXPORT_SYMBOL(tcp_v4_do_rcv);
@@ -1535,16 +1537,16 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
__skb_queue_tail(&tp->ucopy.prequeue, skb);
tp->ucopy.memory += skb->truesize;
- if (tp->ucopy.memory > sk->sk_rcvbuf) {
+ if (skb_queue_len(&tp->ucopy.prequeue) >= 32 ||
+ tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
struct sk_buff *skb1;
BUG_ON(sock_owned_by_user(sk));
+ __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED,
+ skb_queue_len(&tp->ucopy.prequeue));
- while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) {
+ while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
sk_backlog_rcv(sk, skb1);
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPPREQUEUEDROPPED);
- }
tp->ucopy.memory = 0;
} else if (skb_queue_len(&tp->ucopy.prequeue) == 1) {
@@ -1565,24 +1567,25 @@ EXPORT_SYMBOL(tcp_prequeue);
int tcp_v4_rcv(struct sk_buff *skb)
{
+ struct net *net = dev_net(skb->dev);
const struct iphdr *iph;
const struct tcphdr *th;
+ bool refcounted;
struct sock *sk;
int ret;
- struct net *net = dev_net(skb->dev);
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
/* Count it even if it's bad */
- TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
+ __TCP_INC_STATS(net, TCP_MIB_INSEGS);
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
goto discard_it;
- th = tcp_hdr(skb);
+ th = (const struct tcphdr *)skb->data;
- if (th->doff < sizeof(struct tcphdr) / 4)
+ if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
goto bad_packet;
if (!pskb_may_pull(skb, th->doff * 4))
goto discard_it;
@@ -1595,7 +1598,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
goto csum_error;
- th = tcp_hdr(skb);
+ th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
/* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
* barrier() makes sure compiler wont play fool^Waliasing games.
@@ -1615,7 +1618,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
lookup:
sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
- th->dest);
+ th->dest, &refcounted);
if (!sk)
goto no_tcp_socket;
@@ -1636,7 +1639,11 @@ process:
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;
}
+ /* We own a reference on the listener, increase it again
+ * as we might lose it too soon.
+ */
sock_hold(sk);
+ refcounted = true;
nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
@@ -1653,7 +1660,7 @@ process:
}
}
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
- NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
}
@@ -1686,13 +1693,14 @@ process:
} else if (unlikely(sk_add_backlog(sk, skb,
sk->sk_rcvbuf + sk->sk_sndbuf))) {
bh_unlock_sock(sk);
- NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
+ __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP);
goto discard_and_relse;
}
bh_unlock_sock(sk);
put_and_return:
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return ret;
@@ -1702,9 +1710,9 @@ no_tcp_socket:
if (tcp_checksum_complete(skb)) {
csum_error:
- TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS);
+ __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
- TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
+ __TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
tcp_v4_send_reset(NULL, skb);
}
@@ -1715,7 +1723,9 @@ discard_it:
return 0;
discard_and_relse:
- sock_put(sk);
+ sk_drops_add(sk, skb);
+ if (refcounted)
+ sock_put(sk);
goto discard_it;
do_time_wait:
@@ -1739,6 +1749,7 @@ do_time_wait:
if (sk2) {
inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
+ refcounted = false;
goto process;
}
/* Fall through to ACK */
@@ -1855,7 +1866,9 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_free_fastopen_req(tp);
tcp_saved_syn_free(tp);
+ local_bh_disable();
sk_sockets_allocated_dec(sk);
+ local_bh_enable();
if (mem_cgroup_sockets_enabled && sk->sk_memcg)
sock_release_memcg(sk);
@@ -1872,17 +1885,17 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock);
*/
static void *listening_get_next(struct seq_file *seq, void *cur)
{
- struct inet_connection_sock *icsk;
- struct hlist_nulls_node *node;
- struct sock *sk = cur;
- struct inet_listen_hashbucket *ilb;
struct tcp_iter_state *st = seq->private;
struct net *net = seq_file_net(seq);
+ struct inet_listen_hashbucket *ilb;
+ struct inet_connection_sock *icsk;
+ struct sock *sk = cur;
if (!sk) {
+get_head:
ilb = &tcp_hashinfo.listening_hash[st->bucket];
spin_lock_bh(&ilb->lock);
- sk = sk_nulls_head(&ilb->head);
+ sk = sk_head(&ilb->head);
st->offset = 0;
goto get_sk;
}
@@ -1890,28 +1903,20 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
++st->num;
++st->offset;
- sk = sk_nulls_next(sk);
+ sk = sk_next(sk);
get_sk:
- sk_nulls_for_each_from(sk, node) {
+ sk_for_each_from(sk) {
if (!net_eq(sock_net(sk), net))
continue;
- if (sk->sk_family == st->family) {
- cur = sk;
- goto out;
- }
+ if (sk->sk_family == st->family)
+ return sk;
icsk = inet_csk(sk);
}
spin_unlock_bh(&ilb->lock);
st->offset = 0;
- if (++st->bucket < INET_LHTABLE_SIZE) {
- ilb = &tcp_hashinfo.listening_hash[st->bucket];
- spin_lock_bh(&ilb->lock);
- sk = sk_nulls_head(&ilb->head);
- goto get_sk;
- }
- cur = NULL;
-out:
- return cur;
+ if (++st->bucket < INET_LHTABLE_SIZE)
+ goto get_head;
+ return NULL;
}
static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
@@ -2410,6 +2415,7 @@ static int __net_init tcp_sk_init(struct net *net)
IPPROTO_TCP, net);
if (res)
goto fail;
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
}
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 1e70fa8fa..c67ece139 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -260,13 +260,13 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt)
* newReno in increase case.
* We work it out by following the idea from TCP-LP's paper directly
*/
-static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us)
+static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
{
struct tcp_sock *tp = tcp_sk(sk);
struct lp *lp = inet_csk_ca(sk);
- if (rtt_us > 0)
- tcp_lp_rtt_sample(sk, rtt_us);
+ if (sample->rtt_us > 0)
+ tcp_lp_rtt_sample(sk, sample->rtt_us);
/* calc inference */
if (tcp_time_stamp > tp->rx_opt.rcv_tsecr)
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 7b7eec439..b617826e2 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -800,7 +800,8 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
}
if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
- jiffies - tm->tcpm_stamp) < 0)
+ jiffies - tm->tcpm_stamp,
+ TCP_METRICS_ATTR_PAD) < 0)
goto nla_put_failure;
if (tm->tcpm_ts_stamp) {
if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP,
@@ -864,7 +865,8 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
(nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS,
tfom->syn_loss) < 0 ||
nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS,
- jiffies - tfom->last_syn_loss) < 0))
+ jiffies - tfom->last_syn_loss,
+ TCP_METRICS_ATTR_PAD) < 0))
goto nla_put_failure;
if (tfom->cookie.len > 0 &&
nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index acb366dd6..4b95ec4ed 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -235,7 +235,7 @@ kill:
}
if (paws_reject)
- NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
+ __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
if (!th->rst) {
/* In this case we must reset the TIMEWAIT timer.
@@ -337,7 +337,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
* socket up. We've got bigger problems than
* non-graceful socket closings.
*/
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
}
tcp_update_metrics(sk);
@@ -545,7 +545,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->rack.mstamp.v64 = 0;
newtp->rack.advanced = 0;
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
}
return newsk;
}
@@ -704,10 +704,13 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) {
/* Out of window: send ACK and drop. */
- if (!(flg & TCP_FLAG_RST))
+ if (!(flg & TCP_FLAG_RST) &&
+ !tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDSYNRECV,
+ &tcp_rsk(req)->last_oow_ack_time))
req->rsk_ops->send_ack(sk, skb, req);
if (paws_reject)
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
return NULL;
}
@@ -726,7 +729,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* "fourth, check the SYN bit"
*/
if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
goto embryonic_reset;
}
@@ -749,7 +752,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
inet_rsk(req)->acked = 1;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
return NULL;
}
@@ -788,7 +791,7 @@ embryonic_reset:
}
if (!fastopen) {
inet_csk_reqsk_queue_drop(sk, req);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
}
return NULL;
}
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 773083b7f..5c5964962 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -83,23 +83,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
/* Packet is from an untrusted source, reset gso_segs. */
- int type = skb_shinfo(skb)->gso_type;
-
- if (unlikely(type &
- ~(SKB_GSO_TCPV4 |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- SKB_GSO_TCPV6 |
- SKB_GSO_GRE |
- SKB_GSO_GRE_CSUM |
- SKB_GSO_IPIP |
- SKB_GSO_SIT |
- SKB_GSO_UDP_TUNNEL |
- SKB_GSO_UDP_TUNNEL_CSUM |
- SKB_GSO_TUNNEL_REMCSUM |
- 0) ||
- !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
- goto out;
skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
@@ -107,6 +90,12 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
goto out;
}
+ /* GSO partial only requires splitting the frame into an MSS
+ * multiple and possibly a remainder. So update the mss now.
+ */
+ if (features & NETIF_F_GSO_PARTIAL)
+ mss = skb->len - (skb->len % mss);
+
copy_destructor = gso_skb->destructor == tcp_wfree;
ooo_okay = gso_skb->ooo_okay;
/* All segments but the first should have ooo_okay cleared */
@@ -131,7 +120,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
(__force u32)delta));
- do {
+ while (skb->next) {
th->fin = th->psh = 0;
th->check = newcheck;
@@ -151,7 +140,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
th->seq = htonl(seq);
th->cwr = 0;
- } while (skb->next);
+ }
/* Following permits TCP Small Queues to work well with GSO :
* The callback to TCP stack will be called at the time last frag
@@ -237,7 +226,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
found:
/* Include the IP ID check below from the inner most IP hdr */
- flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id;
+ flush = NAPI_GRO_CB(p)->flush;
flush |= (__force int)(flags & TCP_FLAG_CWR);
flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
@@ -246,6 +235,17 @@ found:
flush |= *(u32 *)((u8 *)th + i) ^
*(u32 *)((u8 *)th2 + i);
+ /* When we receive our second frame we can made a decision on if we
+ * continue this flow as an atomic flow with a fixed ID or if we use
+ * an incrementing ID.
+ */
+ if (NAPI_GRO_CB(p)->flush_id != 1 ||
+ NAPI_GRO_CB(p)->count != 1 ||
+ !NAPI_GRO_CB(p)->is_atomic)
+ flush |= NAPI_GRO_CB(p)->flush_id;
+ else
+ NAPI_GRO_CB(p)->is_atomic = false;
+
mss = skb_shinfo(p)->gso_size;
flush |= (len - 1) >= mss;
@@ -314,6 +314,9 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
iph->daddr, 0);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
+ if (NAPI_GRO_CB(skb)->is_atomic)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID;
+
return tcp_gro_complete(skb);
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c0c1dac81..30a7dd4f6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -236,7 +236,8 @@ void tcp_select_initial_window(int __space, __u32 mss,
/* Set window scaling on max possible window
* See RFC1323 for an explanation of the limit to 14
*/
- space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
+ space = max_t(u32, space, sysctl_tcp_rmem[2]);
+ space = max_t(u32, space, sysctl_rmem_max);
space = min_t(u32, space, *window_clamp);
while (space > 65535 && (*rcv_wscale) < 14) {
space >>= 1;
@@ -364,7 +365,7 @@ tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
* be sent.
*/
static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
- int tcp_header_len)
+ struct tcphdr *th, int tcp_header_len)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -375,7 +376,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
INET_ECN_xmit(sk);
if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
- tcp_hdr(skb)->cwr = 1;
+ th->cwr = 1;
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
}
} else if (!tcp_ca_needs_ecn(sk)) {
@@ -383,7 +384,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
INET_ECN_dontxmit(sk);
}
if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
- tcp_hdr(skb)->ece = 1;
+ th->ece = 1;
}
}
@@ -956,12 +957,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
skb_orphan(skb);
skb->sk = sk;
- skb->destructor = skb_is_tcp_pure_ack(skb) ? sock_wfree : tcp_wfree;
+ skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
skb_set_hash_from_sk(skb, sk);
atomic_add(skb->truesize, &sk->sk_wmem_alloc);
/* Build TCP header and checksum it. */
- th = tcp_hdr(skb);
+ th = (struct tcphdr *)skb->data;
th->source = inet->inet_sport;
th->dest = inet->inet_dport;
th->seq = htonl(tcb->seq);
@@ -969,14 +970,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
tcb->tcp_flags);
- if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
- /* RFC1323: The window in SYN & SYN/ACK segments
- * is never scaled.
- */
- th->window = htons(min(tp->rcv_wnd, 65535U));
- } else {
- th->window = htons(tcp_select_window(sk));
- }
th->check = 0;
th->urg_ptr = 0;
@@ -993,9 +986,15 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
tcp_options_write((__be32 *)(th + 1), tp, &opts);
skb_shinfo(skb)->gso_type = sk->sk_gso_type;
- if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
- tcp_ecn_send(sk, skb, tcp_header_size);
-
+ if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
+ th->window = htons(tcp_select_window(sk));
+ tcp_ecn_send(sk, skb, th, tcp_header_size);
+ } else {
+ /* RFC1323: The window in SYN & SYN/ACK segments
+ * is never scaled.
+ */
+ th->window = htons(min(tp->rcv_wnd, 65535U));
+ }
#ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
@@ -1118,11 +1117,17 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
tcp_verify_left_out(tp);
}
+static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
+{
+ return TCP_SKB_CB(skb)->txstamp_ack ||
+ (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
+}
+
static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
- if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) &&
+ if (unlikely(tcp_has_tx_tstamp(skb)) &&
!before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
@@ -1130,9 +1135,17 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
shinfo->tx_flags &= ~tsflags;
shinfo2->tx_flags |= tsflags;
swap(shinfo->tskey, shinfo2->tskey);
+ TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
+ TCP_SKB_CB(skb)->txstamp_ack = 0;
}
}
+static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
+{
+ TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
+ TCP_SKB_CB(skb)->eor = 0;
+}
+
/* Function to create two new TCP segments. Shrinks the given segment
* to the specified size and appends a new segment with the rest of the
* packet to the list. This won't be called frequently, I hope.
@@ -1178,6 +1191,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
TCP_SKB_CB(buff)->tcp_flags = flags;
TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
+ tcp_skb_fragment_eor(skb, buff);
if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
/* Copy and checksum data tail into the new buffer. */
@@ -1738,6 +1752,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
/* This packet was never sent out yet, so no SACK bits. */
TCP_SKB_CB(buff)->sacked = 0;
+ tcp_skb_fragment_eor(skb, buff);
+
buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
skb_split(skb, buff, len);
tcp_fragment_tstamp(skb, buff);
@@ -2211,14 +2227,13 @@ bool tcp_schedule_loss_probe(struct sock *sk)
/* Thanks to skb fast clones, we can detect if a prior transmit of
* a packet is still in a qdisc or driver queue.
* In this case, there is very little point doing a retransmit !
- * Note: This is called from BH context only.
*/
static bool skb_still_in_host_queue(const struct sock *sk,
const struct sk_buff *skb)
{
if (unlikely(skb_fclone_busy(sk, skb))) {
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
return true;
}
return false;
@@ -2273,14 +2288,14 @@ void tcp_send_loss_probe(struct sock *sk)
if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
goto rearm_timer;
- if (__tcp_retransmit_skb(sk, skb))
+ if (__tcp_retransmit_skb(sk, skb, 1))
goto rearm_timer;
/* Record snd_nxt for loss detection. */
tp->tlp_high_seq = tp->snd_nxt;
probe_sent:
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
/* Reset s.t. tcp_rearm_rto will restart timer from now */
inet_csk(sk)->icsk_pending = 0;
rearm_timer:
@@ -2451,14 +2466,15 @@ u32 __tcp_select_window(struct sock *sk)
void tcp_skb_collapse_tstamp(struct sk_buff *skb,
const struct sk_buff *next_skb)
{
- const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb);
- u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
-
- if (unlikely(tsflags)) {
+ if (unlikely(tcp_has_tx_tstamp(next_skb))) {
+ const struct skb_shared_info *next_shinfo =
+ skb_shinfo(next_skb);
struct skb_shared_info *shinfo = skb_shinfo(skb);
- shinfo->tx_flags |= tsflags;
+ shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
shinfo->tskey = next_shinfo->tskey;
+ TCP_SKB_CB(skb)->txstamp_ack |=
+ TCP_SKB_CB(next_skb)->txstamp_ack;
}
}
@@ -2497,6 +2513,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
* packet counting does not break.
*/
TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
+ TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
/* changed transmit queue under us so clear hints */
tcp_clear_retrans_hints_partial(tp);
@@ -2548,6 +2565,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
if (!tcp_can_collapse(sk, skb))
break;
+ if (!tcp_skb_can_collapse_to(to))
+ break;
+
space -= skb->len;
if (first) {
@@ -2574,17 +2594,17 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
* state updates are done by the caller. Returns non-zero if an
* error occurred which prevented the send.
*/
-int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
{
- struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
unsigned int cur_mss;
- int err;
+ int diff, len, err;
- /* Inconslusive MTU probe */
- if (icsk->icsk_mtup.probe_size) {
+
+ /* Inconclusive MTU probe */
+ if (icsk->icsk_mtup.probe_size)
icsk->icsk_mtup.probe_size = 0;
- }
/* Do not sent more than we queued. 1/4 is reserved for possible
* copying overhead: fragmentation, tunneling, mangling etc.
@@ -2617,30 +2637,27 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb)->seq != tp->snd_una)
return -EAGAIN;
- if (skb->len > cur_mss) {
- if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC))
+ len = cur_mss * segs;
+ if (skb->len > len) {
+ if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
return -ENOMEM; /* We'll try again later. */
} else {
- int oldpcount = tcp_skb_pcount(skb);
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return -ENOMEM;
- if (unlikely(oldpcount > 1)) {
- if (skb_unclone(skb, GFP_ATOMIC))
- return -ENOMEM;
- tcp_init_tso_segs(skb, cur_mss);
- tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
- }
+ diff = tcp_skb_pcount(skb);
+ tcp_set_skb_tso_segs(skb, cur_mss);
+ diff -= tcp_skb_pcount(skb);
+ if (diff)
+ tcp_adjust_pcount(sk, skb, diff);
+ if (skb->len < cur_mss)
+ tcp_retrans_try_collapse(sk, skb, cur_mss);
}
/* RFC3168, section 6.1.1.1. ECN fallback */
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
tcp_ecn_clear_syn(sk, skb);
- tcp_retrans_try_collapse(sk, skb, cur_mss);
-
- /* Make a copy, if the first transmission SKB clone we made
- * is still in somebody's hands, else make a clone.
- */
-
/* make sure skb->data is aligned on arches that require it
* and check if ack-trimming & collapsing extended the headroom
* beyond what csum_start can cover.
@@ -2658,20 +2675,22 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
}
if (likely(!err)) {
+ segs = tcp_skb_pcount(skb);
+
TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
/* Update global TCP statistics. */
- TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
- tp->total_retrans++;
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ tp->total_retrans += segs;
}
return err;
}
-int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
{
struct tcp_sock *tp = tcp_sk(sk);
- int err = __tcp_retransmit_skb(sk, skb);
+ int err = __tcp_retransmit_skb(sk, skb, segs);
if (err == 0) {
#if FASTRETRANS_DEBUG > 0
@@ -2687,7 +2706,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
tp->retrans_stamp = tcp_skb_timestamp(skb);
} else if (err != -EBUSY) {
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
if (tp->undo_retrans < 0)
@@ -2740,7 +2759,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
struct sk_buff *hole = NULL;
- u32 last_lost;
+ u32 max_segs, last_lost;
int mib_idx;
int fwd_rexmitting = 0;
@@ -2760,8 +2779,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
last_lost = tp->snd_una;
}
+ max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk));
tcp_for_write_queue_from(skb, sk) {
__u8 sacked = TCP_SKB_CB(skb)->sacked;
+ int segs;
if (skb == tcp_send_head(sk))
break;
@@ -2769,15 +2790,13 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (!hole)
tp->retransmit_skb_hint = skb;
- /* Assume this retransmit will generate
- * only one packet for congestion window
- * calculation purposes. This works because
- * tcp_retransmit_skb() will chop up the
- * packet to be MSS sized and all the
- * packet counting works out.
- */
- if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+ segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
+ if (segs <= 0)
return;
+ /* In case tcp_shift_skb_data() have aggregated large skbs,
+ * we need to make sure not sending too bigs TSO packets
+ */
+ segs = min_t(int, segs, max_segs);
if (fwd_rexmitting) {
begin_fwd:
@@ -2813,10 +2832,10 @@ begin_fwd:
if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
continue;
- if (tcp_retransmit_skb(sk, skb))
+ if (tcp_retransmit_skb(sk, skb, segs))
return;
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ NET_INC_STATS(sock_net(sk), mib_idx);
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
@@ -2969,7 +2988,7 @@ int tcp_send_synack(struct sock *sk)
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- bool attach_req)
+ enum tcp_synack_type synack_type)
{
struct inet_request_sock *ireq = inet_rsk(req);
const struct tcp_sock *tp = tcp_sk(sk);
@@ -2989,14 +3008,22 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
/* Reserve space for headers. */
skb_reserve(skb, MAX_TCP_HEADER);
- if (attach_req) {
+ switch (synack_type) {
+ case TCP_SYNACK_NORMAL:
skb_set_owner_w(skb, req_to_sk(req));
- } else {
+ break;
+ case TCP_SYNACK_COOKIE:
+ /* Under synflood, we do not attach skb to a socket,
+ * to avoid false sharing.
+ */
+ break;
+ case TCP_SYNACK_FASTOPEN:
/* sk is a const pointer, because we want to express multiple
* cpu might call us concurrently.
* sk->sk_wmem_alloc in an atomic, we can promote to rw.
*/
skb_set_owner_w(skb, (struct sock *)sk);
+ break;
}
skb_dst_set(skb, dst);
@@ -3024,7 +3051,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
- th = tcp_hdr(skb);
+ th = (struct tcphdr *)skb->data;
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
@@ -3045,7 +3072,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
th->window = htons(min(req->rsk_rcv_wnd, 65535U));
tcp_options_write((__be32 *)(th + 1), NULL, &opts);
th->doff = (tcp_header_size >> 2);
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
@@ -3549,10 +3576,10 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
int res;
tcp_rsk(req)->txhash = net_tx_rndhash();
- res = af_ops->send_synack(sk, NULL, &fl, req, NULL, true);
+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
if (!res) {
- TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
}
return res;
}
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 5353085fd..e36df4fcf 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -65,8 +65,8 @@ int tcp_rack_mark_lost(struct sock *sk)
if (scb->sacked & TCPCB_SACKED_RETRANS) {
scb->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPLOSTRETRANSMIT);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPLOSTRETRANSMIT);
}
} else if (!(scb->sacked & TCPCB_RETRANS)) {
/* Original data are sent sequentially so stop early
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 49bc474f8..debdd8b33 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -30,7 +30,7 @@ static void tcp_write_err(struct sock *sk)
sk->sk_error_report(sk);
tcp_done(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
}
/* Do not allow orphaned sockets to eat all our resources.
@@ -68,7 +68,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
if (do_reset)
tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_done(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
return 1;
}
return 0;
@@ -162,8 +162,8 @@ static int tcp_write_timeout(struct sock *sk)
if (tp->syn_fastopen || tp->syn_data)
tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
if (tp->syn_data && icsk->icsk_retransmits == 1)
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
}
retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
syn_set = true;
@@ -178,8 +178,8 @@ static int tcp_write_timeout(struct sock *sk)
tp->bytes_acked <= tp->rx_opt.mss_clamp) {
tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1)
- NET_INC_STATS_BH(sock_net(sk),
- LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
}
/* Black hole detection */
tcp_mtu_probing(icsk, sk);
@@ -209,6 +209,7 @@ static int tcp_write_timeout(struct sock *sk)
return 0;
}
+/* Called with BH disabled */
void tcp_delack_timer_handler(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -228,7 +229,7 @@ void tcp_delack_timer_handler(struct sock *sk)
if (!skb_queue_empty(&tp->ucopy.prequeue)) {
struct sk_buff *skb;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED);
while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
sk_backlog_rcv(sk, skb);
@@ -248,7 +249,7 @@ void tcp_delack_timer_handler(struct sock *sk)
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
tcp_send_ack(sk);
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
}
out:
@@ -265,7 +266,7 @@ static void tcp_delack_timer(unsigned long data)
tcp_delack_timer_handler(sk);
} else {
inet_csk(sk)->icsk_ack.blocked = 1;
- NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
/* deleguate our work to tcp_release_cb() */
if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
sock_hold(sk);
@@ -404,7 +405,7 @@ void tcp_retransmit_timer(struct sock *sk)
goto out;
}
tcp_enter_loss(sk);
- tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
+ tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1);
__sk_dst_reset(sk);
goto out_reset_timer;
}
@@ -431,12 +432,12 @@ void tcp_retransmit_timer(struct sock *sk)
} else {
mib_idx = LINUX_MIB_TCPTIMEOUTS;
}
- NET_INC_STATS_BH(sock_net(sk), mib_idx);
+ __NET_INC_STATS(sock_net(sk), mib_idx);
}
tcp_enter_loss(sk);
- if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
+ if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) {
/* Retransmission failed because of local congestion,
* do not backoff.
*/
@@ -493,6 +494,7 @@ out_reset_timer:
out:;
}
+/* Called with BH disabled */
void tcp_write_timer_handler(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -549,7 +551,7 @@ void tcp_syn_ack_timeout(const struct request_sock *req)
{
struct net *net = read_pnet(&inet_rsk(req)->ireq_net);
- NET_INC_STATS_BH(net, LINUX_MIB_TCPTIMEOUTS);
+ __NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS);
}
EXPORT_SYMBOL(tcp_syn_ack_timeout);
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 13951c408..4c4bac1b5 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -107,16 +107,16 @@ EXPORT_SYMBOL_GPL(tcp_vegas_init);
* o min-filter RTT samples from a much longer window (forever for now)
* to find the propagation delay (baseRTT)
*/
-void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+void tcp_vegas_pkts_acked(struct sock *sk, const struct ack_sample *sample)
{
struct vegas *vegas = inet_csk_ca(sk);
u32 vrtt;
- if (rtt_us < 0)
+ if (sample->rtt_us < 0)
return;
/* Never allow zero rtt or baseRTT */
- vrtt = rtt_us + 1;
+ vrtt = sample->rtt_us + 1;
/* Filter to find propagation delay: */
if (vrtt < vegas->baseRTT)
diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h
index ef9da5306..248cfc0ff 100644
--- a/net/ipv4/tcp_vegas.h
+++ b/net/ipv4/tcp_vegas.h
@@ -17,7 +17,7 @@ struct vegas {
void tcp_vegas_init(struct sock *sk);
void tcp_vegas_state(struct sock *sk, u8 ca_state);
-void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us);
+void tcp_vegas_pkts_acked(struct sock *sk, const struct ack_sample *sample);
void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event);
size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
union tcp_cc_info *info);
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 0d094b995..40171e163 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -69,16 +69,17 @@ static void tcp_veno_init(struct sock *sk)
}
/* Do rtt sampling needed for Veno. */
-static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us)
+static void tcp_veno_pkts_acked(struct sock *sk,
+ const struct ack_sample *sample)
{
struct veno *veno = inet_csk_ca(sk);
u32 vrtt;
- if (rtt_us < 0)
+ if (sample->rtt_us < 0)
return;
/* Never allow zero rtt or baseRTT */
- vrtt = rtt_us + 1;
+ vrtt = sample->rtt_us + 1;
/* Filter to find propagation delay: */
if (vrtt < veno->basertt)
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index c10732e39..4b03a2e2a 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -99,12 +99,13 @@ static void westwood_filter(struct westwood *w, u32 delta)
* Called after processing group of packets.
* but all westwood needs is the last sample of srtt.
*/
-static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, s32 rtt)
+static void tcp_westwood_pkts_acked(struct sock *sk,
+ const struct ack_sample *sample)
{
struct westwood *w = inet_csk_ca(sk);
- if (rtt > 0)
- w->rtt = usecs_to_jiffies(rtt);
+ if (sample->rtt_us > 0)
+ w->rtt = usecs_to_jiffies(sample->rtt_us);
}
/*
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 3e6a472e6..028eb046e 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -56,15 +56,16 @@ static void tcp_yeah_init(struct sock *sk)
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}
-static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us)
+static void tcp_yeah_pkts_acked(struct sock *sk,
+ const struct ack_sample *sample)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct yeah *yeah = inet_csk_ca(sk);
if (icsk->icsk_ca_state == TCP_CA_Open)
- yeah->pkts_acked = pkts_acked;
+ yeah->pkts_acked = sample->pkts_acked;
- tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us);
+ tcp_vegas_pkts_acked(sk, sample);
}
static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index e9853dff7..e61f7cd65 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -143,10 +143,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
unsigned int log)
{
struct sock *sk2;
- struct hlist_nulls_node *node;
kuid_t uid = sock_i_uid(sk);
- sk_nulls_for_each(sk2, node, &hslot->head) {
+ sk_for_each(sk2, &hslot->head) {
if (net_eq(sock_net(sk2), net) &&
sk2 != sk &&
(bitmap || udp_sk(sk2)->udp_port_hash == num) &&
@@ -177,12 +176,11 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
bool match_wildcard))
{
struct sock *sk2;
- struct hlist_nulls_node *node;
kuid_t uid = sock_i_uid(sk);
int res = 0;
spin_lock(&hslot2->lock);
- udp_portaddr_for_each_entry(sk2, node, &hslot2->head) {
+ udp_portaddr_for_each_entry(sk2, &hslot2->head) {
if (net_eq(sock_net(sk2), net) &&
sk2 != sk &&
(udp_sk(sk2)->udp_port_hash == num) &&
@@ -207,11 +205,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
bool match_wildcard))
{
struct net *net = sock_net(sk);
- struct hlist_nulls_node *node;
kuid_t uid = sock_i_uid(sk);
struct sock *sk2;
- sk_nulls_for_each(sk2, node, &hslot->head) {
+ sk_for_each(sk2, &hslot->head) {
if (net_eq(sock_net(sk2), net) &&
sk2 != sk &&
sk2->sk_family == sk->sk_family &&
@@ -333,22 +330,23 @@ found:
goto fail_unlock;
}
- sk_nulls_add_node_rcu(sk, &hslot->head);
+ sk_add_node_rcu(sk, &hslot->head);
hslot->count++;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
spin_lock(&hslot2->lock);
if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
- sk->sk_family == AF_INET6)
- hlist_nulls_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
- &hslot2->head);
+ sk->sk_family == AF_INET6)
+ hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &hslot2->head);
else
- hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
- &hslot2->head);
+ hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &hslot2->head);
hslot2->count++;
spin_unlock(&hslot2->lock);
}
+ sock_set_flag(sk, SOCK_RCU_FREE);
error = 0;
fail_unlock:
spin_unlock_bh(&hslot->lock);
@@ -393,9 +391,9 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
}
-static inline int compute_score(struct sock *sk, struct net *net,
- __be32 saddr, unsigned short hnum, __be16 sport,
- __be32 daddr, __be16 dport, int dif)
+static int compute_score(struct sock *sk, struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned short hnum, int dif)
{
int score;
struct inet_sock *inet;
@@ -436,52 +434,6 @@ static inline int compute_score(struct sock *sk, struct net *net,
return score;
}
-/*
- * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
- */
-static inline int compute_score2(struct sock *sk, struct net *net,
- __be32 saddr, __be16 sport,
- __be32 daddr, unsigned int hnum, int dif)
-{
- int score;
- struct inet_sock *inet;
-
- if (!net_eq(sock_net(sk), net) ||
- ipv6_only_sock(sk))
- return -1;
-
- inet = inet_sk(sk);
-
- if (inet->inet_rcv_saddr != daddr ||
- inet->inet_num != hnum)
- return -1;
-
- score = (sk->sk_family == PF_INET) ? 2 : 1;
-
- if (inet->inet_daddr) {
- if (inet->inet_daddr != saddr)
- return -1;
- score += 4;
- }
-
- if (inet->inet_dport) {
- if (inet->inet_dport != sport)
- return -1;
- score += 4;
- }
-
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- return -1;
- score += 4;
- }
-
- if (sk->sk_incoming_cpu == raw_smp_processor_id())
- score++;
-
- return score;
-}
-
static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
const __u16 lport, const __be32 faddr,
const __be16 fport)
@@ -494,45 +446,35 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
udp_ehash_secret + net_hash_mix(net));
}
-/* called with read_rcu_lock() */
+/* called with rcu_read_lock() */
static struct sock *udp4_lib_lookup2(struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned int hnum, int dif,
- struct udp_hslot *hslot2, unsigned int slot2,
+ struct udp_hslot *hslot2,
struct sk_buff *skb)
{
struct sock *sk, *result;
- struct hlist_nulls_node *node;
int score, badness, matches = 0, reuseport = 0;
- bool select_ok = true;
u32 hash = 0;
-begin:
result = NULL;
badness = 0;
- udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
- score = compute_score2(sk, net, saddr, sport,
+ udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
+ score = compute_score(sk, net, saddr, sport,
daddr, hnum, dif);
if (score > badness) {
- result = sk;
- badness = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
- if (select_ok) {
- struct sock *sk2;
-
- sk2 = reuseport_select_sock(sk, hash, skb,
+ result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
- if (sk2) {
- result = sk2;
- select_ok = false;
- goto found;
- }
- }
+ if (result)
+ return result;
matches = 1;
}
+ badness = score;
+ result = sk;
} else if (score == badness && reuseport) {
matches++;
if (reciprocal_scale(hash, matches) == 0)
@@ -540,23 +482,6 @@ begin:
hash = next_pseudo_random32(hash);
}
}
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot2)
- goto begin;
- if (result) {
-found:
- if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
- result = NULL;
- else if (unlikely(compute_score2(result, net, saddr, sport,
- daddr, hnum, dif) < badness)) {
- sock_put(result);
- goto begin;
- }
- }
return result;
}
@@ -568,15 +493,12 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
int dif, struct udp_table *udptable, struct sk_buff *skb)
{
struct sock *sk, *result;
- struct hlist_nulls_node *node;
unsigned short hnum = ntohs(dport);
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
int score, badness, matches = 0, reuseport = 0;
- bool select_ok = true;
u32 hash = 0;
- rcu_read_lock();
if (hslot->count > 10) {
hash2 = udp4_portaddr_hash(net, daddr, hnum);
slot2 = hash2 & udptable->mask;
@@ -586,47 +508,44 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
- hslot2, slot2, skb);
+ hslot2, skb);
if (!result) {
+ unsigned int old_slot2 = slot2;
hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
slot2 = hash2 & udptable->mask;
+ /* avoid searching the same slot again. */
+ if (unlikely(slot2 == old_slot2))
+ return result;
+
hslot2 = &udptable->hash2[slot2];
if (hslot->count < hslot2->count)
goto begin;
result = udp4_lib_lookup2(net, saddr, sport,
- htonl(INADDR_ANY), hnum, dif,
- hslot2, slot2, skb);
+ daddr, hnum, dif,
+ hslot2, skb);
}
- rcu_read_unlock();
return result;
}
begin:
result = NULL;
badness = 0;
- sk_nulls_for_each_rcu(sk, node, &hslot->head) {
- score = compute_score(sk, net, saddr, hnum, sport,
- daddr, dport, dif);
+ sk_for_each_rcu(sk, &hslot->head) {
+ score = compute_score(sk, net, saddr, sport,
+ daddr, hnum, dif);
if (score > badness) {
- result = sk;
- badness = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
- if (select_ok) {
- struct sock *sk2;
-
- sk2 = reuseport_select_sock(sk, hash, skb,
+ result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
- if (sk2) {
- result = sk2;
- select_ok = false;
- goto found;
- }
- }
+ if (result)
+ return result;
matches = 1;
}
+ result = sk;
+ badness = score;
} else if (score == badness && reuseport) {
matches++;
if (reciprocal_scale(hash, matches) == 0)
@@ -634,25 +553,6 @@ begin:
hash = next_pseudo_random32(hash);
}
}
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot)
- goto begin;
-
- if (result) {
-found:
- if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
- result = NULL;
- else if (unlikely(compute_score(result, net, saddr, hnum, sport,
- daddr, dport, dif) < badness)) {
- sock_put(result);
- goto begin;
- }
- }
- rcu_read_unlock();
return result;
}
EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
@@ -663,18 +563,36 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
{
const struct iphdr *iph = ip_hdr(skb);
- return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
+ return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
iph->daddr, dport, inet_iif(skb),
udptable, skb);
}
+struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
+ __be16 sport, __be16 dport)
+{
+ return __udp4_lib_lookup_skb(skb, sport, dport, &udp_table);
+}
+EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb);
+
+/* Must be called under rcu_read_lock().
+ * Does increment socket refcount.
+ */
+#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
+ IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY)
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{
- return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif,
- &udp_table, NULL);
+ struct sock *sk;
+
+ sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
+ dif, &udp_table, NULL);
+ if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+ return sk;
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup);
+#endif
static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
__be16 loc_port, __be32 loc_addr,
@@ -723,7 +641,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
iph->saddr, uh->source, skb->dev->ifindex, udptable,
NULL);
if (!sk) {
- ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
return; /* No socket for error */
}
@@ -776,7 +694,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
sk->sk_err = err;
sk->sk_error_report(sk);
out:
- sock_put(sk);
+ return;
}
void udp_err(struct sk_buff *skb, u32 info)
@@ -917,13 +835,13 @@ send:
err = ip_send_skb(sock_net(sk), skb);
if (err) {
if (err == -ENOBUFS && !inet->recverr) {
- UDP_INC_STATS_USER(sock_net(sk),
- UDP_MIB_SNDBUFERRORS, is_udplite);
+ UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_SNDBUFERRORS, is_udplite);
err = 0;
}
} else
- UDP_INC_STATS_USER(sock_net(sk),
- UDP_MIB_OUTDATAGRAMS, is_udplite);
+ UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_OUTDATAGRAMS, is_udplite);
return err;
}
@@ -1032,15 +950,13 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
*/
connected = 1;
}
- ipc.addr = inet->inet_saddr;
+ ipc.sockc.tsflags = sk->sk_tsflags;
+ ipc.addr = inet->inet_saddr;
ipc.oif = sk->sk_bound_dev_if;
- sock_tx_timestamp(sk, &ipc.tx_flags);
-
if (msg->msg_controllen) {
- err = ip_cmsg_send(sock_net(sk), msg, &ipc,
- sk->sk_family == AF_INET6);
+ err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
if (unlikely(err)) {
kfree(ipc.opt);
return err;
@@ -1065,6 +981,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
saddr = ipc.addr;
ipc.addr = faddr = daddr;
+ sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags);
+
if (ipc.opt && ipc.opt->opt.srr) {
if (!daddr)
return -EINVAL;
@@ -1192,8 +1110,8 @@ out:
* seems like overkill.
*/
if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
- UDP_INC_STATS_USER(sock_net(sk),
- UDP_MIB_SNDBUFERRORS, is_udplite);
+ UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_SNDBUFERRORS, is_udplite);
}
return err;
@@ -1277,10 +1195,10 @@ static unsigned int first_packet_length(struct sock *sk)
spin_lock_bh(&rcvq->lock);
while ((skb = skb_peek(rcvq)) != NULL &&
udp_lib_checksum_complete(skb)) {
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS,
- IS_UDPLITE(sk));
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
- IS_UDPLITE(sk));
+ __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS,
+ IS_UDPLITE(sk));
+ __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
atomic_inc(&sk->sk_drops);
__skb_unlink(skb, rcvq);
__skb_queue_tail(&list_kill, skb);
@@ -1316,14 +1234,6 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
unsigned int amount = first_packet_length(sk);
- if (amount)
- /*
- * We will only return the amount
- * of this packet since that is all
- * that will be read.
- */
- amount -= sizeof(struct udphdr);
-
return put_user(amount, (int __user *)arg);
}
@@ -1347,7 +1257,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct sk_buff *skb;
unsigned int ulen, copied;
- int peeked, off = 0;
+ int peeked, peeking, off;
int err;
int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false;
@@ -1357,15 +1267,16 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
return ip_recv_error(sk, msg, len, addr_len);
try_again:
+ peeking = off = sk_peek_offset(sk, flags);
skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
&peeked, &off, &err);
if (!skb)
- goto out;
+ return err;
- ulen = skb->len - sizeof(struct udphdr);
+ ulen = skb->len;
copied = len;
- if (copied > ulen)
- copied = ulen;
+ if (copied > ulen - off)
+ copied = ulen - off;
else if (copied < ulen)
msg->msg_flags |= MSG_TRUNC;
@@ -1375,18 +1286,16 @@ try_again:
* coverage checksum (UDP-Lite), do it before the copy.
*/
- if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
+ if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) {
checksum_valid = !udp_lib_checksum_complete(skb);
if (!checksum_valid)
goto csum_copy_err;
}
if (checksum_valid || skb_csum_unnecessary(skb))
- err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
- msg, copied);
+ err = skb_copy_datagram_msg(skb, off, msg, copied);
else {
- err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr),
- msg);
+ err = skb_copy_and_csum_datagram_msg(skb, off, msg);
if (err == -EINVAL)
goto csum_copy_err;
@@ -1396,15 +1305,16 @@ try_again:
trace_kfree_skb(skb, udp_recvmsg);
if (!peeked) {
atomic_inc(&sk->sk_drops);
- UDP_INC_STATS_USER(sock_net(sk),
- UDP_MIB_INERRORS, is_udplite);
+ UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_INERRORS, is_udplite);
}
- goto out_free;
+ skb_free_datagram_locked(sk, skb);
+ return err;
}
if (!peeked)
- UDP_INC_STATS_USER(sock_net(sk),
- UDP_MIB_INDATAGRAMS, is_udplite);
+ UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_INDATAGRAMS, is_udplite);
sock_recv_ts_and_drops(msg, sk, skb);
@@ -1417,22 +1327,20 @@ try_again:
*addr_len = sizeof(*sin);
}
if (inet->cmsg_flags)
- ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr));
+ ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr) + off);
err = copied;
if (flags & MSG_TRUNC)
err = ulen;
-out_free:
- skb_free_datagram_locked(sk, skb);
-out:
+ __skb_free_datagram_locked(sk, skb, peeking ? -err : err);
return err;
csum_copy_err:
slow = lock_sock_fast(sk);
if (!skb_kill_datagram(sk, skb, flags)) {
- UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
- UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
}
unlock_sock_fast(sk, slow);
@@ -1479,13 +1387,13 @@ void udp_lib_unhash(struct sock *sk)
spin_lock_bh(&hslot->lock);
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_detach_sock(sk);
- if (sk_nulls_del_node_init_rcu(sk)) {
+ if (sk_del_node_init_rcu(sk)) {
hslot->count--;
inet_sk(sk)->inet_num = 0;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_lock(&hslot2->lock);
- hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
hslot2->count--;
spin_unlock(&hslot2->lock);
}
@@ -1518,12 +1426,12 @@ void udp_lib_rehash(struct sock *sk, u16 newhash)
if (hslot2 != nhslot2) {
spin_lock(&hslot2->lock);
- hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
hslot2->count--;
spin_unlock(&hslot2->lock);
spin_lock(&nhslot2->lock);
- hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
&nhslot2->head);
nhslot2->count++;
spin_unlock(&nhslot2->lock);
@@ -1553,15 +1461,15 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
sk_incoming_cpu_update(sk);
}
- rc = sock_queue_rcv_skb(sk, skb);
+ rc = __sock_queue_rcv_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
/* Note that an ENOMEM error is charged twice */
if (rc == -ENOMEM)
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
- is_udplite);
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ is_udplite);
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
trace_udp_fail_queue_rcv_skb(rc, sk);
return -1;
@@ -1625,9 +1533,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
ret = encap_rcv(sk, skb);
if (ret <= 0) {
- UDP_INC_STATS_BH(sock_net(sk),
- UDP_MIB_INDATAGRAMS,
- is_udplite);
+ __UDP_INC_STATS(sock_net(sk),
+ UDP_MIB_INDATAGRAMS,
+ is_udplite);
return -ret;
}
}
@@ -1671,11 +1579,15 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (rcu_access_pointer(sk->sk_filter) &&
udp_lib_checksum_complete(skb))
- goto csum_error;
+ goto csum_error;
+ if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr)))
+ goto drop;
+
+ udp_csum_pull_header(skb);
if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
- is_udplite);
+ __UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ is_udplite);
goto drop;
}
@@ -1694,43 +1606,14 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return rc;
csum_error:
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
+ __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
atomic_inc(&sk->sk_drops);
kfree_skb(skb);
return -1;
}
-static void flush_stack(struct sock **stack, unsigned int count,
- struct sk_buff *skb, unsigned int final)
-{
- unsigned int i;
- struct sk_buff *skb1 = NULL;
- struct sock *sk;
-
- for (i = 0; i < count; i++) {
- sk = stack[i];
- if (likely(!skb1))
- skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
-
- if (!skb1) {
- atomic_inc(&sk->sk_drops);
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
- IS_UDPLITE(sk));
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
- IS_UDPLITE(sk));
- }
-
- if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
- skb1 = NULL;
-
- sock_put(sk);
- }
- if (unlikely(skb1))
- kfree_skb(skb1);
-}
-
/* For TCP sockets, sk_rx_dst is protected by socket lock
* For UDP, we use xchg() to guard against concurrent changes.
*/
@@ -1754,14 +1637,14 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
struct udp_table *udptable,
int proto)
{
- struct sock *sk, *stack[256 / sizeof(struct sock *)];
- struct hlist_nulls_node *node;
+ struct sock *sk, *first = NULL;
unsigned short hnum = ntohs(uh->dest);
struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
- int dif = skb->dev->ifindex;
- unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
- bool inner_flushed = false;
+ unsigned int offset = offsetof(typeof(*sk), sk_node);
+ int dif = skb->dev->ifindex;
+ struct hlist_node *node;
+ struct sk_buff *nskb;
if (use_hash2) {
hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) &
@@ -1772,23 +1655,28 @@ start_lookup:
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
}
- spin_lock(&hslot->lock);
- sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
- if (__udp_is_mcast_sock(net, sk,
- uh->dest, daddr,
- uh->source, saddr,
- dif, hnum)) {
- if (unlikely(count == ARRAY_SIZE(stack))) {
- flush_stack(stack, count, skb, ~0);
- inner_flushed = true;
- count = 0;
- }
- stack[count++] = sk;
- sock_hold(sk);
+ sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
+ if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr,
+ uh->source, saddr, dif, hnum))
+ continue;
+
+ if (!first) {
+ first = sk;
+ continue;
}
- }
+ nskb = skb_clone(skb, GFP_ATOMIC);
- spin_unlock(&hslot->lock);
+ if (unlikely(!nskb)) {
+ atomic_inc(&sk->sk_drops);
+ __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
+ IS_UDPLITE(sk));
+ __UDP_INC_STATS(net, UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
+ continue;
+ }
+ if (udp_queue_rcv_skb(sk, nskb) > 0)
+ consume_skb(nskb);
+ }
/* Also lookup *:port if we are using hash2 and haven't done so yet. */
if (use_hash2 && hash2 != hash2_any) {
@@ -1796,16 +1684,13 @@ start_lookup:
goto start_lookup;
}
- /*
- * do the slow work with no lock held
- */
- if (count) {
- flush_stack(stack, count, skb, count - 1);
+ if (first) {
+ if (udp_queue_rcv_skb(first, skb) > 0)
+ consume_skb(skb);
} else {
- if (!inner_flushed)
- UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
- proto == IPPROTO_UDPLITE);
- consume_skb(skb);
+ kfree_skb(skb);
+ __UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI,
+ proto == IPPROTO_UDPLITE);
}
return 0;
}
@@ -1829,8 +1714,11 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
return err;
}
- return skb_checksum_init_zero_check(skb, proto, uh->check,
- inet_compute_pseudo);
+ /* Note, we are only interested in != 0 or == 0, thus the
+ * force to int.
+ */
+ return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
+ inet_compute_pseudo);
}
/*
@@ -1902,7 +1790,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
inet_compute_pseudo);
ret = udp_queue_rcv_skb(sk, skb);
- sock_put(sk);
/* a return value > 0 means to resubmit the input, but
* it wants the return to be -protocol, or 0
@@ -1920,7 +1807,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (udp_lib_checksum_complete(skb))
goto csum_error;
- UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
+ __UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
/*
@@ -1947,9 +1834,9 @@ csum_error:
proto == IPPROTO_UDPLITE ? "Lite" : "",
&saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
ulen);
- UDP_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
+ __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
drop:
- UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
+ __UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
kfree_skb(skb);
return 0;
}
@@ -1963,49 +1850,24 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
int dif)
{
struct sock *sk, *result;
- struct hlist_nulls_node *node;
unsigned short hnum = ntohs(loc_port);
- unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask);
+ unsigned int slot = udp_hashfn(net, hnum, udp_table.mask);
struct udp_hslot *hslot = &udp_table.hash[slot];
/* Do not bother scanning a too big list */
if (hslot->count > 10)
return NULL;
- rcu_read_lock();
-begin:
- count = 0;
result = NULL;
- sk_nulls_for_each_rcu(sk, node, &hslot->head) {
- if (__udp_is_mcast_sock(net, sk,
- loc_port, loc_addr,
- rmt_port, rmt_addr,
- dif, hnum)) {
+ sk_for_each_rcu(sk, &hslot->head) {
+ if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr,
+ rmt_port, rmt_addr, dif, hnum)) {
+ if (result)
+ return NULL;
result = sk;
- ++count;
}
}
- /*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot)
- goto begin;
-
- if (result) {
- if (count != 1 ||
- unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
- result = NULL;
- else if (unlikely(!__udp_is_mcast_sock(net, result,
- loc_port, loc_addr,
- rmt_port, rmt_addr,
- dif, hnum))) {
- sock_put(result);
- result = NULL;
- }
- }
- rcu_read_unlock();
+
return result;
}
@@ -2018,37 +1880,22 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
__be16 rmt_port, __be32 rmt_addr,
int dif)
{
- struct sock *sk, *result;
- struct hlist_nulls_node *node;
unsigned short hnum = ntohs(loc_port);
unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum);
unsigned int slot2 = hash2 & udp_table.mask;
struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
+ struct sock *sk;
- rcu_read_lock();
- result = NULL;
- udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
- if (INET_MATCH(sk, net, acookie,
- rmt_addr, loc_addr, ports, dif))
- result = sk;
+ udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
+ if (INET_MATCH(sk, net, acookie, rmt_addr,
+ loc_addr, ports, dif))
+ return sk;
/* Only check first socket in chain */
break;
}
-
- if (result) {
- if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
- result = NULL;
- else if (unlikely(!INET_MATCH(sk, net, acookie,
- rmt_addr, loc_addr,
- ports, dif))) {
- sock_put(result);
- result = NULL;
- }
- }
- rcu_read_unlock();
- return result;
+ return NULL;
}
void udp_v4_early_demux(struct sk_buff *skb)
@@ -2056,7 +1903,7 @@ void udp_v4_early_demux(struct sk_buff *skb)
struct net *net = dev_net(skb->dev);
const struct iphdr *iph;
const struct udphdr *uh;
- struct sock *sk;
+ struct sock *sk = NULL;
struct dst_entry *dst;
int dif = skb->dev->ifindex;
int ours;
@@ -2088,11 +1935,9 @@ void udp_v4_early_demux(struct sk_buff *skb)
} else if (skb->pkt_type == PACKET_HOST) {
sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr,
uh->source, iph->saddr, dif);
- } else {
- return;
}
- if (!sk)
+ if (!sk || !atomic_inc_not_zero_hint(&sk->sk_refcnt, 2))
return;
skb->sk = sk;
@@ -2392,14 +2237,13 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
for (state->bucket = start; state->bucket <= state->udp_table->mask;
++state->bucket) {
- struct hlist_nulls_node *node;
struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
- if (hlist_nulls_empty(&hslot->head))
+ if (hlist_empty(&hslot->head))
continue;
spin_lock_bh(&hslot->lock);
- sk_nulls_for_each(sk, node, &hslot->head) {
+ sk_for_each(sk, &hslot->head) {
if (!net_eq(sock_net(sk), net))
continue;
if (sk->sk_family == state->family)
@@ -2418,7 +2262,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
struct net *net = seq_file_net(seq);
do {
- sk = sk_nulls_next(sk);
+ sk = sk_next(sk);
} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
if (!sk) {
@@ -2627,12 +2471,12 @@ void __init udp_table_init(struct udp_table *table, const char *name)
table->hash2 = table->hash + (table->mask + 1);
for (i = 0; i <= table->mask; i++) {
- INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
+ INIT_HLIST_HEAD(&table->hash[i].head);
table->hash[i].count = 0;
spin_lock_init(&table->hash[i].lock);
}
for (i = 0; i <= table->mask; i++) {
- INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i);
+ INIT_HLIST_HEAD(&table->hash2[i].head);
table->hash2[i].count = 0;
spin_lock_init(&table->hash2[i].lock);
}
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index df1966f3b..3d5ccf4b1 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -36,10 +36,11 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
const struct inet_diag_req_v2 *req)
{
int err = -EINVAL;
- struct sock *sk;
+ struct sock *sk = NULL;
struct sk_buff *rep;
struct net *net = sock_net(in_skb->sk);
+ rcu_read_lock();
if (req->sdiag_family == AF_INET)
sk = __udp4_lib_lookup(net,
req->id.idiag_src[0], req->id.idiag_sport,
@@ -54,9 +55,9 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
req->id.idiag_dport,
req->id.idiag_if, tbl, NULL);
#endif
- else
- goto out_nosk;
-
+ if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
+ sk = NULL;
+ rcu_read_unlock();
err = -ENOENT;
if (!sk)
goto out_nosk;
@@ -96,24 +97,23 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc)
{
- int num, s_num, slot, s_slot;
struct net *net = sock_net(skb->sk);
+ int num, s_num, slot, s_slot;
s_slot = cb->args[0];
num = s_num = cb->args[1];
for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) {
- struct sock *sk;
- struct hlist_nulls_node *node;
struct udp_hslot *hslot = &table->hash[slot];
+ struct sock *sk;
num = 0;
- if (hlist_nulls_empty(&hslot->head))
+ if (hlist_empty(&hslot->head))
continue;
spin_lock_bh(&hslot->lock);
- sk_nulls_for_each(sk, node, &hslot->head) {
+ sk_for_each(sk, &hslot->head) {
struct inet_sock *inet = inet_sk(sk);
if (!net_eq(sock_net(sk), net))
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index e330c0e56..81f253b6f 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -14,18 +14,6 @@
#include <net/udp.h>
#include <net/protocol.h>
-static DEFINE_SPINLOCK(udp_offload_lock);
-static struct udp_offload_priv __rcu *udp_offload_base __read_mostly;
-
-#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock))
-
-struct udp_offload_priv {
- struct udp_offload *offload;
- possible_net_t net;
- struct rcu_head rcu;
- struct udp_offload_priv __rcu *next;
-};
-
static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
netdev_features_t features,
struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
@@ -51,8 +39,11 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
* 16 bit length field due to the header being added outside of an
* IP or IPv6 frame that was already limited to 64K - 1.
*/
- partial = csum_sub(csum_unfold(uh->check),
- (__force __wsum)htonl(skb->len));
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)
+ partial = (__force __wsum)uh->len;
+ else
+ partial = (__force __wsum)htonl(skb->len);
+ partial = csum_sub(csum_unfold(uh->check), partial);
/* setup inner skb. */
skb->encapsulation = 0;
@@ -101,7 +92,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
udp_offset = outer_hlen - tnl_hlen;
skb = segs;
do {
- __be16 len;
+ unsigned int len;
if (remcsum)
skb->ip_summed = CHECKSUM_NONE;
@@ -119,14 +110,26 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
skb_reset_mac_header(skb);
skb_set_network_header(skb, mac_len);
skb_set_transport_header(skb, udp_offset);
- len = htons(skb->len - udp_offset);
+ len = skb->len - udp_offset;
uh = udp_hdr(skb);
- uh->len = len;
+
+ /* If we are only performing partial GSO the inner header
+ * will be using a length value equal to only one MSS sized
+ * segment instead of the entire frame.
+ */
+ if (skb_is_gso(skb)) {
+ uh->len = htons(skb_shinfo(skb)->gso_size +
+ SKB_GSO_CB(skb)->data_offset +
+ skb->head - (unsigned char *)uh);
+ } else {
+ uh->len = htons(len);
+ }
if (!need_csum)
continue;
- uh->check = ~csum_fold(csum_add(partial, (__force __wsum)len));
+ uh->check = ~csum_fold(csum_add(partial,
+ (__force __wsum)htonl(len)));
if (skb->encapsulation || !offload_csum) {
uh->check = gso_make_checksum(skb, ~uh->check);
@@ -179,6 +182,7 @@ out_unlock:
return segs;
}
+EXPORT_SYMBOL(skb_udp_tunnel_segment);
static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
netdev_features_t features)
@@ -205,16 +209,6 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
/* Packet is from an untrusted source, reset gso_segs. */
- int type = skb_shinfo(skb)->gso_type;
-
- if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
- SKB_GSO_UDP_TUNNEL |
- SKB_GSO_UDP_TUNNEL_CSUM |
- SKB_GSO_TUNNEL_REMCSUM |
- SKB_GSO_IPIP |
- SKB_GSO_GRE | SKB_GSO_GRE_CSUM) ||
- !(type & (SKB_GSO_UDP))))
- goto out;
skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
@@ -253,64 +247,14 @@ out:
return segs;
}
-int udp_add_offload(struct net *net, struct udp_offload *uo)
-{
- struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC);
-
- if (!new_offload)
- return -ENOMEM;
-
- write_pnet(&new_offload->net, net);
- new_offload->offload = uo;
-
- spin_lock(&udp_offload_lock);
- new_offload->next = udp_offload_base;
- rcu_assign_pointer(udp_offload_base, new_offload);
- spin_unlock(&udp_offload_lock);
-
- return 0;
-}
-EXPORT_SYMBOL(udp_add_offload);
-
-static void udp_offload_free_routine(struct rcu_head *head)
-{
- struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu);
- kfree(ou_priv);
-}
-
-void udp_del_offload(struct udp_offload *uo)
-{
- struct udp_offload_priv __rcu **head = &udp_offload_base;
- struct udp_offload_priv *uo_priv;
-
- spin_lock(&udp_offload_lock);
-
- uo_priv = udp_deref_protected(*head);
- for (; uo_priv != NULL;
- uo_priv = udp_deref_protected(*head)) {
- if (uo_priv->offload == uo) {
- rcu_assign_pointer(*head,
- udp_deref_protected(uo_priv->next));
- goto unlock;
- }
- head = &uo_priv->next;
- }
- pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port));
-unlock:
- spin_unlock(&udp_offload_lock);
- if (uo_priv)
- call_rcu(&uo_priv->rcu, udp_offload_free_routine);
-}
-EXPORT_SYMBOL(udp_del_offload);
-
struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
- struct udphdr *uh)
+ struct udphdr *uh, udp_lookup_t lookup)
{
- struct udp_offload_priv *uo_priv;
struct sk_buff *p, **pp = NULL;
struct udphdr *uh2;
unsigned int off = skb_gro_offset(skb);
int flush = 1;
+ struct sock *sk;
if (NAPI_GRO_CB(skb)->encap_mark ||
(skb->ip_summed != CHECKSUM_PARTIAL &&
@@ -322,13 +266,10 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
NAPI_GRO_CB(skb)->encap_mark = 1;
rcu_read_lock();
- uo_priv = rcu_dereference(udp_offload_base);
- for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
- if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) &&
- uo_priv->offload->port == uh->dest &&
- uo_priv->offload->callbacks.gro_receive)
- goto unflush;
- }
+ sk = (*lookup)(skb, uh->source, uh->dest);
+
+ if (sk && udp_sk(sk)->gro_receive)
+ goto unflush;
goto out_unlock;
unflush:
@@ -352,9 +293,7 @@ unflush:
skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
- NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
- pp = uo_priv->offload->callbacks.gro_receive(head, skb,
- uo_priv->offload);
+ pp = udp_sk(sk)->gro_receive(sk, head, skb);
out_unlock:
rcu_read_unlock();
@@ -362,6 +301,7 @@ out:
NAPI_GRO_CB(skb)->flush |= flush;
return pp;
}
+EXPORT_SYMBOL(udp_gro_receive);
static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
@@ -383,19 +323,20 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
inet_gro_compute_pseudo);
skip:
NAPI_GRO_CB(skb)->is_ipv6 = 0;
- return udp_gro_receive(head, skb, uh);
+ return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb);
flush:
NAPI_GRO_CB(skb)->flush = 1;
return NULL;
}
-int udp_gro_complete(struct sk_buff *skb, int nhoff)
+int udp_gro_complete(struct sk_buff *skb, int nhoff,
+ udp_lookup_t lookup)
{
- struct udp_offload_priv *uo_priv;
__be16 newlen = htons(skb->len - nhoff);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
int err = -ENOSYS;
+ struct sock *sk;
uh->len = newlen;
@@ -405,22 +346,10 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
skb->encapsulation = 1;
rcu_read_lock();
-
- uo_priv = rcu_dereference(udp_offload_base);
- for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
- if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) &&
- uo_priv->offload->port == uh->dest &&
- uo_priv->offload->callbacks.gro_complete)
- break;
- }
-
- if (uo_priv) {
- NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto;
- err = uo_priv->offload->callbacks.gro_complete(skb,
- nhoff + sizeof(struct udphdr),
- uo_priv->offload);
- }
-
+ sk = (*lookup)(skb, uh->source, uh->dest);
+ if (sk && udp_sk(sk)->gro_complete)
+ err = udp_sk(sk)->gro_complete(sk, skb,
+ nhoff + sizeof(struct udphdr));
rcu_read_unlock();
if (skb->remcsum_offload)
@@ -428,6 +357,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
return err;
}
+EXPORT_SYMBOL(udp_gro_complete);
static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
{
@@ -442,7 +372,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
}
- return udp_gro_complete(skb, nhoff);
+ return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
}
static const struct net_offload udpv4_offload = {
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 96599d1a1..47f12c73d 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -69,6 +69,8 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
udp_sk(sk)->encap_type = cfg->encap_type;
udp_sk(sk)->encap_rcv = cfg->encap_rcv;
udp_sk(sk)->encap_destroy = cfg->encap_destroy;
+ udp_sk(sk)->gro_receive = cfg->gro_receive;
+ udp_sk(sk)->gro_complete = cfg->gro_complete;
udp_tunnel_encap_enable(sock);
}