diff options
Diffstat (limited to 'net/ipv4')
56 files changed, 1594 insertions, 1524 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 7ccd693db..eb51c43c2 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -186,6 +186,7 @@ config NET_IPGRE_DEMUX config NET_IP_TUNNEL tristate + select DST_CACHE default n config NET_IPGRE @@ -405,14 +406,6 @@ config INET_XFRM_MODE_BEET If unsure, say Y. -config INET_LRO - tristate "Large Receive Offload (ipv4/tcp)" - default y - ---help--- - Support for Large Receive Offload (ipv4/tcp). - - If unsure, say Y. - config INET_DIAG tristate "INET: socket monitoring interface" default y diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 62c049b64..bfa133691 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -32,7 +32,6 @@ obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o -obj-$(CONFIG_INET_LRO) += inet_lro.o obj-$(CONFIG_INET_TUNNEL) += tunnel4.o obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5c5db6636..9e481992d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -370,7 +370,11 @@ lookup_protocol: */ inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); + if (err) { + sk_common_release(sk); + goto out; + } } if (sk->sk_prot->init) { @@ -1091,12 +1095,6 @@ void inet_unregister_protosw(struct inet_protosw *p) } EXPORT_SYMBOL(inet_unregister_protosw); -/* - * Shall we try to damage output packets if routing dev changes? - */ - -int sysctl_ip_dynaddr __read_mostly; - static int inet_sk_reselect_saddr(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); @@ -1127,7 +1125,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) if (new_saddr == old_saddr) return 0; - if (sysctl_ip_dynaddr > 1) { + if (sock_net(sk)->ipv4.sysctl_ip_dynaddr > 1) { pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n", __func__, &old_saddr, &new_saddr); } @@ -1142,8 +1140,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) * Besides that, it does not check for connection * uniqueness. Wait for troubles. */ - __sk_prot_rehash(sk); - return 0; + return __sk_prot_rehash(sk); } int inet_sk_rebuild_header(struct sock *sk) @@ -1183,7 +1180,7 @@ int inet_sk_rebuild_header(struct sock *sk) * Other protocols have to map its equivalent state to TCP_SYN_SENT. * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme */ - if (!sysctl_ip_dynaddr || + if (!sock_net(sk)->ipv4.sysctl_ip_dynaddr || sk->sk_state != TCP_SYN_SENT || (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || (err = inet_sk_reselect_saddr(sk)) != 0) @@ -1383,6 +1380,45 @@ out: return pp; } +static struct sk_buff **ipip_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + if (NAPI_GRO_CB(skb)->encap_mark) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + + NAPI_GRO_CB(skb)->encap_mark = 1; + + return inet_gro_receive(head, skb); +} + +#define SECONDS_PER_DAY 86400 + +/* inet_current_timestamp - Return IP network timestamp + * + * Return milliseconds since midnight in network byte order. + */ +__be32 inet_current_timestamp(void) +{ + u32 secs; + u32 msecs; + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + + /* Get secs since midnight. */ + (void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs); + /* Convert to msecs. */ + msecs = secs * MSEC_PER_SEC; + /* Convert nsec to msec. */ + msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC; + + /* Convert to network byte order. */ + return htonl(msecs); +} +EXPORT_SYMBOL(inet_current_timestamp); + int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { if (sk->sk_family == AF_INET) @@ -1425,6 +1461,13 @@ out_unlock: return err; } +static int ipip_gro_complete(struct sk_buff *skb, int nhoff) +{ + skb->encapsulation = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_IPIP; + return inet_gro_complete(skb, nhoff); +} + int inet_ctl_sock_create(struct sock **sk, unsigned short family, unsigned short type, unsigned char protocol, struct net *net) @@ -1652,8 +1695,8 @@ static struct packet_offload ip_packet_offload __read_mostly = { static const struct net_offload ipip_offload = { .callbacks = { .gso_segment = inet_gso_segment, - .gro_receive = inet_gro_receive, - .gro_complete = inet_gro_complete, + .gro_receive = ipip_gro_receive, + .gro_complete = ipip_gro_complete, }, }; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 59b3e0e8f..c34c7544d 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -665,7 +665,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) */ if (!in_dev) - goto out; + goto out_free_skb; arp = arp_hdr(skb); @@ -673,7 +673,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) default: if (arp->ar_pro != htons(ETH_P_IP) || htons(dev_type) != arp->ar_hrd) - goto out; + goto out_free_skb; break; case ARPHRD_ETHER: case ARPHRD_FDDI: @@ -690,17 +690,17 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) if ((arp->ar_hrd != htons(ARPHRD_ETHER) && arp->ar_hrd != htons(ARPHRD_IEEE802)) || arp->ar_pro != htons(ETH_P_IP)) - goto out; + goto out_free_skb; break; case ARPHRD_AX25: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_AX25)) - goto out; + goto out_free_skb; break; case ARPHRD_NETROM: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_NETROM)) - goto out; + goto out_free_skb; break; } @@ -708,7 +708,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) if (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST)) - goto out; + goto out_free_skb; /* * Extract fields @@ -733,7 +733,15 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) */ if (ipv4_is_multicast(tip) || (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) - goto out; + goto out_free_skb; + + /* + * For some 802.11 wireless deployments (and possibly other networks), + * there will be an ARP proxy and gratuitous ARP frames are attacks + * and thus should not be accepted. + */ + if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP)) + goto out_free_skb; /* * Special case: We must set Frame Relay source Q.922 address @@ -770,7 +778,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) !arp_ignore(in_dev, sip, tip)) arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); - goto out; + goto out_consume_skb; } if (arp->ar_op == htons(ARPOP_REQUEST) && @@ -795,7 +803,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) neigh_release(n); } } - goto out; + goto out_consume_skb; } else if (IN_DEV_FORWARD(in_dev)) { if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || @@ -818,7 +826,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) in_dev->arp_parms, skb); goto out_free_dst; } - goto out; + goto out_consume_skb; } } } @@ -868,11 +876,16 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) neigh_release(n); } -out: +out_consume_skb: consume_skb(skb); + out_free_dst: dst_release(reply_dst); - return 0; + return NET_RX_SUCCESS; + +out_free_skb: + kfree_skb(skb); + return NET_RX_DROP; } static void parp_redo(struct sk_buff *skb) @@ -916,11 +929,11 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, consumeskb: consume_skb(skb); - return 0; + return NET_RX_SUCCESS; freeskb: kfree_skb(skb); out_of_mem: - return 0; + return NET_RX_DROP; } /* diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 0212591b0..e333bc86b 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1198,6 +1198,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) __be32 addr = 0; struct in_device *in_dev; struct net *net = dev_net(dev); + int master_idx; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -1218,12 +1219,33 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) if (addr) goto out_unlock; no_in_dev: + master_idx = l3mdev_master_ifindex_rcu(dev); + + /* For VRFs, the VRF device takes the place of the loopback device, + * with addresses on it being preferred. Note in such cases the + * loopback device will be among the devices that fail the master_idx + * equality check in the loop below. + */ + if (master_idx && + (dev = dev_get_by_index_rcu(net, master_idx)) && + (in_dev = __in_dev_get_rcu(dev))) { + for_primary_ifa(in_dev) { + if (ifa->ifa_scope != RT_SCOPE_LINK && + ifa->ifa_scope <= scope) { + addr = ifa->ifa_local; + goto out_unlock; + } + } endfor_ifa(in_dev); + } /* Not loopback addresses on loopback should be preferred in this case. It is important that lo is the first interface in dev_base list. */ for_each_netdev_rcu(net, dev) { + if (l3mdev_master_ifindex_rcu(dev) != master_idx) + continue; + in_dev = __in_dev_get_rcu(dev); if (!in_dev) continue; @@ -1735,17 +1757,20 @@ static int inet_netconf_msgsize_devconf(int type) { int size = NLMSG_ALIGN(sizeof(struct netconfmsg)) + nla_total_size(4); /* NETCONFA_IFINDEX */ + bool all = false; + + if (type == NETCONFA_ALL) + all = true; - /* type -1 is used for ALL */ - if (type == -1 || type == NETCONFA_FORWARDING) + if (all || type == NETCONFA_FORWARDING) size += nla_total_size(4); - if (type == -1 || type == NETCONFA_RP_FILTER) + if (all || type == NETCONFA_RP_FILTER) size += nla_total_size(4); - if (type == -1 || type == NETCONFA_MC_FORWARDING) + if (all || type == NETCONFA_MC_FORWARDING) size += nla_total_size(4); - if (type == -1 || type == NETCONFA_PROXY_NEIGH) + if (all || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); - if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) + if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) size += nla_total_size(4); return size; @@ -1758,36 +1783,39 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, { struct nlmsghdr *nlh; struct netconfmsg *ncm; + bool all = false; nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg), flags); if (!nlh) return -EMSGSIZE; + if (type == NETCONFA_ALL) + all = true; + ncm = nlmsg_data(nlh); ncm->ncm_family = AF_INET; if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) goto nla_put_failure; - /* type -1 is used for ALL */ - if ((type == -1 || type == NETCONFA_FORWARDING) && + if ((all || type == NETCONFA_FORWARDING) && nla_put_s32(skb, NETCONFA_FORWARDING, IPV4_DEVCONF(*devconf, FORWARDING)) < 0) goto nla_put_failure; - if ((type == -1 || type == NETCONFA_RP_FILTER) && + if ((all || type == NETCONFA_RP_FILTER) && nla_put_s32(skb, NETCONFA_RP_FILTER, IPV4_DEVCONF(*devconf, RP_FILTER)) < 0) goto nla_put_failure; - if ((type == -1 || type == NETCONFA_MC_FORWARDING) && + if ((all || type == NETCONFA_MC_FORWARDING) && nla_put_s32(skb, NETCONFA_MC_FORWARDING, IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0) goto nla_put_failure; - if ((type == -1 || type == NETCONFA_PROXY_NEIGH) && + if ((all || type == NETCONFA_PROXY_NEIGH) && nla_put_s32(skb, NETCONFA_PROXY_NEIGH, IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0) goto nla_put_failure; - if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && + if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0) goto nla_put_failure; @@ -1875,14 +1903,14 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, } err = -ENOBUFS; - skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC); + skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC); if (!skb) goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWNETCONF, 0, - -1); + NETCONFA_ALL); if (err < 0) { /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); @@ -1926,7 +1954,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) < 0) { + NETCONFA_ALL) < 0) { rcu_read_unlock(); goto done; } @@ -1942,7 +1970,7 @@ cont: NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) < 0) + NETCONFA_ALL) < 0) goto done; else h++; @@ -1953,7 +1981,7 @@ cont: NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNETCONF, NLM_F_MULTI, - -1) < 0) + NETCONFA_ALL) < 0) goto done; else h++; @@ -2189,6 +2217,8 @@ static struct devinet_sysctl_table { "igmpv3_unsolicited_report_interval"), DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN, "ignore_routes_with_linkdown"), + DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP, + "drop_gratuitous_arp"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), @@ -2196,6 +2226,8 @@ static struct devinet_sysctl_table { "promote_secondaries"), DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET, "route_localnet"), + DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST, + "drop_unicast_in_l2_multicast"), }, }; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 8a9246dec..63566ec54 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -904,7 +904,11 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (!prim) { - pr_warn("%s: bug: prim == NULL\n", __func__); + /* if the device has been deleted, we don't perform + * address promotion + */ + if (!in_dev->dead) + pr_warn("%s: bug: prim == NULL\n", __func__); return; } if (iprim && iprim != prim) { diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d97268e8f..2b68418c7 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -975,6 +975,8 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) val = 65535 - 40; if (type == RTAX_MTU && val > 65535 - 15) val = 65535 - 15; + if (type == RTAX_HOPLIMIT && val > 255) + val = 255; if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) return -EINVAL; fi->fib_metrics[type - 1] = val; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 976f0dcf6..a6962ccad 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -48,7 +48,7 @@ static inline struct fou *fou_from_sock(struct sock *sk) return sk->sk_user_data; } -static void fou_recv_pull(struct sk_buff *skb, size_t len) +static int fou_recv_pull(struct sk_buff *skb, size_t len) { struct iphdr *iph = ip_hdr(skb); @@ -59,6 +59,7 @@ static void fou_recv_pull(struct sk_buff *skb, size_t len) __skb_pull(skb, len); skb_postpull_rcsum(skb, udp_hdr(skb), len); skb_reset_transport_header(skb); + return iptunnel_pull_offloads(skb); } static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) @@ -68,9 +69,14 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) if (!fou) return 1; - fou_recv_pull(skb, sizeof(struct udphdr)); + if (fou_recv_pull(skb, sizeof(struct udphdr))) + goto drop; return -fou->protocol; + +drop: + kfree_skb(skb); + return 0; } static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, @@ -170,6 +176,9 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) __skb_pull(skb, sizeof(struct udphdr) + hdrlen); skb_reset_transport_header(skb); + if (iptunnel_pull_offloads(skb)) + goto drop; + return -guehdr->proto_ctype; drop: @@ -186,6 +195,17 @@ static struct sk_buff **fou_gro_receive(struct sk_buff **head, u8 proto = NAPI_GRO_CB(skb)->proto; const struct net_offload **offloads; + /* We can clear the encap_mark for FOU as we are essentially doing + * one of two possible things. We are either adding an L4 tunnel + * header to the outer L3 tunnel header, or we are are simply + * treating the GRE tunnel header as though it is a UDP protocol + * specific header such as VXLAN or GENEVE. + */ + NAPI_GRO_CB(skb)->encap_mark = 0; + + /* Flag this frame as already having an outer encap header */ + NAPI_GRO_CB(skb)->is_fou = 1; + rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); @@ -208,8 +228,6 @@ static int fou_gro_complete(struct sk_buff *skb, int nhoff, int err = -ENOSYS; const struct net_offload **offloads; - udp_tunnel_gro_complete(skb, nhoff); - rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); @@ -218,6 +236,8 @@ static int fou_gro_complete(struct sk_buff *skb, int nhoff, err = ops->callbacks.gro_complete(skb, nhoff); + skb_set_inner_mac_header(skb, nhoff); + out_unlock: rcu_read_unlock(); @@ -319,8 +339,6 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, skb_gro_pull(skb, hdrlen); - flush = 0; - for (p = *head; p; p = p->next) { const struct guehdr *guehdr2; @@ -345,6 +363,17 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, } } + /* We can clear the encap_mark for GUE as we are essentially doing + * one of two possible things. We are either adding an L4 tunnel + * header to the outer L3 tunnel header, or we are are simply + * treating the GRE tunnel header as though it is a UDP protocol + * specific header such as VXLAN or GENEVE. + */ + NAPI_GRO_CB(skb)->encap_mark = 0; + + /* Flag this frame as already having an outer encap header */ + NAPI_GRO_CB(skb)->is_fou = 1; + rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[guehdr->proto_ctype]); @@ -352,6 +381,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, goto out_unlock; pp = ops->callbacks.gro_receive(head, skb); + flush = 0; out_unlock: rcu_read_unlock(); @@ -384,6 +414,8 @@ static int gue_gro_complete(struct sk_buff *skb, int nhoff, err = ops->callbacks.gro_complete(skb, nhoff + guehlen); + skb_set_inner_mac_header(skb, nhoff + guehlen); + out_unlock: rcu_read_unlock(); return err; @@ -774,7 +806,6 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, uh->dest = e->dport; uh->source = sport; uh->len = htons(skb->len); - uh->check = 0; udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, fl4->saddr, fl4->daddr, skb->len); @@ -784,11 +815,11 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { - bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); - int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : + SKB_GSO_UDP_TUNNEL; __be16 sport; - skb = iptunnel_handle_offloads(skb, csum, type); + skb = iptunnel_handle_offloads(skb, type); if (IS_ERR(skb)) return PTR_ERR(skb); @@ -804,8 +835,8 @@ EXPORT_SYMBOL(fou_build_header); int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { - bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM); - int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : + SKB_GSO_UDP_TUNNEL; struct guehdr *guehdr; size_t hdrlen, optlen = 0; __be16 sport; @@ -814,7 +845,6 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) && skb->ip_summed == CHECKSUM_PARTIAL) { - csum = false; optlen += GUE_PLEN_REMCSUM; type |= SKB_GSO_TUNNEL_REMCSUM; need_priv = true; @@ -822,7 +852,7 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, optlen += need_priv ? GUE_LEN_PRIV : 0; - skb = iptunnel_handle_offloads(skb, csum, type); + skb = iptunnel_handle_offloads(skb, type); if (IS_ERR(skb)) return PTR_ERR(skb); diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 5a8ee3282..6a5bd4317 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -18,15 +18,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { + int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); struct sk_buff *segs = ERR_PTR(-EINVAL); - netdev_features_t enc_features; - int ghl; - struct gre_base_hdr *greh; u16 mac_offset = skb->mac_header; - int mac_len = skb->mac_len; __be16 protocol = skb->protocol; - int tnl_hlen; - bool csum; + u16 mac_len = skb->mac_len; + int gre_offset, outer_hlen; + bool need_csum, ufo; if (unlikely(skb_shinfo(skb)->gso_type & ~(SKB_GSO_TCPV4 | @@ -43,74 +41,75 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, if (!skb->encapsulation) goto out; - if (unlikely(!pskb_may_pull(skb, sizeof(*greh)))) + if (unlikely(tnl_hlen < sizeof(struct gre_base_hdr))) goto out; - greh = (struct gre_base_hdr *)skb_transport_header(skb); - - ghl = skb_inner_mac_header(skb) - skb_transport_header(skb); - if (unlikely(ghl < sizeof(*greh))) + if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; - csum = !!(greh->flags & GRE_CSUM); - if (csum) - skb->encap_hdr_csum = 1; - /* setup inner skb. */ - skb->protocol = greh->protocol; skb->encapsulation = 0; - - if (unlikely(!pskb_may_pull(skb, ghl))) - goto out; - - __skb_pull(skb, ghl); + SKB_GSO_CB(skb)->encap_level = 0; + __skb_pull(skb, tnl_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); + skb->protocol = skb->inner_protocol; + + need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); + skb->encap_hdr_csum = need_csum; + + ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); + + features &= skb->dev->hw_enc_features; + + /* The only checksum offload we care about from here on out is the + * outer one so strip the existing checksum feature flags based + * on the fact that we will be computing our checksum in software. + */ + if (ufo) { + features &= ~NETIF_F_CSUM_MASK; + if (!need_csum) + features |= NETIF_F_HW_CSUM; + } /* segment inner packet. */ - enc_features = skb->dev->hw_enc_features & features; - segs = skb_mac_gso_segment(skb, enc_features); + segs = skb_mac_gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { - skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len); + skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, + mac_len); goto out; } + outer_hlen = skb_tnl_header_len(skb); + gre_offset = outer_hlen - tnl_hlen; skb = segs; - tnl_hlen = skb_tnl_header_len(skb); do { - __skb_push(skb, ghl); - if (csum) { - __be32 *pcsum; - - if (skb_has_shared_frag(skb)) { - int err; - - err = __skb_linearize(skb); - if (err) { - kfree_skb_list(segs); - segs = ERR_PTR(err); - goto out; - } - } + struct gre_base_hdr *greh; + __be32 *pcsum; - skb_reset_transport_header(skb); - - greh = (struct gre_base_hdr *) - skb_transport_header(skb); - pcsum = (__be32 *)(greh + 1); - *pcsum = 0; - *(__sum16 *)pcsum = gso_make_checksum(skb, 0); + /* Set up inner headers if we are offloading inner checksum */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + skb_reset_inner_headers(skb); + skb->encapsulation = 1; } - __skb_push(skb, tnl_hlen - ghl); - skb_reset_inner_headers(skb); - skb->encapsulation = 1; + skb->mac_len = mac_len; + skb->protocol = protocol; + __skb_push(skb, outer_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); - skb->mac_len = mac_len; - skb->protocol = protocol; + skb_set_transport_header(skb, gre_offset); + + if (!need_csum) + continue; + + greh = (struct gre_base_hdr *)skb_transport_header(skb); + pcsum = (__be32 *)(greh + 1); + + *pcsum = 0; + *(__sum16 *)pcsum = gso_make_checksum(skb, 0); } while ((skb = skb->next)); out: return segs; @@ -128,6 +127,11 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, struct packet_offload *ptype; __be16 type; + if (NAPI_GRO_CB(skb)->encap_mark) + goto out; + + NAPI_GRO_CB(skb)->encap_mark = 1; + off = skb_gro_offset(skb); hlen = off + sizeof(*greh); greh = skb_gro_header_fast(skb, off); @@ -146,6 +150,14 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0) goto out; + /* We can only support GRE_CSUM if we can track the location of + * the GRE header. In the case of FOU/GUE we cannot because the + * outer UDP header displaces the GRE header leaving us in a state + * of limbo. + */ + if ((greh->flags & GRE_CSUM) && NAPI_GRO_CB(skb)->is_fou) + goto out; + type = greh->protocol; rcu_read_lock(); @@ -177,8 +189,6 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, null_compute_pseudo); } - flush = 0; - for (p = *head; p; p = p->next) { const struct gre_base_hdr *greh2; @@ -215,6 +225,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, skb_gro_postpull_rcsum(skb, greh, grehlen); pp = ptype->callbacks.gro_receive(head, skb); + flush = 0; out_unlock: rcu_read_unlock(); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 36e26977c..633348977 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -931,7 +931,6 @@ static bool icmp_echo(struct sk_buff *skb) */ static bool icmp_timestamp(struct sk_buff *skb) { - struct timespec tv; struct icmp_bxm icmp_param; /* * Too short. @@ -942,9 +941,7 @@ static bool icmp_timestamp(struct sk_buff *skb) /* * Fill in the current time as ms since midnight UT: */ - getnstimeofday(&tv); - icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + - tv.tv_nsec / NSEC_PER_MSEC); + icmp_param.data.times[1] = inet_current_timestamp(); icmp_param.data.times[2] = icmp_param.data.times[1]; if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)) BUG(); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index b3086cf27..9b4ca87f7 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -107,12 +107,6 @@ #include <linux/seq_file.h> #endif -#define IP_MAX_MEMBERSHIPS 20 -#define IP_MAX_MSF 10 - -/* IGMP reports for link-local multicast groups are enabled by default */ -int sysctl_igmp_llm_reports __read_mostly = 1; - #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ @@ -432,6 +426,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) { struct net_device *dev = pmc->interface->dev; + struct net *net = dev_net(dev); struct igmpv3_report *pih; struct igmpv3_grec *pgr = NULL; struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; @@ -439,7 +434,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; - if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return skb; isquery = type == IGMPV3_MODE_IS_INCLUDE || @@ -542,6 +537,7 @@ empty_source: static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) { struct sk_buff *skb = NULL; + struct net *net = dev_net(in_dev->dev); int type; if (!pmc) { @@ -550,7 +546,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) if (pmc->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(pmc->multiaddr) && - !sysctl_igmp_llm_reports) + !net->ipv4.sysctl_igmp_llm_reports) continue; spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) @@ -686,7 +682,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) return igmpv3_send_report(in_dev, pmc); - if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports) return 0; if (type == IGMP_HOST_LEAVE_MESSAGE) @@ -765,9 +761,10 @@ static void igmp_ifc_timer_expire(unsigned long data) static void igmp_ifc_event(struct in_device *in_dev) { + struct net *net = dev_net(in_dev->dev); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; - in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv; + in_dev->mr_ifc_count = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; igmp_ifc_start_timer(in_dev, 1); } @@ -857,12 +854,13 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs) static bool igmp_heard_report(struct in_device *in_dev, __be32 group) { struct ip_mc_list *im; + struct net *net = dev_net(in_dev->dev); /* Timers are only set for non-local groups */ if (group == IGMP_ALL_HOSTS) return false; - if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports) return false; rcu_read_lock(); @@ -886,6 +884,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, __be32 group = ih->group; int max_delay; int mark = 0; + struct net *net = dev_net(in_dev->dev); if (len == 8) { @@ -971,7 +970,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && - !sysctl_igmp_llm_reports) + !net->ipv4.sysctl_igmp_llm_reports) continue; spin_lock_bh(&im->lock); if (im->tm_running) @@ -1087,6 +1086,7 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr) static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) { struct ip_mc_list *pmc; + struct net *net = dev_net(in_dev->dev); /* this is an "ip_mc_list" for convenience; only the fields below * are actually used. In particular, the refcnt and users are not @@ -1101,7 +1101,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) pmc->interface = im->interface; in_dev_hold(in_dev); pmc->multiaddr = im->multiaddr; - pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; pmc->sfmode = im->sfmode; if (pmc->sfmode == MCAST_INCLUDE) { struct ip_sf_list *psf; @@ -1186,6 +1186,7 @@ static void igmp_group_dropped(struct ip_mc_list *im) { struct in_device *in_dev = im->interface; #ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); int reporter; #endif @@ -1197,7 +1198,7 @@ static void igmp_group_dropped(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; - if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return; reporter = im->reporter; @@ -1222,6 +1223,9 @@ static void igmp_group_dropped(struct ip_mc_list *im) static void igmp_group_added(struct ip_mc_list *im) { struct in_device *in_dev = im->interface; +#ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); +#endif if (im->loaded == 0) { im->loaded = 1; @@ -1231,7 +1235,7 @@ static void igmp_group_added(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; - if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports) return; if (in_dev->dead) @@ -1244,7 +1248,7 @@ static void igmp_group_added(struct ip_mc_list *im) } /* else, v3 */ - im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; igmp_ifc_event(in_dev); #endif } @@ -1313,6 +1317,9 @@ static void ip_mc_hash_remove(struct in_device *in_dev, void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) { struct ip_mc_list *im; +#ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); +#endif ASSERT_RTNL(); @@ -1339,7 +1346,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr) spin_lock_init(&im->lock); #ifdef CONFIG_IP_MULTICAST setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im); - im->unsolicit_count = sysctl_igmp_qrv; + im->unsolicit_count = net->ipv4.sysctl_igmp_qrv; #endif im->next_rcu = in_dev->mc_list; @@ -1532,6 +1539,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev) #ifdef CONFIG_IP_MULTICAST struct ip_mc_list *im; int type; + struct net *net = dev_net(in_dev->dev); ASSERT_RTNL(); @@ -1539,7 +1547,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev) if (im->multiaddr == IGMP_ALL_HOSTS) continue; if (ipv4_is_local_multicast(im->multiaddr) && - !sysctl_igmp_llm_reports) + !net->ipv4.sysctl_igmp_llm_reports) continue; /* a failover is happening and switches @@ -1638,6 +1646,9 @@ void ip_mc_down(struct in_device *in_dev) void ip_mc_init_dev(struct in_device *in_dev) { +#ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); +#endif ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST @@ -1645,7 +1656,7 @@ void ip_mc_init_dev(struct in_device *in_dev) (unsigned long)in_dev); setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, (unsigned long)in_dev); - in_dev->mr_qrv = sysctl_igmp_qrv; + in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; #endif spin_lock_init(&in_dev->mc_tomb_lock); @@ -1656,11 +1667,14 @@ void ip_mc_init_dev(struct in_device *in_dev) void ip_mc_up(struct in_device *in_dev) { struct ip_mc_list *pmc; +#ifdef CONFIG_IP_MULTICAST + struct net *net = dev_net(in_dev->dev); +#endif ASSERT_RTNL(); #ifdef CONFIG_IP_MULTICAST - in_dev->mr_qrv = sysctl_igmp_qrv; + in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv; #endif ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); @@ -1726,11 +1740,6 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) /* * Join a socket to a group */ -int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS; -int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF; -#ifdef CONFIG_IP_MULTICAST -int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE; -#endif static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, __be32 *psfsrc) @@ -1755,6 +1764,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { #ifdef CONFIG_IP_MULTICAST struct in_device *in_dev = pmc->interface; + struct net *net = dev_net(in_dev->dev); #endif /* no more filters for this source */ @@ -1765,7 +1775,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, #ifdef CONFIG_IP_MULTICAST if (psf->sf_oldin && !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { - psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; psf->sf_next = pmc->tomb; pmc->tomb = psf; rv = 1; @@ -1823,12 +1833,13 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->sfcount[MCAST_INCLUDE]) { #ifdef CONFIG_IP_MULTICAST struct ip_sf_list *psf; + struct net *net = dev_net(in_dev->dev); #endif /* filter mode change */ pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST - pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; in_dev->mr_ifc_count = pmc->crcount; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; @@ -1995,6 +2006,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) { #ifdef CONFIG_IP_MULTICAST struct ip_sf_list *psf; + struct net *net = dev_net(pmc->interface->dev); in_dev = pmc->interface; #endif @@ -2006,7 +2018,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, #ifdef CONFIG_IP_MULTICAST /* else no filters; keep old mode for reports */ - pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv; + pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; in_dev->mr_ifc_count = pmc->crcount; for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; @@ -2073,7 +2085,7 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr) count++; } err = -ENOBUFS; - if (count >= sysctl_igmp_max_memberships) + if (count >= net->ipv4.sysctl_igmp_max_memberships) goto done; iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); if (!iml) @@ -2245,7 +2257,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct } /* else, add a new source to the filter */ - if (psl && psl->sl_count >= sysctl_igmp_max_msf) { + if (psl && psl->sl_count >= net->ipv4.sysctl_igmp_max_msf) { err = -ENOBUFS; goto done; } @@ -2918,6 +2930,12 @@ static int __net_init igmp_net_init(struct net *net) goto out_sock; } + /* Sysctl initialization */ + net->ipv4.sysctl_igmp_max_memberships = 20; + net->ipv4.sysctl_igmp_max_msf = 10; + /* IGMP reports for link-local multicast groups are enabled by default */ + net->ipv4.sysctl_igmp_llm_reports = 1; + net->ipv4.sysctl_igmp_qrv = 2; return 0; out_sock: diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 641489148..bc5196ea1 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -24,6 +24,7 @@ #include <net/tcp_states.h> #include <net/xfrm.h> #include <net/tcp.h> +#include <net/sock_reuseport.h> #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; @@ -67,7 +68,8 @@ int inet_csk_bind_conflict(const struct sock *sk, if ((!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) && (!reuseport || !sk2->sk_reuseport || - (sk2->sk_state != TCP_TIME_WAIT && + rcu_access_pointer(sk->sk_reuseport_cb) || + (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid(sk2))))) { if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr || @@ -89,161 +91,154 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. + * We try to allocate an odd port (and leave even ports for connect()) */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; + struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; + int ret = 1, attempts = 5, port = snum; + int smallest_size = -1, smallest_port; struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret, attempts = 5; struct net *net = sock_net(sk); - int smallest_size = -1, smallest_rover; + int i, low, high, attempt_half; + struct inet_bind_bucket *tb; kuid_t uid = sock_i_uid(sk); - int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; + u32 remaining, offset; - local_bh_disable(); - if (!snum) { - int remaining, rover, low, high; + if (port) { +have_port: + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); + inet_bind_bucket_for_each(tb, &head->chain) + if (net_eq(ib_net(tb), net) && tb->port == port) + goto tb_found; + goto tb_not_found; + } again: - inet_get_local_port_range(net, &low, &high); - if (attempt_half) { - int half = low + ((high - low) >> 1); - - if (attempt_half == 1) - high = half; - else - low = half; - } - remaining = (high - low) + 1; - smallest_rover = rover = prandom_u32() % remaining + low; - - smallest_size = -1; - do { - if (inet_is_local_reserved_port(net, rover)) - goto next_nolock; - head = &hashinfo->bhash[inet_bhashfn(net, rover, - hashinfo->bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == rover) { - if (((tb->fastreuse > 0 && - sk->sk_reuse && - sk->sk_state != TCP_LISTEN) || - (tb->fastreuseport > 0 && - sk->sk_reuseport && - uid_eq(tb->fastuid, uid))) && - (tb->num_owners < smallest_size || smallest_size == -1)) { - smallest_size = tb->num_owners; - smallest_rover = rover; - } - if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) { - snum = rover; - goto tb_found; - } - goto next; + attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0; +other_half_scan: + inet_get_local_port_range(net, &low, &high); + high++; /* [32768, 60999] -> [32768, 61000[ */ + if (high - low < 4) + attempt_half = 0; + if (attempt_half) { + int half = low + (((high - low) >> 2) << 1); + + if (attempt_half == 1) + high = half; + else + low = half; + } + remaining = high - low; + if (likely(remaining > 1)) + remaining &= ~1U; + + offset = prandom_u32() % remaining; + /* __inet_hash_connect() favors ports having @low parity + * We do the opposite to not pollute connect() users. + */ + offset |= 1U; + smallest_size = -1; + smallest_port = low; /* avoid compiler warning */ + +other_parity_scan: + port = low + offset; + for (i = 0; i < remaining; i += 2, port += 2) { + if (unlikely(port >= high)) + port -= remaining; + if (inet_is_local_reserved_port(net, port)) + continue; + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); + inet_bind_bucket_for_each(tb, &head->chain) + if (net_eq(ib_net(tb), net) && tb->port == port) { + if (((tb->fastreuse > 0 && reuse) || + (tb->fastreuseport > 0 && + sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && + uid_eq(tb->fastuid, uid))) && + (tb->num_owners < smallest_size || smallest_size == -1)) { + smallest_size = tb->num_owners; + smallest_port = port; } - break; - next: - spin_unlock(&head->lock); - next_nolock: - if (++rover > high) - rover = low; - } while (--remaining > 0); - - /* Exhausted local port range during search? It is not - * possible for us to be holding one of the bind hash - * locks if this test triggers, because if 'remaining' - * drops to zero, we broke out of the do/while loop at - * the top level, not from the 'break;' statement. - */ - ret = 1; - if (remaining <= 0) { - if (smallest_size != -1) { - snum = smallest_rover; - goto have_snum; - } - if (attempt_half == 1) { - /* OK we now try the upper half of the range */ - attempt_half = 2; - goto again; + if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) + goto tb_found; + goto next_port; } - goto fail; - } - /* OK, here is the one we will use. HEAD is - * non-NULL and we hold it's mutex. - */ - snum = rover; - } else { -have_snum: - head = &hashinfo->bhash[inet_bhashfn(net, snum, - hashinfo->bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, &head->chain) - if (net_eq(ib_net(tb), net) && tb->port == snum) - goto tb_found; + goto tb_not_found; +next_port: + spin_unlock_bh(&head->lock); + cond_resched(); + } + + if (smallest_size != -1) { + port = smallest_port; + goto have_port; } - tb = NULL; - goto tb_not_found; + offset--; + if (!(offset & 1)) + goto other_parity_scan; + + if (attempt_half == 1) { + /* OK we now try the upper half of the range */ + attempt_half = 2; + goto other_half_scan; + } + return ret; + +tb_not_found: + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + net, head, port); + if (!tb) + goto fail_unlock; tb_found: if (!hlist_empty(&tb->owners)) { if (sk->sk_reuse == SK_FORCE_REUSE) goto success; - if (((tb->fastreuse > 0 && - sk->sk_reuse && sk->sk_state != TCP_LISTEN) || + if (((tb->fastreuse > 0 && reuse) || (tb->fastreuseport > 0 && + !rcu_access_pointer(sk->sk_reuseport_cb) && sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && - smallest_size == -1) { + smallest_size == -1) goto success; - } else { - ret = 1; - if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { - if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) || - (tb->fastreuseport > 0 && - sk->sk_reuseport && uid_eq(tb->fastuid, uid))) && - smallest_size != -1 && --attempts >= 0) { - spin_unlock(&head->lock); - goto again; - } - - goto fail_unlock; + if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) { + if ((reuse || + (tb->fastreuseport > 0 && + sk->sk_reuseport && + !rcu_access_pointer(sk->sk_reuseport_cb) && + uid_eq(tb->fastuid, uid))) && + smallest_size != -1 && --attempts >= 0) { + spin_unlock_bh(&head->lock); + goto again; } + goto fail_unlock; } - } -tb_not_found: - ret = 1; - if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, - net, head, snum)) == NULL) - goto fail_unlock; - if (hlist_empty(&tb->owners)) { - if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) - tb->fastreuse = 1; - else + if (!reuse) tb->fastreuse = 0; + if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)) + tb->fastreuseport = 0; + } else { + tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = 1; tb->fastuid = uid; - } else - tb->fastreuseport = 0; - } else { - if (tb->fastreuse && - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) - tb->fastreuse = 0; - if (tb->fastreuseport && - (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) + } else { tb->fastreuseport = 0; + } } success: if (!inet_csk(sk)->icsk_bind_hash) - inet_bind_hash(sk, tb, snum); + inet_bind_hash(sk, tb, port); WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); ret = 0; fail_unlock: - spin_unlock(&head->lock); -fail: - local_bh_enable(); + spin_unlock_bh(&head->lock); return ret; } EXPORT_SYMBOL_GPL(inet_csk_get_port); @@ -482,10 +477,6 @@ EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); #define AF_INET_FAMILY(fam) true #endif -/* Only thing we need from tcp.h */ -extern int sysctl_tcp_synack_retries; - - /* Decide when to expire the request and when to resend SYN-ACK */ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, const int max_retries, @@ -557,6 +548,7 @@ static void reqsk_timer_handler(unsigned long data) { struct request_sock *req = (struct request_sock *)data; struct sock *sk_listener = req->rsk_listener; + struct net *net = sock_net(sk_listener); struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; int qlen, expire = 0, resend = 0; @@ -566,7 +558,7 @@ static void reqsk_timer_handler(unsigned long data) if (sk_state_load(sk_listener) != TCP_LISTEN) goto drop; - max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; thresh = max_retries; /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. @@ -737,6 +729,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); + int err = -EADDRINUSE; reqsk_queue_alloc(&icsk->icsk_accept_queue); @@ -754,13 +747,14 @@ int inet_csk_listen_start(struct sock *sk, int backlog) inet->inet_sport = htons(inet->inet_num); sk_dst_reset(sk); - sk->sk_prot->hash(sk); + err = sk->sk_prot->hash(sk); - return 0; + if (likely(!err)) + return 0; } sk->sk_state = TCP_CLOSE; - return -EADDRINUSE; + return err; } EXPORT_SYMBOL_GPL(inet_csk_listen_start); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 6029157a1..5fdb02f55 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -357,18 +357,18 @@ struct sock *inet_diag_find_one_icsk(struct net *net, struct sock *sk; if (req->sdiag_family == AF_INET) - sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0], + sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) - sk = inet_lookup(net, hashinfo, req->id.idiag_dst[3], + sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if); else - sk = inet6_lookup(net, hashinfo, + sk = inet6_lookup(net, hashinfo, NULL, 0, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, @@ -879,6 +879,7 @@ next_normal: } spin_unlock_bh(lock); + cond_resched(); } done: diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ccc598079..0d9e9d7bb 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -20,10 +20,12 @@ #include <linux/wait.h> #include <linux/vmalloc.h> +#include <net/addrconf.h> #include <net/inet_connection_sock.h> #include <net/inet_hashtables.h> #include <net/secure_seq.h> #include <net/ip.h> +#include <net/sock_reuseport.h> static u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, @@ -205,6 +207,7 @@ static inline int compute_score(struct sock *sk, struct net *net, struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, + struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif) @@ -214,6 +217,7 @@ struct sock *__inet_lookup_listener(struct net *net, unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; int score, hiscore, matches = 0, reuseport = 0; + bool select_ok = true; u32 phash = 0; rcu_read_lock(); @@ -229,6 +233,15 @@ begin: if (reuseport) { phash = inet_ehashfn(net, daddr, hnum, saddr, sport); + if (select_ok) { + struct sock *sk2; + sk2 = reuseport_select_sock(sk, phash, + skb, doff); + if (sk2) { + result = sk2; + goto found; + } + } matches = 1; } } else if (score == hiscore && reuseport) { @@ -246,11 +259,13 @@ begin: if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) result = NULL; else if (unlikely(compute_score(result, net, hnum, daddr, dif) < hiscore)) { sock_put(result); + select_ok = false; goto begin; } } @@ -449,32 +464,76 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) } EXPORT_SYMBOL_GPL(inet_ehash_nolisten); -void __inet_hash(struct sock *sk, struct sock *osk) +static int inet_reuseport_add_sock(struct sock *sk, + struct inet_listen_hashbucket *ilb, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) +{ + struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; + struct sock *sk2; + struct hlist_nulls_node *node; + kuid_t uid = sock_i_uid(sk); + + sk_nulls_for_each_rcu(sk2, node, &ilb->head) { + if (sk2 != sk && + sk2->sk_family == sk->sk_family && + ipv6_only_sock(sk2) == ipv6_only_sock(sk) && + sk2->sk_bound_dev_if == sk->sk_bound_dev_if && + inet_csk(sk2)->icsk_bind_hash == tb && + sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && + saddr_same(sk, sk2, false)) + return reuseport_add_sock(sk, sk2); + } + + /* Initial allocation may have already happened via setsockopt */ + if (!rcu_access_pointer(sk->sk_reuseport_cb)) + return reuseport_alloc(sk); + return 0; +} + +int __inet_hash(struct sock *sk, struct sock *osk, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; + int err = 0; if (sk->sk_state != TCP_LISTEN) { inet_ehash_nolisten(sk, osk); - return; + return 0; } WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; spin_lock(&ilb->lock); + if (sk->sk_reuseport) { + err = inet_reuseport_add_sock(sk, ilb, saddr_same); + if (err) + goto unlock; + } __sk_nulls_add_node_rcu(sk, &ilb->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); +unlock: spin_unlock(&ilb->lock); + + return err; } EXPORT_SYMBOL(__inet_hash); -void inet_hash(struct sock *sk) +int inet_hash(struct sock *sk) { + int err = 0; + if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); - __inet_hash(sk, NULL); + err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal); local_bh_enable(); } + + return err; } EXPORT_SYMBOL_GPL(inet_hash); @@ -493,6 +552,8 @@ void inet_unhash(struct sock *sk) lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); done = __sk_nulls_del_node_init_rcu(sk); if (done) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); @@ -506,106 +567,106 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *, __u16, struct inet_timewait_sock **)) { struct inet_hashinfo *hinfo = death_row->hashinfo; - const unsigned short snum = inet_sk(sk)->inet_num; + struct inet_timewait_sock *tw = NULL; struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; + int port = inet_sk(sk)->inet_num; struct net *net = sock_net(sk); + struct inet_bind_bucket *tb; + u32 remaining, offset; + int ret, i, low, high; + static u32 hint; + + if (port) { + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { + inet_ehash_nolisten(sk, NULL); + spin_unlock_bh(&head->lock); + return 0; + } + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = check_established(death_row, sk, port, NULL); + local_bh_enable(); + return ret; + } - if (!snum) { - int i, remaining, low, high, port; - static u32 hint; - u32 offset = hint + port_offset; - struct inet_timewait_sock *tw = NULL; + inet_get_local_port_range(net, &low, &high); + high++; /* [32768, 60999] -> [32768, 61000[ */ + remaining = high - low; + if (likely(remaining > 1)) + remaining &= ~1U; - inet_get_local_port_range(net, &low, &high); - remaining = (high - low) + 1; + offset = (hint + port_offset) % remaining; + /* In first pass we try ports of @low parity. + * inet_csk_get_port() does the opposite choice. + */ + offset &= ~1U; +other_parity_scan: + port = low + offset; + for (i = 0; i < remaining; i += 2, port += 2) { + if (unlikely(port >= high)) + port -= remaining; + if (inet_is_local_reserved_port(net, port)) + continue; + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; + spin_lock_bh(&head->lock); - /* By starting with offset being an even number, - * we tend to leave about 50% of ports for other uses, - * like bind(0). + /* Does not bother with rcv_saddr checks, because + * the established check is already unique enough. */ - offset &= ~1; - - local_bh_disable(); - for (i = 0; i < remaining; i++) { - port = low + (i + offset) % remaining; - if (inet_is_local_reserved_port(net, port)) - continue; - head = &hinfo->bhash[inet_bhashfn(net, port, - hinfo->bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, &head->chain) { - if (net_eq(ib_net(tb), net) && - tb->port == port) { - if (tb->fastreuse >= 0 || - tb->fastreuseport >= 0) - goto next_port; - WARN_ON(hlist_empty(&tb->owners)); - if (!check_established(death_row, sk, - port, &tw)) - goto ok; + inet_bind_bucket_for_each(tb, &head->chain) { + if (net_eq(ib_net(tb), net) && tb->port == port) { + if (tb->fastreuse >= 0 || + tb->fastreuseport >= 0) goto next_port; - } + WARN_ON(hlist_empty(&tb->owners)); + if (!check_established(death_row, sk, + port, &tw)) + goto ok; + goto next_port; } - - tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, - net, head, port); - if (!tb) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - tb->fastreuseport = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); } - local_bh_enable(); - - return -EADDRNOTAVAIL; -ok: - hint += (i + 2) & ~1; - - /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); - if (sk_unhashed(sk)) { - inet_sk(sk)->inet_sport = htons(port); - inet_ehash_nolisten(sk, (struct sock *)tw); + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + net, head, port); + if (!tb) { + spin_unlock_bh(&head->lock); + return -ENOMEM; } - if (tw) - inet_twsk_bind_unhash(tw, hinfo); - spin_unlock(&head->lock); + tb->fastreuse = -1; + tb->fastreuseport = -1; + goto ok; +next_port: + spin_unlock_bh(&head->lock); + cond_resched(); + } - if (tw) - inet_twsk_deschedule_put(tw); + offset++; + if ((offset & 1) && remaining > 1) + goto other_parity_scan; - ret = 0; - goto out; - } + return -EADDRNOTAVAIL; - head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - inet_ehash_nolisten(sk, NULL); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = check_established(death_row, sk, snum, NULL); -out: - local_bh_enable(); - return ret; +ok: + hint += i + 2; + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->inet_sport = htons(port); + inet_ehash_nolisten(sk, (struct sock *)tw); } + if (tw) + inet_twsk_bind_unhash(tw, hinfo); + spin_unlock(&head->lock); + if (tw) + inet_twsk_deschedule_put(tw); + local_bh_enable(); + return 0; } /* diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c deleted file mode 100644 index f17ea49b2..000000000 --- a/net/ipv4/inet_lro.c +++ /dev/null @@ -1,374 +0,0 @@ -/* - * linux/net/ipv4/inet_lro.c - * - * Large Receive Offload (ipv4 / tcp) - * - * (C) Copyright IBM Corp. 2007 - * - * Authors: - * Jan-Bernd Themann <themann@de.ibm.com> - * Christoph Raisch <raisch@de.ibm.com> - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - - -#include <linux/module.h> -#include <linux/if_vlan.h> -#include <linux/inet_lro.h> -#include <net/checksum.h> - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>"); -MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)"); - -#define TCP_HDR_LEN(tcph) (tcph->doff << 2) -#define IP_HDR_LEN(iph) (iph->ihl << 2) -#define TCP_PAYLOAD_LENGTH(iph, tcph) \ - (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph)) - -#define IPH_LEN_WO_OPTIONS 5 -#define TCPH_LEN_WO_OPTIONS 5 -#define TCPH_LEN_W_TIMESTAMP 8 - -#define LRO_MAX_PG_HLEN 64 - -#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; } - -/* - * Basic tcp checks whether packet is suitable for LRO - */ - -static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph, - int len, const struct net_lro_desc *lro_desc) -{ - /* check ip header: don't aggregate padded frames */ - if (ntohs(iph->tot_len) != len) - return -1; - - if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0) - return -1; - - if (iph->ihl != IPH_LEN_WO_OPTIONS) - return -1; - - if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack || - tcph->rst || tcph->syn || tcph->fin) - return -1; - - if (INET_ECN_is_ce(ipv4_get_dsfield(iph))) - return -1; - - if (tcph->doff != TCPH_LEN_WO_OPTIONS && - tcph->doff != TCPH_LEN_W_TIMESTAMP) - return -1; - - /* check tcp options (only timestamp allowed) */ - if (tcph->doff == TCPH_LEN_W_TIMESTAMP) { - __be32 *topt = (__be32 *)(tcph + 1); - - if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) - | (TCPOPT_TIMESTAMP << 8) - | TCPOLEN_TIMESTAMP)) - return -1; - - /* timestamp should be in right order */ - topt++; - if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval), - ntohl(*topt))) - return -1; - - /* timestamp reply should not be zero */ - topt++; - if (*topt == 0) - return -1; - } - - return 0; -} - -static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc) -{ - struct iphdr *iph = lro_desc->iph; - struct tcphdr *tcph = lro_desc->tcph; - __be32 *p; - __wsum tcp_hdr_csum; - - tcph->ack_seq = lro_desc->tcp_ack; - tcph->window = lro_desc->tcp_window; - - if (lro_desc->tcp_saw_tstamp) { - p = (__be32 *)(tcph + 1); - *(p+2) = lro_desc->tcp_rcv_tsecr; - } - - csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len)); - iph->tot_len = htons(lro_desc->ip_tot_len); - - tcph->check = 0; - tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0); - lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum); - tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, - lro_desc->ip_tot_len - - IP_HDR_LEN(iph), IPPROTO_TCP, - lro_desc->data_csum); -} - -static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len) -{ - __wsum tcp_csum; - __wsum tcp_hdr_csum; - __wsum tcp_ps_hdr_csum; - - tcp_csum = ~csum_unfold(tcph->check); - tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum); - - tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, - len + TCP_HDR_LEN(tcph), - IPPROTO_TCP, 0); - - return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum), - tcp_ps_hdr_csum); -} - -static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb, - struct iphdr *iph, struct tcphdr *tcph) -{ - int nr_frags; - __be32 *ptr; - u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - - nr_frags = skb_shinfo(skb)->nr_frags; - lro_desc->parent = skb; - lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]); - lro_desc->iph = iph; - lro_desc->tcph = tcph; - lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len; - lro_desc->tcp_ack = tcph->ack_seq; - lro_desc->tcp_window = tcph->window; - - lro_desc->pkt_aggr_cnt = 1; - lro_desc->ip_tot_len = ntohs(iph->tot_len); - - if (tcph->doff == 8) { - ptr = (__be32 *)(tcph+1); - lro_desc->tcp_saw_tstamp = 1; - lro_desc->tcp_rcv_tsval = *(ptr+1); - lro_desc->tcp_rcv_tsecr = *(ptr+2); - } - - lro_desc->mss = tcp_data_len; - lro_desc->active = 1; - - lro_desc->data_csum = lro_tcp_data_csum(iph, tcph, - tcp_data_len); -} - -static inline void lro_clear_desc(struct net_lro_desc *lro_desc) -{ - memset(lro_desc, 0, sizeof(struct net_lro_desc)); -} - -static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph, - struct tcphdr *tcph, int tcp_data_len) -{ - struct sk_buff *parent = lro_desc->parent; - __be32 *topt; - - lro_desc->pkt_aggr_cnt++; - lro_desc->ip_tot_len += tcp_data_len; - lro_desc->tcp_next_seq += tcp_data_len; - lro_desc->tcp_window = tcph->window; - lro_desc->tcp_ack = tcph->ack_seq; - - /* don't update tcp_rcv_tsval, would not work with PAWS */ - if (lro_desc->tcp_saw_tstamp) { - topt = (__be32 *) (tcph + 1); - lro_desc->tcp_rcv_tsecr = *(topt + 2); - } - - lro_desc->data_csum = csum_block_add(lro_desc->data_csum, - lro_tcp_data_csum(iph, tcph, - tcp_data_len), - parent->len); - - parent->len += tcp_data_len; - parent->data_len += tcp_data_len; - if (tcp_data_len > lro_desc->mss) - lro_desc->mss = tcp_data_len; -} - -static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb, - struct iphdr *iph, struct tcphdr *tcph) -{ - struct sk_buff *parent = lro_desc->parent; - int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); - - lro_add_common(lro_desc, iph, tcph, tcp_data_len); - - skb_pull(skb, (skb->len - tcp_data_len)); - parent->truesize += skb->truesize; - - if (lro_desc->last_skb) - lro_desc->last_skb->next = skb; - else - skb_shinfo(parent)->frag_list = skb; - - lro_desc->last_skb = skb; -} - - -static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, - struct iphdr *iph, - struct tcphdr *tcph) -{ - if ((lro_desc->iph->saddr != iph->saddr) || - (lro_desc->iph->daddr != iph->daddr) || - (lro_desc->tcph->source != tcph->source) || - (lro_desc->tcph->dest != tcph->dest)) - return -1; - return 0; -} - -static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr, - struct net_lro_desc *lro_arr, - struct iphdr *iph, - struct tcphdr *tcph) -{ - struct net_lro_desc *lro_desc = NULL; - struct net_lro_desc *tmp; - int max_desc = lro_mgr->max_desc; - int i; - - for (i = 0; i < max_desc; i++) { - tmp = &lro_arr[i]; - if (tmp->active) - if (!lro_check_tcp_conn(tmp, iph, tcph)) { - lro_desc = tmp; - goto out; - } - } - - for (i = 0; i < max_desc; i++) { - if (!lro_arr[i].active) { - lro_desc = &lro_arr[i]; - goto out; - } - } - - LRO_INC_STATS(lro_mgr, no_desc); -out: - return lro_desc; -} - -static void lro_flush(struct net_lro_mgr *lro_mgr, - struct net_lro_desc *lro_desc) -{ - if (lro_desc->pkt_aggr_cnt > 1) - lro_update_tcp_ip_header(lro_desc); - - skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss; - - if (lro_mgr->features & LRO_F_NAPI) - netif_receive_skb(lro_desc->parent); - else - netif_rx(lro_desc->parent); - - LRO_INC_STATS(lro_mgr, flushed); - lro_clear_desc(lro_desc); -} - -static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, - void *priv) -{ - struct net_lro_desc *lro_desc; - struct iphdr *iph; - struct tcphdr *tcph; - u64 flags; - int vlan_hdr_len = 0; - - if (!lro_mgr->get_skb_header || - lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, - &flags, priv)) - goto out; - - if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) - goto out; - - lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); - if (!lro_desc) - goto out; - - if ((skb->protocol == htons(ETH_P_8021Q)) && - !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) - vlan_hdr_len = VLAN_HLEN; - - if (!lro_desc->active) { /* start new lro session */ - if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL)) - goto out; - - skb->ip_summed = lro_mgr->ip_summed_aggr; - lro_init_desc(lro_desc, skb, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - return 0; - } - - if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) - goto out2; - - if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc)) - goto out2; - - lro_add_packet(lro_desc, skb, iph, tcph); - LRO_INC_STATS(lro_mgr, aggregated); - - if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) || - lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu)) - lro_flush(lro_mgr, lro_desc); - - return 0; - -out2: /* send aggregated SKBs to stack */ - lro_flush(lro_mgr, lro_desc); - -out: - return 1; -} - -void lro_receive_skb(struct net_lro_mgr *lro_mgr, - struct sk_buff *skb, - void *priv) -{ - if (__lro_proc_skb(lro_mgr, skb, priv)) { - if (lro_mgr->features & LRO_F_NAPI) - netif_receive_skb(skb); - else - netif_rx(skb); - } -} -EXPORT_SYMBOL(lro_receive_skb); - -void lro_flush_all(struct net_lro_mgr *lro_mgr) -{ - int i; - struct net_lro_desc *lro_desc = lro_mgr->lro_arr; - - for (i = 0; i < lro_mgr->max_desc; i++) { - if (lro_desc[i].active) - lro_flush(lro_mgr, &lro_desc[i]); - } -} -EXPORT_SYMBOL(lro_flush_all); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index da0d7ce85..af18f1e48 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -71,7 +71,6 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s if (unlikely(opt->optlen)) ip_forward_options(skb); - skb_sender_cpu_clear(skb); return dst_output(net, sk, skb); } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 187c6fcc3..efbd47d1a 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -54,8 +54,6 @@ * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c * as well. Or notify me, at least. --ANK */ - -static int sysctl_ipfrag_max_dist __read_mostly = 64; static const char ip_frag_cache_name[] = "ip4-frags"; struct ipfrag_skb_cb @@ -150,7 +148,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) qp->daddr = arg->iph->daddr; qp->vif = arg->vif; qp->user = arg->user; - qp->peer = sysctl_ipfrag_max_dist ? + qp->peer = q->net->max_dist ? inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : NULL; } @@ -275,7 +273,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, static int ip_frag_too_far(struct ipq *qp) { struct inet_peer *peer = qp->peer; - unsigned int max = sysctl_ipfrag_max_dist; + unsigned int max = qp->q.net->max_dist; unsigned int start, end; int rc; @@ -749,6 +747,14 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "ipfrag_max_dist", + .data = &init_net.ipv4.frags.max_dist, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero + }, { } }; @@ -762,14 +768,6 @@ static struct ctl_table ip4_frags_ctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .procname = "ipfrag_max_dist", - .data = &sysctl_ipfrag_max_dist, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero - }, { } }; @@ -790,10 +788,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net) table[1].data = &net->ipv4.frags.low_thresh; table[1].extra2 = &net->ipv4.frags.high_thresh; table[2].data = &net->ipv4.frags.timeout; - - /* Don't export sysctls to unprivileged users */ - if (net->user_ns != &init_user_ns) - table[0].procname = NULL; + table[3].data = &net->ipv4.frags.max_dist; } hdr = register_net_sysctl(net, "net/ipv4", table); @@ -865,6 +860,8 @@ static int __net_init ipv4_frags_init_net(struct net *net) */ net->ipv4.frags.timeout = IP_FRAG_TIME; + net->ipv4.frags.max_dist = 64; + res = inet_frags_init_net(&net->ipv4.frags); if (res) return res; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 41ba68de4..4cc84212c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -179,6 +179,7 @@ static __be16 tnl_flags_to_gre_flags(__be16 tflags) return flags; } +/* Fills in tpi and returns header length to be pulled. */ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err) { @@ -238,7 +239,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, return -EINVAL; } } - return iptunnel_pull_header(skb, hdr_len, tpi->proto); + return hdr_len; } static void ipgre_err(struct sk_buff *skb, u32 info, @@ -341,7 +342,7 @@ static void gre_err(struct sk_buff *skb, u32 info) struct tnl_ptk_info tpi; bool csum_err = false; - if (parse_gre_header(skb, &tpi, &csum_err)) { + if (parse_gre_header(skb, &tpi, &csum_err) < 0) { if (!csum_err) /* ignore csum errors. */ return; } @@ -397,7 +398,10 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) iph->saddr, iph->daddr, tpi->key); if (tunnel) { - skb_pop_mac_header(skb); + if (tunnel->dev->type != ARPHRD_NONE) + skb_pop_mac_header(skb); + else + skb_reset_mac_header(skb); if (tunnel->collect_md) { __be16 flags; __be64 tun_id; @@ -419,6 +423,7 @@ static int gre_rcv(struct sk_buff *skb) { struct tnl_ptk_info tpi; bool csum_err = false; + int hdr_len; #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { @@ -428,7 +433,10 @@ static int gre_rcv(struct sk_buff *skb) } #endif - if (parse_gre_header(skb, &tpi, &csum_err) < 0) + hdr_len = parse_gre_header(skb, &tpi, &csum_err); + if (hdr_len < 0) + goto drop; + if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false) < 0) goto drop; if (ipgre_rcv(skb, &tpi) == PACKET_RCVD) @@ -440,6 +448,17 @@ drop: return 0; } +static __sum16 gre_checksum(struct sk_buff *skb) +{ + __wsum csum; + + if (skb->ip_summed == CHECKSUM_PARTIAL) + csum = lco_csum(skb); + else + csum = skb_checksum(skb, 0, skb->len, 0); + return csum_fold(csum); +} + static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, __be16 proto, __be32 key, __be32 seq) { @@ -467,8 +486,7 @@ static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, !(skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { *ptr = 0; - *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, - skb->len, 0)); + *(__sum16 *)ptr = gre_checksum(skb); } } } @@ -493,8 +511,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, static struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool csum) { - return iptunnel_handle_offloads(skb, csum, - csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); + return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } static struct rtable *gre_get_rt(struct sk_buff *skb, @@ -514,15 +531,17 @@ static struct rtable *gre_get_rt(struct sk_buff *skb, return ip_route_output_key(net, fl); } -static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) +static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, + __be16 proto) { struct ip_tunnel_info *tun_info; const struct ip_tunnel_key *key; + struct rtable *rt = NULL; struct flowi4 fl; - struct rtable *rt; int min_headroom; int tunnel_hlen; __be16 df, flags; + bool use_cache; int err; tun_info = skb_tunnel_info(skb); @@ -531,9 +550,17 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_skb; key = &tun_info->key; - rt = gre_get_rt(skb, dev, &fl, key); - if (IS_ERR(rt)) - goto err_free_skb; + use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); + if (use_cache) + rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr); + if (!rt) { + rt = gre_get_rt(skb, dev, &fl, key); + if (IS_ERR(rt)) + goto err_free_skb; + if (use_cache) + dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, + fl.saddr); + } tunnel_hlen = ip_gre_calc_hlen(key->tun_flags); @@ -557,7 +584,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) } flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); - build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB), + build_header(skb, tunnel_hlen, flags, proto, tunnel_id_to_key(tun_info->key.tun_id), 0); df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; @@ -598,7 +625,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, const struct iphdr *tnl_params; if (tunnel->collect_md) { - gre_fb_xmit(skb, dev); + gre_fb_xmit(skb, dev, skb->protocol); return NETDEV_TX_OK; } @@ -642,7 +669,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, struct ip_tunnel *tunnel = netdev_priv(dev); if (tunnel->collect_md) { - gre_fb_xmit(skb, dev); + gre_fb_xmit(skb, dev, htons(ETH_P_TEB)); return NETDEV_TX_OK; } @@ -844,9 +871,16 @@ static void __gre_tunnel_init(struct net_device *dev) dev->hw_features |= GRE_FEATURES; if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { - /* TCP offload with GRE SEQ is not supported. */ - dev->features |= NETIF_F_GSO_SOFTWARE; - dev->hw_features |= NETIF_F_GSO_SOFTWARE; + /* TCP offload with GRE SEQ is not supported, nor + * can we support 2 levels of outer headers requiring + * an update. + */ + if (!(tunnel->parms.o_flags & TUNNEL_CSUM) || + (tunnel->encap.type == TUNNEL_ENCAP_NONE)) { + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } + /* Can use a lockless transmit, unless we generate * output sequences */ @@ -868,7 +902,7 @@ static int ipgre_tunnel_init(struct net_device *dev) netif_keep_dst(dev); dev->addr_len = 4; - if (iph->daddr) { + if (iph->daddr && !tunnel->collect_md) { #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { if (!iph->saddr) @@ -877,8 +911,9 @@ static int ipgre_tunnel_init(struct net_device *dev) dev->header_ops = &ipgre_header_ops; } #endif - } else + } else if (!tunnel->collect_md) { dev->header_ops = &ipgre_header_ops; + } return ip_tunnel_init(dev); } @@ -921,6 +956,11 @@ static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) if (flags & (GRE_VERSION|GRE_ROUTING)) return -EINVAL; + if (data[IFLA_GRE_COLLECT_METADATA] && + data[IFLA_GRE_ENCAP_TYPE] && + nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]) != TUNNEL_ENCAP_NONE) + return -EINVAL; + return 0; } @@ -994,6 +1034,8 @@ static void ipgre_netlink_parms(struct net_device *dev, struct ip_tunnel *t = netdev_priv(dev); t->collect_md = true; + if (dev->type == ARPHRD_IPGRE) + dev->type = ARPHRD_NONE; } } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d77eb0c3b..e3d782746 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -308,15 +308,12 @@ drop: return true; } -int sysctl_ip_early_demux __read_mostly = 1; -EXPORT_SYMBOL(sysctl_ip_early_demux); - static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; - if (sysctl_ip_early_demux && + if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk && !ip_is_fragment(iph)) { @@ -362,8 +359,31 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len); - } else if (rt->rt_type == RTN_BROADCAST) + } else if (rt->rt_type == RTN_BROADCAST) { IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len); + } else if (skb->pkt_type == PACKET_BROADCAST || + skb->pkt_type == PACKET_MULTICAST) { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); + + /* RFC 1122 3.3.6: + * + * When a host sends a datagram to a link-layer broadcast + * address, the IP destination address MUST be a legal IP + * broadcast or IP multicast address. + * + * A host SHOULD silently discard a datagram that is received + * via a link-layer broadcast (see Section 2.4) but does not + * specify an IP multicast or broadcast destination address. + * + * This doesn't explicitly say L2 *broadcast*, but broadcast is + * in a way a form of multicast and the most common use case for + * this is 802.11 protecting against cross-station spoofing (the + * so-called "hole-196" attack) so do it for both. + */ + if (in_dev && + IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) + goto drop; + } return dst_input(skb); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index bd2467923..4d158ff1d 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -58,10 +58,9 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt, if (opt->ts_needaddr) ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt); if (opt->ts_needtime) { - struct timespec tv; __be32 midtime; - getnstimeofday(&tv); - midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC); + + midtime = inet_current_timestamp(); memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4); } return; @@ -415,11 +414,10 @@ int ip_options_compile(struct net *net, break; } if (timeptr) { - struct timespec tv; - u32 midtime; - getnstimeofday(&tv); - midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC; - put_unaligned_be32(midtime, timeptr); + __be32 midtime; + + midtime = inet_current_timestamp(); + memcpy(timeptr, &midtime, 4); opt->is_changed = 1; } } else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 565bf64b2..124bf0a66 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -79,9 +79,6 @@ #include <linux/netlink.h> #include <linux/tcp.h> -int sysctl_ip_default_ttl __read_mostly = IPDEFTTL; -EXPORT_SYMBOL(sysctl_ip_default_ttl); - static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu, diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index a50124260..035ad645a 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -573,6 +573,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); int val = 0, err; bool needs_rtnl = setsockopt_needs_rtnl(optname); @@ -912,7 +913,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, } /* numsrc >= (1G-4) overflow in 32 bits */ if (msf->imsf_numsrc >= 0x3ffffffcU || - msf->imsf_numsrc > sysctl_igmp_max_msf) { + msf->imsf_numsrc > net->ipv4.sysctl_igmp_max_msf) { kfree(msf); err = -ENOBUFS; break; @@ -1067,7 +1068,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, /* numsrc >= (4G-140)/128 overflow in 32 bits */ if (gsf->gf_numsrc >= 0x1ffffff || - gsf->gf_numsrc > sysctl_igmp_max_msf) { + gsf->gf_numsrc > net->ipv4.sysctl_igmp_max_msf) { err = -ENOBUFS; goto mc_msf_out; } @@ -1342,10 +1343,13 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, val = inet->tos; break; case IP_TTL: + { + struct net *net = sock_net(sk); val = (inet->uc_ttl == -1 ? - sysctl_ip_default_ttl : + net->ipv4.sysctl_ip_default_ttl : inet->uc_ttl); break; + } case IP_HDRINCL: val = inet->hdrincl; break; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 336e6892a..a69ed94bd 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -68,61 +68,6 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote) IP_TNL_HASH_BITS); } -static void __tunnel_dst_set(struct ip_tunnel_dst *idst, - struct dst_entry *dst, __be32 saddr) -{ - struct dst_entry *old_dst; - - dst_clone(dst); - old_dst = xchg((__force struct dst_entry **)&idst->dst, dst); - dst_release(old_dst); - idst->saddr = saddr; -} - -static noinline void tunnel_dst_set(struct ip_tunnel *t, - struct dst_entry *dst, __be32 saddr) -{ - __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr); -} - -static void tunnel_dst_reset(struct ip_tunnel *t) -{ - tunnel_dst_set(t, NULL, 0); -} - -void ip_tunnel_dst_reset_all(struct ip_tunnel *t) -{ - int i; - - for_each_possible_cpu(i) - __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0); -} -EXPORT_SYMBOL(ip_tunnel_dst_reset_all); - -static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, - u32 cookie, __be32 *saddr) -{ - struct ip_tunnel_dst *idst; - struct dst_entry *dst; - - rcu_read_lock(); - idst = raw_cpu_ptr(t->dst_cache); - dst = rcu_dereference(idst->dst); - if (dst && !atomic_inc_not_zero(&dst->__refcnt)) - dst = NULL; - if (dst) { - if (!dst->obsolete || dst->ops->check(dst, cookie)) { - *saddr = idst->saddr; - } else { - tunnel_dst_reset(t); - dst_release(dst); - dst = NULL; - } - } - rcu_read_unlock(); - return (struct rtable *)dst; -} - static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p, __be16 flags, __be32 key) { @@ -381,11 +326,12 @@ static int ip_tunnel_bind_dev(struct net_device *dev) if (!IS_ERR(rt)) { tdev = rt->dst.dev; - tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); ip_rt_put(rt); } if (dev->type != ARPHRD_ETHER) dev->flags |= IFF_POINTOPOINT; + + dst_cache_reset(&tunnel->dst_cache); } if (!tdev && tunnel->parms.link) @@ -731,7 +677,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) goto tx_error; - rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL; + rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) : + NULL; if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); @@ -741,7 +688,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error; } if (connected) - tunnel_dst_set(tunnel, &rt->dst, fl4.saddr); + dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, + fl4.saddr); } if (rt->dst.dev == dev) { @@ -837,7 +785,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, if (set_mtu) dev->mtu = mtu; } - ip_tunnel_dst_reset_all(t); + dst_cache_reset(&t->dst_cache); netdev_state_change(dev); } @@ -976,7 +924,7 @@ static void ip_tunnel_dev_free(struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); gro_cells_destroy(&tunnel->gro_cells); - free_percpu(tunnel->dst_cache); + dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); free_netdev(dev); } @@ -1170,15 +1118,15 @@ int ip_tunnel_init(struct net_device *dev) if (!dev->tstats) return -ENOMEM; - tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst); - if (!tunnel->dst_cache) { + err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); + if (err) { free_percpu(dev->tstats); - return -ENOMEM; + return err; } err = gro_cells_init(&tunnel->gro_cells, dev); if (err) { - free_percpu(tunnel->dst_cache); + dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); return err; } @@ -1208,7 +1156,7 @@ void ip_tunnel_uninit(struct net_device *dev) if (itn->fb_tunnel_dev != dev) ip_tunnel_del(itn, netdev_priv(dev)); - ip_tunnel_dst_reset_all(tunnel); + dst_cache_reset(&tunnel->dst_cache); } EXPORT_SYMBOL_GPL(ip_tunnel_uninit); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 859d415c0..6165f30c4 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -86,7 +86,8 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, } EXPORT_SYMBOL_GPL(iptunnel_xmit); -int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) +int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, + bool xnet) { if (unlikely(!pskb_may_pull(skb, hdr_len))) return -ENOMEM; @@ -109,14 +110,12 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) skb->protocol = inner_proto; } - nf_reset(skb); - secpath_reset(skb); skb_clear_hash_if_not_l4(skb); - skb_dst_drop(skb); skb->vlan_tci = 0; skb_set_queue_mapping(skb, 0); - skb->pkt_type = PACKET_HOST; - return 0; + skb_scrub_packet(skb, xnet); + + return iptunnel_pull_offloads(skb); } EXPORT_SYMBOL_GPL(iptunnel_pull_header); @@ -148,7 +147,6 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, EXPORT_SYMBOL_GPL(iptunnel_metadata_reply); struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, - bool csum_help, int gso_type_mask) { int err; @@ -166,20 +164,15 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, return skb; } - /* If packet is not gso and we are resolving any partial checksum, - * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL - * on the outer header without confusing devices that implement - * NETIF_F_IP_CSUM with encapsulation. - */ - if (csum_help) - skb->encapsulation = 0; - - if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) { - err = skb_checksum_help(skb); - if (unlikely(err)) - goto error; - } else if (skb->ip_summed != CHECKSUM_PARTIAL) + if (skb->ip_summed != CHECKSUM_PARTIAL) { skb->ip_summed = CHECKSUM_NONE; + /* We clear encapsulation here to prevent badly-written + * drivers potentially deciding to offload an inner checksum + * if we set CHECKSUM_PARTIAL on the outer header. + * This should go away when the drivers are all fixed. + */ + skb->encapsulation = 0; + } return skb; error: @@ -379,8 +372,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) || nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) || nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || - nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) || - nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) || + nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) || + nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) || nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) return -ENOMEM; @@ -406,6 +399,12 @@ static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = { void __init ip_tunnel_core_init(void) { + /* If you land here, make sure whether increasing ip_tunnel_info's + * options_len is a reasonable choice with its usage in front ends + * (f.e., it's part of flow keys, etc). + */ + BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255); + lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6); } diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 5cf10b777..a917903d5 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -156,6 +156,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ int err; + int mtu; if (!dst) { dev->stats.tx_carrier_errors++; @@ -192,6 +193,23 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, tunnel->err_count = 0; } + mtu = dst_mtu(dst); + if (skb->len > mtu) { + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); + if (skb->protocol == htons(ETH_P_IP)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + } else { + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + } + + dst_release(dst); + goto tx_error; + } + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 4044da61e..ec51d0216 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -195,7 +195,7 @@ static int ipip_rcv(struct sk_buff *skb) if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - if (iptunnel_pull_header(skb, 0, tpi.proto)) + if (iptunnel_pull_header(skb, 0, tpi.proto, false)) goto drop; return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); } @@ -219,7 +219,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(skb->protocol != htons(ETH_P_IP))) goto tx_error; - skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP); + skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP); if (IS_ERR(skb)) goto out; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index b488cac9c..4133b0f51 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -359,11 +359,12 @@ unsigned int arpt_do_table(struct sk_buff *skb, } /* All zeroes == unconditional rule. */ -static inline bool unconditional(const struct arpt_arp *arp) +static inline bool unconditional(const struct arpt_entry *e) { static const struct arpt_arp uncond; - return memcmp(arp, &uncond, sizeof(uncond)) == 0; + return e->target_offset == sizeof(struct arpt_entry) && + memcmp(&e->arp, &uncond, sizeof(uncond)) == 0; } /* Figures out from what hook each rule can be called: returns 0 if @@ -402,11 +403,10 @@ static int mark_source_chains(const struct xt_table_info *newinfo, |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct arpt_entry) && + if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < 0 && unconditional(&e->arp)) || - visited) { + t->verdict < 0) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -474,14 +474,12 @@ next: return 1; } -static inline int check_entry(const struct arpt_entry *e, const char *name) +static inline int check_entry(const struct arpt_entry *e) { const struct xt_entry_target *t; - if (!arp_checkentry(&e->arp)) { - duprintf("arp_tables: arp check failed %p %s.\n", e, name); + if (!arp_checkentry(&e->arp)) return -EINVAL; - } if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) return -EINVAL; @@ -522,10 +520,6 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) struct xt_target *target; int ret; - ret = check_entry(e, name); - if (ret) - return ret; - e->counters.pcnt = xt_percpu_counter_alloc(); if (IS_ERR_VALUE(e->counters.pcnt)) return -ENOMEM; @@ -557,7 +551,7 @@ static bool check_underflow(const struct arpt_entry *e) const struct xt_entry_target *t; unsigned int verdict; - if (!unconditional(&e->arp)) + if (!unconditional(e)) return false; t = arpt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) @@ -576,9 +570,11 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, unsigned int valid_hooks) { unsigned int h; + int err; if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || - (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { + (unsigned char *)e + sizeof(struct arpt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -590,6 +586,10 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, return -EINVAL; } + err = check_entry(e); + if (err) + return err; + /* Check hooks & underflows */ for (h = 0; h < NF_ARP_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) @@ -598,9 +598,9 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) { - pr_err("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + pr_debug("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); return -EINVAL; } newinfo->underflow[h] = underflows[h]; @@ -969,6 +969,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, sizeof(struct arpt_get_entries) + get.size); return -EINVAL; } + get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR_OR_NULL(t)) { @@ -1233,7 +1234,8 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 || - (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) { + (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } @@ -1246,7 +1248,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, } /* For purposes of check_entry casting the compat entry is fine */ - ret = check_entry((struct arpt_entry *)e, name); + ret = check_entry((struct arpt_entry *)e); if (ret) return ret; @@ -1662,6 +1664,7 @@ static int compat_get_entries(struct net *net, *len, sizeof(get) + get.size); return -EINVAL; } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(NFPROTO_ARP); t = xt_find_table_lock(net, NFPROTO_ARP, get.name); @@ -1780,9 +1783,29 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len return ret; } -struct xt_table *arpt_register_table(struct net *net, - const struct xt_table *table, - const struct arpt_replace *repl) +static void __arpt_unregister_table(struct xt_table *table) +{ + struct xt_table_info *private; + void *loc_cpu_entry; + struct module *table_owner = table->me; + struct arpt_entry *iter; + + private = xt_unregister_table(table); + + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries; + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter); + if (private->number > private->initial_entries) + module_put(table_owner); + xt_free_table_info(private); +} + +int arpt_register_table(struct net *net, + const struct xt_table *table, + const struct arpt_replace *repl, + const struct nf_hook_ops *ops, + struct xt_table **res) { int ret; struct xt_table_info *newinfo; @@ -1791,10 +1814,8 @@ struct xt_table *arpt_register_table(struct net *net, struct xt_table *new_table; newinfo = xt_alloc_table_info(repl->size); - if (!newinfo) { - ret = -ENOMEM; - goto out; - } + if (!newinfo) + return -ENOMEM; loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); @@ -1809,30 +1830,28 @@ struct xt_table *arpt_register_table(struct net *net, ret = PTR_ERR(new_table); goto out_free; } - return new_table; + + /* set res now, will see skbs right after nf_register_net_hooks */ + WRITE_ONCE(*res, new_table); + + ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); + if (ret != 0) { + __arpt_unregister_table(new_table); + *res = NULL; + } + + return ret; out_free: xt_free_table_info(newinfo); -out: - return ERR_PTR(ret); + return ret; } -void arpt_unregister_table(struct xt_table *table) +void arpt_unregister_table(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops) { - struct xt_table_info *private; - void *loc_cpu_entry; - struct module *table_owner = table->me; - struct arpt_entry *iter; - - private = xt_unregister_table(table); - - /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries; - xt_entry_foreach(iter, loc_cpu_entry, private->size) - cleanup_entry(iter); - if (private->number > private->initial_entries) - module_put(table_owner); - xt_free_table_info(private); + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + __arpt_unregister_table(table); } /* The built-in targets: standard (NULL) and error. */ diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 1897ee160..8f8713b43 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -17,12 +17,15 @@ MODULE_DESCRIPTION("arptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \ (1 << NF_ARP_FORWARD)) +static int __net_init arptable_filter_table_init(struct net *net); + static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_ARP, .priority = NF_IP_PRI_FILTER, + .table_init = arptable_filter_table_init, }; /* The work comes in here from netfilter.c */ @@ -35,26 +38,32 @@ arptable_filter_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *arpfilter_ops __read_mostly; -static int __net_init arptable_filter_net_init(struct net *net) +static int __net_init arptable_filter_table_init(struct net *net) { struct arpt_replace *repl; - + int err; + + if (net->ipv4.arptable_filter) + return 0; + repl = arpt_alloc_initial_table(&packet_filter); if (repl == NULL) return -ENOMEM; - net->ipv4.arptable_filter = - arpt_register_table(net, &packet_filter, repl); + err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops, + &net->ipv4.arptable_filter); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter); + return err; } static void __net_exit arptable_filter_net_exit(struct net *net) { - arpt_unregister_table(net->ipv4.arptable_filter); + if (!net->ipv4.arptable_filter) + return; + arpt_unregister_table(net, net->ipv4.arptable_filter, arpfilter_ops); + net->ipv4.arptable_filter = NULL; } static struct pernet_operations arptable_filter_net_ops = { - .init = arptable_filter_net_init, .exit = arptable_filter_net_exit, }; @@ -62,26 +71,29 @@ static int __init arptable_filter_init(void) { int ret; + arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arptable_filter_hook); + if (IS_ERR(arpfilter_ops)) + return PTR_ERR(arpfilter_ops); + ret = register_pernet_subsys(&arptable_filter_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(arpfilter_ops); return ret; + } - arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook); - if (IS_ERR(arpfilter_ops)) { - ret = PTR_ERR(arpfilter_ops); - goto cleanup_table; + ret = arptable_filter_table_init(&init_net); + if (ret) { + unregister_pernet_subsys(&arptable_filter_net_ops); + kfree(arpfilter_ops); } - return ret; -cleanup_table: - unregister_pernet_subsys(&arptable_filter_net_ops); return ret; } static void __exit arptable_filter_fini(void) { - xt_hook_unlink(&packet_filter, arpfilter_ops); unregister_pernet_subsys(&arptable_filter_net_ops); + kfree(arpfilter_ops); } module_init(arptable_filter_init); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index b99affad6..631c100a1 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -168,11 +168,12 @@ get_entry(const void *base, unsigned int offset) /* All zeroes == unconditional rule. */ /* Mildly perf critical (only if packet tracing is on) */ -static inline bool unconditional(const struct ipt_ip *ip) +static inline bool unconditional(const struct ipt_entry *e) { static const struct ipt_ip uncond; - return memcmp(ip, &uncond, sizeof(uncond)) == 0; + return e->target_offset == sizeof(struct ipt_entry) && + memcmp(&e->ip, &uncond, sizeof(uncond)) == 0; #undef FWINV } @@ -229,11 +230,10 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, } else if (s == e) { (*rulenum)++; - if (s->target_offset == sizeof(struct ipt_entry) && + if (unconditional(s) && strcmp(t->target.u.kernel.target->name, XT_STANDARD_TARGET) == 0 && - t->verdict < 0 && - unconditional(&s->ip)) { + t->verdict < 0) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname ? comments[NF_IP_TRACE_COMMENT_POLICY] @@ -476,11 +476,10 @@ mark_source_chains(const struct xt_table_info *newinfo, e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct ipt_entry) && + if ((unconditional(e) && (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < 0 && unconditional(&e->ip)) || - visited) { + t->verdict < 0) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -569,14 +568,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net) } static int -check_entry(const struct ipt_entry *e, const char *name) +check_entry(const struct ipt_entry *e) { const struct xt_entry_target *t; - if (!ip_checkentry(&e->ip)) { - duprintf("ip check failed %p %s.\n", e, name); + if (!ip_checkentry(&e->ip)) return -EINVAL; - } if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) @@ -666,10 +663,6 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; - ret = check_entry(e, name); - if (ret) - return ret; - e->counters.pcnt = xt_percpu_counter_alloc(); if (IS_ERR_VALUE(e->counters.pcnt)) return -ENOMEM; @@ -721,7 +714,7 @@ static bool check_underflow(const struct ipt_entry *e) const struct xt_entry_target *t; unsigned int verdict; - if (!unconditional(&e->ip)) + if (!unconditional(e)) return false; t = ipt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) @@ -741,9 +734,11 @@ check_entry_size_and_hooks(struct ipt_entry *e, unsigned int valid_hooks) { unsigned int h; + int err; if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || - (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { + (unsigned char *)e + sizeof(struct ipt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -755,6 +750,10 @@ check_entry_size_and_hooks(struct ipt_entry *e, return -EINVAL; } + err = check_entry(e); + if (err) + return err; + /* Check hooks & underflows */ for (h = 0; h < NF_INET_NUMHOOKS; h++) { if (!(valid_hooks & (1 << h))) @@ -763,9 +762,9 @@ check_entry_size_and_hooks(struct ipt_entry *e, newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { if (!check_underflow(e)) { - pr_err("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + pr_debug("Underflows must be unconditional and " + "use the STANDARD target with " + "ACCEPT/DROP\n"); return -EINVAL; } newinfo->underflow[h] = underflows[h]; @@ -1157,6 +1156,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, *len, sizeof(get) + get.size); return -EINVAL; } + get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR_OR_NULL(t)) { @@ -1493,7 +1493,8 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 || - (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) { + (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit || + (unsigned char *)e + e->next_offset > limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } @@ -1506,7 +1507,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, } /* For purposes of check_entry casting the compat entry is fine */ - ret = check_entry((struct ipt_entry *)e, name); + ret = check_entry((struct ipt_entry *)e); if (ret) return ret; @@ -1935,6 +1936,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, *len, sizeof(get) + get.size); return -EINVAL; } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(AF_INET); t = xt_find_table_lock(net, AF_INET, get.name); @@ -2062,9 +2064,27 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) return ret; } -struct xt_table *ipt_register_table(struct net *net, - const struct xt_table *table, - const struct ipt_replace *repl) +static void __ipt_unregister_table(struct net *net, struct xt_table *table) +{ + struct xt_table_info *private; + void *loc_cpu_entry; + struct module *table_owner = table->me; + struct ipt_entry *iter; + + private = xt_unregister_table(table); + + /* Decrease module usage counts and free resources */ + loc_cpu_entry = private->entries; + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter, net); + if (private->number > private->initial_entries) + module_put(table_owner); + xt_free_table_info(private); +} + +int ipt_register_table(struct net *net, const struct xt_table *table, + const struct ipt_replace *repl, + const struct nf_hook_ops *ops, struct xt_table **res) { int ret; struct xt_table_info *newinfo; @@ -2073,10 +2093,8 @@ struct xt_table *ipt_register_table(struct net *net, struct xt_table *new_table; newinfo = xt_alloc_table_info(repl->size); - if (!newinfo) { - ret = -ENOMEM; - goto out; - } + if (!newinfo) + return -ENOMEM; loc_cpu_entry = newinfo->entries; memcpy(loc_cpu_entry, repl->entries, repl->size); @@ -2091,30 +2109,27 @@ struct xt_table *ipt_register_table(struct net *net, goto out_free; } - return new_table; + /* set res now, will see skbs right after nf_register_net_hooks */ + WRITE_ONCE(*res, new_table); + + ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); + if (ret != 0) { + __ipt_unregister_table(net, new_table); + *res = NULL; + } + + return ret; out_free: xt_free_table_info(newinfo); -out: - return ERR_PTR(ret); + return ret; } -void ipt_unregister_table(struct net *net, struct xt_table *table) +void ipt_unregister_table(struct net *net, struct xt_table *table, + const struct nf_hook_ops *ops) { - struct xt_table_info *private; - void *loc_cpu_entry; - struct module *table_owner = table->me; - struct ipt_entry *iter; - - private = xt_unregister_table(table); - - /* Decrease module usage counts and free resources */ - loc_cpu_entry = private->entries; - xt_entry_foreach(iter, loc_cpu_entry, private->size) - cleanup_entry(iter, net); - if (private->number > private->initial_entries) - module_put(table_owner); - xt_free_table_info(private); + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + __ipt_unregister_table(net, table); } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 5fdc55651..db5b87509 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -18,7 +18,8 @@ #include <net/netfilter/nf_conntrack_synproxy.h> static struct iphdr * -synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) +synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr, + __be32 daddr) { struct iphdr *iph; @@ -29,7 +30,7 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) iph->tos = 0; iph->id = 0; iph->frag_off = htons(IP_DF); - iph->ttl = sysctl_ip_default_ttl; + iph->ttl = net->ipv4.sysctl_ip_default_ttl; iph->protocol = IPPROTO_TCP; iph->check = 0; iph->saddr = saddr; @@ -39,14 +40,12 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr) } static void -synproxy_send_tcp(const struct synproxy_net *snet, +synproxy_send_tcp(struct net *net, const struct sk_buff *skb, struct sk_buff *nskb, struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, struct iphdr *niph, struct tcphdr *nth, unsigned int tcp_hdr_size) { - struct net *net = nf_ct_net(snet->tmpl); - nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0); nskb->ip_summed = CHECKSUM_PARTIAL; nskb->csum_start = (unsigned char *)nth - nskb->head; @@ -71,7 +70,7 @@ free_nskb: } static void -synproxy_send_client_synack(const struct synproxy_net *snet, +synproxy_send_client_synack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { @@ -90,7 +89,7 @@ synproxy_send_client_synack(const struct synproxy_net *snet, return; skb_reserve(nskb, MAX_TCP_HEADER); - niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); + niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); skb_reset_transport_header(nskb); nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); @@ -108,15 +107,16 @@ synproxy_send_client_synack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } static void -synproxy_send_server_syn(const struct synproxy_net *snet, +synproxy_send_server_syn(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts, u32 recv_seq) { + struct synproxy_net *snet = synproxy_pernet(net); struct sk_buff *nskb; struct iphdr *iph, *niph; struct tcphdr *nth; @@ -131,7 +131,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet, return; skb_reserve(nskb, MAX_TCP_HEADER); - niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); + niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); skb_reset_transport_header(nskb); nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); @@ -152,12 +152,12 @@ synproxy_send_server_syn(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, niph, nth, tcp_hdr_size); } static void -synproxy_send_server_ack(const struct synproxy_net *snet, +synproxy_send_server_ack(struct net *net, const struct ip_ct_tcp *state, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) @@ -176,7 +176,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet, return; skb_reserve(nskb, MAX_TCP_HEADER); - niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr); + niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr); skb_reset_transport_header(nskb); nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); @@ -192,11 +192,11 @@ synproxy_send_server_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); + synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); } static void -synproxy_send_client_ack(const struct synproxy_net *snet, +synproxy_send_client_ack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { @@ -214,7 +214,7 @@ synproxy_send_client_ack(const struct synproxy_net *snet, return; skb_reserve(nskb, MAX_TCP_HEADER); - niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr); + niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr); skb_reset_transport_header(nskb); nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); @@ -230,15 +230,16 @@ synproxy_send_client_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } static bool -synproxy_recv_client_ack(const struct synproxy_net *snet, +synproxy_recv_client_ack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, struct synproxy_options *opts, u32 recv_seq) { + struct synproxy_net *snet = synproxy_pernet(net); int mss; mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1); @@ -254,7 +255,7 @@ synproxy_recv_client_ack(const struct synproxy_net *snet, if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_check_timestamp_cookie(opts); - synproxy_send_server_syn(snet, skb, th, opts, recv_seq); + synproxy_send_server_syn(net, skb, th, opts, recv_seq); return true; } @@ -262,7 +263,8 @@ static unsigned int synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; - struct synproxy_net *snet = synproxy_pernet(par->net); + struct net *net = par->net; + struct synproxy_net *snet = synproxy_pernet(net); struct synproxy_options opts = {}; struct tcphdr *th, _th; @@ -291,12 +293,12 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); - synproxy_send_client_synack(snet, skb, th, &opts); + synproxy_send_client_synack(net, skb, th, &opts); return NF_DROP; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ - synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq)); + synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq)); return NF_DROP; } @@ -307,7 +309,8 @@ static unsigned int ipv4_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs) { - struct synproxy_net *snet = synproxy_pernet(nhs->net); + struct net *net = nhs->net; + struct synproxy_net *snet = synproxy_pernet(net); enum ip_conntrack_info ctinfo; struct nf_conn *ct; struct nf_conn_synproxy *synproxy; @@ -364,7 +367,7 @@ static unsigned int ipv4_synproxy_hook(void *priv, * therefore we need to add 1 to make the SYN sequence * number match the one of first SYN. */ - if (synproxy_recv_client_ack(snet, skb, th, &opts, + if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq) + 1)) this_cpu_inc(snet->stats->cookie_retrans); @@ -390,12 +393,12 @@ static unsigned int ipv4_synproxy_hook(void *priv, XT_SYNPROXY_OPT_SACK_PERM); swap(opts.tsval, opts.tsecr); - synproxy_send_server_ack(snet, state, skb, th, &opts); + synproxy_send_server_ack(net, state, skb, th, &opts); nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); swap(opts.tsval, opts.tsecr); - synproxy_send_client_ack(snet, skb, th, &opts); + synproxy_send_client_ack(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 397ef2dd1..7667f223d 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -23,6 +23,7 @@ MODULE_DESCRIPTION("iptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) +static int __net_init iptable_filter_table_init(struct net *net); static const struct xt_table packet_filter = { .name = "filter", @@ -30,6 +31,7 @@ static const struct xt_table packet_filter = { .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_FILTER, + .table_init = iptable_filter_table_init, }; static unsigned int @@ -48,12 +50,16 @@ iptable_filter_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ -static bool forward = true; +static bool forward __read_mostly = true; module_param(forward, bool, 0000); -static int __net_init iptable_filter_net_init(struct net *net) +static int __net_init iptable_filter_table_init(struct net *net) { struct ipt_replace *repl; + int err; + + if (net->ipv4.iptable_filter) + return 0; repl = ipt_alloc_initial_table(&packet_filter); if (repl == NULL) @@ -62,15 +68,26 @@ static int __net_init iptable_filter_net_init(struct net *net) ((struct ipt_standard *)repl->entries)[1].target.verdict = forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; - net->ipv4.iptable_filter = - ipt_register_table(net, &packet_filter, repl); + err = ipt_register_table(net, &packet_filter, repl, filter_ops, + &net->ipv4.iptable_filter); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter); + return err; +} + +static int __net_init iptable_filter_net_init(struct net *net) +{ + if (net == &init_net || !forward) + return iptable_filter_table_init(net); + + return 0; } static void __net_exit iptable_filter_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.iptable_filter); + if (!net->ipv4.iptable_filter) + return; + ipt_unregister_table(net, net->ipv4.iptable_filter, filter_ops); + net->ipv4.iptable_filter = NULL; } static struct pernet_operations iptable_filter_net_ops = { @@ -82,24 +99,21 @@ static int __init iptable_filter_init(void) { int ret; + filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook); + if (IS_ERR(filter_ops)) + return PTR_ERR(filter_ops); + ret = register_pernet_subsys(&iptable_filter_net_ops); if (ret < 0) - return ret; - - /* Register hooks */ - filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); - if (IS_ERR(filter_ops)) { - ret = PTR_ERR(filter_ops); - unregister_pernet_subsys(&iptable_filter_net_ops); - } + kfree(filter_ops); return ret; } static void __exit iptable_filter_fini(void) { - xt_hook_unlink(&packet_filter, filter_ops); unregister_pernet_subsys(&iptable_filter_net_ops); + kfree(filter_ops); } module_init(iptable_filter_init); diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index ba5d392a1..57fc97cda 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables mangle table"); (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) +static int __net_init iptable_mangle_table_init(struct net *net); + static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_MANGLE, + .table_init = iptable_mangle_table_init, }; static unsigned int @@ -92,27 +95,32 @@ iptable_mangle_hook(void *priv, } static struct nf_hook_ops *mangle_ops __read_mostly; - -static int __net_init iptable_mangle_net_init(struct net *net) +static int __net_init iptable_mangle_table_init(struct net *net) { struct ipt_replace *repl; + int ret; + + if (net->ipv4.iptable_mangle) + return 0; repl = ipt_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; - net->ipv4.iptable_mangle = - ipt_register_table(net, &packet_mangler, repl); + ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops, + &net->ipv4.iptable_mangle); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle); + return ret; } static void __net_exit iptable_mangle_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.iptable_mangle); + if (!net->ipv4.iptable_mangle) + return; + ipt_unregister_table(net, net->ipv4.iptable_mangle, mangle_ops); + net->ipv4.iptable_mangle = NULL; } static struct pernet_operations iptable_mangle_net_ops = { - .init = iptable_mangle_net_init, .exit = iptable_mangle_net_exit, }; @@ -120,15 +128,22 @@ static int __init iptable_mangle_init(void) { int ret; + mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook); + if (IS_ERR(mangle_ops)) { + ret = PTR_ERR(mangle_ops); + return ret; + } + ret = register_pernet_subsys(&iptable_mangle_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(mangle_ops); return ret; + } - /* Register hooks */ - mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook); - if (IS_ERR(mangle_ops)) { - ret = PTR_ERR(mangle_ops); + ret = iptable_mangle_table_init(&init_net); + if (ret) { unregister_pernet_subsys(&iptable_mangle_net_ops); + kfree(mangle_ops); } return ret; @@ -136,8 +151,8 @@ static int __init iptable_mangle_init(void) static void __exit iptable_mangle_fini(void) { - xt_hook_unlink(&packet_mangler, mangle_ops); unregister_pernet_subsys(&iptable_mangle_net_ops); + kfree(mangle_ops); } module_init(iptable_mangle_init); diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index ae2cd2752..138a24bc7 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -18,6 +18,8 @@ #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_l3proto.h> +static int __net_init iptable_nat_table_init(struct net *net); + static const struct xt_table nf_nat_ipv4_table = { .name = "nat", .valid_hooks = (1 << NF_INET_PRE_ROUTING) | @@ -26,6 +28,7 @@ static const struct xt_table nf_nat_ipv4_table = { (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, .af = NFPROTO_IPV4, + .table_init = iptable_nat_table_init, }; static unsigned int iptable_nat_do_chain(void *priv, @@ -95,50 +98,50 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = { }, }; -static int __net_init iptable_nat_net_init(struct net *net) +static int __net_init iptable_nat_table_init(struct net *net) { struct ipt_replace *repl; + int ret; + + if (net->ipv4.nat_table) + return 0; repl = ipt_alloc_initial_table(&nf_nat_ipv4_table); if (repl == NULL) return -ENOMEM; - net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl); + ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, + nf_nat_ipv4_ops, &net->ipv4.nat_table); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.nat_table); + return ret; } static void __net_exit iptable_nat_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.nat_table); + if (!net->ipv4.nat_table) + return; + ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops); + net->ipv4.nat_table = NULL; } static struct pernet_operations iptable_nat_net_ops = { - .init = iptable_nat_net_init, .exit = iptable_nat_net_exit, }; static int __init iptable_nat_init(void) { - int err; + int ret = register_pernet_subsys(&iptable_nat_net_ops); - err = register_pernet_subsys(&iptable_nat_net_ops); - if (err < 0) - goto err1; + if (ret) + return ret; - err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); - if (err < 0) - goto err2; - return 0; - -err2: - unregister_pernet_subsys(&iptable_nat_net_ops); -err1: - return err; + ret = iptable_nat_table_init(&init_net); + if (ret) + unregister_pernet_subsys(&iptable_nat_net_ops); + return ret; } static void __exit iptable_nat_exit(void) { - nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); unregister_pernet_subsys(&iptable_nat_net_ops); } diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 1ba02811a..2642ecd26 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -10,12 +10,15 @@ #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) +static int __net_init iptable_raw_table_init(struct net *net); + static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_RAW, + .table_init = iptable_raw_table_init, }; /* The work comes in here from netfilter.c. */ @@ -34,26 +37,32 @@ iptable_raw_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *rawtable_ops __read_mostly; -static int __net_init iptable_raw_net_init(struct net *net) +static int __net_init iptable_raw_table_init(struct net *net) { struct ipt_replace *repl; + int ret; + + if (net->ipv4.iptable_raw) + return 0; repl = ipt_alloc_initial_table(&packet_raw); if (repl == NULL) return -ENOMEM; - net->ipv4.iptable_raw = - ipt_register_table(net, &packet_raw, repl); + ret = ipt_register_table(net, &packet_raw, repl, rawtable_ops, + &net->ipv4.iptable_raw); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw); + return ret; } static void __net_exit iptable_raw_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.iptable_raw); + if (!net->ipv4.iptable_raw) + return; + ipt_unregister_table(net, net->ipv4.iptable_raw, rawtable_ops); + net->ipv4.iptable_raw = NULL; } static struct pernet_operations iptable_raw_net_ops = { - .init = iptable_raw_net_init, .exit = iptable_raw_net_exit, }; @@ -61,15 +70,20 @@ static int __init iptable_raw_init(void) { int ret; + rawtable_ops = xt_hook_ops_alloc(&packet_raw, iptable_raw_hook); + if (IS_ERR(rawtable_ops)) + return PTR_ERR(rawtable_ops); + ret = register_pernet_subsys(&iptable_raw_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(rawtable_ops); return ret; + } - /* Register hooks */ - rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook); - if (IS_ERR(rawtable_ops)) { - ret = PTR_ERR(rawtable_ops); + ret = iptable_raw_table_init(&init_net); + if (ret) { unregister_pernet_subsys(&iptable_raw_net_ops); + kfree(rawtable_ops); } return ret; @@ -77,8 +91,8 @@ static int __init iptable_raw_init(void) static void __exit iptable_raw_fini(void) { - xt_hook_unlink(&packet_raw, rawtable_ops); unregister_pernet_subsys(&iptable_raw_net_ops); + kfree(rawtable_ops); } module_init(iptable_raw_init); diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index c2e23d5e9..ff226596e 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) +static int __net_init iptable_security_table_init(struct net *net); + static const struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_SECURITY, + .table_init = iptable_security_table_init, }; static unsigned int @@ -51,26 +54,33 @@ iptable_security_hook(void *priv, struct sk_buff *skb, static struct nf_hook_ops *sectbl_ops __read_mostly; -static int __net_init iptable_security_net_init(struct net *net) +static int __net_init iptable_security_table_init(struct net *net) { struct ipt_replace *repl; + int ret; + + if (net->ipv4.iptable_security) + return 0; repl = ipt_alloc_initial_table(&security_table); if (repl == NULL) return -ENOMEM; - net->ipv4.iptable_security = - ipt_register_table(net, &security_table, repl); + ret = ipt_register_table(net, &security_table, repl, sectbl_ops, + &net->ipv4.iptable_security); kfree(repl); - return PTR_ERR_OR_ZERO(net->ipv4.iptable_security); + return ret; } static void __net_exit iptable_security_net_exit(struct net *net) { - ipt_unregister_table(net, net->ipv4.iptable_security); + if (!net->ipv4.iptable_security) + return; + + ipt_unregister_table(net, net->ipv4.iptable_security, sectbl_ops); + net->ipv4.iptable_security = NULL; } static struct pernet_operations iptable_security_net_ops = { - .init = iptable_security_net_init, .exit = iptable_security_net_exit, }; @@ -78,27 +88,29 @@ static int __init iptable_security_init(void) { int ret; + sectbl_ops = xt_hook_ops_alloc(&security_table, iptable_security_hook); + if (IS_ERR(sectbl_ops)) + return PTR_ERR(sectbl_ops); + ret = register_pernet_subsys(&iptable_security_net_ops); - if (ret < 0) + if (ret < 0) { + kfree(sectbl_ops); return ret; - - sectbl_ops = xt_hook_link(&security_table, iptable_security_hook); - if (IS_ERR(sectbl_ops)) { - ret = PTR_ERR(sectbl_ops); - goto cleanup_table; } - return ret; + ret = iptable_security_table_init(&init_net); + if (ret) { + unregister_pernet_subsys(&iptable_security_net_ops); + kfree(sectbl_ops); + } -cleanup_table: - unregister_pernet_subsys(&iptable_security_net_ops); return ret; } static void __exit iptable_security_fini(void) { - xt_hook_unlink(&security_table, sectbl_ops); unregister_pernet_subsys(&iptable_security_net_ops); + kfree(sectbl_ops); } module_init(iptable_security_init); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index a04dee536..d88da36b3 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -31,10 +31,8 @@ static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, err = ip_defrag(net, skb, user); local_bh_enable(); - if (!err) { - ip_send_check(ip_hdr(skb)); + if (!err) skb->ignore_df = 1; - } return err; } diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 61c7cc22e..f8aad03d6 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -127,29 +127,15 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { - const struct iphdr *iph = ip_hdr(skb); - struct rtable *rt = skb_rtable(skb); - if (skb->ip_summed != CHECKSUM_PARTIAL) { - if (!(rt->rt_flags & RTCF_LOCAL) && - (!skb->dev || skb->dev->features & - (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_headroom(skb) + - skb_network_offset(skb) + - ip_hdrlen(skb); - skb->csum_offset = (void *)check - data; - *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - datalen, proto, 0); - } else { - *check = 0; - *check = csum_tcpudp_magic(iph->saddr, iph->daddr, - datalen, proto, - csum_partial(data, datalen, - 0)); - if (proto == IPPROTO_UDP && !*check) - *check = CSUM_MANGLED_0; - } + const struct iphdr *iph = ip_hdr(skb); + + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + + ip_hdrlen(skb); + skb->csum_offset = (void *)check - data; + *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen, + proto, 0); } else inet_proto_csum_replace2(check, skb, htons(oldlen), htons(datalen), true); diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c index b72ffc58e..51ced81b6 100644 --- a/net/ipv4/netfilter/nft_masq_ipv4.c +++ b/net/ipv4/netfilter/nft_masq_ipv4.c @@ -25,7 +25,12 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr, memset(&range, 0, sizeof(range)); range.flags = priv->flags; - + if (priv->sreg_proto_min) { + range.min_proto.all = + *(__be16 *)®s->data[priv->sreg_proto_min]; + range.max_proto.all = + *(__be16 *)®s->data[priv->sreg_proto_max]; + } regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook, &range, pkt->out); } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index d3a27165f..cf9700b1a 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -145,10 +145,12 @@ fail: } EXPORT_SYMBOL_GPL(ping_get_port); -void ping_hash(struct sock *sk) +int ping_hash(struct sock *sk) { pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); BUG(); /* "Please do not press this button again." */ + + return 0; } void ping_unhash(struct sock *sk) @@ -1140,13 +1142,6 @@ static int ping_v4_seq_show(struct seq_file *seq, void *v) return 0; } -static const struct seq_operations ping_v4_seq_ops = { - .show = ping_v4_seq_show, - .start = ping_v4_seq_start, - .next = ping_seq_next, - .stop = ping_seq_stop, -}; - static int ping_seq_open(struct inode *inode, struct file *file) { struct ping_seq_afinfo *afinfo = PDE_DATA(inode); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3abd9d7a3..9f665b63a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -390,7 +390,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\nIp: %d %d", IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2, - sysctl_ip_default_ttl); + net->ipv4.sysctl_ip_default_ttl); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 7113bae4e..8d22de740 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -93,7 +93,7 @@ static struct raw_hashinfo raw_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), }; -void raw_hash_sk(struct sock *sk) +int raw_hash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; struct hlist_head *head; @@ -104,6 +104,8 @@ void raw_hash_sk(struct sock *sk) sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&h->lock); + + return 0; } EXPORT_SYMBOL_GPL(raw_hash_sk); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 02c62299d..60398a937 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1438,9 +1438,9 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #endif } -static struct rtable *rt_dst_alloc(struct net_device *dev, - unsigned int flags, u16 type, - bool nopolicy, bool noxfrm, bool will_cache) +struct rtable *rt_dst_alloc(struct net_device *dev, + unsigned int flags, u16 type, + bool nopolicy, bool noxfrm, bool will_cache) { struct rtable *rt; @@ -1468,6 +1468,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev, return rt; } +EXPORT_SYMBOL(rt_dst_alloc); /* called in rcu_read_lock() section */ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, @@ -2045,6 +2046,18 @@ static struct rtable *__mkroute_output(const struct fib_result *res, */ if (fi && res->prefixlen < 4) fi = NULL; + } else if ((type == RTN_LOCAL) && (orig_oif != 0) && + (orig_oif != dev_out->ifindex)) { + /* For local routes that require a particular output interface + * we do not want to cache the result. Caching the result + * causes incorrect behaviour when there are multiple source + * addresses on the interface, the end result being that if the + * intended recipient is waiting on that interface for the + * packet he won't receive it because it will be delivered on + * the loopback interface and the IP_PKTINFO ipi_ifindex will + * be set to the loopback interface as well. + */ + fi = NULL; } fnhe = NULL; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 643a86c49..4c04f0933 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -19,8 +19,6 @@ #include <net/tcp.h> #include <net/route.h> -extern int sysctl_tcp_syncookies; - static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; #define COOKIEBITS 24 /* Upper bits store count */ @@ -50,8 +48,7 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; #define TSBITS 6 #define TSMASK (((__u32)1 << TSBITS) - 1) -static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], - ipv4_cookie_scratch); +static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch); static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, u32 count, int c) @@ -307,7 +304,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) __u8 rcv_wscale; struct flowi4 fl4; - if (!sysctl_tcp_syncookies || !th->ack || th->rst) + if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst) goto out; if (tcp_synq_no_recent_overflow(sk)) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4d367b413..1e1fe6086 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -283,31 +283,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "ip_default_ttl", - .data = &sysctl_ip_default_ttl, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &ip_ttl_min, - .extra2 = &ip_ttl_max, - }, - { - .procname = "tcp_syn_retries", - .data = &sysctl_tcp_syn_retries, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &tcp_syn_retries_min, - .extra2 = &tcp_syn_retries_max - }, - { - .procname = "tcp_synack_retries", - .data = &sysctl_tcp_synack_retries, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_max_orphans", .data = &sysctl_tcp_max_orphans, .maxlen = sizeof(int), @@ -322,51 +297,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "ip_early_demux", - .data = &sysctl_ip_early_demux, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "ip_dynaddr", - .data = &sysctl_ip_dynaddr, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_retries1", - .data = &sysctl_tcp_retries1, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra2 = &tcp_retr1_max - }, - { - .procname = "tcp_retries2", - .data = &sysctl_tcp_retries2, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "tcp_fin_timeout", - .data = &sysctl_tcp_fin_timeout, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, -#ifdef CONFIG_SYN_COOKIES - { - .procname = "tcp_syncookies", - .data = &sysctl_tcp_syncookies, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#endif - { .procname = "tcp_fastopen", .data = &sysctl_tcp_fastopen, .maxlen = sizeof(int), @@ -415,30 +345,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .procname = "igmp_max_memberships", - .data = &sysctl_igmp_max_memberships, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "igmp_max_msf", - .data = &sysctl_igmp_max_msf, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, -#ifdef CONFIG_IP_MULTICAST - { - .procname = "igmp_qrv", - .data = &sysctl_igmp_qrv, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &one - }, -#endif - { .procname = "inet_peer_threshold", .data = &inet_peer_threshold, .maxlen = sizeof(int), @@ -460,13 +366,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec_jiffies, }, { - .procname = "tcp_orphan_retries", - .data = &sysctl_tcp_orphan_retries, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_fack", .data = &sysctl_tcp_fack, .maxlen = sizeof(int), @@ -481,13 +380,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "tcp_reordering", - .data = &sysctl_tcp_reordering, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { .procname = "tcp_max_reordering", .data = &sysctl_tcp_max_reordering, .maxlen = sizeof(int), @@ -517,13 +409,6 @@ static struct ctl_table ipv4_table[] = { .extra1 = &one, }, { - .procname = "tcp_notsent_lowat", - .data = &sysctl_tcp_notsent_lowat, - .maxlen = sizeof(sysctl_tcp_notsent_lowat), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "tcp_rmem", .data = &sysctl_tcp_rmem, .maxlen = sizeof(sysctl_tcp_rmem), @@ -845,6 +730,29 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { + .procname = "ip_dynaddr", + .data = &init_net.ipv4.sysctl_ip_dynaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_early_demux", + .data = &init_net.ipv4.sysctl_ip_early_demux, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ip_default_ttl", + .data = &init_net.ipv4.sysctl_ip_default_ttl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &ip_ttl_min, + .extra2 = &ip_ttl_max, + }, + { .procname = "ip_local_port_range", .maxlen = sizeof(init_net.ipv4.ip_local_ports.range), .data = &init_net.ipv4.ip_local_ports.range, @@ -934,12 +842,36 @@ static struct ctl_table ipv4_net_table[] = { }, { .procname = "igmp_link_local_mcast_reports", - .data = &sysctl_igmp_llm_reports, + .data = &init_net.ipv4.sysctl_igmp_llm_reports, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "igmp_max_memberships", + .data = &init_net.ipv4.sysctl_igmp_max_memberships, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { + .procname = "igmp_max_msf", + .data = &init_net.ipv4.sysctl_igmp_max_msf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#ifdef CONFIG_IP_MULTICAST + { + .procname = "igmp_qrv", + .data = &init_net.ipv4.sysctl_igmp_qrv, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one + }, +#endif + { .procname = "tcp_keepalive_time", .data = &init_net.ipv4.sysctl_tcp_keepalive_time, .maxlen = sizeof(int), @@ -960,6 +892,74 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "tcp_syn_retries", + .data = &init_net.ipv4.sysctl_tcp_syn_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &tcp_syn_retries_min, + .extra2 = &tcp_syn_retries_max + }, + { + .procname = "tcp_synack_retries", + .data = &init_net.ipv4.sysctl_tcp_synack_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#ifdef CONFIG_SYN_COOKIES + { + .procname = "tcp_syncookies", + .data = &init_net.ipv4.sysctl_tcp_syncookies, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif + { + .procname = "tcp_reordering", + .data = &init_net.ipv4.sysctl_tcp_reordering, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_retries1", + .data = &init_net.ipv4.sysctl_tcp_retries1, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra2 = &tcp_retr1_max + }, + { + .procname = "tcp_retries2", + .data = &init_net.ipv4.sysctl_tcp_retries2, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_orphan_retries", + .data = &init_net.ipv4.sysctl_tcp_orphan_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_fin_timeout", + .data = &init_net.ipv4.sysctl_tcp_fin_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "tcp_notsent_lowat", + .data = &init_net.ipv4.sysctl_tcp_notsent_lowat, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { } }; @@ -988,6 +988,10 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) if (!net->ipv4.sysctl_local_reserved_ports) goto err_ports; + net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; + net->ipv4.sysctl_ip_dynaddr = 0; + net->ipv4.sysctl_ip_early_demux = 1; + return 0; err_ports: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0da50fe0a..32c9d8c25 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -247,6 +247,7 @@ #define pr_fmt(fmt) "TCP: " fmt +#include <crypto/hash.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> @@ -266,7 +267,6 @@ #include <linux/swap.h> #include <linux/cache.h> #include <linux/err.h> -#include <linux/crypto.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -283,8 +283,6 @@ #include <asm/unaligned.h> #include <net/busy_poll.h> -int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; - int sysctl_tcp_min_tso_segs __read_mostly = 2; int sysctl_tcp_autocorking __read_mostly = 1; @@ -407,7 +405,7 @@ void tcp_init_sock(struct sock *sk) tp->mss_cache = TCP_MSS_DEFAULT; u64_stats_init(&tp->syncp); - tp->reordering = sysctl_tcp_reordering; + tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; tcp_enable_early_retrans(tp); tcp_assign_congestion_control(sk); @@ -559,20 +557,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return -EINVAL; slow = lock_sock_fast(sk); - if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) - answ = 0; - else if (sock_flag(sk, SOCK_URGINLINE) || - !tp->urg_data || - before(tp->urg_seq, tp->copied_seq) || - !before(tp->urg_seq, tp->rcv_nxt)) { - - answ = tp->rcv_nxt - tp->copied_seq; - - /* Subtract 1, if FIN was received */ - if (answ && sock_flag(sk, SOCK_DONE)) - answ--; - } else - answ = tp->urg_seq - tp->copied_seq; + answ = tcp_inq(sk); unlock_sock_fast(sk, slow); break; case SIOCATMARK: @@ -1467,8 +1452,10 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; + } if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { *off = offset; return skb; @@ -1658,8 +1645,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, break; offset = *seq - TCP_SKB_CB(skb)->seq; - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) + if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { + pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; + } if (offset < skb->len) goto found_ok_skb; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) @@ -2364,6 +2353,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct net *net = sock_net(sk); int val; int err = 0; @@ -2620,7 +2610,7 @@ stealth_integrity_out_1: case TCP_LINGER2: if (val < 0) tp->linger2 = -1; - else if (val > sysctl_tcp_fin_timeout / HZ) + else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ) tp->linger2 = 0; else tp->linger2 = val * HZ; @@ -2749,6 +2739,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; unsigned int start; + int notsent_bytes; u64 rate64; u32 rate; @@ -2829,6 +2820,13 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) } while (u64_stats_fetch_retry_irq(&tp->syncp, start)); info->tcpi_segs_out = tp->segs_out; info->tcpi_segs_in = tp->segs_in; + + notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); + info->tcpi_notsent_bytes = max(0, notsent_bytes); + + info->tcpi_min_rtt = tcp_min_rtt(tp); + info->tcpi_data_segs_in = tp->data_segs_in; + info->tcpi_data_segs_out = tp->data_segs_out; } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -2837,6 +2835,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); int val, len; if (get_user(len, optlen)) @@ -2871,12 +2870,12 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = keepalive_probes(tp); break; case TCP_SYNCNT: - val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; + val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; break; case TCP_LINGER2: val = tp->linger2; if (val >= 0) - val = (val ? : sysctl_tcp_fin_timeout) / HZ; + val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ; break; case TCP_DEFER_ACCEPT: val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, @@ -3053,17 +3052,26 @@ static bool tcp_md5sig_pool_populated = false; static void __tcp_alloc_md5sig_pool(void) { + struct crypto_ahash *hash; int cpu; + hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(hash)) + return; + for_each_possible_cpu(cpu) { - if (!per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm) { - struct crypto_hash *hash; + struct ahash_request *req; - hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(hash)) - return; - per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash; - } + if (per_cpu(tcp_md5sig_pool, cpu).md5_req) + continue; + + req = ahash_request_alloc(hash, GFP_KERNEL); + if (!req) + return; + + ahash_request_set_callback(req, 0, NULL, NULL); + + per_cpu(tcp_md5sig_pool, cpu).md5_req = req; } /* before setting tcp_md5sig_pool_populated, we must commit all writes * to memory. See smp_rmb() in tcp_get_md5sig_pool() @@ -3113,7 +3121,6 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, { struct scatterlist sg; struct tcphdr hdr; - int err; /* We are not allowed to change tcphdr, make a local copy */ memcpy(&hdr, th, sizeof(hdr)); @@ -3121,8 +3128,8 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, /* options aren't included in the hash */ sg_init_one(&sg, &hdr, sizeof(hdr)); - err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr)); - return err; + ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(hdr)); + return crypto_ahash_update(hp->md5_req); } EXPORT_SYMBOL(tcp_md5_hash_header); @@ -3131,7 +3138,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, { struct scatterlist sg; const struct tcphdr *tp = tcp_hdr(skb); - struct hash_desc *desc = &hp->md5_desc; + struct ahash_request *req = hp->md5_req; unsigned int i; const unsigned int head_data_len = skb_headlen(skb) > header_len ? skb_headlen(skb) - header_len : 0; @@ -3141,7 +3148,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, sg_init_table(&sg, 1); sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len); - if (crypto_hash_update(desc, &sg, head_data_len)) + ahash_request_set_crypt(req, &sg, NULL, head_data_len); + if (crypto_ahash_update(req)) return 1; for (i = 0; i < shi->nr_frags; ++i) { @@ -3151,7 +3159,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, sg_set_page(&sg, page, skb_frag_size(f), offset_in_page(offset)); - if (crypto_hash_update(desc, &sg, skb_frag_size(f))) + ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f)); + if (crypto_ahash_update(req)) return 1; } @@ -3168,7 +3177,8 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *ke struct scatterlist sg; sg_init_one(&sg, key->key, key->keylen); - return crypto_hash_update(&hp->md5_desc, &sg, key->keylen); + ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen); + return crypto_ahash_update(hp->md5_req); } EXPORT_SYMBOL(tcp_md5_hash_key); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 55be6ac70..cffd8f9ed 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -1,3 +1,4 @@ +#include <linux/crypto.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> @@ -124,6 +125,49 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req, return false; } + +/* If an incoming SYN or SYNACK frame contains a payload and/or FIN, + * queue this additional data / FIN. + */ +void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt) + return; + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + return; + + skb_dst_drop(skb); + /* segs_in has been initialized to 1 in tcp_create_openreq_child(). + * Hence, reset segs_in to 0 before calling tcp_segs_in() + * to avoid double counting. Also, tcp_segs_in() expects + * skb->len to include the tcp_hdrlen. Hence, it should + * be called before __skb_pull(). + */ + tp->segs_in = 0; + tcp_segs_in(tp, skb); + __skb_pull(skb, tcp_hdrlen(skb)); + skb_set_owner_r(skb, sk); + + TCP_SKB_CB(skb)->seq++; + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; + + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + __skb_queue_tail(&sk->sk_receive_queue, skb); + tp->syn_data_acked = 1; + + /* u64_stats_update_begin(&tp->syncp) not needed here, + * as we certainly are not changing upper 32bit value (0) + */ + tp->bytes_received = skb->len; + + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + tcp_fin(sk); +} + static struct sock *tcp_fastopen_create_child(struct sock *sk, struct sk_buff *skb, struct dst_entry *dst, @@ -132,7 +176,6 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, struct tcp_sock *tp; struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; struct sock *child; - u32 end_seq; bool own_req; req->num_retrans = 0; @@ -178,35 +221,11 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, tcp_init_metrics(child); tcp_init_buffer_space(child); - /* Queue the data carried in the SYN packet. - * We used to play tricky games with skb_get(). - * With lockless listener, it is a dead end. - * Do not think about it. - * - * XXX (TFO) - we honor a zero-payload TFO request for now, - * (any reason not to?) but no need to queue the skb since - * there is no data. How about SYN+FIN? - */ - end_seq = TCP_SKB_CB(skb)->end_seq; - if (end_seq != TCP_SKB_CB(skb)->seq + 1) { - struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); - - if (likely(skb2)) { - skb_dst_drop(skb2); - __skb_pull(skb2, tcp_hdrlen(skb)); - skb_set_owner_r(skb2, child); - __skb_queue_tail(&child->sk_receive_queue, skb2); - tp->syn_data_acked = 1; - - /* u64_stats_update_begin(&tp->syncp) not needed here, - * as we certainly are not changing upper 32bit value (0) - */ - tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1; - } else { - end_seq = TCP_SKB_CB(skb)->seq + 1; - } - } - tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq; + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + + tcp_fastopen_add_skb(child, skb); + + tcp_rsk(req)->rcv_nxt = tp->rcv_nxt; /* tcp_conn_request() is sending the SYNACK, * and queues the child into listener accept queue. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b3a47bc08..131005f82 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -83,9 +83,7 @@ EXPORT_SYMBOL(sysctl_tcp_timestamps); int sysctl_tcp_window_scaling __read_mostly = 1; int sysctl_tcp_sack __read_mostly = 1; int sysctl_tcp_fack __read_mostly = 1; -int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH; int sysctl_tcp_max_reordering __read_mostly = 300; -EXPORT_SYMBOL(sysctl_tcp_reordering); int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 1; @@ -129,6 +127,10 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) +#define REXMIT_NONE 0 /* no loss recovery to do */ +#define REXMIT_LOST 1 /* retransmit packets marked lost */ +#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */ + /* Adapt the MSS value used to make delayed ack decision to the * real world. */ @@ -1213,6 +1215,7 @@ static u8 tcp_sacktag_one(struct sock *sk, sacked |= TCPCB_SACKED_ACKED; state->flag |= FLAG_DATA_SACKED; tp->sacked_out += pcount; + tp->delivered += pcount; /* Out-of-order packets delivered */ fack_count += pcount; @@ -1309,6 +1312,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, if (skb == tcp_highest_sack(sk)) tcp_advance_highest_sack(sk, skb); + tcp_skb_collapse_tstamp(prev, skb); tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); @@ -1824,8 +1828,12 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend) static void tcp_add_reno_sack(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + u32 prior_sacked = tp->sacked_out; + tp->sacked_out++; tcp_check_reno_reordering(sk, 0); + if (tp->sacked_out > prior_sacked) + tp->delivered++; /* Some out-of-order packet is delivered */ tcp_verify_left_out(tp); } @@ -1837,6 +1845,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked) if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ + tp->delivered += max_t(int, acked - tp->sacked_out, 1); if (acked - 1 >= tp->sacked_out) tp->sacked_out = 0; else @@ -1876,6 +1885,7 @@ void tcp_enter_loss(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct sk_buff *skb; bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; bool is_reneg; /* is receiver reneging on SACKs? */ @@ -1926,9 +1936,9 @@ void tcp_enter_loss(struct sock *sk) * suggests that the degree of reordering is over-estimated. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder && - tp->sacked_out >= sysctl_tcp_reordering) + tp->sacked_out >= net->ipv4.sysctl_tcp_reordering) tp->reordering = min_t(unsigned int, tp->reordering, - sysctl_tcp_reordering); + net->ipv4.sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; tcp_ecn_queue_cwr(tp); @@ -2112,6 +2122,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out; + int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering; /* Trick#1: The loss is proven. */ if (tp->lost_out) @@ -2126,7 +2137,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) */ packets_out = tp->packets_out; if (packets_out <= tp->reordering && - tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && + tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) && !tcp_may_send_now(sk)) { /* We have nothing to send. This connection is limited * either by receiver window or by application. @@ -2470,14 +2481,12 @@ static void tcp_init_cwnd_reduction(struct sock *sk) tcp_ecn_queue_cwr(tp); } -static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, - int fast_rexmit, int flag) +static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, + int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp); - int newly_acked_sacked = prior_unsacked - - (tp->packets_out - tp->sacked_out); if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd)) return; @@ -2495,7 +2504,8 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, } else { sndcnt = min(delta, newly_acked_sacked); } - sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); + /* Force a fast retransmit upon entering fast recovery */ + sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1)); tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; } @@ -2540,7 +2550,7 @@ static void tcp_try_keep_open(struct sock *sk) } } -static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) +static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -2554,8 +2564,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); - } else { - tcp_cwnd_reduction(sk, prior_unsacked, 0, flag); } } @@ -2665,7 +2673,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are * recovered or spurious. Otherwise retransmits more on partial ACKs. */ -static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) +static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, + int *rexmit) { struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); @@ -2687,10 +2696,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) tp->frto = 0; /* Step 3.a. loss was real */ } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) { tp->high_seq = tp->snd_nxt; - __tcp_push_pending_frames(sk, tcp_current_mss(sk), - TCP_NAGLE_OFF); - if (after(tp->snd_nxt, tp->high_seq)) - return; /* Step 2.b */ + /* Step 2.b. Try send new data (but deferred until cwnd + * is updated in tcp_ack()). Otherwise fall back to + * the conventional recovery. + */ + if (tcp_send_head(sk) && + after(tcp_wnd_end(tp), tp->snd_nxt)) { + *rexmit = REXMIT_NEW; + return; + } tp->frto = 0; } } @@ -2709,12 +2723,11 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) else if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); } - tcp_xmit_retransmit_queue(sk); + *rexmit = REXMIT_LOST; } /* Undo during fast recovery after partial ACK. */ -static bool tcp_try_undo_partial(struct sock *sk, const int acked, - const int prior_unsacked, int flag) +static bool tcp_try_undo_partial(struct sock *sk, const int acked) { struct tcp_sock *tp = tcp_sk(sk); @@ -2729,10 +2742,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked, * can undo. Otherwise we clock out new packets but do not * mark more packets lost or retransmit more. */ - if (tp->retrans_out) { - tcp_cwnd_reduction(sk, prior_unsacked, 0, flag); + if (tp->retrans_out) return true; - } if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; @@ -2751,21 +2762,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked, * taking into account both packets sitting in receiver's buffer and * packets lost by network. * - * Besides that it does CWND reduction, when packet loss is detected - * and changes state of machine. + * Besides that it updates the congestion state when packet loss or ECN + * is detected. But it does not reduce the cwnd, it is done by the + * congestion control later. * * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const int acked, - const int prior_unsacked, - bool is_dupack, int flag) + bool is_dupack, int *ack_flag, int *rexmit) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + int fast_rexmit = 0, flag = *ack_flag; bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - int fast_rexmit = 0; if (WARN_ON(!tp->packets_out && tp->sacked_out)) tp->sacked_out = 0; @@ -2812,8 +2823,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, /* Use RACK to detect loss */ if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS && - tcp_rack_mark_lost(sk)) + tcp_rack_mark_lost(sk)) { flag |= FLAG_LOST_RETRANS; + *ack_flag |= FLAG_LOST_RETRANS; + } /* E. Process state. */ switch (icsk->icsk_ca_state) { @@ -2822,7 +2835,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); } else { - if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag)) + if (tcp_try_undo_partial(sk, acked)) return; /* Partial ACK arrived. Force fast retransmit. */ do_lost = tcp_is_reno(tp) || @@ -2834,7 +2847,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, } break; case TCP_CA_Loss: - tcp_process_loss(sk, flag, is_dupack); + tcp_process_loss(sk, flag, is_dupack, rexmit); if (icsk->icsk_ca_state != TCP_CA_Open && !(flag & FLAG_LOST_RETRANS)) return; @@ -2851,7 +2864,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, tcp_try_undo_dsack(sk); if (!tcp_time_to_recover(sk, flag)) { - tcp_try_to_open(sk, flag, prior_unsacked); + tcp_try_to_open(sk, flag); return; } @@ -2873,8 +2886,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (do_lost) tcp_update_scoreboard(sk, fast_rexmit); - tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag); - tcp_xmit_retransmit_queue(sk); + *rexmit = REXMIT_LOST; } /* Kathleen Nichols' algorithm for tracking the minimum value of @@ -3090,7 +3102,8 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, shinfo = skb_shinfo(skb); if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) && - between(shinfo->tskey, prior_snd_una, tcp_sk(sk)->snd_una - 1)) + !before(shinfo->tskey, prior_snd_una) && + before(shinfo->tskey, tcp_sk(sk)->snd_una)) __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); } @@ -3099,7 +3112,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una, + u32 prior_snd_una, int *acked, struct tcp_sacktag_state *sack) { const struct inet_connection_sock *icsk = inet_csk(sk); @@ -3157,10 +3170,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, flag |= FLAG_ORIG_SACK_ACKED; } - if (sacked & TCPCB_SACKED_ACKED) + if (sacked & TCPCB_SACKED_ACKED) { tp->sacked_out -= acked_pcount; - else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb)) - tcp_rack_advance(tp, &skb->skb_mstamp, sacked); + } else if (tcp_is_sack(tp)) { + tp->delivered += acked_pcount; + if (!tcp_skb_spurious_retrans(tp, skb)) + tcp_rack_advance(tp, &skb->skb_mstamp, sacked); + } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3269,6 +3285,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, } } #endif + *acked = pkts_acked; return flag; } @@ -3302,21 +3319,36 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) /* Decide wheather to run the increase function of congestion control. */ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) { - if (tcp_in_cwnd_reduction(sk)) - return false; - /* If reordering is high then always grow cwnd whenever data is * delivered regardless of its ordering. Otherwise stay conservative * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/ * new SACK or ECE mark may first advance cwnd here and later reduce * cwnd in tcp_fastretrans_alert() based on more states. */ - if (tcp_sk(sk)->reordering > sysctl_tcp_reordering) + if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering) return flag & FLAG_FORWARD_PROGRESS; return flag & FLAG_DATA_ACKED; } +/* The "ultimate" congestion control function that aims to replace the rigid + * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction). + * It's called toward the end of processing an ACK with precise rate + * information. All transmission or retransmission are delayed afterwards. + */ +static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked, + int flag) +{ + if (tcp_in_cwnd_reduction(sk)) { + /* Reduce cwnd if state mandates */ + tcp_cwnd_reduction(sk, acked_sacked, flag); + } else if (tcp_may_raise_cwnd(sk, flag)) { + /* Advance cwnd if state allows */ + tcp_cong_avoid(sk, ack, acked_sacked); + } + tcp_update_pacing_rate(sk); +} + /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ @@ -3512,6 +3544,27 @@ static inline void tcp_in_ack_event(struct sock *sk, u32 flags) icsk->icsk_ca_ops->in_ack_event(sk, flags); } +/* Congestion control has updated the cwnd already. So if we're in + * loss recovery then now we do any new sends (for FRTO) or + * retransmits (for CA_Loss or CA_recovery) that make sense. + */ +static void tcp_xmit_recovery(struct sock *sk, int rexmit) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (rexmit == REXMIT_NONE) + return; + + if (unlikely(rexmit == 2)) { + __tcp_push_pending_frames(sk, tcp_current_mss(sk), + TCP_NAGLE_OFF); + if (after(tp->snd_nxt, tp->high_seq)) + return; + tp->frto = 0; + } + tcp_xmit_retransmit_queue(sk); +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -3524,8 +3577,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) bool is_dupack = false; u32 prior_fackets; int prior_packets = tp->packets_out; - const int prior_unsacked = tp->packets_out - tp->sacked_out; + u32 prior_delivered = tp->delivered; int acked = 0; /* Number of packets newly acked */ + int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ sack_state.first_sackt.v64 = 0; @@ -3614,23 +3668,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) goto no_queue; /* See if we can take anything off of the retransmit queue. */ - acked = tp->packets_out; - flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, + flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, &sack_state); - acked -= tp->packets_out; if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, acked, prior_unsacked, - is_dupack, flag); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); } if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); - /* Advance cwnd if state allows */ - if (tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, acked); - if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { struct dst_entry *dst = __sk_dst_get(sk); if (dst) @@ -3639,14 +3686,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); - tcp_update_pacing_rate(sk); + tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag); + tcp_xmit_recovery(sk, rexmit); return 1; no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, acked, prior_unsacked, - is_dupack, flag); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3669,8 +3716,8 @@ old_ack: if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, acked, prior_unsacked, - is_dupack, flag); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); + tcp_xmit_recovery(sk, rexmit); } SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); @@ -4042,7 +4089,7 @@ void tcp_reset(struct sock *sk) * * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. */ -static void tcp_fin(struct sock *sk) +void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -5598,6 +5645,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tp->syn_data_acked = tp->syn_data; if (tp->syn_data_acked) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + + tcp_fastopen_add_skb(sk, synack); + return false; } @@ -6204,9 +6254,10 @@ static bool tcp_syn_flood_action(const struct sock *sk, struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; const char *msg = "Dropping request"; bool want_cookie = false; + struct net *net = sock_net(sk); #ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) { + if (net->ipv4.sysctl_tcp_syncookies) { msg = "Sending cookies"; want_cookie = true; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); @@ -6215,7 +6266,7 @@ static bool tcp_syn_flood_action(const struct sock *sk, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); if (!queue->synflood_warned && - sysctl_tcp_syncookies != 2 && + net->ipv4.sysctl_tcp_syncookies != 2 && xchg(&queue->synflood_warned, 1) == 0) pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", proto, ntohs(tcp_hdr(skb)->dest), msg); @@ -6248,6 +6299,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; struct tcp_options_received tmp_opt; struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct sock *fastopen_sk = NULL; struct dst_entry *dst = NULL; struct request_sock *req; @@ -6258,7 +6310,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, * limitations, they conserve resources and peer is * evidently real one. */ - if ((sysctl_tcp_syncookies == 2 || + if ((net->ipv4.sysctl_tcp_syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) && !isn) { want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); if (!want_cookie) @@ -6324,7 +6376,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } } /* Kill the following clause, if you dislike this way. */ - else if (!sysctl_tcp_syncookies && + else if (!net->ipv4.sysctl_tcp_syncookies && (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && !tcp_peer_is_proven(req, dst, false, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b8f3908dd..dfd153fbd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -82,7 +82,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <linux/crypto.h> +#include <crypto/hash.h> #include <linux/scatterlist.h> int sysctl_tcp_tw_reuse __read_mostly; @@ -656,8 +656,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = __inet_lookup_listener(net, - &tcp_hashinfo, ip_hdr(skb)->saddr, + sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, + ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, ntohs(th->source), inet_iif(skb)); /* don't send rst if it can't find key */ @@ -879,7 +879,6 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } - #ifdef CONFIG_TCP_MD5SIG /* * RFC2385 MD5 checksumming requires a mapping of @@ -1053,21 +1052,22 @@ static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp, bp->len = cpu_to_be16(nbytes); sg_init_one(&sg, bp, sizeof(*bp)); - return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp)); + ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp)); + return crypto_ahash_update(hp->md5_req); } static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, __be32 daddr, __be32 saddr, const struct tcphdr *th) { struct tcp_md5sig_pool *hp; - struct hash_desc *desc; + struct ahash_request *req; hp = tcp_get_md5sig_pool(); if (!hp) goto clear_hash_noput; - desc = &hp->md5_desc; + req = hp->md5_req; - if (crypto_hash_init(desc)) + if (crypto_ahash_init(req)) goto clear_hash; if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2)) goto clear_hash; @@ -1075,7 +1075,8 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, goto clear_hash; if (tcp_md5_hash_key(hp, key)) goto clear_hash; - if (crypto_hash_final(desc, md5_hash)) + ahash_request_set_crypt(req, NULL, md5_hash, 0); + if (crypto_ahash_final(req)) goto clear_hash; tcp_put_md5sig_pool(); @@ -1093,7 +1094,7 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sk_buff *skb) { struct tcp_md5sig_pool *hp; - struct hash_desc *desc; + struct ahash_request *req; const struct tcphdr *th = tcp_hdr(skb); __be32 saddr, daddr; @@ -1109,9 +1110,9 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, hp = tcp_get_md5sig_pool(); if (!hp) goto clear_hash_noput; - desc = &hp->md5_desc; + req = hp->md5_req; - if (crypto_hash_init(desc)) + if (crypto_ahash_init(req)) goto clear_hash; if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len)) @@ -1122,7 +1123,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, goto clear_hash; if (tcp_md5_hash_key(hp, key)) goto clear_hash; - if (crypto_hash_final(desc, md5_hash)) + ahash_request_set_crypt(req, NULL, md5_hash, 0); + if (crypto_ahash_final(req)) goto clear_hash; tcp_put_md5sig_pool(); @@ -1612,7 +1614,8 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->sacked = 0; lookup: - sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); + sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, + th->dest); if (!sk) goto no_tcp_socket; @@ -1675,7 +1678,7 @@ process: sk_incoming_cpu_update(sk); bh_lock_sock_nested(sk); - tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); + tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { if (!tcp_prequeue(sk, skb)) @@ -1728,7 +1731,8 @@ do_time_wait: switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), - &tcp_hashinfo, + &tcp_hashinfo, skb, + __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); @@ -2420,6 +2424,16 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; + net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES; + net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; + net->ipv4.sysctl_tcp_syncookies = 1; + net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; + net->ipv4.sysctl_tcp_retries1 = TCP_RETR1; + net->ipv4.sysctl_tcp_retries2 = TCP_RETR2; + net->ipv4.sysctl_tcp_orphan_retries = 0; + net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; + net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; + return 0; fail: tcp_sk_exit(net); diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index a726d7853..7b7eec439 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -369,6 +369,7 @@ void tcp_update_metrics(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct tcp_metrics_block *tm; unsigned long rtt; u32 val; @@ -473,7 +474,7 @@ void tcp_update_metrics(struct sock *sk) if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val < tp->reordering && - tp->reordering != sysctl_tcp_reordering) + tp->reordering != net->ipv4.sysctl_tcp_reordering) tcp_metric_set(tm, TCP_METRIC_REORDERING, tp->reordering); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 9b02af213..acb366dd6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -27,9 +27,6 @@ #include <net/inet_common.h> #include <net/xfrm.h> -int sysctl_tcp_syncookies __read_mostly = 1; -EXPORT_SYMBOL(sysctl_tcp_syncookies); - int sysctl_tcp_abort_on_overflow __read_mostly; struct inet_timewait_death_row tcp_death_row = { @@ -815,7 +812,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, int ret = 0; int state = child->sk_state; - tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); + tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { ret = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 9864a2dba..773083b7f 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -135,7 +135,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th->fin = th->psh = 0; th->check = newcheck; - if (skb->ip_summed != CHECKSUM_PARTIAL) + if (skb->ip_summed == CHECKSUM_PARTIAL) + gso_reset_checksum(skb, ~th->check); + else th->check = gso_make_checksum(skb, ~th->check); seq += mss; @@ -169,7 +171,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, skb->data_len); th->check = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); - if (skb->ip_summed != CHECKSUM_PARTIAL) + if (skb->ip_summed == CHECKSUM_PARTIAL) + gso_reset_checksum(skb, ~th->check); + else th->check = gso_make_checksum(skb, ~th->check); out: return segs; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 20f2812a9..c0c1dac81 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -62,9 +62,6 @@ int sysctl_tcp_tso_win_divisor __read_mostly = 3; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; -unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; -EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); - static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); @@ -1013,8 +1010,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (likely(tcb->tcp_flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); - if (skb->len != tcp_header_size) + if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); + tp->data_segs_out += tcp_skb_pcount(skb); + } if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, @@ -2449,6 +2448,20 @@ u32 __tcp_select_window(struct sock *sk) return window; } +void tcp_skb_collapse_tstamp(struct sk_buff *skb, + const struct sk_buff *next_skb) +{ + const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb); + u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; + + if (unlikely(tsflags)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); + + shinfo->tx_flags |= tsflags; + shinfo->tskey = next_shinfo->tskey; + } +} + /* Collapses two adjacent SKB's during retransmission. */ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { @@ -2492,6 +2505,8 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb)); + tcp_skb_collapse_tstamp(skb, next_skb); + sk_wmem_free_skb(sk, next_skb); } @@ -2632,8 +2647,10 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) || skb_headroom(skb) >= 0xFFFF)) { - struct sk_buff *nskb = __pskb_copy(skb, MAX_TCP_HEADER, - GFP_ATOMIC); + struct sk_buff *nskb; + + skb_mstamp_get(&skb->skb_mstamp); + nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : -ENOBUFS; } else { @@ -3491,6 +3508,7 @@ void tcp_send_probe0(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); unsigned long probe_max; int err; @@ -3504,7 +3522,7 @@ void tcp_send_probe0(struct sock *sk) } if (err <= 0) { - if (icsk->icsk_backoff < sysctl_tcp_retries2) + if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2) icsk->icsk_backoff++; icsk->icsk_probes_out++; probe_max = TCP_RTO_MAX; diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index ebf5ff575..f6c50af24 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -187,13 +187,13 @@ static int tcpprobe_sprint(char *tbuf, int n) { const struct tcp_log *p = tcp_probe.log + tcp_probe.tail; - struct timespec tv - = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); + struct timespec64 ts + = ktime_to_timespec64(ktime_sub(p->tstamp, tcp_probe.start)); return scnprintf(tbuf, n, "%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n", - (unsigned long)tv.tv_sec, - (unsigned long)tv.tv_nsec, + (unsigned long)ts.tv_sec, + (unsigned long)ts.tv_nsec, &p->src, &p->dst, p->length, p->snd_nxt, p->snd_una, p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a4730a28b..49bc474f8 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,11 +22,6 @@ #include <linux/gfp.h> #include <net/tcp.h> -int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; -int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES; -int sysctl_tcp_retries1 __read_mostly = TCP_RETR1; -int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; -int sysctl_tcp_orphan_retries __read_mostly; int sysctl_tcp_thin_linear_timeouts __read_mostly; static void tcp_write_err(struct sock *sk) @@ -82,7 +77,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) /* Calculate maximal number or retries on an orphaned socket. */ static int tcp_orphan_retries(struct sock *sk, bool alive) { - int retries = sysctl_tcp_orphan_retries; /* May be zero. */ + int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */ /* We know from an ICMP that something is wrong. */ if (sk->sk_err_soft && !alive) @@ -157,6 +152,7 @@ static int tcp_write_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); int retry_until; bool do_reset, syn_set = false; @@ -169,10 +165,10 @@ static int tcp_write_timeout(struct sock *sk) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); } - retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; + retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; syn_set = true; } else { - if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) { + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) { /* Some middle-boxes may black-hole Fast Open _after_ * the handshake. Therefore we conservatively disable * Fast Open on this path on recurring timeouts with @@ -181,7 +177,7 @@ static int tcp_write_timeout(struct sock *sk) if (tp->syn_data_acked && tp->bytes_acked <= tp->rx_opt.mss_clamp) { tcp_fastopen_cache_set(sk, 0, NULL, true, 0); - if (icsk->icsk_retransmits == sysctl_tcp_retries1) + if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1) NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); } @@ -191,7 +187,7 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); } - retry_until = sysctl_tcp_retries2; + retry_until = net->ipv4.sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { const bool alive = icsk->icsk_rto < TCP_RTO_MAX; @@ -305,7 +301,7 @@ static void tcp_probe_timer(struct sock *sk) (s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout) goto abort; - max_probes = sysctl_tcp_retries2; + max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; @@ -332,7 +328,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); int max_retries = icsk->icsk_syn_retries ? : - sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ + sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ struct request_sock *req; req = tcp_sk(sk)->fastopen_rsk; @@ -360,6 +356,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); if (tp->fastopen_rsk) { @@ -490,7 +487,7 @@ out_reset_timer: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) + if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0, 0)) __sk_dst_reset(sk); out:; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index eb8933bc0..a2e7f55a1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -339,8 +339,13 @@ found: hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock(&hslot2->lock); - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, - &hslot2->head); + if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && + sk->sk_family == AF_INET6) + hlist_nulls_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); + else + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); hslot2->count++; spin_unlock(&hslot2->lock); } @@ -356,8 +361,8 @@ EXPORT_SYMBOL(udp_lib_get_port); * match_wildcard == false: addresses must be exactly the same, i.e. * 0.0.0.0 only equals to 0.0.0.0 */ -static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, - bool match_wildcard) +int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, + bool match_wildcard) { struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); @@ -848,32 +853,20 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb, { struct udphdr *uh = udp_hdr(skb); - if (nocheck) + if (nocheck) { uh->check = 0; - else if (skb_is_gso(skb)) + } else if (skb_is_gso(skb)) { uh->check = ~udp_v4_check(len, saddr, daddr, 0); - else if (skb_dst(skb) && skb_dst(skb)->dev && - (skb_dst(skb)->dev->features & - (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM))) { - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + uh->check = 0; + uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb)); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v4_check(len, saddr, daddr, 0); - } else { - __wsum csum; - - BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL); - - uh->check = 0; - csum = skb_checksum(skb, 0, len, 0); - uh->check = udp_v4_check(len, saddr, daddr, csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - - skb->ip_summed = CHECKSUM_UNNECESSARY; } } EXPORT_SYMBOL(udp_set_csum); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 4c519c1dc..e330c0e56 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -32,42 +32,65 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features), __be16 new_protocol, bool is_ipv6) { + int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); + bool remcsum, need_csum, offload_csum, ufo; struct sk_buff *segs = ERR_PTR(-EINVAL); + struct udphdr *uh = udp_hdr(skb); u16 mac_offset = skb->mac_header; - int mac_len = skb->mac_len; - int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); __be16 protocol = skb->protocol; - netdev_features_t enc_features; + u16 mac_len = skb->mac_len; int udp_offset, outer_hlen; - unsigned int oldlen; - bool need_csum = !!(skb_shinfo(skb)->gso_type & - SKB_GSO_UDP_TUNNEL_CSUM); - bool remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); - bool offload_csum = false, dont_encap = (need_csum || remcsum); - - oldlen = (u16)~skb->len; + __wsum partial; if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; + /* Adjust partial header checksum to negate old length. + * We cannot rely on the value contained in uh->len as it is + * possible that the actual value exceeds the boundaries of the + * 16 bit length field due to the header being added outside of an + * IP or IPv6 frame that was already limited to 64K - 1. + */ + partial = csum_sub(csum_unfold(uh->check), + (__force __wsum)htonl(skb->len)); + + /* setup inner skb. */ skb->encapsulation = 0; + SKB_GSO_CB(skb)->encap_level = 0; __skb_pull(skb, tnl_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); skb->protocol = new_protocol; + + need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM); skb->encap_hdr_csum = need_csum; + + remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); skb->remcsum_offload = remcsum; + ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); + /* Try to offload checksum if possible */ offload_csum = !!(need_csum && - ((skb->dev->features & NETIF_F_HW_CSUM) || - (skb->dev->features & (is_ipv6 ? - NETIF_F_IPV6_CSUM : NETIF_F_IP_CSUM)))); + (skb->dev->features & + (is_ipv6 ? (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM) : + (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)))); + + features &= skb->dev->hw_enc_features; + + /* The only checksum offload we care about from here on out is the + * outer one so strip the existing checksum feature flags and + * instead set the flag based on our outer checksum offload value. + */ + if (remcsum || ufo) { + features &= ~NETIF_F_CSUM_MASK; + if (!need_csum || offload_csum) + features |= NETIF_F_HW_CSUM; + } /* segment inner packet. */ - enc_features = skb->dev->hw_enc_features & features; - segs = gso_inner_segment(skb, enc_features); + segs = gso_inner_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, mac_len); @@ -78,17 +101,13 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, udp_offset = outer_hlen - tnl_hlen; skb = segs; do { - struct udphdr *uh; - int len; - __be32 delta; + __be16 len; - if (dont_encap) { - skb->encapsulation = 0; + if (remcsum) skb->ip_summed = CHECKSUM_NONE; - } else { - /* Only set up inner headers if we might be offloading - * inner checksum. - */ + + /* Set up inner headers if we are offloading inner checksum */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { skb_reset_inner_headers(skb); skb->encapsulation = 1; } @@ -96,43 +115,27 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, skb->mac_len = mac_len; skb->protocol = protocol; - skb_push(skb, outer_hlen); + __skb_push(skb, outer_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); skb_set_transport_header(skb, udp_offset); - len = skb->len - udp_offset; + len = htons(skb->len - udp_offset); uh = udp_hdr(skb); - uh->len = htons(len); + uh->len = len; if (!need_csum) continue; - delta = htonl(oldlen + len); + uh->check = ~csum_fold(csum_add(partial, (__force __wsum)len)); - uh->check = ~csum_fold((__force __wsum) - ((__force u32)uh->check + - (__force u32)delta)); - if (offload_csum) { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct udphdr, check); - } else if (remcsum) { - /* Need to calculate checksum from scratch, - * inner checksums are never when doing - * remote_checksum_offload. - */ - - skb->csum = skb_checksum(skb, udp_offset, - skb->len - udp_offset, - 0); - uh->check = csum_fold(skb->csum); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - } else { + if (skb->encapsulation || !offload_csum) { uh->check = gso_make_checksum(skb, ~uh->check); - if (uh->check == 0) uh->check = CSUM_MANGLED_0; + } else { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct udphdr, check); } } while ((skb = skb->next)); out: @@ -235,6 +238,13 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, skb->ip_summed = CHECKSUM_NONE; + /* If there is no outer header we can fake a checksum offload + * due to the fact that we have already done the checksum in + * software prior to segmenting the frame. + */ + if (!skb->encap_hdr_csum) + features |= NETIF_F_HW_CSUM; + /* Fragment the skb. IP headers of the fragments are updated in * inet_gso_segment() */ @@ -302,14 +312,14 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, unsigned int off = skb_gro_offset(skb); int flush = 1; - if (NAPI_GRO_CB(skb)->udp_mark || + if (NAPI_GRO_CB(skb)->encap_mark || (skb->ip_summed != CHECKSUM_PARTIAL && NAPI_GRO_CB(skb)->csum_cnt == 0 && !NAPI_GRO_CB(skb)->csum_valid)) goto out; - /* mark that this skb passed once through the udp gro layer */ - NAPI_GRO_CB(skb)->udp_mark = 1; + /* mark that this skb passed once through the tunnel gro layer */ + NAPI_GRO_CB(skb)->encap_mark = 1; rcu_read_lock(); uo_priv = rcu_dereference(udp_offload_base); @@ -389,6 +399,11 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff) uh->len = newlen; + /* Set encapsulation before calling into inner gro_complete() functions + * to make them set up the inner offsets. + */ + skb->encapsulation = 1; + rcu_read_lock(); uo_priv = rcu_dereference(udp_offload_base); @@ -411,9 +426,6 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff) if (skb->remcsum_offload) skb_shinfo(skb)->gso_type |= SKB_GSO_TUNNEL_REMCSUM; - skb->encapsulation = 1; - skb_set_inner_mac_header(skb, nhoff + sizeof(struct udphdr)); - return err; } |