diff options
Diffstat (limited to 'net/ipv4')
68 files changed, 1815 insertions, 1240 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 23431321c..a3c217b0d 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -331,20 +331,6 @@ config NET_FOU_IP_TUNNELS When this option is enabled IP tunnels can be configured to use FOU or GUE encapsulation. -config GENEVE_CORE - tristate "Generic Network Virtualization Encapsulation library" - depends on INET - select NET_UDP_TUNNEL - ---help--- - This allows one to create Geneve virtual interfaces that provide - Layer 2 Networks over Layer 3 Networks. Geneve is often used - to tunnel virtual network infrastructure in virtualized environments. - For more information see: - http://tools.ietf.org/html/draft-gross-geneve-01 - - To compile this driver as a module, choose M here: the module - - config INET_AH tristate "IP: AH transformation" select XFRM_ALGO diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index efc43f300..89aacb630 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -57,7 +57,6 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o -obj-$(CONFIG_GENEVE_CORE) += geneve_core.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9532ee871..1d0c3adb6 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -112,12 +112,14 @@ #include <net/raw.h> #include <net/icmp.h> #include <net/inet_common.h> +#include <net/ip_tunnels.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/secure_seq.h> #ifdef CONFIG_IP_MROUTE #include <linux/mroute.h> #endif +#include <net/vrf.h> /* The inetsw table contains everything that inet_create needs to @@ -426,6 +428,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct net *net = sock_net(sk); unsigned short snum; int chk_addr_ret; + u32 tb_id = RT_TABLE_LOCAL; int err; /* If the socket has its own bind function then use it. (RAW) */ @@ -447,7 +450,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } - chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); + tb_id = vrf_dev_table_ifindex(net, sk->sk_bound_dev_if) ? : tb_id; + chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); /* Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since @@ -1448,38 +1452,51 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, } EXPORT_SYMBOL_GPL(inet_ctl_sock_create); +u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt) +{ + return *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt); +} +EXPORT_SYMBOL_GPL(snmp_get_cpu_field); + unsigned long snmp_fold_field(void __percpu *mib, int offt) { unsigned long res = 0; int i; for_each_possible_cpu(i) - res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt); + res += snmp_get_cpu_field(mib, i, offt); return res; } EXPORT_SYMBOL_GPL(snmp_fold_field); #if BITS_PER_LONG==32 +u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt, + size_t syncp_offset) +{ + void *bhptr; + struct u64_stats_sync *syncp; + u64 v; + unsigned int start; + + bhptr = per_cpu_ptr(mib, cpu); + syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); + do { + start = u64_stats_fetch_begin_irq(syncp); + v = *(((u64 *)bhptr) + offt); + } while (u64_stats_fetch_retry_irq(syncp, start)); + + return v; +} +EXPORT_SYMBOL_GPL(snmp_get_cpu_field64); + u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) { u64 res = 0; int cpu; for_each_possible_cpu(cpu) { - void *bhptr; - struct u64_stats_sync *syncp; - u64 v; - unsigned int start; - - bhptr = per_cpu_ptr(mib, cpu); - syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); - do { - start = u64_stats_fetch_begin_irq(syncp); - v = *(((u64 *) bhptr) + offt); - } while (u64_stats_fetch_retry_irq(syncp, start)); - - res += v; + res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset); } return res; } @@ -1780,6 +1797,8 @@ static int __init inet_init(void) dev_add_pack(&ip_packet_type); + ip_tunnel_core_init(); + rc = 0; out: return rc; diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index ac9a32ec3..f2a71025a 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -360,8 +360,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + ahp->icv_trunc_len + seqhi_len); - if (!work_iph) + if (!work_iph) { + err = -ENOMEM; goto out; + } seqhi = (__be32 *)((char *)work_iph + ihl); auth_data = ah_tmp_auth(seqhi, seqhi_len); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 6c8b1fbaf..0c9c3482e 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -113,6 +113,8 @@ #include <net/arp.h> #include <net/ax25.h> #include <net/netrom.h> +#include <net/dst_metadata.h> +#include <net/ip_tunnels.h> #include <linux/uaccess.h> @@ -233,7 +235,7 @@ static int arp_constructor(struct neighbour *neigh) return -EINVAL; } - neigh->type = inet_addr_type(dev_net(dev), addr); + neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr); parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); @@ -291,6 +293,39 @@ static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) kfree_skb(skb); } +/* Create and send an arp packet. */ +static void arp_send_dst(int type, int ptype, __be32 dest_ip, + struct net_device *dev, __be32 src_ip, + const unsigned char *dest_hw, + const unsigned char *src_hw, + const unsigned char *target_hw, + struct dst_entry *dst) +{ + struct sk_buff *skb; + + /* arp on this interface. */ + if (dev->flags & IFF_NOARP) + return; + + skb = arp_create(type, ptype, dest_ip, dev, src_ip, + dest_hw, src_hw, target_hw); + if (!skb) + return; + + skb_dst_set(skb, dst_clone(dst)); + arp_xmit(skb); +} + +void arp_send(int type, int ptype, __be32 dest_ip, + struct net_device *dev, __be32 src_ip, + const unsigned char *dest_hw, const unsigned char *src_hw, + const unsigned char *target_hw) +{ + arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, + target_hw, NULL); +} +EXPORT_SYMBOL(arp_send); + static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) { __be32 saddr = 0; @@ -299,6 +334,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); struct in_device *in_dev; + struct dst_entry *dst = NULL; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); @@ -309,7 +345,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ - if (skb && inet_addr_type(dev_net(dev), + if (skb && inet_addr_type_dev_table(dev_net(dev), dev, ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; break; @@ -317,7 +353,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) if (!skb) break; saddr = ip_hdr(skb)->saddr; - if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) { + if (inet_addr_type_dev_table(dev_net(dev), dev, + saddr) == RTN_LOCAL) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) break; @@ -346,8 +383,10 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) } } - arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, - dst_hw, dev->dev_addr, NULL); + if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE)) + dst = skb_dst(skb); + arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, + dst_hw, dev->dev_addr, NULL, dst); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) @@ -597,32 +636,6 @@ void arp_xmit(struct sk_buff *skb) EXPORT_SYMBOL(arp_xmit); /* - * Create and send an arp packet. - */ -void arp_send(int type, int ptype, __be32 dest_ip, - struct net_device *dev, __be32 src_ip, - const unsigned char *dest_hw, const unsigned char *src_hw, - const unsigned char *target_hw) -{ - struct sk_buff *skb; - - /* - * No arp on this interface. - */ - - if (dev->flags&IFF_NOARP) - return; - - skb = arp_create(type, ptype, dest_ip, dev, src_ip, - dest_hw, src_hw, target_hw); - if (!skb) - return; - - arp_xmit(skb); -} -EXPORT_SYMBOL(arp_send); - -/* * Process an arp request. */ @@ -639,6 +652,7 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) int addr_type; struct neighbour *n; struct net *net = dev_net(dev); + struct dst_entry *reply_dst = NULL; bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device @@ -739,13 +753,18 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) * cache. */ + if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb)) + reply_dst = (struct dst_entry *) + iptunnel_metadata_reply(skb_metadata_dst(skb), + GFP_ATOMIC); + /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && - inet_addr_type(net, tip) == RTN_LOCAL && + inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, - dev->dev_addr, sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, + sha, dev->dev_addr, sha, reply_dst); goto out; } @@ -764,9 +783,10 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, - dev, tip, sha, dev->dev_addr, - sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, + sip, dev, tip, sha, + dev->dev_addr, sha, + reply_dst); neigh_release(n); } } @@ -784,13 +804,14 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, - dev, tip, sha, dev->dev_addr, - sha); + arp_send_dst(ARPOP_REPLY, ETH_P_ARP, + sip, dev, tip, sha, + dev->dev_addr, sha, + reply_dst); } else { pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); - return 0; + goto out_free_dst; } goto out; } @@ -802,16 +823,18 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); if (IN_DEV_ARP_ACCEPT(in_dev)) { + unsigned int addr_type = inet_addr_type_dev_table(net, dev, sip); + /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip && - inet_addr_type(net, sip) == RTN_UNICAST; + addr_type == RTN_UNICAST; if (!n && ((arp->ar_op == htons(ARPOP_REPLY) && - inet_addr_type(net, sip) == RTN_UNICAST) || is_garp)) + addr_type == RTN_UNICAST) || is_garp)) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } @@ -842,6 +865,8 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) out: consume_skb(skb); +out_free_dst: + dst_release(reply_dst); return 0; } diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 574fad9cc..f915abff1 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -74,7 +74,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; sk->sk_state = TCP_ESTABLISHED; - inet_set_txhash(sk); + sk_set_txhash(sk); inet->inet_id = jiffies; sk_dst_set(sk, &rt->dst); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6bbc54940..457b2cd75 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -45,6 +45,8 @@ #include <net/ip_fib.h> #include <net/rtnetlink.h> #include <net/xfrm.h> +#include <net/vrf.h> +#include <trace/events/fib.h> #ifndef CONFIG_IP_MULTIPLE_TABLES @@ -211,12 +213,12 @@ void fib_flush_external(struct net *net) */ static inline unsigned int __inet_dev_addr_type(struct net *net, const struct net_device *dev, - __be32 addr) + __be32 addr, u32 tb_id) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; unsigned int ret = RTN_BROADCAST; - struct fib_table *local_table; + struct fib_table *table; if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) return RTN_BROADCAST; @@ -225,10 +227,10 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, rcu_read_lock(); - local_table = fib_get_table(net, RT_TABLE_LOCAL); - if (local_table) { + table = fib_get_table(net, tb_id); + if (table) { ret = RTN_UNICAST; - if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) { + if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) { if (!dev || dev == res.fi->fib_dev) ret = res.type; } @@ -238,19 +240,40 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, return ret; } +unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id) +{ + return __inet_dev_addr_type(net, NULL, addr, tb_id); +} +EXPORT_SYMBOL(inet_addr_type_table); + unsigned int inet_addr_type(struct net *net, __be32 addr) { - return __inet_dev_addr_type(net, NULL, addr); + return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL); } EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { - return __inet_dev_addr_type(net, dev, addr); + u32 rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL; + + return __inet_dev_addr_type(net, dev, addr, rt_table); } EXPORT_SYMBOL(inet_dev_addr_type); +/* inet_addr_type with dev == NULL but using the table from a dev + * if one is associated + */ +unsigned int inet_addr_type_dev_table(struct net *net, + const struct net_device *dev, + __be32 addr) +{ + u32 rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL; + + return __inet_dev_addr_type(net, NULL, addr, rt_table); +} +EXPORT_SYMBOL(inet_addr_type_dev_table); + __be32 fib_compute_spec_dst(struct sk_buff *skb) { struct net_device *dev = skb->dev; @@ -280,6 +303,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_scope = scope; fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; + fl4.flowi4_tun_key.tun_id = 0; if (!fib_lookup(net, &fl4, &res, 0)) return FIB_RES_PREFSRC(net, res); } else { @@ -308,16 +332,22 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, bool dev_match; fl4.flowi4_oif = 0; - fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; + fl4.flowi4_iif = vrf_master_ifindex_rcu(dev); + if (!fl4.flowi4_iif) + fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_tun_key.tun_id = 0; + fl4.flowi4_flags = 0; no_addr = idev->ifa_list == NULL; fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; + trace_fib_validate_source(dev, &fl4); + net = dev_net(dev); if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; @@ -337,6 +367,9 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, if (nh->nh_dev == dev) { dev_match = true; break; + } else if (vrf_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) { + dev_match = true; + break; } } #else @@ -494,9 +527,12 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, addr = sk_extract_addr(&rt->rt_gateway); if (rt->rt_gateway.sa_family == AF_INET && addr) { + unsigned int addr_type; + cfg->fc_gw = addr; + addr_type = inet_addr_type_table(net, addr, cfg->fc_table); if (rt->rt_flags & RTF_GATEWAY && - inet_addr_type(net, addr) == RTN_UNICAST) + addr_type == RTN_UNICAST) cfg->fc_scope = RT_SCOPE_UNIVERSE; } @@ -591,6 +627,8 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_FLOW] = { .type = NLA_U32 }, + [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, + [RTA_ENCAP] = { .type = NLA_NESTED }, }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, @@ -656,6 +694,12 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_TABLE: cfg->fc_table = nla_get_u32(attr); break; + case RTA_ENCAP: + cfg->fc_encap = attr; + break; + case RTA_ENCAP_TYPE: + cfg->fc_encap_type = nla_get_u16(attr); + break; } } @@ -760,6 +804,7 @@ out: static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) { struct net *net = dev_net(ifa->ifa_dev->dev); + u32 tb_id = vrf_dev_table_rtnl(ifa->ifa_dev->dev); struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, @@ -774,11 +819,10 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad }, }; - if (type == RTN_UNICAST) - tb = fib_new_table(net, RT_TABLE_MAIN); - else - tb = fib_new_table(net, RT_TABLE_LOCAL); + if (!tb_id) + tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL; + tb = fib_new_table(net, tb_id); if (!tb) return; @@ -960,11 +1004,14 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); } if (!(ok & LOCAL_OK)) { + unsigned int addr_type; + fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); /* Check, that this local address finally disappeared. */ - if (gone && - inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { + addr_type = inet_addr_type_dev_table(dev_net(dev), dev, + ifa->ifa_local); + if (gone && addr_type != RTN_LOCAL) { /* And the last, but not the least thing. * We must flush stray FIB entries. * @@ -1063,9 +1110,10 @@ static void nl_fib_lookup_exit(struct net *net) net->ipv4.fibnl = NULL; } -static void fib_disable_ip(struct net_device *dev, unsigned long event) +static void fib_disable_ip(struct net_device *dev, unsigned long event, + bool force) { - if (fib_sync_down_dev(dev, event)) + if (fib_sync_down_dev(dev, event, force)) fib_flush(dev_net(dev)); rt_cache_flush(dev_net(dev)); arp_ifdown(dev); @@ -1093,7 +1141,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, /* Last address was deleted from this interface. * Disable IP. */ - fib_disable_ip(dev, event); + fib_disable_ip(dev, event, true); } else { rt_cache_flush(dev_net(dev)); } @@ -1110,7 +1158,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo unsigned int flags; if (event == NETDEV_UNREGISTER) { - fib_disable_ip(dev, event); + fib_disable_ip(dev, event, true); rt_flush_dev(dev); return NOTIFY_DONE; } @@ -1131,14 +1179,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo rt_cache_flush(net); break; case NETDEV_DOWN: - fib_disable_ip(dev, event); + fib_disable_ip(dev, event, false); break; case NETDEV_CHANGE: flags = dev_get_flags(dev); if (flags & (IFF_RUNNING | IFF_LOWER_UP)) fib_sync_up(dev, RTNH_F_LINKDOWN); else - fib_sync_down_dev(dev, event); + fib_sync_down_dev(dev, event, false); /* fall through */ case NETDEV_CHANGEMTU: rt_cache_flush(net); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 18123d50f..f2bda9e89 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -318,7 +318,6 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { .delete = fib4_rule_delete, .compare = fib4_rule_compare, .fill = fib4_rule_fill, - .default_pref = fib_default_rule_pref, .nlmsg_payload = fib4_rule_nlmsg_payload, .flush_cache = fib4_rule_flush_cache, .nlgroup = RTNLGRP_IPV4_RULE, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 3a06586b1..ef5892f5e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -42,6 +42,7 @@ #include <net/ip_fib.h> #include <net/netlink.h> #include <net/nexthop.h> +#include <net/lwtunnel.h> #include "fib_lookup.h" @@ -208,6 +209,7 @@ static void free_fib_info_rcu(struct rcu_head *head) change_nexthops(fi) { if (nexthop_nh->nh_dev) dev_put(nexthop_nh->nh_dev); + lwtstate_put(nexthop_nh->nh_lwtstate); free_nh_exceptions(nexthop_nh); rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); rt_fibinfo_free(&nexthop_nh->nh_rth_input); @@ -266,6 +268,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif + lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) || ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK)) return -1; onh++; @@ -366,6 +369,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) payload += nla_total_size((RTAX_MAX * nla_total_size(4))); if (fi->fib_nhs) { + size_t nh_encapsize = 0; /* Also handles the special case fib_nhs == 1 */ /* each nexthop is packed in an attribute */ @@ -374,8 +378,21 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) /* may contain flow and gateway attribute */ nhsize += 2 * nla_total_size(4); + /* grab encap info */ + for_nexthops(fi) { + if (nh->nh_lwtstate) { + /* RTA_ENCAP_TYPE */ + nh_encapsize += lwtunnel_get_encap_size( + nh->nh_lwtstate); + /* RTA_ENCAP */ + nh_encapsize += nla_total_size(2); + } + } endfor_nexthops(fi); + /* all nexthops are packed in a nested attribute */ - payload += nla_total_size(fi->fib_nhs * nhsize); + payload += nla_total_size((fi->fib_nhs * nhsize) + + nh_encapsize); + } return payload; @@ -421,13 +438,15 @@ static int fib_detect_death(struct fib_info *fi, int order, if (n) { state = n->nud_state; neigh_release(n); + } else { + return 0; } if (state == NUD_REACHABLE) return 0; if ((state & NUD_VALID) && order != dflt) return 0; if ((state & NUD_VALID) || - (*last_idx < 0 && order > dflt)) { + (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) { *last_resort = fi; *last_idx = order; } @@ -452,6 +471,9 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg) { + struct net *net = cfg->fc_nlinfo.nl_net; + int ret; + change_nexthops(fi) { int attrlen; @@ -475,18 +497,70 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, if (nexthop_nh->nh_tclassid) fi->fib_net->ipv4.fib_num_tclassid_users++; #endif + nla = nla_find(attrs, attrlen, RTA_ENCAP); + if (nla) { + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + struct nlattr *nla_entype; + + nla_entype = nla_find(attrs, attrlen, + RTA_ENCAP_TYPE); + if (!nla_entype) + goto err_inval; + if (cfg->fc_oif) + dev = __dev_get_by_index(net, cfg->fc_oif); + ret = lwtunnel_build_state(dev, nla_get_u16( + nla_entype), + nla, AF_INET, cfg, + &lwtstate); + if (ret) + goto errout; + nexthop_nh->nh_lwtstate = + lwtstate_get(lwtstate); + } } rtnh = rtnh_next(rtnh, &remaining); } endfor_nexthops(fi); return 0; + +err_inval: + ret = -EINVAL; + +errout: + return ret; } #endif +static int fib_encap_match(struct net *net, u16 encap_type, + struct nlattr *encap, + int oif, const struct fib_nh *nh, + const struct fib_config *cfg) +{ + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + int ret, result = 0; + + if (encap_type == LWTUNNEL_ENCAP_NONE) + return 0; + + if (oif) + dev = __dev_get_by_index(net, oif); + ret = lwtunnel_build_state(dev, encap_type, encap, + AF_INET, cfg, &lwtstate); + if (!ret) { + result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); + lwtstate_free(lwtstate); + } + + return result; +} + int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) { + struct net *net = cfg->fc_nlinfo.nl_net; #ifdef CONFIG_IP_ROUTE_MULTIPATH struct rtnexthop *rtnh; int remaining; @@ -496,6 +570,12 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) return 1; if (cfg->fc_oif || cfg->fc_gw) { + if (cfg->fc_encap) { + if (fib_encap_match(net, cfg->fc_encap_type, + cfg->fc_encap, cfg->fc_oif, + fi->fib_nh, cfg)) + return 1; + } if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) return 0; @@ -585,7 +665,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, struct fib_nh *nh) { - int err; + int err = 0; struct net *net; struct net_device *dev; @@ -594,16 +674,18 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, struct fib_result res; if (nh->nh_flags & RTNH_F_ONLINK) { + unsigned int addr_type; if (cfg->fc_scope >= RT_SCOPE_LINK) return -EINVAL; - if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) - return -EINVAL; dev = __dev_get_by_index(net, nh->nh_oif); if (!dev) return -ENODEV; if (!(dev->flags & IFF_UP)) return -ENETDOWN; + addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw); + if (addr_type != RTN_UNICAST) + return -EINVAL; if (!netif_carrier_ok(dev)) nh->nh_flags |= RTNH_F_LINKDOWN; nh->nh_dev = dev; @@ -613,6 +695,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, } rcu_read_lock(); { + struct fib_table *tbl = NULL; struct flowi4 fl4 = { .daddr = nh->nh_gw, .flowi4_scope = cfg->fc_scope + 1, @@ -623,8 +706,24 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, /* It is not necessary, but requires a bit of thinking */ if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; - err = fib_lookup(net, &fl4, &res, - FIB_LOOKUP_IGNORE_LINKSTATE); + + if (cfg->fc_table) + tbl = fib_get_table(net, cfg->fc_table); + + if (tbl) + err = fib_table_lookup(tbl, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE | + FIB_LOOKUP_NOREF); + + /* on error or if no table given do full lookup. This + * is needed for example when nexthops are in the local + * table rather than the given table + */ + if (!tbl || err) { + err = fib_lookup(net, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE); + } + if (err) { rcu_read_unlock(); return err; @@ -760,6 +859,74 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) return nh->nh_saddr; } +static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) +{ + if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || + fib_prefsrc != cfg->fc_dst) { + u32 tb_id = cfg->fc_table; + int rc; + + if (tb_id == RT_TABLE_MAIN) + tb_id = RT_TABLE_LOCAL; + + rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, + fib_prefsrc, tb_id); + + if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) { + rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, + fib_prefsrc, RT_TABLE_LOCAL); + } + + if (rc != RTN_LOCAL) + return false; + } + return true; +} + +static int +fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) +{ + bool ecn_ca = false; + struct nlattr *nla; + int remaining; + + if (!cfg->fc_mx) + return 0; + + nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { + int type = nla_type(nla); + u32 val; + + if (!type) + continue; + if (type > RTAX_MAX) + return -EINVAL; + + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; + + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + if (val == TCP_CA_UNSPEC) + return -EINVAL; + } else { + val = nla_get_u32(nla); + } + if (type == RTAX_ADVMSS && val > 65535 - 40) + val = 65535 - 40; + if (type == RTAX_MTU && val > 65535 - 15) + val = 65535 - 15; + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + return -EINVAL; + fi->fib_metrics[type - 1] = val; + } + + if (ecn_ca) + fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + + return 0; +} + struct fib_info *fib_create_info(struct fib_config *cfg) { int err; @@ -832,36 +999,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; } endfor_nexthops(fi) - if (cfg->fc_mx) { - struct nlattr *nla; - int remaining; - - nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla_type(nla); - - if (type) { - u32 val; - - if (type > RTAX_MAX) - goto err_inval; - if (type == RTAX_CC_ALGO) { - char tmp[TCP_CA_NAME_MAX]; - - nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp); - if (val == TCP_CA_UNSPEC) - goto err_inval; - } else { - val = nla_get_u32(nla); - } - if (type == RTAX_ADVMSS && val > 65535 - 40) - val = 65535 - 40; - if (type == RTAX_MTU && val > 65535 - 15) - val = 65535 - 15; - fi->fib_metrics[type - 1] = val; - } - } - } + err = fib_convert_metrics(fi, cfg); + if (err) + goto failure; if (cfg->fc_mp) { #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -882,6 +1022,22 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } else { struct fib_nh *nh = fi->fib_nh; + if (cfg->fc_encap) { + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + + if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) + goto err_inval; + if (cfg->fc_oif) + dev = __dev_get_by_index(net, cfg->fc_oif); + err = lwtunnel_build_state(dev, cfg->fc_encap_type, + cfg->fc_encap, AF_INET, cfg, + &lwtstate); + if (err) + goto failure; + + nh->nh_lwtstate = lwtstate_get(lwtstate); + } nh->nh_oif = cfg->fc_oif; nh->nh_gw = cfg->fc_gw; nh->nh_flags = cfg->fc_flags; @@ -940,12 +1096,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_flags |= RTNH_F_LINKDOWN; } - if (fi->fib_prefsrc) { - if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || - fi->fib_prefsrc != cfg->fc_dst) - if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL) - goto err_inval; - } + if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) + goto err_inval; change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); @@ -1055,6 +1207,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) goto nla_put_failure; #endif + if (fi->fib_nh->nh_lwtstate) + lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate); } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fi->fib_nhs > 1) { @@ -1090,6 +1244,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) goto nla_put_failure; #endif + if (nh->nh_lwtstate) + lwtunnel_fill_encap(skb, nh->nh_lwtstate); /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; } endfor_nexthops(fi); @@ -1132,7 +1288,13 @@ int fib_sync_down_addr(struct net *net, __be32 local) return ret; } -int fib_sync_down_dev(struct net_device *dev, unsigned long event) +/* Event force Flags Description + * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host + * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host + * NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed + * NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed + */ +int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) { int ret = 0; int scope = RT_SCOPE_NOWHERE; @@ -1141,8 +1303,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event) struct hlist_head *head = &fib_info_devhash[hash]; struct fib_nh *nh; - if (event == NETDEV_UNREGISTER || - event == NETDEV_DOWN) + if (force) scope = -1; hlist_for_each_entry(nh, head, nh_hash) { @@ -1291,6 +1452,13 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags) if (!(dev->flags & IFF_UP)) return 0; + if (nh_flags & RTNH_F_DEAD) { + unsigned int flags = dev_get_flags(dev); + + if (flags & (IFF_RUNNING | IFF_LOWER_UP)) + nh_flags |= RTNH_F_LINKDOWN; + } + prev_fi = NULL; hash = fib_devindex_hashfn(dev->ifindex); head = &fib_info_devhash[hash]; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index b0c6258ff..744e5936c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -81,6 +81,7 @@ #include <net/sock.h> #include <net/ip_fib.h> #include <net/switchdev.h> +#include <trace/events/fib.h> #include "fib_lookup.h" #define MAX_STAT_DEPTH 32 @@ -1278,6 +1279,8 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, unsigned long index; t_key cindex; + trace_fib_table_lookup(tb->tb_id, flp); + pn = t->kv; cindex = 0; @@ -1423,8 +1426,11 @@ found: nh->nh_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) continue; - if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) - continue; + if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) { + if (flp->flowi4_oif && + flp->flowi4_oif != nh->nh_oif) + continue; + } if (!(fib_flags & FIB_LOOKUP_NOREF)) atomic_inc(&fi->fib_clntref); @@ -1439,6 +1445,8 @@ found: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif + trace_fib_table_lookup_nh(nh); + return err; } } @@ -1561,7 +1569,7 @@ static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key) do { /* record parent and next child index */ pn = n; - cindex = key ? get_index(key, pn) : 0; + cindex = (key > pn->key) ? get_index(key, pn) : 0; if (cindex >> pn->bits) break; diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 34968cd5c..e0fcbbbcf 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -79,7 +79,11 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, __be16 *pd = data; size_t start = ntohs(pd[0]); size_t offset = ntohs(pd[1]); - size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); + size_t plen = sizeof(struct udphdr) + hdrlen + + max_t(size_t, offset + sizeof(u16), start); + + if (skb->remcsum_offload) + return guehdr; if (!pskb_may_pull(skb, plen)) return NULL; @@ -221,29 +225,21 @@ out_unlock: static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, struct guehdr *guehdr, void *data, - size_t hdrlen, u8 ipproto, - struct gro_remcsum *grc, bool nopartial) + size_t hdrlen, struct gro_remcsum *grc, + bool nopartial) { __be16 *pd = data; size_t start = ntohs(pd[0]); size_t offset = ntohs(pd[1]); - size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); if (skb->remcsum_offload) - return NULL; + return guehdr; if (!NAPI_GRO_CB(skb)->csum_valid) return NULL; - /* Pull checksum that will be written */ - if (skb_gro_header_hard(skb, off + plen)) { - guehdr = skb_gro_header_slow(skb, off + plen, off); - if (!guehdr) - return NULL; - } - - skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen, - start, offset, grc, nopartial); + guehdr = skb_gro_remcsum_process(skb, (void *)guehdr, off, hdrlen, + start, offset, grc, nopartial); skb->remcsum_offload = 1; @@ -307,10 +303,10 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, if (flags & GUE_PFLAG_REMCSUM) { guehdr = gue_gro_remcsum(skb, off, guehdr, - data + doffset, hdrlen, - guehdr->proto_ctype, &grc, + data + doffset, hdrlen, &grc, !!(fou->flags & FOU_F_REMCSUM_NOPARTIAL)); + if (!guehdr) goto out; @@ -351,7 +347,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[guehdr->proto_ctype]); - if (WARN_ON(!ops || !ops->callbacks.gro_receive)) + if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive)) goto out_unlock; pp = ops->callbacks.gro_receive(head, skb); @@ -570,7 +566,7 @@ static int parse_nl_config(struct genl_info *info, if (info->attrs[FOU_ATTR_AF]) { u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]); - if (family != AF_INET && family != AF_INET6) + if (family != AF_INET) return -EINVAL; cfg->udp_config.family = family; diff --git a/net/ipv4/geneve_core.c b/net/ipv4/geneve_core.c deleted file mode 100644 index 311a4ba69..000000000 --- a/net/ipv4/geneve_core.c +++ /dev/null @@ -1,447 +0,0 @@ -/* - * Geneve: Generic Network Virtualization Encapsulation - * - * Copyright (c) 2014 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/module.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/skbuff.h> -#include <linux/list.h> -#include <linux/netdevice.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <linux/igmp.h> -#include <linux/etherdevice.h> -#include <linux/if_ether.h> -#include <linux/if_vlan.h> -#include <linux/ethtool.h> -#include <linux/mutex.h> -#include <net/arp.h> -#include <net/ndisc.h> -#include <net/ip.h> -#include <net/ip_tunnels.h> -#include <net/icmp.h> -#include <net/udp.h> -#include <net/rtnetlink.h> -#include <net/route.h> -#include <net/dsfield.h> -#include <net/inet_ecn.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> -#include <net/geneve.h> -#include <net/protocol.h> -#include <net/udp_tunnel.h> -#if IS_ENABLED(CONFIG_IPV6) -#include <net/ipv6.h> -#include <net/addrconf.h> -#include <net/ip6_tunnel.h> -#include <net/ip6_checksum.h> -#endif - -/* Protects sock_list and refcounts. */ -static DEFINE_MUTEX(geneve_mutex); - -/* per-network namespace private data for this module */ -struct geneve_net { - struct list_head sock_list; -}; - -static int geneve_net_id; - -static struct geneve_sock *geneve_find_sock(struct net *net, - sa_family_t family, __be16 port) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_sock *gs; - - list_for_each_entry(gs, &gn->sock_list, list) { - if (inet_sk(gs->sock->sk)->inet_sport == port && - inet_sk(gs->sock->sk)->sk.sk_family == family) - return gs; - } - - return NULL; -} - -static void geneve_build_header(struct genevehdr *geneveh, - __be16 tun_flags, u8 vni[3], - u8 options_len, u8 *options) -{ - geneveh->ver = GENEVE_VER; - geneveh->opt_len = options_len / 4; - geneveh->oam = !!(tun_flags & TUNNEL_OAM); - geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); - geneveh->rsvd1 = 0; - memcpy(geneveh->vni, vni, 3); - geneveh->proto_type = htons(ETH_P_TEB); - geneveh->rsvd2 = 0; - - memcpy(geneveh->options, options, options_len); -} - -/* Transmit a fully formatted Geneve frame. - * - * When calling this function. The skb->data should point - * to the geneve header which is fully formed. - * - * This function will add other UDP tunnel headers. - */ -int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, - struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, - __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, - bool csum, bool xnet) -{ - struct genevehdr *gnvh; - int min_headroom; - int err; - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) - + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - - err = skb_cow_head(skb, min_headroom); - if (unlikely(err)) { - kfree_skb(skb); - return err; - } - - skb = vlan_hwaccel_push_inside(skb); - if (unlikely(!skb)) - return -ENOMEM; - - skb = udp_tunnel_handle_offloads(skb, csum); - if (IS_ERR(skb)) - return PTR_ERR(skb); - - gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); - geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); - - skb_set_inner_protocol(skb, htons(ETH_P_TEB)); - - return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst, - tos, ttl, df, src_port, dst_port, xnet, - !csum); -} -EXPORT_SYMBOL_GPL(geneve_xmit_skb); - -static int geneve_hlen(struct genevehdr *gh) -{ - return sizeof(*gh) + gh->opt_len * 4; -} - -static struct sk_buff **geneve_gro_receive(struct sk_buff **head, - struct sk_buff *skb, - struct udp_offload *uoff) -{ - struct sk_buff *p, **pp = NULL; - struct genevehdr *gh, *gh2; - unsigned int hlen, gh_len, off_gnv; - const struct packet_offload *ptype; - __be16 type; - int flush = 1; - - off_gnv = skb_gro_offset(skb); - hlen = off_gnv + sizeof(*gh); - gh = skb_gro_header_fast(skb, off_gnv); - if (skb_gro_header_hard(skb, hlen)) { - gh = skb_gro_header_slow(skb, hlen, off_gnv); - if (unlikely(!gh)) - goto out; - } - - if (gh->ver != GENEVE_VER || gh->oam) - goto out; - gh_len = geneve_hlen(gh); - - hlen = off_gnv + gh_len; - if (skb_gro_header_hard(skb, hlen)) { - gh = skb_gro_header_slow(skb, hlen, off_gnv); - if (unlikely(!gh)) - goto out; - } - - flush = 0; - - for (p = *head; p; p = p->next) { - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - gh2 = (struct genevehdr *)(p->data + off_gnv); - if (gh->opt_len != gh2->opt_len || - memcmp(gh, gh2, gh_len)) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } - } - - type = gh->proto_type; - - rcu_read_lock(); - ptype = gro_find_receive_by_type(type); - if (!ptype) { - flush = 1; - goto out_unlock; - } - - skb_gro_pull(skb, gh_len); - skb_gro_postpull_rcsum(skb, gh, gh_len); - pp = ptype->callbacks.gro_receive(head, skb); - -out_unlock: - rcu_read_unlock(); -out: - NAPI_GRO_CB(skb)->flush |= flush; - - return pp; -} - -static int geneve_gro_complete(struct sk_buff *skb, int nhoff, - struct udp_offload *uoff) -{ - struct genevehdr *gh; - struct packet_offload *ptype; - __be16 type; - int gh_len; - int err = -ENOSYS; - - udp_tunnel_gro_complete(skb, nhoff); - - gh = (struct genevehdr *)(skb->data + nhoff); - gh_len = geneve_hlen(gh); - type = gh->proto_type; - - rcu_read_lock(); - ptype = gro_find_complete_by_type(type); - if (ptype) - err = ptype->callbacks.gro_complete(skb, nhoff + gh_len); - - rcu_read_unlock(); - return err; -} - -static void geneve_notify_add_rx_port(struct geneve_sock *gs) -{ - struct sock *sk = gs->sock->sk; - sa_family_t sa_family = sk->sk_family; - int err; - - if (sa_family == AF_INET) { - err = udp_add_offload(&gs->udp_offloads); - if (err) - pr_warn("geneve: udp_add_offload failed with status %d\n", - err); - } -} - -static void geneve_notify_del_rx_port(struct geneve_sock *gs) -{ - struct sock *sk = gs->sock->sk; - sa_family_t sa_family = sk->sk_family; - - if (sa_family == AF_INET) - udp_del_offload(&gs->udp_offloads); -} - -/* Callback from net/ipv4/udp.c to receive packets */ -static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) -{ - struct genevehdr *geneveh; - struct geneve_sock *gs; - int opts_len; - - /* Need Geneve and inner Ethernet header to be present */ - if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) - goto error; - - /* Return packets with reserved bits set */ - geneveh = geneve_hdr(skb); - - if (unlikely(geneveh->ver != GENEVE_VER)) - goto error; - - if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) - goto error; - - opts_len = geneveh->opt_len * 4; - if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, - htons(ETH_P_TEB))) - goto drop; - - gs = rcu_dereference_sk_user_data(sk); - if (!gs) - goto drop; - - gs->rcv(gs, skb); - return 0; - -drop: - /* Consume bad packet */ - kfree_skb(skb); - return 0; - -error: - /* Let the UDP layer deal with the skb */ - return 1; -} - -static struct socket *geneve_create_sock(struct net *net, bool ipv6, - __be16 port) -{ - struct socket *sock; - struct udp_port_cfg udp_conf; - int err; - - memset(&udp_conf, 0, sizeof(udp_conf)); - - if (ipv6) { - udp_conf.family = AF_INET6; - } else { - udp_conf.family = AF_INET; - udp_conf.local_ip.s_addr = htonl(INADDR_ANY); - } - - udp_conf.local_udp_port = port; - - /* Open UDP socket */ - err = udp_sock_create(net, &udp_conf, &sock); - if (err < 0) - return ERR_PTR(err); - - return sock; -} - -/* Create new listen socket if needed */ -static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool ipv6) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_sock *gs; - struct socket *sock; - struct udp_tunnel_sock_cfg tunnel_cfg; - - gs = kzalloc(sizeof(*gs), GFP_KERNEL); - if (!gs) - return ERR_PTR(-ENOMEM); - - sock = geneve_create_sock(net, ipv6, port); - if (IS_ERR(sock)) { - kfree(gs); - return ERR_CAST(sock); - } - - gs->sock = sock; - gs->refcnt = 1; - gs->rcv = rcv; - gs->rcv_data = data; - - /* Initialize the geneve udp offloads structure */ - gs->udp_offloads.port = port; - gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive; - gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete; - geneve_notify_add_rx_port(gs); - - /* Mark socket as an encapsulation socket */ - tunnel_cfg.sk_user_data = gs; - tunnel_cfg.encap_type = 1; - tunnel_cfg.encap_rcv = geneve_udp_encap_recv; - tunnel_cfg.encap_destroy = NULL; - setup_udp_tunnel_sock(net, sock, &tunnel_cfg); - - list_add(&gs->list, &gn->sock_list); - - return gs; -} - -struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool no_share, bool ipv6) -{ - struct geneve_sock *gs; - - mutex_lock(&geneve_mutex); - - gs = geneve_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port); - if (gs) { - if (!no_share && gs->rcv == rcv) - gs->refcnt++; - else - gs = ERR_PTR(-EBUSY); - } else { - gs = geneve_socket_create(net, port, rcv, data, ipv6); - } - - mutex_unlock(&geneve_mutex); - - return gs; -} -EXPORT_SYMBOL_GPL(geneve_sock_add); - -void geneve_sock_release(struct geneve_sock *gs) -{ - mutex_lock(&geneve_mutex); - - if (--gs->refcnt) - goto unlock; - - list_del(&gs->list); - geneve_notify_del_rx_port(gs); - udp_tunnel_sock_release(gs->sock); - kfree_rcu(gs, rcu); - -unlock: - mutex_unlock(&geneve_mutex); -} -EXPORT_SYMBOL_GPL(geneve_sock_release); - -static __net_init int geneve_init_net(struct net *net) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - - INIT_LIST_HEAD(&gn->sock_list); - - return 0; -} - -static struct pernet_operations geneve_net_ops = { - .init = geneve_init_net, - .id = &geneve_net_id, - .size = sizeof(struct geneve_net), -}; - -static int __init geneve_init_module(void) -{ - int rc; - - rc = register_pernet_subsys(&geneve_net_ops); - if (rc) - return rc; - - pr_info("Geneve core logic\n"); - - return 0; -} -module_init(geneve_init_module); - -static void __exit geneve_cleanup_module(void) -{ - unregister_pernet_subsys(&geneve_net_ops); -} -module_exit(geneve_cleanup_module); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>"); -MODULE_DESCRIPTION("Driver library for GENEVE encapsulated traffic"); diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 4a7b5b2a1..d9c552a72 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -31,7 +31,6 @@ #include <net/xfrm.h> static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; -static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX]; int gre_add_protocol(const struct gre_protocol *proto, u8 version) { @@ -61,197 +60,6 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version) } EXPORT_SYMBOL_GPL(gre_del_protocol); -void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, - int hdr_len) -{ - struct gre_base_hdr *greh; - - skb_push(skb, hdr_len); - - skb_reset_transport_header(skb); - greh = (struct gre_base_hdr *)skb->data; - greh->flags = tnl_flags_to_gre_flags(tpi->flags); - greh->protocol = tpi->proto; - - if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) { - __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); - - if (tpi->flags&TUNNEL_SEQ) { - *ptr = tpi->seq; - ptr--; - } - if (tpi->flags&TUNNEL_KEY) { - *ptr = tpi->key; - ptr--; - } - if (tpi->flags&TUNNEL_CSUM && - !(skb_shinfo(skb)->gso_type & - (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) { - *ptr = 0; - *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, - skb->len, 0)); - } - } -} -EXPORT_SYMBOL_GPL(gre_build_header); - -static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, - bool *csum_err) -{ - const struct gre_base_hdr *greh; - __be32 *options; - int hdr_len; - - if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) - return -EINVAL; - - greh = (struct gre_base_hdr *)skb_transport_header(skb); - if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) - return -EINVAL; - - tpi->flags = gre_flags_to_tnl_flags(greh->flags); - hdr_len = ip_gre_calc_hlen(tpi->flags); - - if (!pskb_may_pull(skb, hdr_len)) - return -EINVAL; - - greh = (struct gre_base_hdr *)skb_transport_header(skb); - tpi->proto = greh->protocol; - - options = (__be32 *)(greh + 1); - if (greh->flags & GRE_CSUM) { - if (skb_checksum_simple_validate(skb)) { - *csum_err = true; - return -EINVAL; - } - - skb_checksum_try_convert(skb, IPPROTO_GRE, 0, - null_compute_pseudo); - - options++; - } - - if (greh->flags & GRE_KEY) { - tpi->key = *options; - options++; - } else - tpi->key = 0; - - if (unlikely(greh->flags & GRE_SEQ)) { - tpi->seq = *options; - options++; - } else - tpi->seq = 0; - - /* WCCP version 1 and 2 protocol decoding. - * - Change protocol to IP - * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header - */ - if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { - tpi->proto = htons(ETH_P_IP); - if ((*(u8 *)options & 0xF0) != 0x40) { - hdr_len += 4; - if (!pskb_may_pull(skb, hdr_len)) - return -EINVAL; - } - } - - return iptunnel_pull_header(skb, hdr_len, tpi->proto); -} - -static int gre_cisco_rcv(struct sk_buff *skb) -{ - struct tnl_ptk_info tpi; - int i; - bool csum_err = false; - -#ifdef CONFIG_NET_IPGRE_BROADCAST - if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { - /* Looped back packet, drop it! */ - if (rt_is_output_route(skb_rtable(skb))) - goto drop; - } -#endif - - if (parse_gre_header(skb, &tpi, &csum_err) < 0) - goto drop; - - rcu_read_lock(); - for (i = 0; i < GRE_IP_PROTO_MAX; i++) { - struct gre_cisco_protocol *proto; - int ret; - - proto = rcu_dereference(gre_cisco_proto_list[i]); - if (!proto) - continue; - ret = proto->handler(skb, &tpi); - if (ret == PACKET_RCVD) { - rcu_read_unlock(); - return 0; - } - } - rcu_read_unlock(); - - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); -drop: - kfree_skb(skb); - return 0; -} - -static void gre_cisco_err(struct sk_buff *skb, u32 info) -{ - /* All the routers (except for Linux) return only - * 8 bytes of packet payload. It means, that precise relaying of - * ICMP in the real Internet is absolutely infeasible. - * - * Moreover, Cisco "wise men" put GRE key to the third word - * in GRE header. It makes impossible maintaining even soft - * state for keyed - * GRE tunnels with enabled checksum. Tell them "thank you". - * - * Well, I wonder, rfc1812 was written by Cisco employee, - * what the hell these idiots break standards established - * by themselves??? - */ - - const int type = icmp_hdr(skb)->type; - const int code = icmp_hdr(skb)->code; - struct tnl_ptk_info tpi; - bool csum_err = false; - int i; - - if (parse_gre_header(skb, &tpi, &csum_err)) { - if (!csum_err) /* ignore csum errors. */ - return; - } - - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - ipv4_update_pmtu(skb, dev_net(skb->dev), info, - skb->dev->ifindex, 0, IPPROTO_GRE, 0); - return; - } - if (type == ICMP_REDIRECT) { - ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0, - IPPROTO_GRE, 0); - return; - } - - rcu_read_lock(); - for (i = 0; i < GRE_IP_PROTO_MAX; i++) { - struct gre_cisco_protocol *proto; - - proto = rcu_dereference(gre_cisco_proto_list[i]); - if (!proto) - continue; - - if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD) - goto out; - - } -out: - rcu_read_unlock(); -} - static int gre_rcv(struct sk_buff *skb) { const struct gre_protocol *proto; @@ -302,60 +110,19 @@ static const struct net_protocol net_gre_protocol = { .netns_ok = 1, }; -static const struct gre_protocol ipgre_protocol = { - .handler = gre_cisco_rcv, - .err_handler = gre_cisco_err, -}; - -int gre_cisco_register(struct gre_cisco_protocol *newp) -{ - struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **) - &gre_cisco_proto_list[newp->priority]; - - return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY; -} -EXPORT_SYMBOL_GPL(gre_cisco_register); - -int gre_cisco_unregister(struct gre_cisco_protocol *del_proto) -{ - struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **) - &gre_cisco_proto_list[del_proto->priority]; - int ret; - - ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL; - - if (ret) - return ret; - - synchronize_net(); - return 0; -} -EXPORT_SYMBOL_GPL(gre_cisco_unregister); - static int __init gre_init(void) { pr_info("GRE over IPv4 demultiplexor driver\n"); if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { pr_err("can't add protocol\n"); - goto err; - } - - if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) { - pr_info("%s: can't add ipgre handler\n", __func__); - goto err_gre; + return -EAGAIN; } - return 0; -err_gre: - inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); -err: - return -EAGAIN; } static void __exit gre_exit(void) { - gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 5aa46d4b4..5a8ee3282 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -36,7 +36,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, SKB_GSO_TCP_ECN | SKB_GSO_GRE | SKB_GSO_GRE_CSUM | - SKB_GSO_IPIP))) + SKB_GSO_IPIP | + SKB_GSO_SIT))) goto out; if (!skb->encapsulation) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index f5203fba6..e5eb8ac40 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -96,6 +96,7 @@ #include <net/xfrm.h> #include <net/inet_common.h> #include <net/ip_fib.h> +#include <net/vrf.h> /* * Build xmit assembly blocks @@ -308,9 +309,10 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, rc = false; if (icmp_global_allow()) { + int vif = vrf_master_ifindex(dst->dev); struct inet_peer *peer; - peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit); if (peer) @@ -425,6 +427,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.flowi4_mark = mark; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; + fl4.flowi4_oif = vrf_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) @@ -458,6 +461,8 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; + fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev); + security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = __ip_route_output_key(net, fl4); if (IS_ERR(rt)) @@ -480,7 +485,8 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) { + if (inet_addr_type_dev_table(net, skb_in->dev, + fl4_dec.saddr) == RTN_LOCAL) { rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2)) err = PTR_ERR(rt2); @@ -496,6 +502,7 @@ static struct rtable *icmp_route_lookup(struct net *net, } /* Ugh! */ orefdst = skb_in->_skb_refdst; /* save old refdst */ + skb_dst_set(skb_in, NULL); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, RT_TOS(tos), rt2->dst.dev); @@ -828,7 +835,7 @@ static bool icmp_unreach(struct sk_buff *skb) */ if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses && - inet_addr_type(net, iph->daddr) == RTN_BROADCAST) { + inet_addr_type_dev_table(net, skb->dev, iph->daddr) == RTN_BROADCAST) { net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n", &ip_hdr(skb)->saddr, icmph->type, icmph->code, diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 9fdfd9dea..d38b8b61e 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -110,6 +110,9 @@ #define IP_MAX_MEMBERSHIPS 20 #define IP_MAX_MSF 10 +/* IGMP reports for link-local multicast groups are enabled by default */ +int sysctl_igmp_llm_reports __read_mostly = 1; + #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ @@ -437,6 +440,8 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; + if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports) + return skb; isquery = type == IGMPV3_MODE_IS_INCLUDE || type == IGMPV3_MODE_IS_EXCLUDE; @@ -545,6 +550,9 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) for_each_pmc_rcu(in_dev, pmc) { if (pmc->multiaddr == IGMP_ALL_HOSTS) continue; + if (ipv4_is_local_multicast(pmc->multiaddr) && + !sysctl_igmp_llm_reports) + continue; spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) type = IGMPV3_MODE_IS_EXCLUDE; @@ -678,7 +686,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) return igmpv3_send_report(in_dev, pmc); - else if (type == IGMP_HOST_LEAVE_MESSAGE) + + if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + return 0; + + if (type == IGMP_HOST_LEAVE_MESSAGE) dst = IGMP_ALL_ROUTER; else dst = group; @@ -851,6 +863,8 @@ static bool igmp_heard_report(struct in_device *in_dev, __be32 group) if (group == IGMP_ALL_HOSTS) return false; + if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + return false; rcu_read_lock(); for_each_pmc_rcu(in_dev, im) { @@ -957,6 +971,9 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, continue; if (im->multiaddr == IGMP_ALL_HOSTS) continue; + if (ipv4_is_local_multicast(im->multiaddr) && + !sysctl_igmp_llm_reports) + continue; spin_lock_bh(&im->lock); if (im->tm_running) im->gsquery = im->gsquery && mark; @@ -1181,6 +1198,8 @@ static void igmp_group_dropped(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; + if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + return; reporter = im->reporter; igmp_stop_timer(im); @@ -1213,6 +1232,8 @@ static void igmp_group_added(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; + if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + return; if (in_dev->dead) return; @@ -1518,6 +1539,9 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev) for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == IGMP_ALL_HOSTS) continue; + if (ipv4_is_local_multicast(im->multiaddr) && + !sysctl_igmp_llm_reports) + continue; /* a failover is happening and switches * must be notified immediately diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 0cb916542..89120196a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -343,7 +343,6 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk2; const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; - int twrefcnt = 0; spin_lock(lock); @@ -371,21 +370,17 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { - twrefcnt = inet_twsk_unhash(tw); + sk_nulls_del_node_init_rcu((struct sock *)tw); NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); - if (twrefcnt) - inet_twsk_put(tw); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw); - - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); } return 0; @@ -403,13 +398,12 @@ static u32 inet_sk_port_offset(const struct sock *sk) inet->inet_dport); } -int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) +void __inet_hash_nolisten(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; struct inet_ehash_bucket *head; spinlock_t *lock; - int twrefcnt = 0; WARN_ON(!sk_unhashed(sk)); @@ -420,23 +414,22 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) spin_lock(lock); __sk_nulls_add_node_rcu(sk, list); - if (tw) { - WARN_ON(sk->sk_hash != tw->tw_hash); - twrefcnt = inet_twsk_unhash(tw); + if (osk) { + WARN_ON(sk->sk_hash != osk->sk_hash); + sk_nulls_del_node_init_rcu(osk); } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - return twrefcnt; } EXPORT_SYMBOL_GPL(__inet_hash_nolisten); -int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw) +void __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; if (sk->sk_state != TCP_LISTEN) - return __inet_hash_nolisten(sk, tw); + return __inet_hash_nolisten(sk, osk); WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; @@ -445,7 +438,6 @@ int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw) __sk_nulls_add_node_rcu(sk, &ilb->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); spin_unlock(&ilb->lock); - return 0; } EXPORT_SYMBOL(__inet_hash); @@ -492,7 +484,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct inet_bind_bucket *tb; int ret; struct net *net = sock_net(sk); - int twrefcnt = 1; if (!snum) { int i, remaining, low, high, port; @@ -560,19 +551,14 @@ ok: inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); - twrefcnt += __inet_hash_nolisten(sk, tw); + __inet_hash_nolisten(sk, (struct sock *)tw); } if (tw) - twrefcnt += inet_twsk_bind_unhash(tw, hinfo); + inet_twsk_bind_unhash(tw, hinfo); spin_unlock(&head->lock); - if (tw) { - inet_twsk_deschedule(tw); - while (twrefcnt) { - twrefcnt--; - inet_twsk_put(tw); - } - } + if (tw) + inet_twsk_deschedule_put(tw); ret = 0; goto out; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 2ffbd16b7..c67f9bd76 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -18,28 +18,6 @@ /** - * inet_twsk_unhash - unhash a timewait socket from established hash - * @tw: timewait socket - * - * unhash a timewait socket from established hash, if hashed. - * ehash lock must be held by caller. - * Returns 1 if caller should call inet_twsk_put() after lock release. - */ -int inet_twsk_unhash(struct inet_timewait_sock *tw) -{ - if (hlist_nulls_unhashed(&tw->tw_node)) - return 0; - - hlist_nulls_del_rcu(&tw->tw_node); - sk_nulls_node_init(&tw->tw_node); - /* - * We cannot call inet_twsk_put() ourself under lock, - * caller must call it for us. - */ - return 1; -} - -/** * inet_twsk_bind_unhash - unhash a timewait socket from bind hash * @tw: timewait socket * @hashinfo: hashinfo pointer @@ -48,35 +26,29 @@ int inet_twsk_unhash(struct inet_timewait_sock *tw) * bind hash lock must be held by caller. * Returns 1 if caller should call inet_twsk_put() after lock release. */ -int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, +void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) { struct inet_bind_bucket *tb = tw->tw_tb; if (!tb) - return 0; + return; __hlist_del(&tw->tw_bind_node); tw->tw_tb = NULL; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); - /* - * We cannot call inet_twsk_put() ourself under lock, - * caller must call it for us. - */ - return 1; + __sock_put((struct sock *)tw); } /* Must be called with locally disabled BHs. */ static void inet_twsk_kill(struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; - struct inet_bind_hashbucket *bhead; - int refcnt; - /* Unlink from established hashes. */ spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); + struct inet_bind_hashbucket *bhead; spin_lock(lock); - refcnt = inet_twsk_unhash(tw); + sk_nulls_del_node_init_rcu((struct sock *)tw); spin_unlock(lock); /* Disassociate with bind bucket. */ @@ -84,11 +56,9 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) hashinfo->bhash_size)]; spin_lock(&bhead->lock); - refcnt += inet_twsk_bind_unhash(tw, hashinfo); + inet_twsk_bind_unhash(tw, hashinfo); spin_unlock(&bhead->lock); - BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt)); - atomic_sub(refcnt, &tw->tw_refcnt); atomic_dec(&tw->tw_dr->tw_count); inet_twsk_put(tw); } @@ -153,13 +123,15 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, /* * Step 2: Hash TW into tcp ehash chain. * Notes : - * - tw_refcnt is set to 3 because : + * - tw_refcnt is set to 4 because : * - We have one reference from bhash chain. * - We have one reference from ehash chain. + * - We have one reference from timer. + * - One reference for ourself (our caller will release it). * We can use atomic_set() because prior spin_lock()/spin_unlock() * committed into memory all tw fields. */ - atomic_set(&tw->tw_refcnt, 1 + 1 + 1); + atomic_set(&tw->tw_refcnt, 4); inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ @@ -235,15 +207,19 @@ EXPORT_SYMBOL_GPL(inet_twsk_alloc); * tcp_input.c to verify this. */ -/* This is for handling early-kills of TIME_WAIT sockets. */ -void inet_twsk_deschedule(struct inet_timewait_sock *tw) +/* This is for handling early-kills of TIME_WAIT sockets. + * Warning : consume reference. + * Caller should not access tw anymore. + */ +void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) { if (del_timer_sync(&tw->tw_timer)) inet_twsk_kill(tw); + inet_twsk_put(tw); } -EXPORT_SYMBOL(inet_twsk_deschedule); +EXPORT_SYMBOL(inet_twsk_deschedule_put); -void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) +void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) { /* timeout := RTO * 3.5 * @@ -271,12 +247,14 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) */ tw->tw_kill = timeo <= 4*HZ; - if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) { - atomic_inc(&tw->tw_refcnt); + if (!rearm) { + BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo)); atomic_inc(&tw->tw_dr->tw_count); + } else { + mod_timer_pending(&tw->tw_timer, jiffies + timeo); } } -EXPORT_SYMBOL_GPL(inet_twsk_schedule); +EXPORT_SYMBOL_GPL(__inet_twsk_schedule); void inet_twsk_purge(struct inet_hashinfo *hashinfo, struct inet_timewait_death_row *twdr, int family) @@ -311,9 +289,8 @@ restart: rcu_read_unlock(); local_bh_disable(); - inet_twsk_deschedule(tw); + inet_twsk_deschedule_put(tw); local_bh_enable(); - inet_twsk_put(tw); goto restart_rcu; } /* If the nulls value we got at the end of this lookup is diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 241afd743..86fa45809 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -157,22 +157,6 @@ void __init inet_initpeers(void) INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker); } -static int addr_compare(const struct inetpeer_addr *a, - const struct inetpeer_addr *b) -{ - int i, n = (a->family == AF_INET ? 1 : 4); - - for (i = 0; i < n; i++) { - if (a->addr.a6[i] == b->addr.a6[i]) - continue; - if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i]) - return -1; - return 1; - } - - return 0; -} - #define rcu_deref_locked(X, BASE) \ rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock)) @@ -188,7 +172,7 @@ static int addr_compare(const struct inetpeer_addr *a, *stackptr++ = &_base->root; \ for (u = rcu_deref_locked(_base->root, _base); \ u != peer_avl_empty;) { \ - int cmp = addr_compare(_daddr, &u->daddr); \ + int cmp = inetpeer_addr_cmp(_daddr, &u->daddr); \ if (cmp == 0) \ break; \ if (cmp == -1) \ @@ -215,7 +199,7 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, int count = 0; while (u != peer_avl_empty) { - int cmp = addr_compare(daddr, &u->daddr); + int cmp = inetpeer_addr_cmp(daddr, &u->daddr); if (cmp == 0) { /* Before taking a reference, check if this entry was * deleted (refcnt=-1) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 921138f6c..fa7f15305 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -48,6 +48,7 @@ #include <linux/inet.h> #include <linux/netfilter_ipv4.h> #include <net/inet_ecn.h> +#include <net/vrf.h> /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c @@ -77,6 +78,7 @@ struct ipq { u8 ecn; /* RFC3168 support */ u16 max_df_size; /* largest frag with DF set seen */ int iif; + int vif; /* VRF device index */ unsigned int rid; struct inet_peer *peer; }; @@ -99,6 +101,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct ip4_create_arg { struct iphdr *iph; u32 user; + int vif; }; static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) @@ -127,7 +130,8 @@ static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a) qp->saddr == arg->iph->saddr && qp->daddr == arg->iph->daddr && qp->protocol == arg->iph->protocol && - qp->user == arg->user; + qp->user == arg->user && + qp->vif == arg->vif; } static void ip4_frag_init(struct inet_frag_queue *q, const void *a) @@ -144,9 +148,11 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) qp->ecn = ip4_frag_ecn(arg->iph->tos); qp->saddr = arg->iph->saddr; qp->daddr = arg->iph->daddr; + qp->vif = arg->vif; qp->user = arg->user; qp->peer = sysctl_ipfrag_max_dist ? - inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; + inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : + NULL; } static void ip4_frag_free(struct inet_frag_queue *q) @@ -244,7 +250,8 @@ out: /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. */ -static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) +static struct ipq *ip_find(struct net *net, struct iphdr *iph, + u32 user, int vif) { struct inet_frag_queue *q; struct ip4_create_arg arg; @@ -252,6 +259,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) arg.iph = iph; arg.user = user; + arg.vif = vif; hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); @@ -522,7 +530,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, int len; int ihlen; int err; - int sum_truesize; u8 ecn; ipq_kill(qp); @@ -590,32 +597,19 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, add_frag_mem_limit(qp->q.net, clone->truesize); } + skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - sum_truesize = head->truesize; - for (fp = head->next; fp;) { - bool headstolen; - int delta; - struct sk_buff *next = fp->next; - - sum_truesize += fp->truesize; + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); - - if (skb_try_coalesce(head, fp, &headstolen, &delta)) { - kfree_skb_partial(fp, headstolen); - } else { - if (!skb_shinfo(head)->frag_list) - skb_shinfo(head)->frag_list = fp; - head->data_len += fp->len; - head->len += fp->len; - head->truesize += fp->truesize; - } - fp = next; + head->truesize += fp->truesize; } - sub_frag_mem_limit(qp->q.net, sum_truesize); + sub_frag_mem_limit(qp->q.net, head->truesize); head->next = NULL; head->dev = dev; @@ -662,14 +656,15 @@ out_fail: /* Process an incoming IP datagram fragment. */ int ip_defrag(struct sk_buff *skb, u32 user) { + struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; + int vif = vrf_master_ifindex_rcu(dev); + struct net *net = dev_net(dev); struct ipq *qp; - struct net *net; - net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); /* Lookup (or create) queue header */ - qp = ip_find(net, ip_hdr(skb), user); + qp = ip_find(net, ip_hdr(skb), user, vif); if (qp) { int ret; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 5fd706473..614521437 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -25,6 +25,7 @@ #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/mroute.h> +#include <linux/if_vlan.h> #include <linux/init.h> #include <linux/in6.h> #include <linux/inetdevice.h> @@ -47,6 +48,7 @@ #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/gre.h> +#include <net/dst_metadata.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> @@ -121,8 +123,127 @@ static int ipgre_tunnel_init(struct net_device *dev); static int ipgre_net_id __read_mostly; static int gre_tap_net_id __read_mostly; -static int ipgre_err(struct sk_buff *skb, u32 info, - const struct tnl_ptk_info *tpi) +static int ip_gre_calc_hlen(__be16 o_flags) +{ + int addend = 4; + + if (o_flags & TUNNEL_CSUM) + addend += 4; + if (o_flags & TUNNEL_KEY) + addend += 4; + if (o_flags & TUNNEL_SEQ) + addend += 4; + return addend; +} + +static __be16 gre_flags_to_tnl_flags(__be16 flags) +{ + __be16 tflags = 0; + + if (flags & GRE_CSUM) + tflags |= TUNNEL_CSUM; + if (flags & GRE_ROUTING) + tflags |= TUNNEL_ROUTING; + if (flags & GRE_KEY) + tflags |= TUNNEL_KEY; + if (flags & GRE_SEQ) + tflags |= TUNNEL_SEQ; + if (flags & GRE_STRICT) + tflags |= TUNNEL_STRICT; + if (flags & GRE_REC) + tflags |= TUNNEL_REC; + if (flags & GRE_VERSION) + tflags |= TUNNEL_VERSION; + + return tflags; +} + +static __be16 tnl_flags_to_gre_flags(__be16 tflags) +{ + __be16 flags = 0; + + if (tflags & TUNNEL_CSUM) + flags |= GRE_CSUM; + if (tflags & TUNNEL_ROUTING) + flags |= GRE_ROUTING; + if (tflags & TUNNEL_KEY) + flags |= GRE_KEY; + if (tflags & TUNNEL_SEQ) + flags |= GRE_SEQ; + if (tflags & TUNNEL_STRICT) + flags |= GRE_STRICT; + if (tflags & TUNNEL_REC) + flags |= GRE_REC; + if (tflags & TUNNEL_VERSION) + flags |= GRE_VERSION; + + return flags; +} + +static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, + bool *csum_err) +{ + const struct gre_base_hdr *greh; + __be32 *options; + int hdr_len; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) + return -EINVAL; + + greh = (struct gre_base_hdr *)skb_transport_header(skb); + if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) + return -EINVAL; + + tpi->flags = gre_flags_to_tnl_flags(greh->flags); + hdr_len = ip_gre_calc_hlen(tpi->flags); + + if (!pskb_may_pull(skb, hdr_len)) + return -EINVAL; + + greh = (struct gre_base_hdr *)skb_transport_header(skb); + tpi->proto = greh->protocol; + + options = (__be32 *)(greh + 1); + if (greh->flags & GRE_CSUM) { + if (skb_checksum_simple_validate(skb)) { + *csum_err = true; + return -EINVAL; + } + + skb_checksum_try_convert(skb, IPPROTO_GRE, 0, + null_compute_pseudo); + options++; + } + + if (greh->flags & GRE_KEY) { + tpi->key = *options; + options++; + } else { + tpi->key = 0; + } + if (unlikely(greh->flags & GRE_SEQ)) { + tpi->seq = *options; + options++; + } else { + tpi->seq = 0; + } + /* WCCP version 1 and 2 protocol decoding. + * - Change protocol to IP + * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header + */ + if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { + tpi->proto = htons(ETH_P_IP); + if ((*(u8 *)options & 0xF0) != 0x40) { + hdr_len += 4; + if (!pskb_may_pull(skb, hdr_len)) + return -EINVAL; + } + } + return iptunnel_pull_header(skb, hdr_len, tpi->proto); +} + +static void ipgre_err(struct sk_buff *skb, u32 info, + const struct tnl_ptk_info *tpi) { /* All the routers (except for Linux) return only @@ -148,14 +269,14 @@ static int ipgre_err(struct sk_buff *skb, u32 info, switch (type) { default: case ICMP_PARAMETERPROB: - return PACKET_RCVD; + return; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: case ICMP_PORT_UNREACH: /* Impossible event. */ - return PACKET_RCVD; + return; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, @@ -164,9 +285,10 @@ static int ipgre_err(struct sk_buff *skb, u32 info, break; } break; + case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) - return PACKET_RCVD; + return; break; case ICMP_REDIRECT: @@ -183,26 +305,85 @@ static int ipgre_err(struct sk_buff *skb, u32 info, iph->daddr, iph->saddr, tpi->key); if (!t) - return PACKET_REJECT; + return; if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) - return PACKET_RCVD; + return; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) - return PACKET_RCVD; + return; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; - return PACKET_RCVD; +} + +static void gre_err(struct sk_buff *skb, u32 info) +{ + /* All the routers (except for Linux) return only + * 8 bytes of packet payload. It means, that precise relaying of + * ICMP in the real Internet is absolutely infeasible. + * + * Moreover, Cisco "wise men" put GRE key to the third word + * in GRE header. It makes impossible maintaining even soft + * state for keyed + * GRE tunnels with enabled checksum. Tell them "thank you". + * + * Well, I wonder, rfc1812 was written by Cisco employee, + * what the hell these idiots break standards established + * by themselves??? + */ + + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; + struct tnl_ptk_info tpi; + bool csum_err = false; + + if (parse_gre_header(skb, &tpi, &csum_err)) { + if (!csum_err) /* ignore csum errors. */ + return; + } + + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + ipv4_update_pmtu(skb, dev_net(skb->dev), info, + skb->dev->ifindex, 0, IPPROTO_GRE, 0); + return; + } + if (type == ICMP_REDIRECT) { + ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0, + IPPROTO_GRE, 0); + return; + } + + ipgre_err(skb, info, &tpi); +} + +static __be64 key_to_tunnel_id(__be32 key) +{ +#ifdef __BIG_ENDIAN + return (__force __be64)((__force u32)key); +#else + return (__force __be64)((__force u64)key << 32); +#endif +} + +/* Returns the least-significant 32 bits of a __be64. */ +static __be32 tunnel_id_to_key(__be64 x) +{ +#ifdef __BIG_ENDIAN + return (__force __be32)x; +#else + return (__force __be32)((__force u64)x >> 32); +#endif } static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) { struct net *net = dev_net(skb->dev); + struct metadata_dst *tun_dst = NULL; struct ip_tunnel_net *itn; const struct iphdr *iph; struct ip_tunnel *tunnel; @@ -218,40 +399,211 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) if (tunnel) { skb_pop_mac_header(skb); - ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error); + if (tunnel->collect_md) { + __be16 flags; + __be64 tun_id; + + flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY); + tun_id = key_to_tunnel_id(tpi->key); + tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0); + if (!tun_dst) + return PACKET_REJECT; + } + + ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); return PACKET_RCVD; } return PACKET_REJECT; } +static int gre_rcv(struct sk_buff *skb) +{ + struct tnl_ptk_info tpi; + bool csum_err = false; + +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { + /* Looped back packet, drop it! */ + if (rt_is_output_route(skb_rtable(skb))) + goto drop; + } +#endif + + if (parse_gre_header(skb, &tpi, &csum_err) < 0) + goto drop; + + if (ipgre_rcv(skb, &tpi) == PACKET_RCVD) + return 0; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); +drop: + kfree_skb(skb); + return 0; +} + +static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, + __be16 proto, __be32 key, __be32 seq) +{ + struct gre_base_hdr *greh; + + skb_push(skb, hdr_len); + + skb_reset_transport_header(skb); + greh = (struct gre_base_hdr *)skb->data; + greh->flags = tnl_flags_to_gre_flags(flags); + greh->protocol = proto; + + if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) { + __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); + + if (flags & TUNNEL_SEQ) { + *ptr = seq; + ptr--; + } + if (flags & TUNNEL_KEY) { + *ptr = key; + ptr--; + } + if (flags & TUNNEL_CSUM && + !(skb_shinfo(skb)->gso_type & + (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { + *ptr = 0; + *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, + skb->len, 0)); + } + } +} + static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct tnl_ptk_info tpi; - tpi.flags = tunnel->parms.o_flags; - tpi.proto = proto; - tpi.key = tunnel->parms.o_key; if (tunnel->parms.o_flags & TUNNEL_SEQ) tunnel->o_seqno++; - tpi.seq = htonl(tunnel->o_seqno); /* Push GRE header. */ - gre_build_header(skb, &tpi, tunnel->tun_hlen); - - skb_set_inner_protocol(skb, tpi.proto); + build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, + proto, tunnel->parms.o_key, htonl(tunnel->o_seqno)); + skb_set_inner_protocol(skb, proto); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } +static struct sk_buff *gre_handle_offloads(struct sk_buff *skb, + bool csum) +{ + return iptunnel_handle_offloads(skb, csum, + csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); +} + +static struct rtable *gre_get_rt(struct sk_buff *skb, + struct net_device *dev, + struct flowi4 *fl, + const struct ip_tunnel_key *key) +{ + struct net *net = dev_net(dev); + + memset(fl, 0, sizeof(*fl)); + fl->daddr = key->u.ipv4.dst; + fl->saddr = key->u.ipv4.src; + fl->flowi4_tos = RT_TOS(key->tos); + fl->flowi4_mark = skb->mark; + fl->flowi4_proto = IPPROTO_GRE; + + return ip_route_output_key(net, fl); +} + +static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel_info *tun_info; + const struct ip_tunnel_key *key; + struct flowi4 fl; + struct rtable *rt; + int min_headroom; + int tunnel_hlen; + __be16 df, flags; + int err; + + tun_info = skb_tunnel_info(skb); + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET)) + goto err_free_skb; + + key = &tun_info->key; + rt = gre_get_rt(skb, dev, &fl, key); + if (IS_ERR(rt)) + goto err_free_skb; + + tunnel_hlen = ip_gre_calc_hlen(key->tun_flags); + + min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + + tunnel_hlen + sizeof(struct iphdr); + if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { + int head_delta = SKB_DATA_ALIGN(min_headroom - + skb_headroom(skb) + + 16); + err = pskb_expand_head(skb, max_t(int, head_delta, 0), + 0, GFP_ATOMIC); + if (unlikely(err)) + goto err_free_rt; + } + + /* Push Tunnel header. */ + skb = gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)); + if (IS_ERR(skb)) { + skb = NULL; + goto err_free_rt; + } + + flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB), + tunnel_id_to_key(tun_info->key.tun_id), 0); + + df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr, + key->u.ipv4.dst, IPPROTO_GRE, + key->tos, key->ttl, df, false); + iptunnel_xmit_stats(err, &dev->stats, dev->tstats); + return; + +err_free_rt: + ip_rt_put(rt); +err_free_skb: + kfree_skb(skb); + dev->stats.tx_dropped++; +} + +static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) +{ + struct ip_tunnel_info *info = skb_tunnel_info(skb); + struct rtable *rt; + struct flowi4 fl4; + + if (ip_tunnel_info_af(info) != AF_INET) + return -EINVAL; + + rt = gre_get_rt(skb, dev, &fl4, &info->key); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + ip_rt_put(rt); + info->key.u.ipv4.src = fl4.saddr; + return 0; +} + static netdev_tx_t ipgre_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tnl_params; + if (tunnel->collect_md) { + gre_fb_xmit(skb, dev); + return NETDEV_TX_OK; + } + if (dev->header_ops) { /* Need space for new headers */ if (skb_cow_head(skb, dev->needed_headroom - @@ -277,7 +629,6 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, goto out; __gre_xmit(skb, dev, tnl_params, skb->protocol); - return NETDEV_TX_OK; free_skb: @@ -292,6 +643,11 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, { struct ip_tunnel *tunnel = netdev_priv(dev); + if (tunnel->collect_md) { + gre_fb_xmit(skb, dev); + return NETDEV_TX_OK; + } + skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM)); if (IS_ERR(skb)) goto out; @@ -300,7 +656,6 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, goto free_skb; __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB)); - return NETDEV_TX_OK; free_skb: @@ -530,10 +885,9 @@ static int ipgre_tunnel_init(struct net_device *dev) return ip_tunnel_init(dev); } -static struct gre_cisco_protocol ipgre_protocol = { - .handler = ipgre_rcv, - .err_handler = ipgre_err, - .priority = 0, +static const struct gre_protocol ipgre_protocol = { + .handler = gre_rcv, + .err_handler = gre_err, }; static int __net_init ipgre_init_net(struct net *net) @@ -596,8 +950,10 @@ out: return ipgre_tunnel_validate(tb, data); } -static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], - struct ip_tunnel_parm *parms) +static void ipgre_netlink_parms(struct net_device *dev, + struct nlattr *data[], + struct nlattr *tb[], + struct ip_tunnel_parm *parms) { memset(parms, 0, sizeof(*parms)); @@ -635,6 +991,12 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) parms->iph.frag_off = htons(IP_DF); + + if (data[IFLA_GRE_COLLECT_METADATA]) { + struct ip_tunnel *t = netdev_priv(dev); + + t->collect_md = true; + } } /* This function returns true when ENCAP attributes are present in the nl msg */ @@ -688,6 +1050,7 @@ static const struct net_device_ops gre_tap_netdev_ops = { .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_fill_metadata_dst = gre_fill_metadata_dst, }; static void ipgre_tap_setup(struct net_device *dev) @@ -712,7 +1075,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, return err; } - ipgre_netlink_parms(data, tb, &p); + ipgre_netlink_parms(dev, data, tb, &p); return ip_tunnel_newlink(dev, tb, &p); } @@ -730,7 +1093,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], return err; } - ipgre_netlink_parms(data, tb, &p); + ipgre_netlink_parms(dev, data, tb, &p); return ip_tunnel_changelink(dev, tb, &p); } @@ -765,6 +1128,8 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_GRE_ENCAP_DPORT */ nla_total_size(2) + + /* IFLA_GRE_COLLECT_METADATA */ + nla_total_size(0) + 0; } @@ -796,6 +1161,11 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) t->encap.flags)) goto nla_put_failure; + if (t->collect_md) { + if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) + goto nla_put_failure; + } + return 0; nla_put_failure: @@ -817,6 +1187,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, + [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { @@ -849,9 +1220,38 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { .get_link_net = ip_tunnel_get_link_net, }; +struct net_device *gretap_fb_dev_create(struct net *net, const char *name, + u8 name_assign_type) +{ + struct nlattr *tb[IFLA_MAX + 1]; + struct net_device *dev; + struct ip_tunnel *t; + int err; + + memset(&tb, 0, sizeof(tb)); + + dev = rtnl_create_link(net, name, name_assign_type, + &ipgre_tap_ops, tb); + if (IS_ERR(dev)) + return dev; + + /* Configure flow based GRE device. */ + t = netdev_priv(dev); + t->collect_md = true; + + err = ipgre_newlink(net, dev, tb, NULL); + if (err < 0) + goto out; + return dev; +out: + free_netdev(dev); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(gretap_fb_dev_create); + static int __net_init ipgre_tap_init_net(struct net *net) { - return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL); + return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } static void __net_exit ipgre_tap_exit_net(struct net *net) @@ -881,7 +1281,7 @@ static int __init ipgre_init(void) if (err < 0) goto pnet_tap_faied; - err = gre_cisco_register(&ipgre_protocol); + err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); if (err < 0) { pr_info("%s: can't add protocol\n", __func__); goto add_proto_failed; @@ -900,7 +1300,7 @@ static int __init ipgre_init(void) tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: - gre_cisco_unregister(&ipgre_protocol); + gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); add_proto_failed: unregister_pernet_device(&ipgre_tap_net_ops); pnet_tap_faied: @@ -912,7 +1312,7 @@ static void __exit ipgre_fini(void) { rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); - gre_cisco_unregister(&ipgre_protocol); + gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); unregister_pernet_device(&ipgre_tap_net_ops); unregister_pernet_device(&ipgre_net_ops); } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 2db4c8773..f4fc8a77a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -146,6 +146,7 @@ #include <net/xfrm.h> #include <linux/mroute.h> #include <linux/netlink.h> +#include <net/dst_metadata.h> /* * Process Router Attention IP option (RFC 2113) @@ -331,7 +332,7 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ - if (!skb_dst(skb)) { + if (!skb_valid_dst(skb)) { int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, iph->tos, skb->dev); if (unlikely(err)) { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6bf89a631..0138fada0 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1542,6 +1542,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, struct net *net = sock_net(sk); struct sk_buff *nskb; int err; + int oif; if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) return; @@ -1559,7 +1560,11 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, daddr = replyopts.opt.opt.faddr; } - flowi4_init_output(&fl4, arg->bound_dev_if, + oif = arg->bound_dev_if; + if (!oif && netif_index_is_vrf(net, skb->skb_iif)) + oif = skb->skb_iif; + + flowi4_init_output(&fl4, oif, IP4_REPLY_MARK(net, skb->mark), RT_TOS(arg->tos), RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 626d9e56a..cbb51f3fa 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -230,10 +230,13 @@ skip_key_lookup: if (cand) return cand; + t = rcu_dereference(itn->collect_md_tun); + if (t) + return t; + if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) return netdev_priv(itn->fb_tunnel_dev); - return NULL; } EXPORT_SYMBOL_GPL(ip_tunnel_lookup); @@ -261,11 +264,15 @@ static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) { struct hlist_head *head = ip_bucket(itn, &t->parms); + if (t->collect_md) + rcu_assign_pointer(itn->collect_md_tun, t); hlist_add_head_rcu(&t->hash_node, head); } -static void ip_tunnel_del(struct ip_tunnel *t) +static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) { + if (t->collect_md) + rcu_assign_pointer(itn->collect_md_tun, NULL); hlist_del_init_rcu(&t->hash_node); } @@ -419,7 +426,8 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, } int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, - const struct tnl_ptk_info *tpi, bool log_ecn_error) + const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, + bool log_ecn_error) { struct pcpu_sw_netstats *tstats; const struct iphdr *iph = ip_hdr(skb); @@ -478,6 +486,9 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, skb->dev = tunnel->dev; } + if (tun_dst) + skb_dst_set(skb, (struct dst_entry *)tun_dst); + gro_cells_receive(&tunnel->gro_cells, skb); return 0; @@ -806,7 +817,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, struct ip_tunnel_parm *p, bool set_mtu) { - ip_tunnel_del(t); + ip_tunnel_del(itn, t); t->parms.iph.saddr = p->iph.saddr; t->parms.iph.daddr = p->iph.daddr; t->parms.i_key = p->i_key; @@ -967,7 +978,7 @@ void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); if (itn->fb_tunnel_dev != dev) { - ip_tunnel_del(netdev_priv(dev)); + ip_tunnel_del(itn, netdev_priv(dev)); unregister_netdevice_queue(dev, head); } } @@ -1072,8 +1083,13 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], nt = netdev_priv(dev); itn = net_generic(net, nt->ip_tnl_net_id); - if (ip_tunnel_find(itn, p, dev->type)) - return -EEXIST; + if (nt->collect_md) { + if (rtnl_dereference(itn->collect_md_tun)) + return -EEXIST; + } else { + if (ip_tunnel_find(itn, p, dev->type)) + return -EEXIST; + } nt->net = net; nt->parms = *p; @@ -1089,7 +1105,6 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], dev->mtu = mtu; ip_tunnel_add(itn, nt); - out: return err; } @@ -1163,6 +1178,10 @@ int ip_tunnel_init(struct net_device *dev) iph->version = 4; iph->ihl = 5; + if (tunnel->collect_md) { + dev->features |= NETIF_F_NETNS_LOCAL; + netif_keep_dst(dev); + } return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_init); @@ -1176,7 +1195,7 @@ void ip_tunnel_uninit(struct net_device *dev) itn = net_generic(net, tunnel->ip_tnl_net_id); /* fb_tunnel_dev will be unregisted in net-exit call. */ if (itn->fb_tunnel_dev != dev) - ip_tunnel_del(netdev_priv(dev)); + ip_tunnel_del(itn, netdev_priv(dev)); ip_tunnel_dst_reset_all(tunnel); } diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 6a51a71a6..84dce6a92 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -32,6 +32,7 @@ #include <linux/etherdevice.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> +#include <linux/static_key.h> #include <net/ip.h> #include <net/icmp.h> @@ -45,12 +46,13 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> +#include <net/dst_metadata.h> int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) { - int pkt_len = skb->len; + int pkt_len = skb->len - skb_inner_network_offset(skb); struct iphdr *iph; int err; @@ -118,6 +120,33 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto) } EXPORT_SYMBOL_GPL(iptunnel_pull_header); +struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, + gfp_t flags) +{ + struct metadata_dst *res; + struct ip_tunnel_info *dst, *src; + + if (!md || md->u.tun_info.mode & IP_TUNNEL_INFO_TX) + return NULL; + + res = metadata_dst_alloc(0, flags); + if (!res) + return NULL; + + dst = &res->u.tun_info; + src = &md->u.tun_info; + dst->key.tun_id = src->key.tun_id; + if (src->mode & IP_TUNNEL_INFO_IPV6) + memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src, + sizeof(struct in6_addr)); + else + dst->key.u.ipv4.dst = src->key.u.ipv4.src; + dst->mode = src->mode | IP_TUNNEL_INFO_TX; + + return res; +} +EXPORT_SYMBOL_GPL(iptunnel_metadata_reply); + struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, bool csum_help, int gso_type_mask) @@ -190,3 +219,208 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, return tot; } EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); + +static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { + [LWTUNNEL_IP_ID] = { .type = NLA_U64 }, + [LWTUNNEL_IP_DST] = { .type = NLA_U32 }, + [LWTUNNEL_IP_SRC] = { .type = NLA_U32 }, + [LWTUNNEL_IP_TTL] = { .type = NLA_U8 }, + [LWTUNNEL_IP_TOS] = { .type = NLA_U8 }, + [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, +}; + +static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts) +{ + struct ip_tunnel_info *tun_info; + struct lwtunnel_state *new_state; + struct nlattr *tb[LWTUNNEL_IP_MAX + 1]; + int err; + + err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy); + if (err < 0) + return err; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + if (!new_state) + return -ENOMEM; + + new_state->type = LWTUNNEL_ENCAP_IP; + + tun_info = lwt_tun_info(new_state); + + if (tb[LWTUNNEL_IP_ID]) + tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP_ID]); + + if (tb[LWTUNNEL_IP_DST]) + tun_info->key.u.ipv4.dst = nla_get_be32(tb[LWTUNNEL_IP_DST]); + + if (tb[LWTUNNEL_IP_SRC]) + tun_info->key.u.ipv4.src = nla_get_be32(tb[LWTUNNEL_IP_SRC]); + + if (tb[LWTUNNEL_IP_TTL]) + tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]); + + if (tb[LWTUNNEL_IP_TOS]) + tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); + + if (tb[LWTUNNEL_IP_FLAGS]) + tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP_FLAGS]); + + tun_info->mode = IP_TUNNEL_INFO_TX; + tun_info->options_len = 0; + + *ts = new_state; + + return 0; +} + +static int ip_tun_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); + + if (nla_put_u64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) || + nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) || + nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || + nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || + nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || + nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) + return -ENOMEM; + + return 0; +} + +static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + return nla_total_size(8) /* LWTUNNEL_IP_ID */ + + nla_total_size(4) /* LWTUNNEL_IP_DST */ + + nla_total_size(4) /* LWTUNNEL_IP_SRC */ + + nla_total_size(1) /* LWTUNNEL_IP_TOS */ + + nla_total_size(1) /* LWTUNNEL_IP_TTL */ + + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */ +} + +static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + return memcmp(lwt_tun_info(a), lwt_tun_info(b), + sizeof(struct ip_tunnel_info)); +} + +static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { + .build_state = ip_tun_build_state, + .fill_encap = ip_tun_fill_encap_info, + .get_encap_size = ip_tun_encap_nlsize, + .cmp_encap = ip_tun_cmp_encap, +}; + +static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { + [LWTUNNEL_IP6_ID] = { .type = NLA_U64 }, + [LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) }, + [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) }, + [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 }, + [LWTUNNEL_IP6_TC] = { .type = NLA_U8 }, + [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, +}; + +static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts) +{ + struct ip_tunnel_info *tun_info; + struct lwtunnel_state *new_state; + struct nlattr *tb[LWTUNNEL_IP6_MAX + 1]; + int err; + + err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy); + if (err < 0) + return err; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + if (!new_state) + return -ENOMEM; + + new_state->type = LWTUNNEL_ENCAP_IP6; + + tun_info = lwt_tun_info(new_state); + + if (tb[LWTUNNEL_IP6_ID]) + tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP6_ID]); + + if (tb[LWTUNNEL_IP6_DST]) + tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]); + + if (tb[LWTUNNEL_IP6_SRC]) + tun_info->key.u.ipv6.src = nla_get_in6_addr(tb[LWTUNNEL_IP6_SRC]); + + if (tb[LWTUNNEL_IP6_HOPLIMIT]) + tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP6_HOPLIMIT]); + + if (tb[LWTUNNEL_IP6_TC]) + tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); + + if (tb[LWTUNNEL_IP6_FLAGS]) + tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]); + + tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6; + tun_info->options_len = 0; + + *ts = new_state; + + return 0; +} + +static int ip6_tun_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); + + if (nla_put_u64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) || + nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) || + nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || + nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) || + nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) || + nla_put_u16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) + return -ENOMEM; + + return 0; +} + +static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + return nla_total_size(8) /* LWTUNNEL_IP6_ID */ + + nla_total_size(16) /* LWTUNNEL_IP6_DST */ + + nla_total_size(16) /* LWTUNNEL_IP6_SRC */ + + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */ + + nla_total_size(1) /* LWTUNNEL_IP6_TC */ + + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */ +} + +static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = { + .build_state = ip6_tun_build_state, + .fill_encap = ip6_tun_fill_encap_info, + .get_encap_size = ip6_tun_encap_nlsize, + .cmp_encap = ip_tun_cmp_encap, +}; + +void __init ip_tunnel_core_init(void) +{ + lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); + lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6); +} + +struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL(ip_tunnel_metadata_cnt); + +void ip_tunnel_need_metadata(void) +{ + static_key_slow_inc(&ip_tunnel_metadata_cnt); +} +EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata); + +void ip_tunnel_unneed_metadata(void) +{ + static_key_slow_dec(&ip_tunnel_metadata_cnt); +} +EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 8e7328c6a..ed4ef09c2 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -94,7 +94,7 @@ /* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */ #define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */ #define CONF_SEND_RETRIES 6 /* Send six requests per open */ -#define CONF_INTER_TIMEOUT (HZ/2) /* Inter-device timeout: 1/2 second */ +#define CONF_INTER_TIMEOUT (HZ) /* Inter-device timeout: 1 second */ #define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */ #define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */ #define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */ diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 254238daf..f34c31def 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -198,7 +198,7 @@ static int ipip_rcv(struct sk_buff *skb) goto drop; if (iptunnel_pull_header(skb, 0, tpi.proto)) goto drop; - return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); + return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); } return -1; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 3a2c0162c..8e8203d5c 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -233,7 +233,6 @@ static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = { .match = ipmr_rule_match, .configure = ipmr_rule_configure, .compare = ipmr_rule_compare, - .default_pref = fib_default_rule_pref, .fill = ipmr_rule_fill, .nlgroup = RTNLGRP_IPV4_RULE, .policy = ipmr_rule_policy, @@ -1683,8 +1682,8 @@ static inline int ipmr_forward_finish(struct sock *sk, struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); - IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len); + IP_INC_STATS(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_ADD_STATS(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len); if (unlikely(opt->optlen)) ip_forward_options(skb); @@ -1746,7 +1745,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, * to blackhole. */ - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); ip_rt_put(rt); goto out_free; } diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 2199a5db2..a35584176 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -58,6 +58,12 @@ config NFT_REJECT_IPV4 default NFT_REJECT tristate +config NFT_DUP_IPV4 + tristate "IPv4 nf_tables packet duplication support" + select NF_DUP_IPV4 + help + This module enables IPv4 packet duplication support for nf_tables. + endif # NF_TABLES_IPV4 config NF_TABLES_ARP @@ -67,6 +73,13 @@ config NF_TABLES_ARP endif # NF_TABLES +config NF_DUP_IPV4 + tristate "Netfilter IPv4 packet duplication to alternate destination" + depends on !NF_CONNTRACK || NF_CONNTRACK + help + This option enables the nf_dup_ipv4 core, which duplicates an IPv4 + packet to be rerouted to another destination. + config NF_LOG_ARP tristate "ARP packet logging" default m if NETFILTER_ADVANCED=n diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 7fe6c7035..87b073da1 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o +obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # generic IP tables @@ -70,3 +71,5 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o # just filtering instance of ARP tables for now obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o + +obj-$(CONFIG_NF_DUP_IPV4) += nf_dup_ipv4.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 92305a1a0..8f87fc38c 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -240,7 +240,7 @@ get_entry(const void *base, unsigned int offset) return (struct arpt_entry *)(base + offset); } -static inline __pure +static inline struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry) { return (void *)entry + entry->next_offset; @@ -280,6 +280,9 @@ unsigned int arpt_do_table(struct sk_buff *skb, table_base = private->entries; jumpstack = (struct arpt_entry **)private->jumpstack[cpu]; + /* No TEE support for arptables, so no need to switch to alternate + * stack. All targets that reenter must return absolute verdicts. + */ e = get_entry(table_base, private->hook_entry[hook]); acpar.in = state->in; @@ -325,11 +328,6 @@ unsigned int arpt_do_table(struct sk_buff *skb, } if (table_base + v != arpt_next_entry(e)) { - - if (stackidx >= private->stacksize) { - verdict = NF_DROP; - break; - } jumpstack[stackidx++] = e; } @@ -337,9 +335,6 @@ unsigned int arpt_do_table(struct sk_buff *skb, continue; } - /* Targets which reenter must return - * abs. verdicts - */ acpar.target = t->u.kernel.target; acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 6c72fbb7b..b0a86e734 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -276,7 +276,7 @@ static void trace_packet(const struct sk_buff *skb, } #endif -static inline __pure +static inline struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry) { return (void *)entry + entry->next_offset; @@ -296,12 +296,13 @@ ipt_do_table(struct sk_buff *skb, const char *indev, *outdev; const void *table_base; struct ipt_entry *e, **jumpstack; - unsigned int *stackptr, origptr, cpu; + unsigned int stackidx, cpu; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; /* Initialization */ + stackidx = 0; ip = ip_hdr(skb); indev = state->in ? state->in->name : nulldevname; outdev = state->out ? state->out->name : nulldevname; @@ -331,13 +332,21 @@ ipt_do_table(struct sk_buff *skb, smp_read_barrier_depends(); table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; - stackptr = per_cpu_ptr(private->stackptr, cpu); - origptr = *stackptr; + + /* Switch to alternate jumpstack if we're being invoked via TEE. + * TEE issues XT_CONTINUE verdict on original skb so we must not + * clobber the jumpstack. + * + * For recursion via REJECT or SYNPROXY the stack will be clobbered + * but it is no problem since absolute verdict is issued by these. + */ + if (static_key_false(&xt_tee_enabled)) + jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); - pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n", - table->name, hook, origptr, + pr_debug("Entering %s(hook %u), UF %p\n", + table->name, hook, get_entry(table_base, private->underflow[hook])); do { @@ -383,28 +392,24 @@ ipt_do_table(struct sk_buff *skb, verdict = (unsigned int)(-v) - 1; break; } - if (*stackptr <= origptr) { + if (stackidx == 0) { e = get_entry(table_base, private->underflow[hook]); pr_debug("Underflow (this is normal) " "to %p\n", e); } else { - e = jumpstack[--*stackptr]; + e = jumpstack[--stackidx]; pr_debug("Pulled %p out from pos %u\n", - e, *stackptr); + e, stackidx); e = ipt_next_entry(e); } continue; } if (table_base + v != ipt_next_entry(e) && !(e->ip.flags & IPT_F_GOTO)) { - if (*stackptr >= private->stacksize) { - verdict = NF_DROP; - break; - } - jumpstack[(*stackptr)++] = e; + jumpstack[stackidx++] = e; pr_debug("Pushed %p into pos %u\n", - e, *stackptr - 1); + e, stackidx - 1); } e = get_entry(table_base, v); @@ -423,9 +428,8 @@ ipt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!acpar.hotdrop); - pr_debug("Exiting %s; resetting sp from %u to %u\n", - __func__, *stackptr, origptr); - *stackptr = origptr; + pr_debug("Exiting %s; sp at %u\n", __func__, stackidx); + xt_write_recseq_end(addend); local_bh_enable(); diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index 4bf3dc49a..270765236 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -72,7 +72,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo) tcph->cwr = einfo->proto.tcp.cwr; inet_proto_csum_replace2(&tcph->check, skb, - oldval, ((__be16 *)tcph)[6], 0); + oldval, ((__be16 *)tcph)[6], false); return true; } diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 8618fd150..c4ffc9de1 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -61,9 +61,7 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4, if (FIB_RES_DEV(res) == dev) dev_match = true; #endif - if (dev_match || flags & XT_RPFILTER_LOOSE) - return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST; - return dev_match; + return dev_match || flags & XT_RPFILTER_LOOSE; } static bool rpfilter_is_local(const struct sk_buff *skb) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 30ad9554b..8a2caaf39 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -280,7 +280,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -EINVAL; } - h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple); + h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); if (h) { struct sockaddr_in sin; struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 80d5554b9..cdde3ec49 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -134,9 +134,11 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, struct nf_conntrack_tuple innertuple, origtuple; const struct nf_conntrack_l4proto *innerproto; const struct nf_conntrack_tuple_hash *h; - u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + const struct nf_conntrack_zone *zone; + struct nf_conntrack_zone tmp; NF_CT_ASSERT(skb->nfct == NULL); + zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); /* Are they talking about one of our connections? */ if (!nf_ct_get_tuplepr(skb, diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index c88b7d434..9306ec4fa 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -43,22 +43,22 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, struct sk_buff *skb) { - u16 zone = NF_CT_DEFAULT_ZONE; - + u16 zone_id = NF_CT_DEFAULT_ZONE_ID; #if IS_ENABLED(CONFIG_NF_CONNTRACK) - if (skb->nfct) - zone = nf_ct_zone((struct nf_conn *)skb->nfct); -#endif + if (skb->nfct) { + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge && - skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) - return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone; + zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); + } #endif + if (nf_bridge_in_prerouting(skb)) + return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id; + if (hooknum == NF_INET_PRE_ROUTING) - return IP_DEFRAG_CONNTRACK_IN + zone; + return IP_DEFRAG_CONNTRACK_IN + zone_id; else - return IP_DEFRAG_CONNTRACK_OUT + zone; + return IP_DEFRAG_CONNTRACK_OUT + zone_id; } static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c new file mode 100644 index 000000000..2d79e6e8d --- /dev/null +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -0,0 +1,121 @@ +/* + * (C) 2007 by Sebastian Claßen <sebastian.classen@freenet.ag> + * (C) 2007-2010 by Jan Engelhardt <jengelh@medozas.de> + * + * Extracted from xt_TEE.c + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 or later, as + * published by the Free Software Foundation. + */ +#include <linux/ip.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/route.h> +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <net/checksum.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/route.h> +#include <net/netfilter/ipv4/nf_dup_ipv4.h> +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include <net/netfilter/nf_conntrack.h> +#endif + +static struct net *pick_net(struct sk_buff *skb) +{ +#ifdef CONFIG_NET_NS + const struct dst_entry *dst; + + if (skb->dev != NULL) + return dev_net(skb->dev); + dst = skb_dst(skb); + if (dst != NULL && dst->dev != NULL) + return dev_net(dst->dev); +#endif + return &init_net; +} + +static bool nf_dup_ipv4_route(struct sk_buff *skb, const struct in_addr *gw, + int oif) +{ + const struct iphdr *iph = ip_hdr(skb); + struct net *net = pick_net(skb); + struct rtable *rt; + struct flowi4 fl4; + + memset(&fl4, 0, sizeof(fl4)); + if (oif != -1) + fl4.flowi4_oif = oif; + + fl4.daddr = gw->s_addr; + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + return false; + + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + skb->dev = rt->dst.dev; + skb->protocol = htons(ETH_P_IP); + + return true; +} + +void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum, + const struct in_addr *gw, int oif) +{ + struct iphdr *iph; + + if (this_cpu_read(nf_skb_duplicated)) + return; + /* + * Copy the skb, and route the copy. Will later return %XT_CONTINUE for + * the original skb, which should continue on its way as if nothing has + * happened. The copy should be independently delivered to the gateway. + */ + skb = pskb_copy(skb, GFP_ATOMIC); + if (skb == NULL) + return; + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + /* Avoid counting cloned packets towards the original connection. */ + nf_conntrack_put(skb->nfct); + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); +#endif + /* + * If we are in PREROUTING/INPUT, the checksum must be recalculated + * since the length could have changed as a result of defragmentation. + * + * We also decrease the TTL to mitigate potential loops between two + * hosts. + * + * Set %IP_DF so that the original source is notified of a potentially + * decreased MTU on the clone route. IPv6 does this too. + */ + iph = ip_hdr(skb); + iph->frag_off |= htons(IP_DF); + if (hooknum == NF_INET_PRE_ROUTING || + hooknum == NF_INET_LOCAL_IN) + --iph->ttl; + ip_send_check(iph); + + if (nf_dup_ipv4_route(skb, gw, oif)) { + __this_cpu_write(nf_skb_duplicated, true); + ip_local_out(skb); + __this_cpu_write(nf_skb_duplicated, false); + } else { + kfree_skb(skb); + } +} +EXPORT_SYMBOL_GPL(nf_dup_ipv4); + +MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>"); +MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>"); +MODULE_DESCRIPTION("nf_dup_ipv4: Duplicate IPv4 packet"); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index e59cc05c0..22f4579b0 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -120,7 +120,7 @@ static void nf_nat_ipv4_csum_update(struct sk_buff *skb, oldip = iph->daddr; newip = t->dst.u3.ip; } - inet_proto_csum_replace4(check, skb, oldip, newip, 1); + inet_proto_csum_replace4(check, skb, oldip, newip, true); } static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, @@ -151,7 +151,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, } } else inet_proto_csum_replace2(check, skb, - htons(oldlen), htons(datalen), 1); + htons(oldlen), htons(datalen), true); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index 4557b4ab8..7b98baa13 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -67,7 +67,7 @@ icmp_manip_pkt(struct sk_buff *skb, hdr = (struct icmphdr *)(skb->data + hdroff); inet_proto_csum_replace2(&hdr->checksum, skb, - hdr->un.echo.id, tuple->src.u.icmp.id, 0); + hdr->un.echo.id, tuple->src.u.icmp.id, false); hdr->un.echo.id = tuple->src.u.icmp.id; return true; } diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c new file mode 100644 index 000000000..b45932d43 --- /dev/null +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/ipv4/nf_dup_ipv4.h> + +struct nft_dup_ipv4 { + enum nft_registers sreg_addr:8; + enum nft_registers sreg_dev:8; +}; + +static void nft_dup_ipv4_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_dup_ipv4 *priv = nft_expr_priv(expr); + struct in_addr gw = { + .s_addr = (__force __be32)regs->data[priv->sreg_addr], + }; + int oif = regs->data[priv->sreg_dev]; + + nf_dup_ipv4(pkt->skb, pkt->ops->hooknum, &gw, oif); +} + +static int nft_dup_ipv4_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_dup_ipv4 *priv = nft_expr_priv(expr); + int err; + + if (tb[NFTA_DUP_SREG_ADDR] == NULL) + return -EINVAL; + + priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]); + err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in_addr)); + if (err < 0) + return err; + + if (tb[NFTA_DUP_SREG_DEV] != NULL) { + priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]); + return nft_validate_register_load(priv->sreg_dev, sizeof(int)); + } + return 0; +} + +static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_dup_ipv4 *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) || + nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_dup_ipv4_type; +static const struct nft_expr_ops nft_dup_ipv4_ops = { + .type = &nft_dup_ipv4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_dup_ipv4)), + .eval = nft_dup_ipv4_eval, + .init = nft_dup_ipv4_init, + .dump = nft_dup_ipv4_dump, +}; + +static const struct nla_policy nft_dup_ipv4_policy[NFTA_DUP_MAX + 1] = { + [NFTA_DUP_SREG_ADDR] = { .type = NLA_U32 }, + [NFTA_DUP_SREG_DEV] = { .type = NLA_U32 }, +}; + +static struct nft_expr_type nft_dup_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, + .name = "dup", + .ops = &nft_dup_ipv4_ops, + .policy = nft_dup_ipv4_policy, + .maxattr = NFTA_DUP_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_dup_ipv4_module_init(void) +{ + return nft_register_expr(&nft_dup_ipv4_type); +} + +static void __exit nft_dup_ipv4_module_exit(void) +{ + nft_unregister_expr(&nft_dup_ipv4_type); +} + +module_init(nft_dup_ipv4_module_init); +module_exit(nft_dup_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "dup"); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 05ff44b75..e89094ab5 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -363,7 +363,8 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, scoped); rcu_read_unlock(); - if (!(isk->freebind || isk->transparent || has_addr || + if (!(net->ipv6.sysctl.ip_nonlocal_bind || + isk->freebind || isk->transparent || has_addr || addr_type == IPV6_ADDR_ANY)) return -EADDRNOTAVAIL; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index da5d483e2..3abd9d7a3 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -300,6 +300,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE), SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE), SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), + SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL), + SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e681b852c..c81deb85a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -91,6 +91,7 @@ #include <linux/slab.h> #include <linux/jhash.h> #include <net/dst.h> +#include <net/dst_metadata.h> #include <net/net_namespace.h> #include <net/protocol.h> #include <net/ip.h> @@ -102,6 +103,7 @@ #include <net/tcp.h> #include <net/icmp.h> #include <net/xfrm.h> +#include <net/lwtunnel.h> #include <net/netevent.h> #include <net/rtnetlink.h> #ifdef CONFIG_SYSCTL @@ -109,6 +111,8 @@ #include <linux/kmemleak.h> #endif #include <net/secure_seq.h> +#include <net/ip_tunnels.h> +#include <net/vrf.h> #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) @@ -834,6 +838,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) struct inet_peer *peer; struct net *net; int log_martians; + int vif; rcu_read_lock(); in_dev = __in_dev_get_rcu(rt->dst.dev); @@ -842,10 +847,11 @@ void ip_rt_send_redirect(struct sk_buff *skb) return; } log_martians = IN_DEV_LOG_MARTIANS(in_dev); + vif = vrf_master_ifindex_rcu(rt->dst.dev); rcu_read_unlock(); net = dev_net(rt->dst.dev); - peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); if (!peer) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt_nexthop(rt, ip_hdr(skb)->daddr)); @@ -934,7 +940,8 @@ static int ip_error(struct sk_buff *skb) break; } - peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, + vrf_master_ifindex(skb->dev), 1); send = true; if (peer) { @@ -1403,6 +1410,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif + rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr); else if (!(rt->dst.flags & DST_NOCACHE)) @@ -1546,7 +1554,6 @@ static int __mkroute_input(struct sk_buff *skb, struct rtable *rth; int err; struct in_device *out_dev; - unsigned int flags = 0; bool do_cache; u32 itag = 0; @@ -1610,7 +1617,7 @@ static int __mkroute_input(struct sk_buff *skb, } rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev)); - rth->rt_flags = flags; + rth->rt_flags = 0; rth->rt_type = res->type; rth->rt_is_input = 1; rth->rt_iif = 0; @@ -1624,6 +1631,14 @@ static int __mkroute_input(struct sk_buff *skb, rth->dst.output = ip_output; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); + if (lwtunnel_output_redirect(rth->dst.lwtstate)) { + rth->dst.lwtstate->orig_output = rth->dst.output; + rth->dst.output = lwtunnel_output; + } + if (lwtunnel_input_redirect(rth->dst.lwtstate)) { + rth->dst.lwtstate->orig_input = rth->dst.input; + rth->dst.input = lwtunnel_input; + } skb_dst_set(skb, &rth->dst); out: err = 0; @@ -1662,6 +1677,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, { struct fib_result res; struct in_device *in_dev = __in_dev_get_rcu(dev); + struct ip_tunnel_info *tun_info; struct flowi4 fl4; unsigned int flags = 0; u32 itag = 0; @@ -1679,6 +1695,13 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ + tun_info = skb_tunnel_info(skb); + if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) + fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; + else + fl4.flowi4_tun_key.tun_id = 0; + skb_dst_drop(skb); + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; @@ -1710,10 +1733,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, * Now we are ready to route packet. */ fl4.flowi4_oif = 0; - fl4.flowi4_iif = dev->ifindex; + fl4.flowi4_iif = vrf_master_ifindex_rcu(dev) ? : dev->ifindex; fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = 0; fl4.daddr = daddr; fl4.saddr = saddr; err = fib_lookup(net, &fl4, &res, 0); @@ -1792,6 +1816,7 @@ local_input: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; @@ -1981,7 +2006,6 @@ add: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); - RT_CACHE_STAT_INC(out_slow_tot); if (flags & RTCF_LOCAL) @@ -2004,6 +2028,8 @@ add: } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); + if (lwtunnel_output_redirect(rth->dst.lwtstate)) + rth->dst.output = lwtunnel_output; return rth; } @@ -2020,6 +2046,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) struct fib_result res; struct rtable *rth; int orig_oif; + int err = -ENETUNREACH; res.tclassid = 0; res.fi = NULL; @@ -2110,6 +2137,11 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } + if (netif_is_vrf(dev_out) && + !(fl4->flowi4_flags & FLOWI_FLAG_VRFSRC)) { + rth = vrf_dev_get_rth(dev_out); + goto out; + } } if (!fl4->daddr) { @@ -2123,7 +2155,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) goto make_route; } - if (fib_lookup(net, fl4, &res, 0)) { + err = fib_lookup(net, fl4, &res, 0); + if (err) { res.fi = NULL; res.table = NULL; if (fl4->flowi4_oif) { @@ -2151,7 +2184,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) res.type = RTN_UNICAST; goto make_route; } - rth = ERR_PTR(-ENETUNREACH); + rth = ERR_PTR(err); goto out; } @@ -2261,7 +2294,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_uses_gateway = ort->rt_uses_gateway; INIT_LIST_HEAD(&rt->rt_uncached); - dst_free(new); } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0330ab2e2..ade773744 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -29,6 +29,7 @@ static int zero; static int one = 1; static int four = 4; +static int thousand = 1000; static int gso_max_segs = GSO_MAX_SEGS; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; @@ -47,14 +48,14 @@ static void set_local_port_range(struct net *net, int range[2]) { bool same_parity = !((range[0] ^ range[1]) & 1); - write_seqlock(&net->ipv4.ip_local_ports.lock); + write_seqlock_bh(&net->ipv4.ip_local_ports.lock); if (same_parity && !net->ipv4.ip_local_ports.warned) { net->ipv4.ip_local_ports.warned = true; pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n"); } net->ipv4.ip_local_ports.range[0] = range[0]; net->ipv4.ip_local_ports.range[1] = range[1]; - write_sequnlock(&net->ipv4.ip_local_ports.lock); + write_sequnlock_bh(&net->ipv4.ip_local_ports.lock); } /* Validate changes from /proc interface. */ @@ -712,6 +713,24 @@ static struct ctl_table ipv4_table[] = { .extra2 = &gso_max_segs, }, { + .procname = "tcp_pacing_ss_ratio", + .data = &sysctl_tcp_pacing_ss_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &thousand, + }, + { + .procname = "tcp_pacing_ca_ratio", + .data = &sysctl_tcp_pacing_ca_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &thousand, + }, + { .procname = "tcp_autocorking", .data = &sysctl_tcp_autocorking, .maxlen = sizeof(int), @@ -910,6 +929,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "igmp_link_local_mcast_reports", + .data = &sysctl_igmp_llm_reports, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f6afbdc99..631540add 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -628,6 +628,8 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb) sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; + + tcp_slow_start_after_idle_check(sk); } static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index c037644ea..fd1405d37 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -146,7 +146,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { bictcp_update(ca, tp->snd_cwnd); diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 8c6fd3d5e..167b6a3e1 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -264,7 +264,7 @@ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked) u32 prior_snd_cwnd; u32 incr; - if (tp->snd_cwnd < tp->snd_ssthresh && hystart_detect) + if (tcp_in_slow_start(tp) && hystart_detect) tcp_cdg_hystart_update(sk); if (after(ack, ca->rtt_seq) && ca->rtt.v64) { diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 84be008c9..93c4dc3ab 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -114,16 +114,19 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); -u32 tcp_ca_get_key_by_name(const char *name) +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; - u32 key; + u32 key = TCP_CA_UNSPEC; might_sleep(); rcu_read_lock(); ca = __tcp_ca_find_autoload(name); - key = ca ? ca->key : TCP_CA_UNSPEC; + if (ca) { + key = ca->key; + *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; + } rcu_read_unlock(); return key; @@ -365,10 +368,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) */ u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) { - u32 cwnd = tp->snd_cwnd + acked; + u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh); - if (cwnd > tp->snd_ssthresh) - cwnd = tp->snd_ssthresh + 1; acked -= cwnd - tp->snd_cwnd; tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); @@ -413,7 +414,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) return; /* In "safe" area, increase. */ - if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 4f0c402a1..448c2615f 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -154,14 +154,20 @@ static void bictcp_init(struct sock *sk) static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_TX_START) { - s32 delta = tcp_time_stamp - tcp_sk(sk)->lsndtime; struct bictcp *ca = inet_csk_ca(sk); + u32 now = tcp_time_stamp; + s32 delta; + + delta = now - tcp_sk(sk)->lsndtime; /* We were application limited (idle) for a while. * Shift epoch_start to keep cwnd growth to cubic curve. */ - if (ca->epoch_start && delta > 0) + if (ca->epoch_start && delta > 0) { ca->epoch_start += delta; + if (after(ca->epoch_start, now)) + ca->epoch_start = now; + } return; } } @@ -335,7 +341,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (tcp_in_slow_start(tp)) { if (hystart && after(ack, ca->end_seq)) bictcp_hystart_reset(sk); acked = tcp_slow_start(tp, acked); @@ -454,7 +460,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) ca->delay_min = delay; /* hystart triggers when cwnd is larger than some threshold */ - if (hystart && tp->snd_cwnd <= tp->snd_ssthresh && + if (hystart && tcp_in_slow_start(tp) && tp->snd_cwnd >= hystart_low_window) hystart_update(sk, delay); } diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 7092a61c4..7e538f71f 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -209,7 +209,7 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags) /* alpha = (1 - g) * alpha + g * F */ - alpha -= alpha >> dctcp_shift_g; + alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g); if (bytes_ecn) { /* If dctcp_shift_g == 1, a 32bit value would overflow * after 8 Mbytes. diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 882c08aae..db7842495 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -116,7 +116,7 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { /* Update AIMD parameters. diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 58469fff6..82f0d9ed6 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -236,7 +236,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { /* In dangerous area, increase slowly. diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index f963b274f..083831e35 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -112,7 +112,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked) rho_fractions = ca->rho_3ls - (ca->rho << 3); - if (tp->snd_cwnd < tp->snd_ssthresh) { + if (tcp_in_slow_start(tp)) { /* * slow start * INC = 2^RHO - 1 diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index f71002e4d..2ab9bbb6f 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -268,7 +268,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked) return; /* In slow start */ - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bf0636da8..3d12c5e94 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -112,6 +112,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ +#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ @@ -199,11 +200,13 @@ static void tcp_enter_quickack_mode(struct sock *sk) * and the session is not interactive. */ -static inline bool tcp_in_quickack_mode(const struct sock *sk) +static bool tcp_in_quickack_mode(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); + const struct dst_entry *dst = __sk_dst_get(sk); - return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; + return (dst && dst_metric(dst, RTAX_QUICKACK)) || + (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong); } static void tcp_ecn_queue_cwr(struct tcp_sock *tp) @@ -753,13 +756,29 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) * TCP pacing, to smooth the burst on large writes when packets * in flight is significantly lower than cwnd (or rwin) */ +int sysctl_tcp_pacing_ss_ratio __read_mostly = 200; +int sysctl_tcp_pacing_ca_ratio __read_mostly = 120; + static void tcp_update_pacing_rate(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); u64 rate; /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ - rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3); + rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3); + + /* current rate is (cwnd * mss) / srtt + * In Slow Start [1], set sk_pacing_rate to 200 % the current rate. + * In Congestion Avoidance phase, set it to 120 % the current rate. + * + * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh) + * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching + * end of slow start and should slow down. + */ + if (tp->snd_cwnd < tp->snd_ssthresh / 2) + rate *= sysctl_tcp_pacing_ss_ratio; + else + rate *= sysctl_tcp_pacing_ca_ratio; rate *= max(tp->snd_cwnd, tp->packets_out); @@ -1040,7 +1059,7 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, * highest SACK block). Also calculate the lowest snd_nxt among the remaining * retransmitted skbs to avoid some costly processing per ACKs. */ -static void tcp_mark_lost_retrans(struct sock *sk) +static void tcp_mark_lost_retrans(struct sock *sk, int *flag) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -1081,7 +1100,7 @@ static void tcp_mark_lost_retrans(struct sock *sk) if (after(received_upto, ack_seq)) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); - + *flag |= FLAG_LOST_RETRANS; tcp_skb_mark_lost_uncond_verify(tp, skb); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); } else { @@ -1821,7 +1840,7 @@ advance_sp: ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); - tcp_mark_lost_retrans(sk); + tcp_mark_lost_retrans(sk, &state->flag); tcp_verify_left_out(tp); out: @@ -2477,15 +2496,14 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) return false; } -/* The cwnd reduction in CWR and Recovery use the PRR algorithm - * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/ +/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937. * It computes the number of packets to send (sndcnt) based on packets newly * delivered: * 1) If the packets in flight is larger than ssthresh, PRR spreads the * cwnd reductions across a full RTT. - * 2) If packets in flight is lower than ssthresh (such as due to excess - * losses and/or application stalls), do not perform any further cwnd - * reductions, but instead slow start up to ssthresh. + * 2) Otherwise PRR uses packet conservation to send as much as delivered. + * But when the retransmits are acked without further losses, PRR + * slow starts cwnd up to ssthresh to speed up the recovery. */ static void tcp_init_cwnd_reduction(struct sock *sk) { @@ -2502,7 +2520,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) } static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, - int fast_rexmit) + int fast_rexmit, int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; @@ -2511,16 +2529,18 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, (tp->packets_out - tp->sacked_out); tp->prr_delivered += newly_acked_sacked; - if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { + if (delta < 0) { u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + tp->prior_cwnd - 1; sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; - } else { + } else if ((flag & FLAG_RETRANS_DATA_ACKED) && + !(flag & FLAG_LOST_RETRANS)) { sndcnt = min_t(int, delta, max_t(int, tp->prr_delivered - tp->prr_out, newly_acked_sacked) + 1); + } else { + sndcnt = min(delta, newly_acked_sacked); } - sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; } @@ -2581,7 +2601,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); } else { - tcp_cwnd_reduction(sk, prior_unsacked, 0); + tcp_cwnd_reduction(sk, prior_unsacked, 0, flag); } } @@ -2591,6 +2611,7 @@ static void tcp_mtup_probe_failed(struct sock *sk) icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1; icsk->icsk_mtup.probe_size = 0; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL); } static void tcp_mtup_probe_success(struct sock *sk) @@ -2610,6 +2631,7 @@ static void tcp_mtup_probe_success(struct sock *sk) icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; icsk->icsk_mtup.probe_size = 0; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS); } /* Do a simple retransmit without using the backoff mechanisms in @@ -2678,7 +2700,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) tp->prior_ssthresh = 0; tcp_init_undo(tp); - if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + if (!tcp_in_cwnd_reduction(sk)) { if (!ece_ack) tp->prior_ssthresh = tcp_current_ssthresh(sk); tcp_init_cwnd_reduction(sk); @@ -2738,7 +2760,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) /* Undo during fast recovery after partial ACK. */ static bool tcp_try_undo_partial(struct sock *sk, const int acked, - const int prior_unsacked) + const int prior_unsacked, int flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -2754,7 +2776,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked, * mark more packets lost or retransmit more. */ if (tp->retrans_out) { - tcp_cwnd_reduction(sk, prior_unsacked, 0); + tcp_cwnd_reduction(sk, prior_unsacked, 0, flag); return true; } @@ -2841,7 +2863,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); } else { - if (tcp_try_undo_partial(sk, acked, prior_unsacked)) + if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag)) return; /* Partial ACK arrived. Force fast retransmit. */ do_lost = tcp_is_reno(tp) || @@ -2854,9 +2876,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack); - if (icsk->icsk_ca_state != TCP_CA_Open) + if (icsk->icsk_ca_state != TCP_CA_Open && + !(flag & FLAG_LOST_RETRANS)) return; - /* Fall through to processing in Open state. */ + /* Change state if cwnd is undone or retransmits are lost */ default: if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) @@ -2891,7 +2914,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (do_lost) tcp_update_scoreboard(sk, fast_rexmit); - tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit); + tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag); tcp_xmit_retransmit_queue(sk); } @@ -3328,6 +3351,9 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 tp->pred_flags = 0; tcp_fast_path_check(sk); + if (tcp_send_head(sk)) + tcp_slow_start_after_idle_check(sk); + if (nwin > tp->max_window) { tp->max_window = nwin; tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); @@ -3565,10 +3591,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) &sack_state); acked -= tp->packets_out; - /* Advance cwnd if state allows */ - if (tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, acked); - if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); tcp_fastretrans_alert(sk, acked, prior_unsacked, @@ -3577,6 +3599,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); + /* Advance cwnd if state allows */ + if (tcp_may_raise_cwnd(sk, flag)) + tcp_cong_avoid(sk, ack, acked); + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { struct dst_entry *dst = __sk_dst_get(sk); if (dst) @@ -3991,7 +4017,6 @@ void tcp_reset(struct sock *sk) static void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - const struct dst_entry *dst; inet_csk_schedule_ack(sk); @@ -4003,9 +4028,7 @@ static void tcp_fin(struct sock *sk) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - dst = __sk_dst_get(sk); - if (!dst || !dst_metric(dst, RTAX_QUICKACK)) - inet_csk(sk)->icsk_ack.pingpong = 1; + inet_csk(sk)->icsk_ack.pingpong = 1; break; case TCP_CLOSE_WAIT: @@ -6066,14 +6089,17 @@ static void tcp_ecn_create_request(struct request_sock *req, const struct net *net = sock_net(listen_sk); bool th_ecn = th->ece && th->cwr; bool ect, ecn_ok; + u32 ecn_ok_dst; if (!th_ecn) return; ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); - ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN); + ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); + ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; - if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk)) + if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || + (ecn_ok_dst & DST_FEATURE_ECN_CA)) inet_rsk(req)->ecn_ok = 1; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1eef463f8..42562a7f8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -223,7 +223,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (err) goto failure; - inet_set_txhash(sk); + sk_set_txhash(sk); rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, inet->inet_sport, inet->inet_dport, sk); @@ -1293,7 +1293,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->rcv_tos = ip_hdr(skb)->tos; inet_csk(newsk)->icsk_ext_hdr_len = 0; - inet_set_txhash(newsk); + sk_set_txhash(newsk); if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = newtp->write_seq ^ jiffies; @@ -1710,8 +1710,7 @@ do_time_wait: iph->daddr, th->dest, inet_iif(skb)); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk)); - inet_twsk_put(inet_twsk(sk)); + inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; goto process; } diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index a51d63a43..c8cbc2b4b 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -81,11 +81,7 @@ static void tcp_metric_set(struct tcp_metrics_block *tm, static bool addr_same(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { - if (a->family != b->family) - return false; - if (a->family == AF_INET) - return a->addr.a4 == b->addr.a4; - return ipv6_addr_equal(&a->addr.in6, &b->addr.in6); + return inetpeer_addr_cmp(a, b) == 0; } struct tcpm_hash_bucket { @@ -247,14 +243,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, daddr.family = req->rsk_ops->family; switch (daddr.family) { case AF_INET: - saddr.addr.a4 = inet_rsk(req)->ir_loc_addr; - daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr; - hash = (__force unsigned int) daddr.addr.a4; + inetpeer_set_addr_v4(&saddr, inet_rsk(req)->ir_loc_addr); + inetpeer_set_addr_v4(&daddr, inet_rsk(req)->ir_rmt_addr); + hash = ipv4_addr_hash(inet_rsk(req)->ir_rmt_addr); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - saddr.addr.in6 = inet_rsk(req)->ir_v6_loc_addr; - daddr.addr.in6 = inet_rsk(req)->ir_v6_rmt_addr; + inetpeer_set_addr_v6(&saddr, &inet_rsk(req)->ir_v6_loc_addr); + inetpeer_set_addr_v6(&daddr, &inet_rsk(req)->ir_v6_rmt_addr); hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); break; #endif @@ -285,25 +281,19 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock struct net *net; if (tw->tw_family == AF_INET) { - saddr.family = AF_INET; - saddr.addr.a4 = tw->tw_rcv_saddr; - daddr.family = AF_INET; - daddr.addr.a4 = tw->tw_daddr; - hash = (__force unsigned int) daddr.addr.a4; + inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr); + inetpeer_set_addr_v4(&daddr, tw->tw_daddr); + hash = ipv4_addr_hash(tw->tw_daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (tw->tw_family == AF_INET6) { if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) { - saddr.family = AF_INET; - saddr.addr.a4 = tw->tw_rcv_saddr; - daddr.family = AF_INET; - daddr.addr.a4 = tw->tw_daddr; - hash = (__force unsigned int) daddr.addr.a4; + inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr); + inetpeer_set_addr_v4(&daddr, tw->tw_daddr); + hash = ipv4_addr_hash(tw->tw_daddr); } else { - saddr.family = AF_INET6; - saddr.addr.in6 = tw->tw_v6_rcv_saddr; - daddr.family = AF_INET6; - daddr.addr.in6 = tw->tw_v6_daddr; + inetpeer_set_addr_v6(&saddr, &tw->tw_v6_rcv_saddr); + inetpeer_set_addr_v6(&daddr, &tw->tw_v6_daddr); hash = ipv6_addr_hash(&tw->tw_v6_daddr); } } @@ -335,25 +325,19 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, struct net *net; if (sk->sk_family == AF_INET) { - saddr.family = AF_INET; - saddr.addr.a4 = inet_sk(sk)->inet_saddr; - daddr.family = AF_INET; - daddr.addr.a4 = inet_sk(sk)->inet_daddr; - hash = (__force unsigned int) daddr.addr.a4; + inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr); + inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr); + hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { - saddr.family = AF_INET; - saddr.addr.a4 = inet_sk(sk)->inet_saddr; - daddr.family = AF_INET; - daddr.addr.a4 = inet_sk(sk)->inet_daddr; - hash = (__force unsigned int) daddr.addr.a4; + inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr); + inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr); + hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr); } else { - saddr.family = AF_INET6; - saddr.addr.in6 = sk->sk_v6_rcv_saddr; - daddr.family = AF_INET6; - daddr.addr.in6 = sk->sk_v6_daddr; + inetpeer_set_addr_v6(&saddr, &sk->sk_v6_rcv_saddr); + inetpeer_set_addr_v6(&daddr, &sk->sk_v6_daddr); hash = ipv6_addr_hash(&sk->sk_v6_daddr); } } @@ -461,7 +445,7 @@ void tcp_update_metrics(struct sock *sk) tcp_metric_set(tm, TCP_METRIC_CWND, tp->snd_cwnd); } - } else if (tp->snd_cwnd > tp->snd_ssthresh && + } else if (!tcp_in_slow_start(tp) && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) @@ -796,18 +780,18 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, switch (tm->tcpm_daddr.family) { case AF_INET: if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4, - tm->tcpm_daddr.addr.a4) < 0) + inetpeer_get_addr_v4(&tm->tcpm_daddr)) < 0) goto nla_put_failure; if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4, - tm->tcpm_saddr.addr.a4) < 0) + inetpeer_get_addr_v4(&tm->tcpm_saddr)) < 0) goto nla_put_failure; break; case AF_INET6: if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6, - &tm->tcpm_daddr.addr.in6) < 0) + inetpeer_get_addr_v6(&tm->tcpm_daddr)) < 0) goto nla_put_failure; if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6, - &tm->tcpm_saddr.addr.in6) < 0) + inetpeer_get_addr_v6(&tm->tcpm_saddr)) < 0) goto nla_put_failure; break; default: @@ -956,20 +940,21 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, a = info->attrs[v4]; if (a) { - addr->family = AF_INET; - addr->addr.a4 = nla_get_in_addr(a); + inetpeer_set_addr_v4(addr, nla_get_in_addr(a)); if (hash) - *hash = (__force unsigned int) addr->addr.a4; + *hash = ipv4_addr_hash(inetpeer_get_addr_v4(addr)); return 0; } a = info->attrs[v6]; if (a) { + struct in6_addr in6; + if (nla_len(a) != sizeof(struct in6_addr)) return -EINVAL; - addr->family = AF_INET6; - addr->addr.in6 = nla_get_in6_addr(a); + in6 = nla_get_in6_addr(a); + inetpeer_set_addr_v6(addr, &in6); if (hash) - *hash = ipv6_addr_hash(&addr->addr.in6); + *hash = ipv6_addr_hash(inetpeer_get_addr_v6(addr)); return 0; } return optional ? 1 : -EAFNOSUPPORT; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4bc00cb79..def765911 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -147,8 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (!th->fin || TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { kill_with_rst: - inet_twsk_deschedule(tw); - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); return TCP_TW_RST; } @@ -163,9 +162,9 @@ kill_with_rst: if (tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_tw_remember_stamp(tw)) - inet_twsk_schedule(tw, tw->tw_timeout); + inet_twsk_reschedule(tw, tw->tw_timeout); else - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -198,12 +197,11 @@ kill_with_rst: */ if (sysctl_tcp_rfc1337 == 0) { kill: - inet_twsk_deschedule(tw); - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; } } - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; @@ -253,7 +251,7 @@ kill: * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); @@ -324,9 +322,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } while (0); #endif - /* Linkage updates. */ - __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); - /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) timeo = rto; @@ -340,6 +335,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } inet_twsk_schedule(tw, timeo); + /* Linkage updates. */ + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 81c640fe0..f5ecbbbc6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -137,12 +137,12 @@ static __u16 tcp_advertise_mss(struct sock *sk) } /* RFC2861. Reset CWND after idle period longer RTO to "restart window". - * This is the first part of cwnd validation mechanism. */ -static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst) + * This is the first part of cwnd validation mechanism. + */ +void tcp_cwnd_restart(struct sock *sk, s32 delta) { struct tcp_sock *tp = tcp_sk(sk); - s32 delta = tcp_time_stamp - tp->lsndtime; - u32 restart_cwnd = tcp_init_cwnd(tp, dst); + u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 cwnd = tp->snd_cwnd; tcp_ca_event(sk, CA_EVENT_CWND_RESTART); @@ -163,20 +163,17 @@ static void tcp_event_data_sent(struct tcp_sock *tp, { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; - const struct dst_entry *dst = __sk_dst_get(sk); - if (sysctl_tcp_slow_start_after_idle && - (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) - tcp_cwnd_restart(sk, __sk_dst_get(sk)); + if (tcp_packets_in_flight(tp) == 0) + tcp_ca_event(sk, CA_EVENT_TX_START); tp->lsndtime = now; /* If it is a reply for ato after last received * packet, enter pingpong mode. */ - if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato && - (!dst || !dst_metric(dst, RTAX_QUICKACK))) - icsk->icsk_ack.pingpong = 1; + if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) + icsk->icsk_ack.pingpong = 1; } /* Account for an ACK we sent. */ @@ -953,9 +950,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, &md5); tcp_header_size = tcp_options_size + sizeof(struct tcphdr); - if (tcp_packets_in_flight(tp) == 0) - tcp_ca_event(sk, CA_EVENT_TX_START); - /* if no packet is in qdisc/device queue, then allow XPS to select * another queue. We can be called from tcp_tsq_handler() * which holds one reference to sk_wmem_alloc. @@ -1783,7 +1777,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto send_now; - if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR))) + if (icsk->icsk_ca_state >= TCP_CA_Recovery) goto send_now; /* Avoid bursty behavior by allowing defer @@ -2158,7 +2152,7 @@ repair: tcp_cwnd_validate(sk, is_cwnd_limited); return false; } - return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); + return !tp->packets_out && tcp_send_head(sk); } bool tcp_schedule_loss_probe(struct sock *sk) @@ -2235,7 +2229,7 @@ static bool skb_still_in_host_queue(const struct sock *sk, return false; } -/* When probe timeout (PTO) fires, send a new segment if one exists, else +/* When probe timeout (PTO) fires, try send a new segment if possible, else * retransmit the last segment. */ void tcp_send_loss_probe(struct sock *sk) @@ -2244,11 +2238,19 @@ void tcp_send_loss_probe(struct sock *sk) struct sk_buff *skb; int pcount; int mss = tcp_current_mss(sk); - int err = -1; - if (tcp_send_head(sk)) { - err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); - goto rearm_timer; + skb = tcp_send_head(sk); + if (skb) { + if (tcp_snd_wnd_test(tp, skb, mss)) { + pcount = tp->packets_out; + tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); + if (tp->packets_out > pcount) + goto probe_sent; + goto rearm_timer; + } + skb = tcp_write_queue_prev(sk, skb); + } else { + skb = tcp_write_queue_tail(sk); } /* At most one outstanding TLP retransmission. */ @@ -2256,7 +2258,6 @@ void tcp_send_loss_probe(struct sock *sk) goto rearm_timer; /* Retransmit last segment. */ - skb = tcp_write_queue_tail(sk); if (WARN_ON(!skb)) goto rearm_timer; @@ -2271,26 +2272,24 @@ void tcp_send_loss_probe(struct sock *sk) if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, GFP_ATOMIC))) goto rearm_timer; - skb = tcp_write_queue_tail(sk); + skb = tcp_write_queue_next(sk, skb); } if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; - err = __tcp_retransmit_skb(sk, skb); + if (__tcp_retransmit_skb(sk, skb)) + goto rearm_timer; /* Record snd_nxt for loss detection. */ - if (likely(!err)) - tp->tlp_high_seq = tp->snd_nxt; + tp->tlp_high_seq = tp->snd_nxt; +probe_sent: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); + /* Reset s.t. tcp_rearm_rto will restart timer from now */ + inet_csk(sk)->icsk_pending = 0; rearm_timer: - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); - - if (likely(!err)) - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPLOSSPROBES); + tcp_rearm_rto(sk); } /* Push out any pending frames which were held back due to @@ -3421,7 +3420,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) */ tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); skb_mstamp_get(&skb->skb_mstamp); - NET_INC_STATS_BH(sock_net(sk), mib); + NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 333bcb241..bf5ea9e9b 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -22,7 +22,7 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT), diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 5b752f58a..7149ebc82 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -649,4 +649,3 @@ void tcp_init_xmit_timers(struct sock *sk) inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); } -EXPORT_SYMBOL(tcp_init_xmit_timers); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index a6cea1d5e..13951c408 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -225,7 +225,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) */ diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT; - if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) { + if (diff > gamma && tcp_in_slow_start(tp)) { /* Going too fast. Time to slow down * and switch to congestion avoidance. */ @@ -240,7 +240,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1); tp->snd_ssthresh = tcp_vegas_ssthresh(tp); - } else if (tp->snd_cwnd <= tp->snd_ssthresh) { + } else if (tcp_in_slow_start(tp)) { /* Slow start. */ tcp_slow_start(tp, acked); } else { @@ -281,7 +281,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) vegas->minRTT = 0x7fffffff; } /* Use normal slow start */ - else if (tp->snd_cwnd <= tp->snd_ssthresh) + else if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); } diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 112151eee..0d094b995 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -150,7 +150,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; - if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (tcp_in_slow_start(tp)) { /* Slow start. */ tcp_slow_start(tp, acked); } else { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1b8c5ba7d..f7d1d5e19 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1013,11 +1013,32 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!rt) { struct net *net = sock_net(sk); + __u8 flow_flags = inet_sk_flowi_flags(sk); fl4 = &fl4_stack; + + /* unconnected socket. If output device is enslaved to a VRF + * device lookup source address from VRF table. This mimics + * behavior of ip_route_connect{_init}. + */ + if (netif_index_is_vrf(net, ipc.oif)) { + flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, + RT_SCOPE_UNIVERSE, sk->sk_protocol, + (flow_flags | FLOWI_FLAG_VRFSRC | + FLOWI_FLAG_SKIP_NH_OIF), + faddr, saddr, dport, + inet->inet_sport); + + rt = ip_route_output_flow(net, fl4, sk); + if (!IS_ERR(rt)) { + saddr = fl4->saddr; + ip_rt_put(rt); + } + } + flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - inet_sk_flowi_flags(sk), + flow_flags, faddr, saddr, dport, inet->inet_sport); security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 933ea903f..aba428626 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -4,9 +4,10 @@ #include <linux/udp.h> #include <linux/types.h> #include <linux/kernel.h> +#include <net/dst_metadata.h> +#include <net/net_namespace.h> #include <net/udp.h> #include <net/udp_tunnel.h> -#include <net/net_namespace.h> int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) @@ -103,4 +104,26 @@ void udp_tunnel_sock_release(struct socket *sock) } EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); +struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, + __be16 flags, __be64 tunnel_id, int md_size) +{ + struct metadata_dst *tun_dst; + struct ip_tunnel_info *info; + + if (family == AF_INET) + tun_dst = ip_tun_rx_dst(skb, flags, tunnel_id, md_size); + else + tun_dst = ipv6_tun_rx_dst(skb, flags, tunnel_id, md_size); + if (!tun_dst) + return NULL; + + info = &tun_dst->u.tun_info; + info->key.tp_src = udp_hdr(skb)->source; + info->key.tp_dst = udp_hdr(skb)->dest; + if (udp_hdr(skb)->check) + info->key.tun_flags |= TUNNEL_CSUM; + return tun_dst; +} +EXPORT_SYMBOL_GPL(udp_tun_rx_dst); + MODULE_LICENSE("GPL"); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 2878dbfff..41a261355 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -30,6 +30,8 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) mtu = dst_mtu(skb_dst(skb)); if (skb->len > mtu) { + skb->protocol = htons(ETH_P_IP); + if (skb->sk) xfrm_local_error(skb, mtu); else diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index bff69746e..c10a9ee68 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -15,11 +15,12 @@ #include <net/dst.h> #include <net/xfrm.h> #include <net/ip.h> +#include <net/vrf.h> static struct xfrm_policy_afinfo xfrm4_policy_afinfo; static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, - int tos, + int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr) { @@ -28,9 +29,12 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, memset(fl4, 0, sizeof(*fl4)); fl4->daddr = daddr->a4; fl4->flowi4_tos = tos; + fl4->flowi4_oif = oif; if (saddr) fl4->saddr = saddr->a4; + fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF; + rt = __ip_route_output_key(net, fl4); if (!IS_ERR(rt)) return &rt->dst; @@ -38,22 +42,22 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, return ERR_CAST(rt); } -static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, +static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr) { struct flowi4 fl4; - return __xfrm4_dst_lookup(net, &fl4, tos, saddr, daddr); + return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr); } -static int xfrm4_get_saddr(struct net *net, +static int xfrm4_get_saddr(struct net *net, int oif, xfrm_address_t *saddr, xfrm_address_t *daddr) { struct dst_entry *dst; struct flowi4 fl4; - dst = __xfrm4_dst_lookup(net, &fl4, 0, NULL, daddr); + dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr); if (IS_ERR(dst)) return -EHOSTUNREACH; @@ -106,8 +110,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) struct flowi4 *fl4 = &fl->u.ip4; int oif = 0; - if (skb_dst(skb)) - oif = skb_dst(skb)->dev->ifindex; + if (skb_dst(skb)) { + oif = vrf_master_ifindex(skb_dst(skb)->dev) ? + : skb_dst(skb)->dev->ifindex; + } memset(fl4, 0, sizeof(struct flowi4)); fl4->flowi4_mark = skb->mark; |