diff options
Diffstat (limited to 'net')
454 files changed, 18944 insertions, 10347 deletions
diff --git a/net/6lowpan/6lowpan_i.h b/net/6lowpan/6lowpan_i.h index d16bb4b14..97ecc27ae 100644 --- a/net/6lowpan/6lowpan_i.h +++ b/net/6lowpan/6lowpan_i.h @@ -3,6 +3,15 @@ #include <linux/netdevice.h> +#include <net/6lowpan.h> + +/* caller need to be sure it's dev->type is ARPHRD_6LOWPAN */ +static inline bool lowpan_is_ll(const struct net_device *dev, + enum lowpan_lltypes lltype) +{ + return lowpan_dev(dev)->lltype == lltype; +} + #ifdef CONFIG_6LOWPAN_DEBUGFS int lowpan_dev_debugfs_init(struct net_device *dev); void lowpan_dev_debugfs_exit(struct net_device *dev); diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index 34e44c0c0..7a240b3ea 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -27,11 +27,11 @@ int lowpan_register_netdevice(struct net_device *dev, dev->mtu = IPV6_MIN_MTU; dev->priv_flags |= IFF_NO_QUEUE; - lowpan_priv(dev)->lltype = lltype; + lowpan_dev(dev)->lltype = lltype; - spin_lock_init(&lowpan_priv(dev)->ctx.lock); + spin_lock_init(&lowpan_dev(dev)->ctx.lock); for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) - lowpan_priv(dev)->ctx.table[i].id = i; + lowpan_dev(dev)->ctx.table[i].id = i; ret = register_netdevice(dev); if (ret < 0) @@ -85,7 +85,7 @@ static int lowpan_event(struct notifier_block *unused, case NETDEV_DOWN: for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, - &lowpan_priv(dev)->ctx.table[i].flags); + &lowpan_dev(dev)->ctx.table[i].flags); break; default: return NOTIFY_DONE; diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c index 0793a8157..acbaa3db4 100644 --- a/net/6lowpan/debugfs.c +++ b/net/6lowpan/debugfs.c @@ -172,7 +172,7 @@ static const struct file_operations lowpan_ctx_pfx_fops = { static int lowpan_dev_debugfs_ctx_init(struct net_device *dev, struct dentry *ctx, u8 id) { - struct lowpan_priv *lpriv = lowpan_priv(dev); + struct lowpan_dev *ldev = lowpan_dev(dev); struct dentry *dentry, *root; char buf[32]; @@ -185,25 +185,25 @@ static int lowpan_dev_debugfs_ctx_init(struct net_device *dev, return -EINVAL; dentry = debugfs_create_file("active", 0644, root, - &lpriv->ctx.table[id], + &ldev->ctx.table[id], &lowpan_ctx_flag_active_fops); if (!dentry) return -EINVAL; dentry = debugfs_create_file("compression", 0644, root, - &lpriv->ctx.table[id], + &ldev->ctx.table[id], &lowpan_ctx_flag_c_fops); if (!dentry) return -EINVAL; dentry = debugfs_create_file("prefix", 0644, root, - &lpriv->ctx.table[id], + &ldev->ctx.table[id], &lowpan_ctx_pfx_fops); if (!dentry) return -EINVAL; dentry = debugfs_create_file("prefix_len", 0644, root, - &lpriv->ctx.table[id], + &ldev->ctx.table[id], &lowpan_ctx_plen_fops); if (!dentry) return -EINVAL; @@ -247,21 +247,21 @@ static const struct file_operations lowpan_context_fops = { int lowpan_dev_debugfs_init(struct net_device *dev) { - struct lowpan_priv *lpriv = lowpan_priv(dev); + struct lowpan_dev *ldev = lowpan_dev(dev); struct dentry *contexts, *dentry; int ret, i; /* creating the root */ - lpriv->iface_debugfs = debugfs_create_dir(dev->name, lowpan_debugfs); - if (!lpriv->iface_debugfs) + ldev->iface_debugfs = debugfs_create_dir(dev->name, lowpan_debugfs); + if (!ldev->iface_debugfs) goto fail; - contexts = debugfs_create_dir("contexts", lpriv->iface_debugfs); + contexts = debugfs_create_dir("contexts", ldev->iface_debugfs); if (!contexts) goto remove_root; dentry = debugfs_create_file("show", 0644, contexts, - &lowpan_priv(dev)->ctx, + &lowpan_dev(dev)->ctx, &lowpan_context_fops); if (!dentry) goto remove_root; @@ -282,7 +282,7 @@ fail: void lowpan_dev_debugfs_exit(struct net_device *dev) { - debugfs_remove_recursive(lowpan_priv(dev)->iface_debugfs); + debugfs_remove_recursive(lowpan_dev(dev)->iface_debugfs); } int __init lowpan_debugfs_init(void) diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c index 99bb22aea..8501dd532 100644 --- a/net/6lowpan/iphc.c +++ b/net/6lowpan/iphc.c @@ -53,9 +53,6 @@ #include <net/6lowpan.h> #include <net/ipv6.h> -/* special link-layer handling */ -#include <net/mac802154.h> - #include "6lowpan_i.h" #include "nhc.h" @@ -148,35 +145,25 @@ (((a)->s6_addr16[6]) == 0) && \ (((a)->s6_addr[14]) == 0)) +#define lowpan_is_linklocal_zero_padded(a) \ + (!(hdr->saddr.s6_addr[1] & 0x3f) && \ + !hdr->saddr.s6_addr16[1] && \ + !hdr->saddr.s6_addr32[1]) + #define LOWPAN_IPHC_CID_DCI(cid) (cid & 0x0f) #define LOWPAN_IPHC_CID_SCI(cid) ((cid & 0xf0) >> 4) -static inline void iphc_uncompress_eui64_lladdr(struct in6_addr *ipaddr, - const void *lladdr) -{ - /* fe:80::XXXX:XXXX:XXXX:XXXX - * \_________________/ - * hwaddr - */ - ipaddr->s6_addr[0] = 0xFE; - ipaddr->s6_addr[1] = 0x80; - memcpy(&ipaddr->s6_addr[8], lladdr, EUI64_ADDR_LEN); - /* second bit-flip (Universe/Local) - * is done according RFC2464 - */ - ipaddr->s6_addr[8] ^= 0x02; -} - -static inline void iphc_uncompress_802154_lladdr(struct in6_addr *ipaddr, - const void *lladdr) +static inline void +lowpan_iphc_uncompress_802154_lladdr(struct in6_addr *ipaddr, + const void *lladdr) { const struct ieee802154_addr *addr = lladdr; - u8 eui64[EUI64_ADDR_LEN] = { }; + u8 eui64[EUI64_ADDR_LEN]; switch (addr->mode) { case IEEE802154_ADDR_LONG: ieee802154_le64_to_be64(eui64, &addr->extended_addr); - iphc_uncompress_eui64_lladdr(ipaddr, eui64); + lowpan_iphc_uncompress_eui64_lladdr(ipaddr, eui64); break; case IEEE802154_ADDR_SHORT: /* fe:80::ff:fe00:XXXX @@ -202,7 +189,7 @@ static inline void iphc_uncompress_802154_lladdr(struct in6_addr *ipaddr, static struct lowpan_iphc_ctx * lowpan_iphc_ctx_get_by_id(const struct net_device *dev, u8 id) { - struct lowpan_iphc_ctx *ret = &lowpan_priv(dev)->ctx.table[id]; + struct lowpan_iphc_ctx *ret = &lowpan_dev(dev)->ctx.table[id]; if (!lowpan_iphc_ctx_is_active(ret)) return NULL; @@ -214,7 +201,7 @@ static struct lowpan_iphc_ctx * lowpan_iphc_ctx_get_by_addr(const struct net_device *dev, const struct in6_addr *addr) { - struct lowpan_iphc_ctx *table = lowpan_priv(dev)->ctx.table; + struct lowpan_iphc_ctx *table = lowpan_dev(dev)->ctx.table; struct lowpan_iphc_ctx *ret = NULL; struct in6_addr addr_pfx; u8 addr_plen; @@ -258,7 +245,7 @@ static struct lowpan_iphc_ctx * lowpan_iphc_ctx_get_by_mcast_addr(const struct net_device *dev, const struct in6_addr *addr) { - struct lowpan_iphc_ctx *table = lowpan_priv(dev)->ctx.table; + struct lowpan_iphc_ctx *table = lowpan_dev(dev)->ctx.table; struct lowpan_iphc_ctx *ret = NULL; struct in6_addr addr_mcast, network_pfx = {}; int i; @@ -296,9 +283,10 @@ lowpan_iphc_ctx_get_by_mcast_addr(const struct net_device *dev, * * address_mode is the masked value for sam or dam value */ -static int uncompress_addr(struct sk_buff *skb, const struct net_device *dev, - struct in6_addr *ipaddr, u8 address_mode, - const void *lladdr) +static int lowpan_iphc_uncompress_addr(struct sk_buff *skb, + const struct net_device *dev, + struct in6_addr *ipaddr, + u8 address_mode, const void *lladdr) { bool fail; @@ -327,12 +315,12 @@ static int uncompress_addr(struct sk_buff *skb, const struct net_device *dev, case LOWPAN_IPHC_SAM_11: case LOWPAN_IPHC_DAM_11: fail = false; - switch (lowpan_priv(dev)->lltype) { + switch (lowpan_dev(dev)->lltype) { case LOWPAN_LLTYPE_IEEE802154: - iphc_uncompress_802154_lladdr(ipaddr, lladdr); + lowpan_iphc_uncompress_802154_lladdr(ipaddr, lladdr); break; default: - iphc_uncompress_eui64_lladdr(ipaddr, lladdr); + lowpan_iphc_uncompress_eui64_lladdr(ipaddr, lladdr); break; } break; @@ -355,11 +343,11 @@ static int uncompress_addr(struct sk_buff *skb, const struct net_device *dev, /* Uncompress address function for source context * based address(non-multicast). */ -static int uncompress_ctx_addr(struct sk_buff *skb, - const struct net_device *dev, - const struct lowpan_iphc_ctx *ctx, - struct in6_addr *ipaddr, u8 address_mode, - const void *lladdr) +static int lowpan_iphc_uncompress_ctx_addr(struct sk_buff *skb, + const struct net_device *dev, + const struct lowpan_iphc_ctx *ctx, + struct in6_addr *ipaddr, + u8 address_mode, const void *lladdr) { bool fail; @@ -388,12 +376,12 @@ static int uncompress_ctx_addr(struct sk_buff *skb, case LOWPAN_IPHC_SAM_11: case LOWPAN_IPHC_DAM_11: fail = false; - switch (lowpan_priv(dev)->lltype) { + switch (lowpan_dev(dev)->lltype) { case LOWPAN_LLTYPE_IEEE802154: - iphc_uncompress_802154_lladdr(ipaddr, lladdr); + lowpan_iphc_uncompress_802154_lladdr(ipaddr, lladdr); break; default: - iphc_uncompress_eui64_lladdr(ipaddr, lladdr); + lowpan_iphc_uncompress_eui64_lladdr(ipaddr, lladdr); break; } ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen); @@ -652,22 +640,24 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, } if (iphc1 & LOWPAN_IPHC_SAC) { - spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + spin_lock_bh(&lowpan_dev(dev)->ctx.lock); ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_SCI(cid)); if (!ci) { - spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); return -EINVAL; } pr_debug("SAC bit is set. Handle context based source address.\n"); - err = uncompress_ctx_addr(skb, dev, ci, &hdr.saddr, - iphc1 & LOWPAN_IPHC_SAM_MASK, saddr); - spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + err = lowpan_iphc_uncompress_ctx_addr(skb, dev, ci, &hdr.saddr, + iphc1 & LOWPAN_IPHC_SAM_MASK, + saddr); + spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); } else { /* Source address uncompression */ pr_debug("source address stateless compression\n"); - err = uncompress_addr(skb, dev, &hdr.saddr, - iphc1 & LOWPAN_IPHC_SAM_MASK, saddr); + err = lowpan_iphc_uncompress_addr(skb, dev, &hdr.saddr, + iphc1 & LOWPAN_IPHC_SAM_MASK, + saddr); } /* Check on error of previous branch */ @@ -676,10 +666,10 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, switch (iphc1 & (LOWPAN_IPHC_M | LOWPAN_IPHC_DAC)) { case LOWPAN_IPHC_M | LOWPAN_IPHC_DAC: - spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + spin_lock_bh(&lowpan_dev(dev)->ctx.lock); ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid)); if (!ci) { - spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); return -EINVAL; } @@ -688,7 +678,7 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, err = lowpan_uncompress_multicast_ctx_daddr(skb, ci, &hdr.daddr, iphc1 & LOWPAN_IPHC_DAM_MASK); - spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); break; case LOWPAN_IPHC_M: /* multicast */ @@ -696,22 +686,24 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, iphc1 & LOWPAN_IPHC_DAM_MASK); break; case LOWPAN_IPHC_DAC: - spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + spin_lock_bh(&lowpan_dev(dev)->ctx.lock); ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid)); if (!ci) { - spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); return -EINVAL; } /* Destination address context based uncompression */ pr_debug("DAC bit is set. Handle context based destination address.\n"); - err = uncompress_ctx_addr(skb, dev, ci, &hdr.daddr, - iphc1 & LOWPAN_IPHC_DAM_MASK, daddr); - spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + err = lowpan_iphc_uncompress_ctx_addr(skb, dev, ci, &hdr.daddr, + iphc1 & LOWPAN_IPHC_DAM_MASK, + daddr); + spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); break; default: - err = uncompress_addr(skb, dev, &hdr.daddr, - iphc1 & LOWPAN_IPHC_DAM_MASK, daddr); + err = lowpan_iphc_uncompress_addr(skb, dev, &hdr.daddr, + iphc1 & LOWPAN_IPHC_DAM_MASK, + daddr); pr_debug("dest: stateless compression mode %d dest %pI6c\n", iphc1 & LOWPAN_IPHC_DAM_MASK, &hdr.daddr); break; @@ -731,7 +723,7 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, return err; } - switch (lowpan_priv(dev)->lltype) { + switch (lowpan_dev(dev)->lltype) { case LOWPAN_LLTYPE_IEEE802154: if (lowpan_802154_cb(skb)->d_size) hdr.payload_len = htons(lowpan_802154_cb(skb)->d_size - @@ -1028,7 +1020,7 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, skb->data, skb->len); ipv6_daddr_type = ipv6_addr_type(&hdr->daddr); - spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + spin_lock_bh(&lowpan_dev(dev)->ctx.lock); if (ipv6_daddr_type & IPV6_ADDR_MULTICAST) dci = lowpan_iphc_ctx_get_by_mcast_addr(dev, &hdr->daddr); else @@ -1037,15 +1029,15 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, memcpy(&dci_entry, dci, sizeof(*dci)); cid |= dci->id; } - spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); - spin_lock_bh(&lowpan_priv(dev)->ctx.lock); + spin_lock_bh(&lowpan_dev(dev)->ctx.lock); sci = lowpan_iphc_ctx_get_by_addr(dev, &hdr->saddr); if (sci) { memcpy(&sci_entry, sci, sizeof(*sci)); cid |= (sci->id << 4); } - spin_unlock_bh(&lowpan_priv(dev)->ctx.lock); + spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); /* if cid is zero it will be compressed */ if (cid) { @@ -1101,7 +1093,8 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, true); iphc1 |= LOWPAN_IPHC_SAC; } else { - if (ipv6_saddr_type & IPV6_ADDR_LINKLOCAL) { + if (ipv6_saddr_type & IPV6_ADDR_LINKLOCAL && + lowpan_is_linklocal_zero_padded(hdr->saddr)) { iphc1 |= lowpan_compress_addr_64(&hc_ptr, &hdr->saddr, saddr, true); @@ -1135,7 +1128,8 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev, false); iphc1 |= LOWPAN_IPHC_DAC; } else { - if (ipv6_daddr_type & IPV6_ADDR_LINKLOCAL) { + if (ipv6_daddr_type & IPV6_ADDR_LINKLOCAL && + lowpan_is_linklocal_zero_padded(hdr->daddr)) { iphc1 |= lowpan_compress_addr_64(&hc_ptr, &hdr->daddr, daddr, false); diff --git a/net/6lowpan/nhc_udp.c b/net/6lowpan/nhc_udp.c index 69537a2ea..225d91906 100644 --- a/net/6lowpan/nhc_udp.c +++ b/net/6lowpan/nhc_udp.c @@ -91,7 +91,7 @@ static int udp_uncompress(struct sk_buff *skb, size_t needed) * here, we obtain the hint from the remaining size of the * frame */ - switch (lowpan_priv(skb->dev)->lltype) { + switch (lowpan_dev(skb->dev)->lltype) { case LOWPAN_LLTYPE_IEEE802154: if (lowpan_802154_cb(skb)->d_size) uh.len = htons(lowpan_802154_cb(skb)->d_size - diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index a1e273af6..82a116ba5 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -290,6 +290,10 @@ static void vlan_sync_address(struct net_device *dev, if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) return; + /* vlan continues to inherit address of lower device */ + if (vlan_dev_inherit_address(vlandev, dev)) + goto out; + /* vlan address was different from the old address and is equal to * the new address */ if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && @@ -302,6 +306,7 @@ static void vlan_sync_address(struct net_device *dev, !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_add(dev, vlandev->dev_addr); +out: ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index 9d010a09a..cc1557978 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -109,6 +109,8 @@ int vlan_check_real_dev(struct net_device *real_dev, void vlan_setup(struct net_device *dev); int register_vlan_dev(struct net_device *dev); void unregister_vlan_dev(struct net_device *dev, struct list_head *head); +bool vlan_dev_inherit_address(struct net_device *dev, + struct net_device *real_dev); static inline u32 vlan_get_ingress_priority(struct net_device *dev, u16 vlan_tci) diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index e7e62570b..516b0e732 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -146,10 +146,12 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu) { - /* TODO: gotta make sure the underlying layer can handle it, - * maybe an IFF_VLAN_CAPABLE flag for devices? - */ - if (vlan_dev_priv(dev)->real_dev->mtu < new_mtu) + struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; + unsigned int max_mtu = real_dev->mtu; + + if (netif_reduces_vlan_mtu(real_dev)) + max_mtu -= VLAN_HLEN; + if (max_mtu < new_mtu) return -ERANGE; dev->mtu = new_mtu; @@ -245,6 +247,17 @@ void vlan_dev_get_realdev_name(const struct net_device *dev, char *result) strncpy(result, vlan_dev_priv(dev)->real_dev->name, 23); } +bool vlan_dev_inherit_address(struct net_device *dev, + struct net_device *real_dev) +{ + if (dev->addr_assign_type != NET_ADDR_STOLEN) + return false; + + ether_addr_copy(dev->dev_addr, real_dev->dev_addr); + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return true; +} + static int vlan_dev_open(struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); @@ -255,7 +268,8 @@ static int vlan_dev_open(struct net_device *dev) !(vlan->flags & VLAN_FLAG_LOOSE_BINDING)) return -ENETDOWN; - if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) { + if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr) && + !vlan_dev_inherit_address(dev, real_dev)) { err = dev_uc_add(real_dev, dev->dev_addr); if (err < 0) goto out; @@ -560,8 +574,10 @@ static int vlan_dev_init(struct net_device *dev) /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; - if (is_zero_ether_addr(dev->dev_addr)) - eth_hw_addr_inherit(dev, real_dev); + if (is_zero_ether_addr(dev->dev_addr)) { + ether_addr_copy(dev->dev_addr, real_dev->dev_addr); + dev->addr_assign_type = NET_ADDR_STOLEN; + } if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index c92b52f37..1270207f3 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -118,6 +118,7 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev; + unsigned int max_mtu; __be16 proto; int err; @@ -144,9 +145,11 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, if (err < 0) return err; + max_mtu = netif_reduces_vlan_mtu(real_dev) ? real_dev->mtu - VLAN_HLEN : + real_dev->mtu; if (!tb[IFLA_MTU]) - dev->mtu = real_dev->mtu; - else if (dev->mtu > real_dev->mtu) + dev->mtu = max_mtu; + else if (dev->mtu > max_mtu) return -EINVAL; err = vlan_changelink(dev, tb, data); diff --git a/net/9p/client.c b/net/9p/client.c index ea79ee9a7..3fc94a49c 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -518,10 +518,10 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) if (err) goto out_err; - if (p9_is_proto_dotu(c)) + if (p9_is_proto_dotu(c) && ecode < 512) err = -ecode; - if (!err || !IS_ERR_VALUE(err)) { + if (!err) { err = p9_errstr2errno(ename, strlen(ename)); p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", @@ -605,10 +605,10 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, if (err) goto out_err; - if (p9_is_proto_dotu(c)) + if (p9_is_proto_dotu(c) && ecode < 512) err = -ecode; - if (!err || !IS_ERR_VALUE(err)) { + if (!err) { err = p9_errstr2errno(ename, strlen(ename)); p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n", diff --git a/net/Kconfig b/net/Kconfig index a8934d8c8..ff40562a7 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -236,6 +236,7 @@ source "net/mpls/Kconfig" source "net/hsr/Kconfig" source "net/switchdev/Kconfig" source "net/l3mdev/Kconfig" +source "net/qrtr/Kconfig" config RPS bool @@ -288,14 +289,17 @@ config BQL config BPF_JIT bool "enable BPF Just In Time compiler" - depends on HAVE_BPF_JIT + depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT depends on MODULES ---help--- Berkeley Packet Filter filtering capabilities are normally handled by an interpreter. This option allows kernel to generate a native code when filter is loaded in memory. This should speedup - packet sniffing (libpcap/tcpdump). Note : Admin should enable - this feature changing /proc/sys/net/core/bpf_jit_enable + packet sniffing (libpcap/tcpdump). + + Note, admin should enable this feature changing: + /proc/sys/net/core/bpf_jit_enable + /proc/sys/net/core/bpf_jit_harden (optional) config NET_FLOW_LIMIT bool @@ -418,6 +422,14 @@ config MAY_USE_DEVLINK endif # if NET -# Used by archs to tell that they support BPF_JIT -config HAVE_BPF_JIT +# Used by archs to tell that they support BPF JIT compiler plus which flavour. +# Only one of the two can be selected for a specific arch since eBPF JIT supersedes +# the cBPF JIT. + +# Classic BPF JIT (cBPF) +config HAVE_CBPF_JIT + bool + +# Extended BPF JIT (eBPF) +config HAVE_EBPF_JIT bool diff --git a/net/Makefile b/net/Makefile index 81d14119e..bdd14553a 100644 --- a/net/Makefile +++ b/net/Makefile @@ -78,3 +78,4 @@ endif ifneq ($(CONFIG_NET_L3_MASTER_DEV),) obj-y += l3mdev/ endif +obj-$(CONFIG_QRTR) += qrtr/ diff --git a/net/atm/lec.c b/net/atm/lec.c index cd3b37989..e574a7e9d 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -194,7 +194,7 @@ lec_send(struct atm_vcc *vcc, struct sk_buff *skb) static void lec_tx_timeout(struct net_device *dev) { pr_info("%s\n", dev->name); - dev->trans_start = jiffies; + netif_trans_update(dev); netif_wake_queue(dev); } @@ -324,7 +324,7 @@ static netdev_tx_t lec_start_xmit(struct sk_buff *skb, out: if (entry) lec_arp_put(entry); - dev->trans_start = jiffies; + netif_trans_update(dev); return NETDEV_TX_OK; } diff --git a/net/atm/signaling.c b/net/atm/signaling.c index 4fd6af473..adb6e3d21 100644 --- a/net/atm/signaling.c +++ b/net/atm/signaling.c @@ -124,7 +124,7 @@ as_indicate_complete: break; case as_addparty: case as_dropparty: - sk->sk_err_soft = msg->reply; + sk->sk_err_soft = -msg->reply; /* < 0 failure, otherwise ep_ref */ clear_bit(ATM_VF_WAITING, &vcc->flags); break; diff --git a/net/atm/svc.c b/net/atm/svc.c index 3fa0a9ee9..878563a83 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -546,7 +546,7 @@ static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr, schedule(); } finish_wait(sk_sleep(sk), &wait); - error = xchg(&sk->sk_err_soft, 0); + error = -xchg(&sk->sk_err_soft, 0); out: release_sock(sk); return error; @@ -573,7 +573,7 @@ static int svc_dropparty(struct socket *sock, int ep_ref) error = -EUNATCH; goto out; } - error = xchg(&sk->sk_err_soft, 0); + error = -xchg(&sk->sk_err_soft, 0); out: release_sock(sk); return error; diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index cb2d1b9b0..ce2f20304 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -32,6 +32,7 @@ #include <linux/jiffies.h> #include <linux/list.h> #include <linux/kref.h> +#include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/pkt_sched.h> #include <linux/printk.h> @@ -156,10 +157,8 @@ static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node, orig_node->bat_iv.bcast_own = data_ptr; data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC); - if (!data_ptr) { - kfree(orig_node->bat_iv.bcast_own); + if (!data_ptr) goto unlock; - } memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum, (max_if_num - 1) * sizeof(u8)); @@ -175,71 +174,107 @@ unlock: } /** - * batadv_iv_ogm_orig_del_if - change the private structures of the orig_node to - * exclude the removed interface + * batadv_iv_ogm_drop_bcast_own_entry - drop section of bcast_own * @orig_node: the orig_node that has to be changed * @max_if_num: the current amount of interfaces * @del_if_num: the index of the interface being removed - * - * Return: 0 on success, a negative error code otherwise. */ -static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node, - int max_if_num, int del_if_num) +static void +batadv_iv_ogm_drop_bcast_own_entry(struct batadv_orig_node *orig_node, + int max_if_num, int del_if_num) { - int ret = -ENOMEM; - size_t chunk_size, if_offset; - void *data_ptr = NULL; - - spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); + size_t chunk_size; + size_t if_offset; + void *data_ptr; - /* last interface was removed */ - if (max_if_num == 0) - goto free_bcast_own; + lockdep_assert_held(&orig_node->bat_iv.ogm_cnt_lock); chunk_size = sizeof(unsigned long) * BATADV_NUM_WORDS; data_ptr = kmalloc_array(max_if_num, chunk_size, GFP_ATOMIC); if (!data_ptr) - goto unlock; + /* use old buffer when new one could not be allocated */ + data_ptr = orig_node->bat_iv.bcast_own; /* copy first part */ - memcpy(data_ptr, orig_node->bat_iv.bcast_own, del_if_num * chunk_size); + memmove(data_ptr, orig_node->bat_iv.bcast_own, del_if_num * chunk_size); /* copy second part */ if_offset = (del_if_num + 1) * chunk_size; - memcpy((char *)data_ptr + del_if_num * chunk_size, - (uint8_t *)orig_node->bat_iv.bcast_own + if_offset, - (max_if_num - del_if_num) * chunk_size); + memmove((char *)data_ptr + del_if_num * chunk_size, + (uint8_t *)orig_node->bat_iv.bcast_own + if_offset, + (max_if_num - del_if_num) * chunk_size); -free_bcast_own: - kfree(orig_node->bat_iv.bcast_own); - orig_node->bat_iv.bcast_own = data_ptr; + /* bcast_own was shrunk down in new buffer; free old one */ + if (orig_node->bat_iv.bcast_own != data_ptr) { + kfree(orig_node->bat_iv.bcast_own); + orig_node->bat_iv.bcast_own = data_ptr; + } +} + +/** + * batadv_iv_ogm_drop_bcast_own_sum_entry - drop section of bcast_own_sum + * @orig_node: the orig_node that has to be changed + * @max_if_num: the current amount of interfaces + * @del_if_num: the index of the interface being removed + */ +static void +batadv_iv_ogm_drop_bcast_own_sum_entry(struct batadv_orig_node *orig_node, + int max_if_num, int del_if_num) +{ + size_t if_offset; + void *data_ptr; - if (max_if_num == 0) - goto free_own_sum; + lockdep_assert_held(&orig_node->bat_iv.ogm_cnt_lock); data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC); - if (!data_ptr) { - kfree(orig_node->bat_iv.bcast_own); - goto unlock; - } + if (!data_ptr) + /* use old buffer when new one could not be allocated */ + data_ptr = orig_node->bat_iv.bcast_own_sum; - memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum, - del_if_num * sizeof(u8)); + memmove(data_ptr, orig_node->bat_iv.bcast_own_sum, + del_if_num * sizeof(u8)); if_offset = (del_if_num + 1) * sizeof(u8); - memcpy((char *)data_ptr + del_if_num * sizeof(u8), - orig_node->bat_iv.bcast_own_sum + if_offset, - (max_if_num - del_if_num) * sizeof(u8)); + memmove((char *)data_ptr + del_if_num * sizeof(u8), + orig_node->bat_iv.bcast_own_sum + if_offset, + (max_if_num - del_if_num) * sizeof(u8)); + + /* bcast_own_sum was shrunk down in new buffer; free old one */ + if (orig_node->bat_iv.bcast_own_sum != data_ptr) { + kfree(orig_node->bat_iv.bcast_own_sum); + orig_node->bat_iv.bcast_own_sum = data_ptr; + } +} -free_own_sum: - kfree(orig_node->bat_iv.bcast_own_sum); - orig_node->bat_iv.bcast_own_sum = data_ptr; +/** + * batadv_iv_ogm_orig_del_if - change the private structures of the orig_node to + * exclude the removed interface + * @orig_node: the orig_node that has to be changed + * @max_if_num: the current amount of interfaces + * @del_if_num: the index of the interface being removed + * + * Return: 0 on success, a negative error code otherwise. + */ +static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node, + int max_if_num, int del_if_num) +{ + spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock); + + if (max_if_num == 0) { + kfree(orig_node->bat_iv.bcast_own); + kfree(orig_node->bat_iv.bcast_own_sum); + orig_node->bat_iv.bcast_own = NULL; + orig_node->bat_iv.bcast_own_sum = NULL; + } else { + batadv_iv_ogm_drop_bcast_own_entry(orig_node, max_if_num, + del_if_num); + batadv_iv_ogm_drop_bcast_own_sum_entry(orig_node, max_if_num, + del_if_num); + } - ret = 0; -unlock: spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock); - return ret; + return 0; } /** @@ -644,18 +679,12 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, unsigned char *skb_buff; unsigned int skb_size; - if (!kref_get_unless_zero(&if_incoming->refcount)) - return; - - if (!kref_get_unless_zero(&if_outgoing->refcount)) - goto out_free_incoming; - /* own packet should always be scheduled */ if (!own_packet) { if (!batadv_atomic_dec_not_zero(&bat_priv->batman_queue_left)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "batman packet queue full\n"); - goto out_free_outgoing; + return; } } @@ -681,6 +710,8 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, forw_packet_aggr->packet_len = packet_len; memcpy(skb_buff, packet_buff, packet_len); + kref_get(&if_incoming->refcount); + kref_get(&if_outgoing->refcount); forw_packet_aggr->own = own_packet; forw_packet_aggr->if_incoming = if_incoming; forw_packet_aggr->if_outgoing = if_outgoing; @@ -710,10 +741,6 @@ out_free_forw_packet: out_nomem: if (!own_packet) atomic_inc(&bat_priv->batman_queue_left); -out_free_outgoing: - batadv_hardif_put(if_outgoing); -out_free_incoming: - batadv_hardif_put(if_incoming); } /* aggregate a new packet into the existing ogm packet */ @@ -950,9 +977,15 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) list_for_each_entry_rcu(tmp_hard_iface, &batadv_hardif_list, list) { if (tmp_hard_iface->soft_iface != hard_iface->soft_iface) continue; + + if (!kref_get_unless_zero(&tmp_hard_iface->refcount)) + continue; + batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len, hard_iface, tmp_hard_iface, 1, send_time); + + batadv_hardif_put(tmp_hard_iface); } rcu_read_unlock(); @@ -1133,13 +1166,13 @@ out: * @if_incoming: interface where the packet was received * @if_outgoing: interface for which the retransmission should be considered * - * Return: 1 if the link can be considered bidirectional, 0 otherwise + * Return: true if the link can be considered bidirectional, false otherwise */ -static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - struct batadv_ogm_packet *batadv_ogm_packet, - struct batadv_hard_iface *if_incoming, - struct batadv_hard_iface *if_outgoing) +static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, + struct batadv_orig_node *orig_neigh_node, + struct batadv_ogm_packet *batadv_ogm_packet, + struct batadv_hard_iface *if_incoming, + struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node; @@ -1147,9 +1180,11 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, u8 total_count; u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; - int tq_asym_penalty, inv_asym_penalty, if_num, ret = 0; + int if_num; + unsigned int tq_asym_penalty, inv_asym_penalty; unsigned int combined_tq; - int tq_iface_penalty; + unsigned int tq_iface_penalty; + bool ret = false; /* find corresponding one hop neighbor */ rcu_read_lock(); @@ -1261,7 +1296,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, * consider it bidirectional */ if (batadv_ogm_packet->tq >= BATADV_TQ_TOTAL_BIDRECT_LIMIT) - ret = 1; + ret = true; out: if (neigh_node) @@ -1290,9 +1325,9 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, struct batadv_orig_ifinfo *orig_ifinfo = NULL; struct batadv_neigh_node *neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; - int is_dup; + bool is_dup; s32 seq_diff; - int need_update = 0; + bool need_update = false; int set_mark; enum batadv_dup_status ret = BATADV_NO_DUP; u32 seqno = ntohl(batadv_ogm_packet->seqno); @@ -1402,7 +1437,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, struct sk_buff *skb_priv; struct ethhdr *ethhdr; u8 *prev_sender; - int is_bidirect; + bool is_bidirect; /* create a private copy of the skb, as some functions change tq value * and/or flags. @@ -1730,8 +1765,13 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, if (hard_iface->soft_iface != bat_priv->soft_iface) continue; + if (!kref_get_unless_zero(&hard_iface->refcount)) + continue; + batadv_iv_ogm_process_per_outif(skb, ogm_offset, orig_node, if_incoming, hard_iface); + + batadv_hardif_put(hard_iface); } rcu_read_unlock(); @@ -1829,9 +1869,8 @@ static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv, int batman_count = 0; u32 i; - seq_printf(seq, " %-15s %s (%s/%i) %17s [%10s]: %20s ...\n", - "Originator", "last-seen", "#", BATADV_TQ_MAX_VALUE, - "Nexthop", "outgoingIF", "Potential nexthops"); + seq_puts(seq, + " Originator last-seen (#/255) Nexthop [outgoingIF]: Potential nexthops ...\n"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -1911,8 +1950,7 @@ static void batadv_iv_neigh_print(struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface; int batman_count = 0; - seq_printf(seq, " %10s %-13s %s\n", - "IF", "Neighbor", "last-seen"); + seq_puts(seq, " IF Neighbor last-seen\n"); rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 4026f198a..0a12e5cdd 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -27,6 +27,7 @@ #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> +#include <linux/stddef.h> #include <linux/types.h> #include <linux/workqueue.h> @@ -39,6 +40,16 @@ static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface) { + struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct batadv_hard_iface *primary_if; + + primary_if = batadv_primary_if_get_selected(bat_priv); + + if (primary_if) { + batadv_v_elp_iface_activate(primary_if, hard_iface); + batadv_hardif_put(primary_if); + } + /* B.A.T.M.A.N. V does not use any queuing mechanism, therefore it can * set the interface as ACTIVE right away, without any risk of race * condition @@ -72,16 +83,34 @@ static void batadv_v_iface_disable(struct batadv_hard_iface *hard_iface) batadv_v_elp_iface_disable(hard_iface); } -static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface) -{ -} - static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface) { batadv_v_elp_primary_iface_set(hard_iface); batadv_v_ogm_primary_iface_set(hard_iface); } +/** + * batadv_v_iface_update_mac - react to hard-interface MAC address change + * @hard_iface: the modified interface + * + * If the modified interface is the primary one, update the originator + * address in the ELP and OGM messages to reflect the new MAC address. + */ +static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface) +{ + struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct batadv_hard_iface *primary_if; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (primary_if != hard_iface) + goto out; + + batadv_v_primary_iface_set(hard_iface); +out: + if (primary_if) + batadv_hardif_put(primary_if); +} + static void batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) { @@ -162,8 +191,8 @@ static void batadv_v_neigh_print(struct batadv_priv *bat_priv, struct batadv_hard_iface *hard_iface; int batman_count = 0; - seq_printf(seq, " %-15s %s (%11s) [%10s]\n", "Neighbor", - "last-seen", "throughput", "IF"); + seq_puts(seq, + " Neighbor last-seen ( throughput) [ IF]\n"); rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { @@ -202,9 +231,8 @@ static void batadv_v_orig_print(struct batadv_priv *bat_priv, int batman_count = 0; u32 i; - seq_printf(seq, " %-15s %s (%11s) %17s [%10s]: %20s ...\n", - "Originator", "last-seen", "throughput", "Nexthop", - "outgoingIF", "Potential nexthops"); + seq_puts(seq, + " Originator last-seen ( throughput) Nexthop [outgoingIF]: Potential nexthops ...\n"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -256,14 +284,23 @@ static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing2) { struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2; + int ret = 0; ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); + if (WARN_ON(!ifinfo1)) + goto err_ifinfo1; + ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); + if (WARN_ON(!ifinfo2)) + goto err_ifinfo2; - if (WARN_ON(!ifinfo1 || !ifinfo2)) - return 0; + ret = ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput; - return ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput; + batadv_neigh_ifinfo_put(ifinfo2); +err_ifinfo2: + batadv_neigh_ifinfo_put(ifinfo1); +err_ifinfo1: + return ret; } static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1, @@ -273,14 +310,26 @@ static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1, { struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2; u32 threshold; + bool ret = false; ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); + if (WARN_ON(!ifinfo1)) + goto err_ifinfo1; + ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2); + if (WARN_ON(!ifinfo2)) + goto err_ifinfo2; threshold = ifinfo1->bat_v.throughput / 4; threshold = ifinfo1->bat_v.throughput - threshold; - return ifinfo2->bat_v.throughput > threshold; + ret = ifinfo2->bat_v.throughput > threshold; + + batadv_neigh_ifinfo_put(ifinfo2); +err_ifinfo2: + batadv_neigh_ifinfo_put(ifinfo1); +err_ifinfo1: + return ret; } static struct batadv_algo_ops batadv_batman_v __read_mostly = { diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 3844e7efd..df42eb136 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -377,6 +377,27 @@ void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface) } /** + * batadv_v_elp_iface_activate - update the ELP buffer belonging to the given + * hard-interface + * @primary_iface: the new primary interface + * @hard_iface: interface holding the to-be-updated buffer + */ +void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface, + struct batadv_hard_iface *hard_iface) +{ + struct batadv_elp_packet *elp_packet; + struct sk_buff *skb; + + if (!hard_iface->bat_v.elp_skb) + return; + + skb = hard_iface->bat_v.elp_skb; + elp_packet = (struct batadv_elp_packet *)skb->data; + ether_addr_copy(elp_packet->orig, + primary_iface->net_dev->dev_addr); +} + +/** * batadv_v_elp_primary_iface_set - change internal data to reflect the new * primary interface * @primary_iface: the new primary interface @@ -384,8 +405,6 @@ void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface) void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface) { struct batadv_hard_iface *hard_iface; - struct batadv_elp_packet *elp_packet; - struct sk_buff *skb; /* update orig field of every elp iface belonging to this mesh */ rcu_read_lock(); @@ -393,13 +412,7 @@ void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface) if (primary_iface->soft_iface != hard_iface->soft_iface) continue; - if (!hard_iface->bat_v.elp_skb) - continue; - - skb = hard_iface->bat_v.elp_skb; - elp_packet = (struct batadv_elp_packet *)skb->data; - ether_addr_copy(elp_packet->orig, - primary_iface->net_dev->dev_addr); + batadv_v_elp_iface_activate(primary_iface, hard_iface); } rcu_read_unlock(); } diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index e95f1bca0..cc130b2d0 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -25,6 +25,8 @@ struct work_struct; int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface); void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface); +void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface, + struct batadv_hard_iface *hard_iface); void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface); int batadv_v_elp_packet_recv(struct sk_buff *skb, struct batadv_hard_iface *if_incoming); diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 91df28a10..473ebb9a0 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -26,6 +26,7 @@ #include <linux/if_ether.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/random.h> @@ -176,6 +177,9 @@ static void batadv_v_ogm_send(struct work_struct *work) if (hard_iface->soft_iface != bat_priv->soft_iface) continue; + if (!kref_get_unless_zero(&hard_iface->refcount)) + continue; + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Sending own OGM2 packet (originator %pM, seqno %u, throughput %u, TTL %d) on interface %s [%pM]\n", ogm_packet->orig, ntohl(ogm_packet->seqno), @@ -185,10 +189,13 @@ static void batadv_v_ogm_send(struct work_struct *work) /* this skb gets consumed by batadv_v_ogm_send_to_if() */ skb_tmp = skb_clone(skb, GFP_ATOMIC); - if (!skb_tmp) + if (!skb_tmp) { + batadv_hardif_put(hard_iface); break; + } batadv_v_ogm_send_to_if(skb_tmp, hard_iface); + batadv_hardif_put(hard_iface); } rcu_read_unlock(); @@ -234,73 +241,6 @@ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface) } /** - * batadv_v_ogm_orig_update - update the originator status based on the received - * OGM - * @bat_priv: the bat priv with all the soft interface information - * @orig_node: the originator to update - * @neigh_node: the neighbour the OGM has been received from (to update) - * @ogm2: the received OGM - * @if_outgoing: the interface where this OGM is going to be forwarded through - */ -static void -batadv_v_ogm_orig_update(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_neigh_node *neigh_node, - const struct batadv_ogm2_packet *ogm2, - struct batadv_hard_iface *if_outgoing) -{ - struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL; - struct batadv_neigh_node *router = NULL; - s32 neigh_seq_diff; - u32 neigh_last_seqno; - u32 router_last_seqno; - u32 router_throughput, neigh_throughput; - - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Searching and updating originator entry of received packet\n"); - - /* if this neighbor already is our next hop there is nothing - * to change - */ - router = batadv_orig_router_get(orig_node, if_outgoing); - if (router == neigh_node) - goto out; - - /* don't consider neighbours with worse throughput. - * also switch route if this seqno is BATADV_V_MAX_ORIGDIFF newer than - * the last received seqno from our best next hop. - */ - if (router) { - router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); - neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); - - /* if these are not allocated, something is wrong. */ - if (!router_ifinfo || !neigh_ifinfo) - goto out; - - neigh_last_seqno = neigh_ifinfo->bat_v.last_seqno; - router_last_seqno = router_ifinfo->bat_v.last_seqno; - neigh_seq_diff = neigh_last_seqno - router_last_seqno; - router_throughput = router_ifinfo->bat_v.throughput; - neigh_throughput = neigh_ifinfo->bat_v.throughput; - - if ((neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF) && - (router_throughput >= neigh_throughput)) - goto out; - } - - batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node); - -out: - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); - if (neigh_ifinfo) - batadv_neigh_ifinfo_put(neigh_ifinfo); - if (router) - batadv_neigh_node_put(router); -} - -/** * batadv_v_forward_penalty - apply a penalty to the throughput metric forwarded * with B.A.T.M.A.N. V OGMs * @bat_priv: the bat priv with all the soft interface information @@ -347,10 +287,12 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, } /** - * batadv_v_ogm_forward - forward an OGM to the given outgoing interface + * batadv_v_ogm_forward - check conditions and forward an OGM to the given + * outgoing interface * @bat_priv: the bat priv with all the soft interface information * @ogm_received: previously received OGM to be forwarded - * @throughput: throughput to announce, may vary per outgoing interface + * @orig_node: the originator which has been updated + * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface on which this OGM was received on * @if_outgoing: the interface to which the OGM has to be forwarded to * @@ -359,28 +301,57 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, */ static void batadv_v_ogm_forward(struct batadv_priv *bat_priv, const struct batadv_ogm2_packet *ogm_received, - u32 throughput, + struct batadv_orig_node *orig_node, + struct batadv_neigh_node *neigh_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { + struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; + struct batadv_orig_ifinfo *orig_ifinfo = NULL; + struct batadv_neigh_node *router = NULL; struct batadv_ogm2_packet *ogm_forward; unsigned char *skb_buff; struct sk_buff *skb; size_t packet_len; u16 tvlv_len; + /* only forward for specific interfaces, not for the default one. */ + if (if_outgoing == BATADV_IF_DEFAULT) + goto out; + + orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); + if (!orig_ifinfo) + goto out; + + /* acquire possibly updated router */ + router = batadv_orig_router_get(orig_node, if_outgoing); + + /* strict rule: forward packets coming from the best next hop only */ + if (neigh_node != router) + goto out; + + /* don't forward the same seqno twice on one interface */ + if (orig_ifinfo->last_seqno_forwarded == ntohl(ogm_received->seqno)) + goto out; + + orig_ifinfo->last_seqno_forwarded = ntohl(ogm_received->seqno); + if (ogm_received->ttl <= 1) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n"); - return; + goto out; } + neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); + if (!neigh_ifinfo) + goto out; + tvlv_len = ntohs(ogm_received->tvlv_len); packet_len = BATADV_OGM2_HLEN + tvlv_len; skb = netdev_alloc_skb_ip_align(if_outgoing->net_dev, ETH_HLEN + packet_len); if (!skb) - return; + goto out; skb_reserve(skb, ETH_HLEN); skb_buff = skb_put(skb, packet_len); @@ -388,15 +359,23 @@ static void batadv_v_ogm_forward(struct batadv_priv *bat_priv, /* apply forward penalty */ ogm_forward = (struct batadv_ogm2_packet *)skb_buff; - ogm_forward->throughput = htonl(throughput); + ogm_forward->throughput = htonl(neigh_ifinfo->bat_v.throughput); ogm_forward->ttl--; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Forwarding OGM2 packet on %s: throughput %u, ttl %u, received via %s\n", - if_outgoing->net_dev->name, throughput, ogm_forward->ttl, - if_incoming->net_dev->name); + if_outgoing->net_dev->name, ntohl(ogm_forward->throughput), + ogm_forward->ttl, if_incoming->net_dev->name); batadv_v_ogm_send_to_if(skb, if_outgoing); + +out: + if (orig_ifinfo) + batadv_orig_ifinfo_put(orig_ifinfo); + if (router) + batadv_neigh_node_put(router); + if (neigh_ifinfo) + batadv_neigh_ifinfo_put(neigh_ifinfo); } /** @@ -493,8 +472,10 @@ out: * @neigh_node: the neigh_node through with the OGM has been received * @if_incoming: the interface where this packet was received * @if_outgoing: the interface for which the packet should be considered + * + * Return: true if the packet should be forwarded, false otherwise */ -static void batadv_v_ogm_route_update(struct batadv_priv *bat_priv, +static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv, const struct ethhdr *ethhdr, const struct batadv_ogm2_packet *ogm2, struct batadv_orig_node *orig_node, @@ -503,14 +484,14 @@ static void batadv_v_ogm_route_update(struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing) { struct batadv_neigh_node *router = NULL; - struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; struct batadv_orig_node *orig_neigh_node = NULL; - struct batadv_orig_ifinfo *orig_ifinfo = NULL; struct batadv_neigh_node *orig_neigh_router = NULL; - - neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); - if (!neigh_ifinfo) - goto out; + struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL; + u32 router_throughput, neigh_throughput; + u32 router_last_seqno; + u32 neigh_last_seqno; + s32 neigh_seq_diff; + bool forward = false; orig_neigh_node = batadv_v_ogm_orig_get(bat_priv, ethhdr->h_source); if (!orig_neigh_node) @@ -529,49 +510,57 @@ static void batadv_v_ogm_route_update(struct batadv_priv *bat_priv, goto out; } - if (router) { - batadv_neigh_node_put(router); - router = NULL; - } + /* Mark the OGM to be considered for forwarding, and update routes + * if needed. + */ + forward = true; - /* Update routes, and check if the OGM is from the best next hop */ - batadv_v_ogm_orig_update(bat_priv, orig_node, neigh_node, ogm2, - if_outgoing); + batadv_dbg(BATADV_DBG_BATMAN, bat_priv, + "Searching and updating originator entry of received packet\n"); - orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing); - if (!orig_ifinfo) + /* if this neighbor already is our next hop there is nothing + * to change + */ + if (router == neigh_node) goto out; - /* don't forward the same seqno twice on one interface */ - if (orig_ifinfo->last_seqno_forwarded == ntohl(ogm2->seqno)) - goto out; + /* don't consider neighbours with worse throughput. + * also switch route if this seqno is BATADV_V_MAX_ORIGDIFF newer than + * the last received seqno from our best next hop. + */ + if (router) { + router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing); + neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing); - /* acquire possibly updated router */ - router = batadv_orig_router_get(orig_node, if_outgoing); + /* if these are not allocated, something is wrong. */ + if (!router_ifinfo || !neigh_ifinfo) + goto out; - /* strict rule: forward packets coming from the best next hop only */ - if (neigh_node != router) - goto out; + neigh_last_seqno = neigh_ifinfo->bat_v.last_seqno; + router_last_seqno = router_ifinfo->bat_v.last_seqno; + neigh_seq_diff = neigh_last_seqno - router_last_seqno; + router_throughput = router_ifinfo->bat_v.throughput; + neigh_throughput = neigh_ifinfo->bat_v.throughput; - /* only forward for specific interface, not for the default one. */ - if (if_outgoing != BATADV_IF_DEFAULT) { - orig_ifinfo->last_seqno_forwarded = ntohl(ogm2->seqno); - batadv_v_ogm_forward(bat_priv, ogm2, - neigh_ifinfo->bat_v.throughput, - if_incoming, if_outgoing); + if ((neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF) && + (router_throughput >= neigh_throughput)) + goto out; } + batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node); out: - if (orig_ifinfo) - batadv_orig_ifinfo_put(orig_ifinfo); if (router) batadv_neigh_node_put(router); if (orig_neigh_router) batadv_neigh_node_put(orig_neigh_router); if (orig_neigh_node) batadv_orig_node_put(orig_neigh_node); + if (router_ifinfo) + batadv_neigh_ifinfo_put(router_ifinfo); if (neigh_ifinfo) batadv_neigh_ifinfo_put(neigh_ifinfo); + + return forward; } /** @@ -594,6 +583,7 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv, struct batadv_hard_iface *if_outgoing) { int seqno_age; + bool forward; /* first, update the metric with according sanity checks */ seqno_age = batadv_v_ogm_metric_update(bat_priv, ogm2, orig_node, @@ -612,8 +602,14 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv, ntohs(ogm2->tvlv_len)); /* if the metric update went through, update routes if needed */ - batadv_v_ogm_route_update(bat_priv, ethhdr, ogm2, orig_node, - neigh_node, if_incoming, if_outgoing); + forward = batadv_v_ogm_route_update(bat_priv, ethhdr, ogm2, orig_node, + neigh_node, if_incoming, + if_outgoing); + + /* if the routes have been processed correctly, check and forward */ + if (forward) + batadv_v_ogm_forward(bat_priv, ogm2, orig_node, neigh_node, + if_incoming, if_outgoing); } /** @@ -715,9 +711,14 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, if (hard_iface->soft_iface != bat_priv->soft_iface) continue; + if (!kref_get_unless_zero(&hard_iface->refcount)) + continue; + batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, orig_node, neigh_node, if_incoming, hard_iface); + + batadv_hardif_put(hard_iface); } rcu_read_unlock(); out: diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index b56bb000a..a0c791383 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -38,11 +38,11 @@ static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n) * the last sequence number * @set_mark: whether this packet should be marked in seq_bits * - * Return: 1 if the window was moved (either new or very old), - * 0 if the window was not moved/shifted. + * Return: true if the window was moved (either new or very old), + * false if the window was not moved/shifted. */ -int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, - int set_mark) +bool batadv_bit_get_packet(void *priv, unsigned long *seq_bits, + s32 seq_num_diff, int set_mark) { struct batadv_priv *bat_priv = priv; @@ -52,7 +52,7 @@ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, if (seq_num_diff <= 0 && seq_num_diff > -BATADV_TQ_LOCAL_WINDOW_SIZE) { if (set_mark) batadv_set_bit(seq_bits, -seq_num_diff); - return 0; + return false; } /* sequence number is slightly newer, so we shift the window and @@ -63,7 +63,7 @@ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, if (set_mark) batadv_set_bit(seq_bits, 0); - return 1; + return true; } /* sequence number is much newer, probably missed a lot of packets */ @@ -75,7 +75,7 @@ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, bitmap_zero(seq_bits, BATADV_TQ_LOCAL_WINDOW_SIZE); if (set_mark) batadv_set_bit(seq_bits, 0); - return 1; + return true; } /* received a much older packet. The other host either restarted @@ -94,5 +94,5 @@ int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, if (set_mark) batadv_set_bit(seq_bits, 0); - return 1; + return true; } diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index 3e41bb80e..0e6e9d090 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -22,6 +22,7 @@ #include <linux/bitops.h> #include <linux/compiler.h> +#include <linux/stddef.h> #include <linux/types.h> /** @@ -31,17 +32,17 @@ * @last_seqno: latest sequence number in seq_bits * @curr_seqno: sequence number to test for * - * Return: 1 if the corresponding bit in the given seq_bits indicates true - * and curr_seqno is within range of last_seqno. Otherwise returns 0. + * Return: true if the corresponding bit in the given seq_bits indicates true + * and curr_seqno is within range of last_seqno. Otherwise returns false. */ -static inline int batadv_test_bit(const unsigned long *seq_bits, - u32 last_seqno, u32 curr_seqno) +static inline bool batadv_test_bit(const unsigned long *seq_bits, + u32 last_seqno, u32 curr_seqno) { s32 diff; diff = last_seqno - curr_seqno; if (diff < 0 || diff >= BATADV_TQ_LOCAL_WINDOW_SIZE) - return 0; + return false; return test_bit(diff, seq_bits) != 0; } @@ -55,7 +56,7 @@ static inline void batadv_set_bit(unsigned long *seq_bits, s32 n) set_bit(n, seq_bits); /* turn the position on */ } -int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, - int set_mark); +bool batadv_bit_get_packet(void *priv, unsigned long *seq_bits, + s32 seq_num_diff, int set_mark); #endif /* _NET_BATMAN_ADV_BITARRAY_H_ */ diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 0a6c8b824..825a5cdf4 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -50,6 +50,7 @@ #include "hash.h" #include "originator.h" #include "packet.h" +#include "sysfs.h" #include "translation-table.h" static const u8 batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05}; @@ -100,10 +101,10 @@ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size) * @node: list node of the first entry to compare * @data2: pointer to the second backbone gateway * - * Return: 1 if the backbones have the same data, 0 otherwise + * Return: true if the backbones have the same data, false otherwise */ -static int batadv_compare_backbone_gw(const struct hlist_node *node, - const void *data2) +static bool batadv_compare_backbone_gw(const struct hlist_node *node, + const void *data2) { const void *data1 = container_of(node, struct batadv_bla_backbone_gw, hash_entry); @@ -111,23 +112,23 @@ static int batadv_compare_backbone_gw(const struct hlist_node *node, const struct batadv_bla_backbone_gw *gw2 = data2; if (!batadv_compare_eth(gw1->orig, gw2->orig)) - return 0; + return false; if (gw1->vid != gw2->vid) - return 0; + return false; - return 1; + return true; } /** - * batadv_compare_backbone_gw - compare address and vid of two claims + * batadv_compare_claim - compare address and vid of two claims * @node: list node of the first entry to compare * @data2: pointer to the second claims * - * Return: 1 if the claim have the same data, 0 otherwise + * Return: true if the claim have the same data, 0 otherwise */ -static int batadv_compare_claim(const struct hlist_node *node, - const void *data2) +static bool batadv_compare_claim(const struct hlist_node *node, + const void *data2) { const void *data1 = container_of(node, struct batadv_bla_claim, hash_entry); @@ -135,12 +136,12 @@ static int batadv_compare_claim(const struct hlist_node *node, const struct batadv_bla_claim *cl2 = data2; if (!batadv_compare_eth(cl1->addr, cl2->addr)) - return 0; + return false; if (cl1->vid != cl2->vid) - return 0; + return false; - return 1; + return true; } /** @@ -176,10 +177,21 @@ static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw) static void batadv_claim_release(struct kref *ref) { struct batadv_bla_claim *claim; + struct batadv_bla_backbone_gw *old_backbone_gw; claim = container_of(ref, struct batadv_bla_claim, refcount); - batadv_backbone_gw_put(claim->backbone_gw); + spin_lock_bh(&claim->backbone_lock); + old_backbone_gw = claim->backbone_gw; + claim->backbone_gw = NULL; + spin_unlock_bh(&claim->backbone_lock); + + spin_lock_bh(&old_backbone_gw->crc_lock); + old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); + spin_unlock_bh(&old_backbone_gw->crc_lock); + + batadv_backbone_gw_put(old_backbone_gw); + kfree_rcu(claim, rcu); } @@ -200,9 +212,9 @@ static void batadv_claim_put(struct batadv_bla_claim *claim) * * Return: claim if found or NULL otherwise. */ -static struct batadv_bla_claim -*batadv_claim_hash_find(struct batadv_priv *bat_priv, - struct batadv_bla_claim *data) +static struct batadv_bla_claim * +batadv_claim_hash_find(struct batadv_priv *bat_priv, + struct batadv_bla_claim *data) { struct batadv_hashtable *hash = bat_priv->bla.claim_hash; struct hlist_head *head; @@ -407,11 +419,22 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, ethhdr->h_source, ethhdr->h_dest, BATADV_PRINT_VID(vid)); break; + case BATADV_CLAIM_TYPE_LOOPDETECT: + ether_addr_copy(ethhdr->h_source, mac); + batadv_dbg(BATADV_DBG_BLA, bat_priv, + "bla_send_claim(): LOOPDETECT of %pM to %pM on vid %d\n", + ethhdr->h_source, ethhdr->h_dest, + BATADV_PRINT_VID(vid)); + + break; } - if (vid & BATADV_VLAN_HAS_TAG) + if (vid & BATADV_VLAN_HAS_TAG) { skb = vlan_insert_tag(skb, htons(ETH_P_8021Q), vid & VLAN_VID_MASK); + if (!skb) + goto out; + } skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, soft_iface); @@ -427,6 +450,36 @@ out: } /** + * batadv_bla_loopdetect_report - worker for reporting the loop + * @work: work queue item + * + * Throws an uevent, as the loopdetect check function can't do that itself + * since the kernel may sleep while throwing uevents. + */ +static void batadv_bla_loopdetect_report(struct work_struct *work) +{ + struct batadv_bla_backbone_gw *backbone_gw; + struct batadv_priv *bat_priv; + char vid_str[6] = { '\0' }; + + backbone_gw = container_of(work, struct batadv_bla_backbone_gw, + report_work); + bat_priv = backbone_gw->bat_priv; + + batadv_info(bat_priv->soft_iface, + "Possible loop on VLAN %d detected which can't be handled by BLA - please check your network setup!\n", + BATADV_PRINT_VID(backbone_gw->vid)); + snprintf(vid_str, sizeof(vid_str), "%d", + BATADV_PRINT_VID(backbone_gw->vid)); + vid_str[sizeof(vid_str) - 1] = 0; + + batadv_throw_uevent(bat_priv, BATADV_UEV_BLA, BATADV_UEV_LOOPDETECT, + vid_str); + + batadv_backbone_gw_put(backbone_gw); +} + +/** * batadv_bla_get_backbone_gw - finds or creates a backbone gateway * @bat_priv: the bat priv with all the soft interface information * @orig: the mac address of the originator @@ -464,6 +517,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, atomic_set(&entry->request_sent, 0); atomic_set(&entry->wait_periods, 0); ether_addr_copy(entry->orig, orig); + INIT_WORK(&entry->report_work, batadv_bla_loopdetect_report); /* one for the hash, one for returning */ kref_init(&entry->refcount); @@ -634,8 +688,10 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, const u8 *mac, const unsigned short vid, struct batadv_bla_backbone_gw *backbone_gw) { + struct batadv_bla_backbone_gw *old_backbone_gw; struct batadv_bla_claim *claim; struct batadv_bla_claim search_claim; + bool remove_crc = false; int hash_added; ether_addr_copy(search_claim.addr, mac); @@ -649,8 +705,10 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, return; ether_addr_copy(claim->addr, mac); + spin_lock_init(&claim->backbone_lock); claim->vid = vid; claim->lasttime = jiffies; + kref_get(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; kref_init(&claim->refcount); @@ -678,15 +736,26 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, "bla_add_claim(): changing ownership for %pM, vid %d\n", mac, BATADV_PRINT_VID(vid)); - spin_lock_bh(&claim->backbone_gw->crc_lock); - claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); - spin_unlock_bh(&claim->backbone_gw->crc_lock); - batadv_backbone_gw_put(claim->backbone_gw); + remove_crc = true; } - /* set (new) backbone gw */ + + /* replace backbone_gw atomically and adjust reference counters */ + spin_lock_bh(&claim->backbone_lock); + old_backbone_gw = claim->backbone_gw; kref_get(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; + spin_unlock_bh(&claim->backbone_lock); + if (remove_crc) { + /* remove claim address from old backbone_gw */ + spin_lock_bh(&old_backbone_gw->crc_lock); + old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); + spin_unlock_bh(&old_backbone_gw->crc_lock); + } + + batadv_backbone_gw_put(old_backbone_gw); + + /* add claim address to new backbone_gw */ spin_lock_bh(&backbone_gw->crc_lock); backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&backbone_gw->crc_lock); @@ -697,6 +766,26 @@ claim_free_ref: } /** + * batadv_bla_claim_get_backbone_gw - Get valid reference for backbone_gw of + * claim + * @claim: claim whose backbone_gw should be returned + * + * Return: valid reference to claim::backbone_gw + */ +static struct batadv_bla_backbone_gw * +batadv_bla_claim_get_backbone_gw(struct batadv_bla_claim *claim) +{ + struct batadv_bla_backbone_gw *backbone_gw; + + spin_lock_bh(&claim->backbone_lock); + backbone_gw = claim->backbone_gw; + kref_get(&backbone_gw->refcount); + spin_unlock_bh(&claim->backbone_lock); + + return backbone_gw; +} + +/** * batadv_bla_del_claim - delete a claim from the claim hash * @bat_priv: the bat priv with all the soft interface information * @mac: mac address of the claim to be removed @@ -720,10 +809,6 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, batadv_choose_claim, claim); batadv_claim_put(claim); /* reference from the hash is gone */ - spin_lock_bh(&claim->backbone_gw->crc_lock); - claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); - spin_unlock_bh(&claim->backbone_gw->crc_lock); - /* don't need the reference from hash_find() anymore */ batadv_claim_put(claim); } @@ -735,22 +820,22 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, * @backbone_addr: originator address of the sender (Ethernet source MAC) * @vid: the VLAN ID of the frame * - * Return: 1 if handled + * Return: true if handled */ -static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, - u8 *backbone_addr, unsigned short vid) +static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, + u8 *backbone_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; u16 backbone_crc, crc; if (memcmp(an_addr, batadv_announce_mac, 4) != 0) - return 0; + return false; backbone_gw = batadv_bla_get_backbone_gw(bat_priv, backbone_addr, vid, false); if (unlikely(!backbone_gw)) - return 1; + return true; /* handle as ANNOUNCE frame */ backbone_gw->lasttime = jiffies; @@ -783,7 +868,7 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, } batadv_backbone_gw_put(backbone_gw); - return 1; + return true; } /** @@ -794,29 +879,29 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, * @ethhdr: ethernet header of a packet * @vid: the VLAN ID of the frame * - * Return: 1 if handled + * Return: true if handled */ -static int batadv_handle_request(struct batadv_priv *bat_priv, - struct batadv_hard_iface *primary_if, - u8 *backbone_addr, struct ethhdr *ethhdr, - unsigned short vid) +static bool batadv_handle_request(struct batadv_priv *bat_priv, + struct batadv_hard_iface *primary_if, + u8 *backbone_addr, struct ethhdr *ethhdr, + unsigned short vid) { /* check for REQUEST frame */ if (!batadv_compare_eth(backbone_addr, ethhdr->h_dest)) - return 0; + return false; /* sanity check, this should not happen on a normal switch, * we ignore it in this case. */ if (!batadv_compare_eth(ethhdr->h_dest, primary_if->net_dev->dev_addr)) - return 1; + return true; batadv_dbg(BATADV_DBG_BLA, bat_priv, "handle_request(): REQUEST vid %d (sent by %pM)...\n", BATADV_PRINT_VID(vid), ethhdr->h_source); batadv_bla_answer_request(bat_priv, primary_if, vid); - return 1; + return true; } /** @@ -827,12 +912,12 @@ static int batadv_handle_request(struct batadv_priv *bat_priv, * @claim_addr: Client to be unclaimed (ARP sender HW MAC) * @vid: the VLAN ID of the frame * - * Return: 1 if handled + * Return: true if handled */ -static int batadv_handle_unclaim(struct batadv_priv *bat_priv, - struct batadv_hard_iface *primary_if, - u8 *backbone_addr, u8 *claim_addr, - unsigned short vid) +static bool batadv_handle_unclaim(struct batadv_priv *bat_priv, + struct batadv_hard_iface *primary_if, + u8 *backbone_addr, u8 *claim_addr, + unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; @@ -845,7 +930,7 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, backbone_gw = batadv_backbone_hash_find(bat_priv, backbone_addr, vid); if (!backbone_gw) - return 1; + return true; /* this must be an UNCLAIM frame */ batadv_dbg(BATADV_DBG_BLA, bat_priv, @@ -854,7 +939,7 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, batadv_bla_del_claim(bat_priv, claim_addr, vid); batadv_backbone_gw_put(backbone_gw); - return 1; + return true; } /** @@ -865,12 +950,12 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, * @claim_addr: client mac address to be claimed (ARP sender HW MAC) * @vid: the VLAN ID of the frame * - * Return: 1 if handled + * Return: true if handled */ -static int batadv_handle_claim(struct batadv_priv *bat_priv, - struct batadv_hard_iface *primary_if, - u8 *backbone_addr, u8 *claim_addr, - unsigned short vid) +static bool batadv_handle_claim(struct batadv_priv *bat_priv, + struct batadv_hard_iface *primary_if, + u8 *backbone_addr, u8 *claim_addr, + unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; @@ -880,7 +965,7 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv, false); if (unlikely(!backbone_gw)) - return 1; + return true; /* this must be a CLAIM frame */ batadv_bla_add_claim(bat_priv, claim_addr, vid, backbone_gw); @@ -891,7 +976,7 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv, /* TODO: we could call something like tt_local_del() here. */ batadv_backbone_gw_put(backbone_gw); - return 1; + return true; } /** @@ -975,12 +1060,12 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv, * @primary_if: the primary hard interface of this batman soft interface * @skb: the frame to be checked * - * Return: 1 if it was a claim frame, otherwise return 0 to + * Return: true if it was a claim frame, otherwise return false to * tell the callee that it can use the frame on its own. */ -static int batadv_bla_process_claim(struct batadv_priv *bat_priv, - struct batadv_hard_iface *primary_if, - struct sk_buff *skb) +static bool batadv_bla_process_claim(struct batadv_priv *bat_priv, + struct batadv_hard_iface *primary_if, + struct sk_buff *skb) { struct batadv_bla_claim_dst *bla_dst, *bla_dst_own; u8 *hw_src, *hw_dst; @@ -1011,7 +1096,7 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, vhdr = skb_header_pointer(skb, headlen, VLAN_HLEN, &vhdr_buf); if (!vhdr) - return 0; + return false; proto = vhdr->h_vlan_encapsulated_proto; headlen += VLAN_HLEN; @@ -1020,12 +1105,12 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, } if (proto != htons(ETH_P_ARP)) - return 0; /* not a claim frame */ + return false; /* not a claim frame */ /* this must be a ARP frame. check if it is a claim. */ if (unlikely(!pskb_may_pull(skb, headlen + arp_hdr_len(skb->dev)))) - return 0; + return false; /* pskb_may_pull() may have modified the pointers, get ethhdr again */ ethhdr = eth_hdr(skb); @@ -1035,13 +1120,13 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, * IP information */ if (arphdr->ar_hrd != htons(ARPHRD_ETHER)) - return 0; + return false; if (arphdr->ar_pro != htons(ETH_P_IP)) - return 0; + return false; if (arphdr->ar_hln != ETH_ALEN) - return 0; + return false; if (arphdr->ar_pln != 4) - return 0; + return false; hw_src = (u8 *)arphdr + sizeof(struct arphdr); hw_dst = hw_src + ETH_ALEN + 4; @@ -1051,14 +1136,18 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, /* check if it is a claim frame in general */ if (memcmp(bla_dst->magic, bla_dst_own->magic, sizeof(bla_dst->magic)) != 0) - return 0; + return false; /* check if there is a claim frame encapsulated deeper in (QinQ) and * drop that, as this is not supported by BLA but should also not be * sent via the mesh. */ if (vlan_depth > 1) - return 1; + return true; + + /* Let the loopdetect frames on the mesh in any case. */ + if (bla_dst->type == BATADV_CLAIM_TYPE_LOOPDETECT) + return 0; /* check if it is a claim frame. */ ret = batadv_check_claim_group(bat_priv, primary_if, hw_src, hw_dst, @@ -1070,7 +1159,7 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, hw_dst); if (ret < 2) - return ret; + return !!ret; /* become a backbone gw ourselves on this vlan if not happened yet */ batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid); @@ -1080,30 +1169,30 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, case BATADV_CLAIM_TYPE_CLAIM: if (batadv_handle_claim(bat_priv, primary_if, hw_src, ethhdr->h_source, vid)) - return 1; + return true; break; case BATADV_CLAIM_TYPE_UNCLAIM: if (batadv_handle_unclaim(bat_priv, primary_if, ethhdr->h_source, hw_src, vid)) - return 1; + return true; break; case BATADV_CLAIM_TYPE_ANNOUNCE: if (batadv_handle_announce(bat_priv, hw_src, ethhdr->h_source, vid)) - return 1; + return true; break; case BATADV_CLAIM_TYPE_REQUEST: if (batadv_handle_request(bat_priv, primary_if, hw_src, ethhdr, vid)) - return 1; + return true; break; } batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_process_claim(): ERROR - this looks like a claim frame, but is useless. eth src %pM on vid %d ...(hw_src %pM, hw_dst %pM)\n", ethhdr->h_source, BATADV_PRINT_VID(vid), hw_src, hw_dst); - return 1; + return true; } /** @@ -1172,6 +1261,7 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, int now) { + struct batadv_bla_backbone_gw *backbone_gw; struct batadv_bla_claim *claim; struct hlist_head *head; struct batadv_hashtable *hash; @@ -1186,14 +1276,17 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); if (now) goto purge_now; - if (!batadv_compare_eth(claim->backbone_gw->orig, + + if (!batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr)) - continue; + goto skip; + if (!batadv_has_timed_out(claim->lasttime, BATADV_BLA_CLAIM_TIMEOUT)) - continue; + goto skip; batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_purge_claims(): %pM, vid %d, time out\n", @@ -1201,8 +1294,10 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, purge_now: batadv_handle_unclaim(bat_priv, primary_if, - claim->backbone_gw->orig, + backbone_gw->orig, claim->addr, claim->vid); +skip: + batadv_backbone_gw_put(backbone_gw); } rcu_read_unlock(); } @@ -1265,6 +1360,26 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, } /** + * batadv_bla_send_loopdetect - send a loopdetect frame + * @bat_priv: the bat priv with all the soft interface information + * @backbone_gw: the backbone gateway for which a loop should be detected + * + * To detect loops that the bridge loop avoidance can't handle, send a loop + * detection packet on the backbone. Unlike other BLA frames, this frame will + * be allowed on the mesh by other nodes. If it is received on the mesh, this + * indicates that there is a loop. + */ +static void +batadv_bla_send_loopdetect(struct batadv_priv *bat_priv, + struct batadv_bla_backbone_gw *backbone_gw) +{ + batadv_dbg(BATADV_DBG_BLA, bat_priv, "Send loopdetect frame for vid %d\n", + backbone_gw->vid); + batadv_bla_send_claim(bat_priv, bat_priv->bla.loopdetect_addr, + backbone_gw->vid, BATADV_CLAIM_TYPE_LOOPDETECT); +} + +/** * batadv_bla_status_update - purge bla interfaces if necessary * @net_dev: the soft interface net device */ @@ -1301,9 +1416,10 @@ static void batadv_bla_periodic_work(struct work_struct *work) struct batadv_bla_backbone_gw *backbone_gw; struct batadv_hashtable *hash; struct batadv_hard_iface *primary_if; + bool send_loopdetect = false; int i; - delayed_work = container_of(work, struct delayed_work, work); + delayed_work = to_delayed_work(work); priv_bla = container_of(delayed_work, struct batadv_priv_bla, work); bat_priv = container_of(priv_bla, struct batadv_priv, bla); primary_if = batadv_primary_if_get_selected(bat_priv); @@ -1316,6 +1432,22 @@ static void batadv_bla_periodic_work(struct work_struct *work) if (!atomic_read(&bat_priv->bridge_loop_avoidance)) goto out; + if (atomic_dec_and_test(&bat_priv->bla.loopdetect_next)) { + /* set a new random mac address for the next bridge loop + * detection frames. Set the locally administered bit to avoid + * collisions with users mac addresses. + */ + random_ether_addr(bat_priv->bla.loopdetect_addr); + bat_priv->bla.loopdetect_addr[0] = 0xba; + bat_priv->bla.loopdetect_addr[1] = 0xbe; + bat_priv->bla.loopdetect_lasttime = jiffies; + atomic_set(&bat_priv->bla.loopdetect_next, + BATADV_BLA_LOOPDETECT_PERIODS); + + /* mark for sending loop detect on all VLANs */ + send_loopdetect = true; + } + hash = bat_priv->bla.backbone_hash; if (!hash) goto out; @@ -1332,6 +1464,9 @@ static void batadv_bla_periodic_work(struct work_struct *work) backbone_gw->lasttime = jiffies; batadv_bla_send_announce(bat_priv, backbone_gw); + if (send_loopdetect) + batadv_bla_send_loopdetect(bat_priv, + backbone_gw); /* request_sent is only set after creation to avoid * problems when we are not yet known as backbone gw @@ -1405,6 +1540,9 @@ int batadv_bla_init(struct batadv_priv *bat_priv) bat_priv->bla.bcast_duplist[i].entrytime = entrytime; bat_priv->bla.bcast_duplist_curr = 0; + atomic_set(&bat_priv->bla.loopdetect_next, + BATADV_BLA_LOOPDETECT_PERIODS); + if (bat_priv->bla.claim_hash) return 0; @@ -1442,15 +1580,16 @@ int batadv_bla_init(struct batadv_priv *bat_priv) * sent by another host, drop it. We allow equal packets from * the same host however as this might be intended. * - * Return: 1 if a packet is in the duplicate list, 0 otherwise. + * Return: true if a packet is in the duplicate list, false otherwise. */ -int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, - struct sk_buff *skb) +bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, + struct sk_buff *skb) { - int i, curr, ret = 0; + int i, curr; __be32 crc; struct batadv_bcast_packet *bcast_packet; struct batadv_bcast_duplist_entry *entry; + bool ret = false; bcast_packet = (struct batadv_bcast_packet *)skb->data; @@ -1478,9 +1617,9 @@ int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, continue; /* this entry seems to match: same crc, not too old, - * and from another gw. therefore return 1 to forbid it. + * and from another gw. therefore return true to forbid it. */ - ret = 1; + ret = true; goto out; } /* not found, add a new entry (overwrite the oldest entry) @@ -1546,21 +1685,21 @@ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, * @orig_node: the orig_node of the frame * @hdr_size: maximum length of the frame * - * Return: 1 if the orig_node is also a gateway on the soft interface, otherwise - * it returns 0. + * Return: true if the orig_node is also a gateway on the soft interface, + * otherwise it returns false. */ -int batadv_bla_is_backbone_gw(struct sk_buff *skb, - struct batadv_orig_node *orig_node, int hdr_size) +bool batadv_bla_is_backbone_gw(struct sk_buff *skb, + struct batadv_orig_node *orig_node, int hdr_size) { struct batadv_bla_backbone_gw *backbone_gw; unsigned short vid; if (!atomic_read(&orig_node->bat_priv->bridge_loop_avoidance)) - return 0; + return false; /* first, find out the vid. */ if (!pskb_may_pull(skb, hdr_size + ETH_HLEN)) - return 0; + return false; vid = batadv_get_vid(skb, hdr_size); @@ -1568,14 +1707,14 @@ int batadv_bla_is_backbone_gw(struct sk_buff *skb, backbone_gw = batadv_backbone_hash_find(orig_node->bat_priv, orig_node->orig, vid); if (!backbone_gw) - return 0; + return false; batadv_backbone_gw_put(backbone_gw); - return 1; + return true; } /** - * batadv_bla_init - free all bla structures + * batadv_bla_free - free all bla structures * @bat_priv: the bat priv with all the soft interface information * * for softinterface free or module unload @@ -1602,6 +1741,55 @@ void batadv_bla_free(struct batadv_priv *bat_priv) } /** + * batadv_bla_loopdetect_check - check and handle a detected loop + * @bat_priv: the bat priv with all the soft interface information + * @skb: the packet to check + * @primary_if: interface where the request came on + * @vid: the VLAN ID of the frame + * + * Checks if this packet is a loop detect frame which has been sent by us, + * throw an uevent and log the event if that is the case. + * + * Return: true if it is a loop detect frame which is to be dropped, false + * otherwise. + */ +static bool +batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, + struct batadv_hard_iface *primary_if, + unsigned short vid) +{ + struct batadv_bla_backbone_gw *backbone_gw; + struct ethhdr *ethhdr; + + ethhdr = eth_hdr(skb); + + /* Only check for the MAC address and skip more checks here for + * performance reasons - this function is on the hotpath, after all. + */ + if (!batadv_compare_eth(ethhdr->h_source, + bat_priv->bla.loopdetect_addr)) + return false; + + /* If the packet came too late, don't forward it on the mesh + * but don't consider that as loop. It might be a coincidence. + */ + if (batadv_has_timed_out(bat_priv->bla.loopdetect_lasttime, + BATADV_BLA_LOOPDETECT_TIMEOUT)) + return true; + + backbone_gw = batadv_bla_get_backbone_gw(bat_priv, + primary_if->net_dev->dev_addr, + vid, true); + if (unlikely(!backbone_gw)) + return true; + + queue_work(batadv_event_workqueue, &backbone_gw->report_work); + /* backbone_gw is unreferenced in the report work function function */ + + return true; +} + +/** * batadv_bla_rx - check packets coming from the mesh. * @bat_priv: the bat priv with all the soft interface information * @skb: the frame to be checked @@ -1614,16 +1802,18 @@ void batadv_bla_free(struct batadv_priv *bat_priv) * * in these cases, the skb is further handled by this function * - * Return: 1 if handled, otherwise it returns 0 and the caller shall further - * process the skb. + * Return: true if handled, otherwise it returns false and the caller shall + * further process the skb. */ -int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, - unsigned short vid, bool is_bcast) +bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid, bool is_bcast) { + struct batadv_bla_backbone_gw *backbone_gw; struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; struct batadv_hard_iface *primary_if; - int ret; + bool own_claim; + bool ret; ethhdr = eth_hdr(skb); @@ -1634,6 +1824,9 @@ int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, if (!atomic_read(&bat_priv->bridge_loop_avoidance)) goto allow; + if (batadv_bla_loopdetect_check(bat_priv, skb, primary_if, vid)) + goto handled; + if (unlikely(atomic_read(&bat_priv->bla.num_requests))) /* don't allow broadcasts while requests are in flight */ if (is_multicast_ether_addr(ethhdr->h_dest) && is_bcast) @@ -1654,8 +1847,12 @@ int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, } /* if it is our own claim ... */ - if (batadv_compare_eth(claim->backbone_gw->orig, - primary_if->net_dev->dev_addr)) { + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); + own_claim = batadv_compare_eth(backbone_gw->orig, + primary_if->net_dev->dev_addr); + batadv_backbone_gw_put(backbone_gw); + + if (own_claim) { /* ... allow it in any case */ claim->lasttime = jiffies; goto allow; @@ -1682,12 +1879,12 @@ int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, } allow: batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid); - ret = 0; + ret = false; goto out; handled: kfree_skb(skb); - ret = 1; + ret = true; out: if (primary_if) @@ -1711,16 +1908,18 @@ out: * * This call might reallocate skb data. * - * Return: 1 if handled, otherwise it returns 0 and the caller shall further - * process the skb. + * Return: true if handled, otherwise it returns false and the caller shall + * further process the skb. */ -int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, - unsigned short vid) +bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid) { struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; + struct batadv_bla_backbone_gw *backbone_gw; struct batadv_hard_iface *primary_if; - int ret = 0; + bool client_roamed; + bool ret = false; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) @@ -1749,8 +1948,12 @@ int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, goto allow; /* check if we are responsible. */ - if (batadv_compare_eth(claim->backbone_gw->orig, - primary_if->net_dev->dev_addr)) { + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); + client_roamed = batadv_compare_eth(backbone_gw->orig, + primary_if->net_dev->dev_addr); + batadv_backbone_gw_put(backbone_gw); + + if (client_roamed) { /* if yes, the client has roamed and we have * to unclaim it. */ @@ -1774,10 +1977,10 @@ int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, } allow: batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid); - ret = 0; + ret = false; goto out; handled: - ret = 1; + ret = true; out: if (primary_if) batadv_hardif_put(primary_if); @@ -1798,6 +2001,7 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) struct net_device *net_dev = (struct net_device *)seq->private; struct batadv_priv *bat_priv = netdev_priv(net_dev); struct batadv_hashtable *hash = bat_priv->bla.claim_hash; + struct batadv_bla_backbone_gw *backbone_gw; struct batadv_bla_claim *claim; struct batadv_hard_iface *primary_if; struct hlist_head *head; @@ -1815,24 +2019,28 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) "Claims announced for the mesh %s (orig %pM, group id %#.4x)\n", net_dev->name, primary_addr, ntohs(bat_priv->bla.claim_dest.group)); - seq_printf(seq, " %-17s %-5s %-17s [o] (%-6s)\n", - "Client", "VID", "Originator", "CRC"); + seq_puts(seq, + " Client VID Originator [o] (CRC )\n"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { - is_own = batadv_compare_eth(claim->backbone_gw->orig, + backbone_gw = batadv_bla_claim_get_backbone_gw(claim); + + is_own = batadv_compare_eth(backbone_gw->orig, primary_addr); - spin_lock_bh(&claim->backbone_gw->crc_lock); - backbone_crc = claim->backbone_gw->crc; - spin_unlock_bh(&claim->backbone_gw->crc_lock); + spin_lock_bh(&backbone_gw->crc_lock); + backbone_crc = backbone_gw->crc; + spin_unlock_bh(&backbone_gw->crc_lock); seq_printf(seq, " * %pM on %5d by %pM [%c] (%#.4x)\n", claim->addr, BATADV_PRINT_VID(claim->vid), - claim->backbone_gw->orig, + backbone_gw->orig, (is_own ? 'x' : ' '), backbone_crc); + + batadv_backbone_gw_put(backbone_gw); } rcu_read_unlock(); } @@ -1873,8 +2081,7 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) "Backbones announced for the mesh %s (orig %pM, group id %#.4x)\n", net_dev->name, primary_addr, ntohs(bat_priv->bla.claim_dest.group)); - seq_printf(seq, " %-17s %-5s %-9s (%-6s)\n", - "Originator", "VID", "last seen", "CRC"); + seq_puts(seq, " Originator VID last seen (CRC )\n"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 579f0fa6f..0f01daeb3 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -27,19 +27,20 @@ struct seq_file; struct sk_buff; #ifdef CONFIG_BATMAN_ADV_BLA -int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, - unsigned short vid, bool is_bcast); -int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, - unsigned short vid); -int batadv_bla_is_backbone_gw(struct sk_buff *skb, - struct batadv_orig_node *orig_node, int hdr_size); +bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid, bool is_bcast); +bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, + unsigned short vid); +bool batadv_bla_is_backbone_gw(struct sk_buff *skb, + struct batadv_orig_node *orig_node, + int hdr_size); int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset); int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset); bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid); -int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, - struct sk_buff *skb); +bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, + struct sk_buff *skb); void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, struct batadv_hard_iface *oldif); @@ -50,24 +51,24 @@ void batadv_bla_free(struct batadv_priv *bat_priv); #define BATADV_BLA_CRC_INIT 0 #else /* ifdef CONFIG_BATMAN_ADV_BLA */ -static inline int batadv_bla_rx(struct batadv_priv *bat_priv, - struct sk_buff *skb, unsigned short vid, - bool is_bcast) +static inline bool batadv_bla_rx(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid, + bool is_bcast) { - return 0; + return false; } -static inline int batadv_bla_tx(struct batadv_priv *bat_priv, - struct sk_buff *skb, unsigned short vid) +static inline bool batadv_bla_tx(struct batadv_priv *bat_priv, + struct sk_buff *skb, unsigned short vid) { - return 0; + return false; } -static inline int batadv_bla_is_backbone_gw(struct sk_buff *skb, - struct batadv_orig_node *orig_node, - int hdr_size) +static inline bool batadv_bla_is_backbone_gw(struct sk_buff *skb, + struct batadv_orig_node *orig_node, + int hdr_size) { - return 0; + return false; } static inline int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, @@ -88,11 +89,11 @@ static inline bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, return false; } -static inline int +static inline bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb) { - return 0; + return false; } static inline void diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index 48253cf83..952900466 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -134,7 +134,7 @@ static int batadv_log_release(struct inode *inode, struct file *file) return 0; } -static int batadv_log_empty(struct batadv_priv_debug_log *debug_log) +static bool batadv_log_empty(struct batadv_priv_debug_log *debug_log) { return !(debug_log->log_start - debug_log->log_end); } @@ -365,14 +365,17 @@ static int batadv_nc_nodes_open(struct inode *inode, struct file *file) #define BATADV_DEBUGINFO(_name, _mode, _open) \ struct batadv_debuginfo batadv_debuginfo_##_name = { \ - .attr = { .name = __stringify(_name), \ - .mode = _mode, }, \ - .fops = { .owner = THIS_MODULE, \ - .open = _open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ - } \ + .attr = { \ + .name = __stringify(_name), \ + .mode = _mode, \ + }, \ + .fops = { \ + .owner = THIS_MODULE, \ + .open = _open, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ + }, \ } /* the following attributes are general and therefore they will be directly diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 3e6b2624f..aee3b3991 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -152,7 +152,7 @@ static void batadv_dat_purge(struct work_struct *work) struct batadv_priv_dat *priv_dat; struct batadv_priv *bat_priv; - delayed_work = container_of(work, struct delayed_work, work); + delayed_work = to_delayed_work(work); priv_dat = container_of(delayed_work, struct batadv_priv_dat, work); bat_priv = container_of(priv_dat, struct batadv_priv, dat); @@ -165,14 +165,14 @@ static void batadv_dat_purge(struct work_struct *work) * @node: node in the local table * @data2: second object to compare the node to * - * Return: 1 if the two entries are the same, 0 otherwise. + * Return: true if the two entries are the same, false otherwise. */ -static int batadv_compare_dat(const struct hlist_node *node, const void *data2) +static bool batadv_compare_dat(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_dat_entry, hash_entry); - return memcmp(data1, data2, sizeof(__be32)) == 0 ? 1 : 0; + return memcmp(data1, data2, sizeof(__be32)) == 0; } /** @@ -720,7 +720,7 @@ void batadv_dat_status_update(struct net_device *net_dev) } /** - * batadv_gw_tvlv_ogm_handler_v1 - process incoming dat tvlv container + * batadv_dat_tvlv_ogm_handler_v1 - process incoming dat tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) @@ -817,8 +817,8 @@ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset) goto out; seq_printf(seq, "Distributed ARP Table (%s):\n", net_dev->name); - seq_printf(seq, " %-7s %-9s %4s %11s\n", "IPv4", - "MAC", "VID", "last-seen"); + seq_puts(seq, + " IPv4 MAC VID last-seen\n"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -1009,9 +1009,12 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, if (!skb_new) goto out; - if (vid & BATADV_VLAN_HAS_TAG) + if (vid & BATADV_VLAN_HAS_TAG) { skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q), vid & VLAN_VID_MASK); + if (!skb_new) + goto out; + } skb_reset_mac_header(skb_new); skb_new->protocol = eth_type_trans(skb_new, @@ -1089,9 +1092,12 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, */ skb_reset_mac_header(skb_new); - if (vid & BATADV_VLAN_HAS_TAG) + if (vid & BATADV_VLAN_HAS_TAG) { skb_new = vlan_insert_tag(skb_new, htons(ETH_P_8021Q), vid & VLAN_VID_MASK); + if (!skb_new) + goto out; + } /* To preserve backwards compatibility, the node has choose the outgoing * format based on the incoming request packet type. The assumption is diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index e6956d074..65536db1b 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -407,8 +407,8 @@ static struct sk_buff *batadv_frag_create(struct sk_buff *skb, unsigned int mtu) { struct sk_buff *skb_fragment; - unsigned header_size = sizeof(*frag_head); - unsigned fragment_size = mtu - header_size; + unsigned int header_size = sizeof(*frag_head); + unsigned int fragment_size = mtu - header_size; skb_fragment = netdev_alloc_skb(NULL, mtu + ETH_HLEN); if (!skb_fragment) @@ -444,15 +444,15 @@ bool batadv_frag_send_packet(struct sk_buff *skb, struct batadv_hard_iface *primary_if = NULL; struct batadv_frag_packet frag_header; struct sk_buff *skb_fragment; - unsigned mtu = neigh_node->if_incoming->net_dev->mtu; - unsigned header_size = sizeof(frag_header); - unsigned max_fragment_size, max_packet_size; + unsigned int mtu = neigh_node->if_incoming->net_dev->mtu; + unsigned int header_size = sizeof(frag_header); + unsigned int max_fragment_size, max_packet_size; bool ret = false; /* To avoid merge and refragmentation at next-hops we never send * fragments larger than BATADV_FRAG_MAX_FRAG_SIZE */ - mtu = min_t(unsigned, mtu, BATADV_FRAG_MAX_FRAG_SIZE); + mtu = min_t(unsigned int, mtu, BATADV_FRAG_MAX_FRAG_SIZE); max_fragment_size = mtu - header_size; max_packet_size = max_fragment_size * BATADV_FRAG_MAX_FRAGMENTS; diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index c59aff5cc..5839c569f 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -135,8 +135,8 @@ static void batadv_gw_select(struct batadv_priv *bat_priv, spin_lock_bh(&bat_priv->gw.list_lock); - if (new_gw_node && !kref_get_unless_zero(&new_gw_node->refcount)) - new_gw_node = NULL; + if (new_gw_node) + kref_get(&new_gw_node->refcount); curr_gw_node = rcu_dereference_protected(bat_priv->gw.curr_gw, 1); rcu_assign_pointer(bat_priv->gw.curr_gw, new_gw_node); @@ -440,15 +440,11 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, if (gateway->bandwidth_down == 0) return; - if (!kref_get_unless_zero(&orig_node->refcount)) - return; - gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC); - if (!gw_node) { - batadv_orig_node_put(orig_node); + if (!gw_node) return; - } + kref_get(&orig_node->refcount); INIT_HLIST_NODE(&gw_node->list); gw_node->orig_node = orig_node; gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 0a7deaf26..8c2f39962 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -36,7 +36,6 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/workqueue.h> -#include <net/net_namespace.h> #include "bridge_loop_avoidance.h" #include "debugfs.h" @@ -121,6 +120,7 @@ static bool batadv_mutual_parents(const struct net_device *dev1, static bool batadv_is_on_batman_iface(const struct net_device *net_dev) { struct net_device *parent_dev; + struct net *net = dev_net(net_dev); bool ret; /* check if this is a batman-adv mesh interface */ @@ -133,7 +133,7 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) return false; /* recurse over the parent device */ - parent_dev = __dev_get_by_index(&init_net, dev_get_iflink(net_dev)); + parent_dev = __dev_get_by_index(net, dev_get_iflink(net_dev)); /* if we got a NULL parent_dev there is something broken.. */ if (WARN(!parent_dev, "Cannot find parent device")) return false; @@ -146,22 +146,22 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) return ret; } -static int batadv_is_valid_iface(const struct net_device *net_dev) +static bool batadv_is_valid_iface(const struct net_device *net_dev) { if (net_dev->flags & IFF_LOOPBACK) - return 0; + return false; if (net_dev->type != ARPHRD_ETHER) - return 0; + return false; if (net_dev->addr_len != ETH_ALEN) - return 0; + return false; /* no batman over batman */ if (batadv_is_on_batman_iface(net_dev)) - return 0; + return false; - return 1; + return true; } /** @@ -236,8 +236,8 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv, ASSERT_RTNL(); - if (new_hard_iface && !kref_get_unless_zero(&new_hard_iface->refcount)) - new_hard_iface = NULL; + if (new_hard_iface) + kref_get(&new_hard_iface->refcount); curr_hard_iface = rcu_dereference_protected(bat_priv->primary_if, 1); rcu_assign_pointer(bat_priv->primary_if, new_hard_iface); @@ -456,7 +456,7 @@ static int batadv_master_del_slave(struct batadv_hard_iface *slave, } int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, - const char *iface_name) + struct net *net, const char *iface_name) { struct batadv_priv *bat_priv; struct net_device *soft_iface, *master; @@ -467,13 +467,12 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) goto out; - if (!kref_get_unless_zero(&hard_iface->refcount)) - goto out; + kref_get(&hard_iface->refcount); - soft_iface = dev_get_by_name(&init_net, iface_name); + soft_iface = dev_get_by_name(net, iface_name); if (!soft_iface) { - soft_iface = batadv_softif_create(iface_name); + soft_iface = batadv_softif_create(net, iface_name); if (!soft_iface) { ret = -ENOMEM; @@ -522,6 +521,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, goto err_upper; } + kref_get(&hard_iface->refcount); hard_iface->batman_adv_ptype.type = ethertype; hard_iface->batman_adv_ptype.func = batadv_batman_skb_recv; hard_iface->batman_adv_ptype.dev = hard_iface->net_dev; @@ -583,6 +583,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_info(hard_iface->soft_iface, "Removing interface: %s\n", hard_iface->net_dev->name); dev_remove_pack(&hard_iface->batman_adv_ptype); + batadv_hardif_put(hard_iface); bat_priv->num_ifaces--; batadv_orig_hash_del_if(hard_iface, bat_priv->num_ifaces); @@ -652,8 +653,7 @@ batadv_hardif_add_interface(struct net_device *net_dev) ASSERT_RTNL(); - ret = batadv_is_valid_iface(net_dev); - if (ret != 1) + if (!batadv_is_valid_iface(net_dev)) goto out; dev_hold(net_dev); diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index d74f1983f..a76724d36 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -28,6 +28,7 @@ #include <linux/types.h> struct net_device; +struct net; enum batadv_hard_if_state { BATADV_IF_NOT_IN_USE, @@ -55,7 +56,7 @@ bool batadv_is_wifi_iface(int ifindex); struct batadv_hard_iface* batadv_hardif_get_by_netdev(const struct net_device *net_dev); int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, - const char *iface_name); + struct net *net, const char *iface_name); void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, enum batadv_hard_if_cleanup autodel); void batadv_hardif_remove_interfaces(void); diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 9bb57b874..cbbf87075 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -32,10 +32,10 @@ struct lock_class_key; /* callback to a compare function. should compare 2 element datas for their * keys * - * Return: 0 if same and not 0 if not same + * Return: true if same and false if not same */ -typedef int (*batadv_hashdata_compare_cb)(const struct hlist_node *, - const void *); +typedef bool (*batadv_hashdata_compare_cb)(const struct hlist_node *, + const void *); /* the hashfunction * diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 14d0013b3..777aea10c 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -104,25 +104,21 @@ static int batadv_socket_open(struct inode *inode, struct file *file) static int batadv_socket_release(struct inode *inode, struct file *file) { - struct batadv_socket_client *socket_client = file->private_data; - struct batadv_socket_packet *socket_packet; - struct list_head *list_pos, *list_pos_tmp; + struct batadv_socket_client *client = file->private_data; + struct batadv_socket_packet *packet, *tmp; - spin_lock_bh(&socket_client->lock); + spin_lock_bh(&client->lock); /* for all packets in the queue ... */ - list_for_each_safe(list_pos, list_pos_tmp, &socket_client->queue_list) { - socket_packet = list_entry(list_pos, - struct batadv_socket_packet, list); - - list_del(list_pos); - kfree(socket_packet); + list_for_each_entry_safe(packet, tmp, &client->queue_list, list) { + list_del(&packet->list); + kfree(packet); } - batadv_socket_client_hash[socket_client->index] = NULL; - spin_unlock_bh(&socket_client->lock); + batadv_socket_client_hash[client->index] = NULL; + spin_unlock_bh(&client->lock); - kfree(socket_client); + kfree(client); module_put(THIS_MODULE); return 0; @@ -337,7 +333,7 @@ err: } /** - * batadv_socket_receive_packet - schedule an icmp packet to be sent to + * batadv_socket_add_packet - schedule an icmp packet to be sent to * userspace on an icmp socket. * @socket_client: the socket this packet belongs to * @icmph: pointer to the header of the icmp packet diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index d64ddb961..5f2974bd1 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -401,11 +401,19 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, hard_iface = container_of(ptype, struct batadv_hard_iface, batman_adv_ptype); + + /* Prevent processing a packet received on an interface which is getting + * shut down otherwise the packet may trigger de-reference errors + * further down in the receive path. + */ + if (!kref_get_unless_zero(&hard_iface->refcount)) + goto err_out; + skb = skb_share_check(skb, GFP_ATOMIC); /* skb was released by skb_share_check() */ if (!skb) - goto err_out; + goto err_put; /* packet should hold at least type and version */ if (unlikely(!pskb_may_pull(skb, 2))) @@ -448,6 +456,8 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, if (ret == NET_RX_DROP) kfree_skb(skb); + batadv_hardif_put(hard_iface); + /* return NET_RX_SUCCESS in any case as we * most probably dropped the packet for * routing-logical reasons. @@ -456,6 +466,8 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, err_free: kfree_skb(skb); +err_put: + batadv_hardif_put(hard_iface); err_out: return NET_RX_DROP; } @@ -663,8 +675,8 @@ static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler) * * Return: tvlv handler if found or NULL otherwise. */ -static struct batadv_tvlv_handler -*batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version) +static struct batadv_tvlv_handler * +batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_handler *tvlv_handler_tmp, *tvlv_handler = NULL; @@ -722,8 +734,8 @@ static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv) * * Return: tvlv container if found or NULL otherwise. */ -static struct batadv_tvlv_container -*batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) +static struct batadv_tvlv_container * +batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL; @@ -736,9 +748,7 @@ static struct batadv_tvlv_container if (tvlv_tmp->tvlv_hdr.version != version) continue; - if (!kref_get_unless_zero(&tvlv_tmp->refcount)) - continue; - + kref_get(&tvlv_tmp->refcount); tvlv = tvlv_tmp; break; } diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index db4533631..76925266d 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2016.1" +#define BATADV_SOURCE_VERSION "2016.2" #endif /* B.A.T.M.A.N. parameters */ @@ -120,6 +120,8 @@ #define BATADV_BLA_BACKBONE_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 6) #define BATADV_BLA_CLAIM_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 10) #define BATADV_BLA_WAIT_PERIODS 3 +#define BATADV_BLA_LOOPDETECT_PERIODS 6 +#define BATADV_BLA_LOOPDETECT_TIMEOUT 3000 /* 3 seconds */ #define BATADV_DUPLIST_SIZE 16 #define BATADV_DUPLIST_TIMEOUT 500 /* 500 ms */ @@ -142,10 +144,12 @@ enum batadv_uev_action { BATADV_UEV_ADD = 0, BATADV_UEV_DEL, BATADV_UEV_CHANGE, + BATADV_UEV_LOOPDETECT, }; enum batadv_uev_type { BATADV_UEV_GW = 0, + BATADV_UEV_BLA, }; #define BATADV_GW_THRESHOLD 50 @@ -288,7 +292,7 @@ static inline void _batadv_dbg(int type __always_unused, * * note: can't use ether_addr_equal() as it requires aligned memory * - * Return: 1 if they are the same ethernet addr + * Return: true if they are the same ethernet addr */ static inline bool batadv_compare_eth(const void *data1, const void *data2) { @@ -296,7 +300,8 @@ static inline bool batadv_compare_eth(const void *data1, const void *data2) } /** - * has_timed_out - compares current time (jiffies) and timestamp + timeout + * batadv_has_timed_out - compares current time (jiffies) and timestamp + + * timeout * @timestamp: base value to compare with (in jiffies) * @timeout: added to base value before comparing (in milliseconds) * diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 8caa2c72e..c32f24faf 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -394,7 +394,8 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, } /** - * batadv_mcast_want_all_ip_count - count nodes with unspecific mcast interest + * batadv_mcast_forw_want_all_ip_count - count nodes with unspecific mcast + * interest * @bat_priv: the bat priv with all the soft interface information * @ethhdr: ethernet header of a packet * @@ -433,7 +434,7 @@ batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv, } /** - * batadv_mcast_want_forw_ipv4_node_get - get a node with an ipv4 flag + * batadv_mcast_forw_ipv4_node_get - get a node with an ipv4 flag * @bat_priv: the bat priv with all the soft interface information * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and @@ -460,7 +461,7 @@ batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv) } /** - * batadv_mcast_want_forw_ipv6_node_get - get a node with an ipv6 flag + * batadv_mcast_forw_ipv6_node_get - get a node with an ipv6 flag * @bat_priv: the bat priv with all the soft interface information * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set @@ -487,7 +488,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) } /** - * batadv_mcast_want_forw_ip_node_get - get a node with an ipv4/ipv6 flag + * batadv_mcast_forw_ip_node_get - get a node with an ipv4/ipv6 flag * @bat_priv: the bat priv with all the soft interface information * @ethhdr: an ethernet header to determine the protocol family from * @@ -511,7 +512,7 @@ batadv_mcast_forw_ip_node_get(struct batadv_priv *bat_priv, } /** - * batadv_mcast_want_forw_unsnoop_node_get - get a node with an unsnoopable flag + * batadv_mcast_forw_unsnoop_node_get - get a node with an unsnoopable flag * @bat_priv: the bat priv with all the soft interface information * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index b41719b64..678f06865 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -510,10 +510,10 @@ static u32 batadv_nc_hash_choose(const void *data, u32 size) * @node: node in the local table * @data2: second object to compare the node to * - * Return: 1 if the two entry are the same, 0 otherwise + * Return: true if the two entry are the same, false otherwise */ -static int batadv_nc_hash_compare(const struct hlist_node *node, - const void *data2) +static bool batadv_nc_hash_compare(const struct hlist_node *node, + const void *data2) { const struct batadv_nc_path *nc_path1, *nc_path2; @@ -521,15 +521,13 @@ static int batadv_nc_hash_compare(const struct hlist_node *node, nc_path2 = data2; /* Return 1 if the two keys are identical */ - if (memcmp(nc_path1->prev_hop, nc_path2->prev_hop, - sizeof(nc_path1->prev_hop)) != 0) - return 0; + if (!batadv_compare_eth(nc_path1->prev_hop, nc_path2->prev_hop)) + return false; - if (memcmp(nc_path1->next_hop, nc_path2->next_hop, - sizeof(nc_path1->next_hop)) != 0) - return 0; + if (!batadv_compare_eth(nc_path1->next_hop, nc_path2->next_hop)) + return false; - return 1; + return true; } /** @@ -714,7 +712,7 @@ static void batadv_nc_worker(struct work_struct *work) struct batadv_priv *bat_priv; unsigned long timeout; - delayed_work = container_of(work, struct delayed_work, work); + delayed_work = to_delayed_work(work); priv_nc = container_of(delayed_work, struct batadv_priv_nc, work); bat_priv = container_of(priv_nc, struct batadv_priv, nc); @@ -793,10 +791,10 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, * * Return: the nc_node if found, NULL otherwise. */ -static struct batadv_nc_node -*batadv_nc_find_nc_node(struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - bool in_coding) +static struct batadv_nc_node * +batadv_nc_find_nc_node(struct batadv_orig_node *orig_node, + struct batadv_orig_node *orig_neigh_node, + bool in_coding) { struct batadv_nc_node *nc_node, *nc_node_out = NULL; struct list_head *list; @@ -835,11 +833,11 @@ static struct batadv_nc_node * * Return: the nc_node if found or created, NULL in case of an error. */ -static struct batadv_nc_node -*batadv_nc_get_nc_node(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - bool in_coding) +static struct batadv_nc_node * +batadv_nc_get_nc_node(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node, + struct batadv_orig_node *orig_neigh_node, + bool in_coding) { struct batadv_nc_node *nc_node; spinlock_t *lock; /* Used to lock list selected by "int in_coding" */ @@ -856,8 +854,7 @@ static struct batadv_nc_node if (!nc_node) return NULL; - if (!kref_get_unless_zero(&orig_neigh_node->refcount)) - goto free; + kref_get(&orig_neigh_node->refcount); /* Initialize nc_node */ INIT_LIST_HEAD(&nc_node->list); @@ -884,10 +881,6 @@ static struct batadv_nc_node spin_unlock_bh(lock); return nc_node; - -free: - kfree(nc_node); - return NULL; } /** diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index c355a8247..ab8c4f973 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -54,9 +54,9 @@ static void batadv_purge_orig(struct work_struct *work); * @node: node in the local table * @data2: second object to compare the node to * - * Return: 1 if they are the same originator + * Return: true if they are the same originator */ -int batadv_compare_orig(const struct hlist_node *node, const void *data2) +bool batadv_compare_orig(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_orig_node, hash_entry); @@ -282,7 +282,7 @@ void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) } /** - * batadv_orig_node_get_router - router to the originator depending on iface + * batadv_orig_router_get - router to the originator depending on iface * @orig_node: the orig node for the router * @if_outgoing: the interface where the payload packet has been received or * the OGM should be sent to @@ -374,12 +374,8 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, if (!orig_ifinfo) goto out; - if (if_outgoing != BATADV_IF_DEFAULT && - !kref_get_unless_zero(&if_outgoing->refcount)) { - kfree(orig_ifinfo); - orig_ifinfo = NULL; - goto out; - } + if (if_outgoing != BATADV_IF_DEFAULT) + kref_get(&if_outgoing->refcount); reset_time = jiffies - 1; reset_time -= msecs_to_jiffies(BATADV_RESET_PROTECTION_MS); @@ -455,11 +451,8 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, if (!neigh_ifinfo) goto out; - if (if_outgoing && !kref_get_unless_zero(&if_outgoing->refcount)) { - kfree(neigh_ifinfo); - neigh_ifinfo = NULL; - goto out; - } + if (if_outgoing) + kref_get(&if_outgoing->refcount); INIT_HLIST_NODE(&neigh_ifinfo->list); kref_init(&neigh_ifinfo->refcount); @@ -532,15 +525,11 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, if (hardif_neigh) goto out; - if (!kref_get_unless_zero(&hard_iface->refcount)) - goto out; - hardif_neigh = kzalloc(sizeof(*hardif_neigh), GFP_ATOMIC); - if (!hardif_neigh) { - batadv_hardif_put(hard_iface); + if (!hardif_neigh) goto out; - } + kref_get(&hard_iface->refcount); INIT_HLIST_NODE(&hardif_neigh->list); ether_addr_copy(hardif_neigh->addr, neigh_addr); hardif_neigh->if_incoming = hard_iface; @@ -630,6 +619,8 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, struct batadv_neigh_node *neigh_node; struct batadv_hardif_neigh_node *hardif_neigh = NULL; + spin_lock_bh(&orig_node->neigh_list_lock); + neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr); if (neigh_node) goto out; @@ -643,16 +634,11 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, if (!neigh_node) goto out; - if (!kref_get_unless_zero(&hard_iface->refcount)) { - kfree(neigh_node); - neigh_node = NULL; - goto out; - } - INIT_HLIST_NODE(&neigh_node->list); INIT_HLIST_HEAD(&neigh_node->ifinfo_list); spin_lock_init(&neigh_node->ifinfo_lock); + kref_get(&hard_iface->refcount); ether_addr_copy(neigh_node->addr, neigh_addr); neigh_node->if_incoming = hard_iface; neigh_node->orig_node = orig_node; @@ -666,15 +652,15 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node, kref_init(&neigh_node->refcount); kref_get(&neigh_node->refcount); - spin_lock_bh(&orig_node->neigh_list_lock); hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); - spin_unlock_bh(&orig_node->neigh_list_lock); batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv, "Creating new neighbor %pM for orig_node %pM on interface %s\n", neigh_addr, orig_node->orig, hard_iface->net_dev->name); out: + spin_unlock_bh(&orig_node->neigh_list_lock); + if (hardif_neigh) batadv_hardif_neigh_put(hardif_neigh); return neigh_node; @@ -779,6 +765,8 @@ static void batadv_orig_node_release(struct kref *ref) struct batadv_neigh_node *neigh_node; struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo; + struct batadv_orig_node_vlan *vlan; + struct batadv_orig_ifinfo *last_candidate; orig_node = container_of(ref, struct batadv_orig_node, refcount); @@ -796,8 +784,21 @@ static void batadv_orig_node_release(struct kref *ref) hlist_del_rcu(&orig_ifinfo->list); batadv_orig_ifinfo_put(orig_ifinfo); } + + last_candidate = orig_node->last_bonding_candidate; + orig_node->last_bonding_candidate = NULL; spin_unlock_bh(&orig_node->neigh_list_lock); + if (last_candidate) + batadv_orig_ifinfo_put(last_candidate); + + spin_lock_bh(&orig_node->vlan_list_lock); + hlist_for_each_entry_safe(vlan, node_tmp, &orig_node->vlan_list, list) { + hlist_del_rcu(&vlan->list); + batadv_orig_node_vlan_put(vlan); + } + spin_unlock_bh(&orig_node->vlan_list_lock); + /* Free nc_nodes */ batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL); @@ -1160,6 +1161,9 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, if (hard_iface->soft_iface != bat_priv->soft_iface) continue; + if (!kref_get_unless_zero(&hard_iface->refcount)) + continue; + best_neigh_node = batadv_find_best_neighbor(bat_priv, orig_node, hard_iface); @@ -1167,6 +1171,8 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, best_neigh_node); if (best_neigh_node) batadv_neigh_node_put(best_neigh_node); + + batadv_hardif_put(hard_iface); } rcu_read_unlock(); @@ -1217,7 +1223,7 @@ static void batadv_purge_orig(struct work_struct *work) struct delayed_work *delayed_work; struct batadv_priv *bat_priv; - delayed_work = container_of(work, struct delayed_work, work); + delayed_work = to_delayed_work(work); bat_priv = container_of(delayed_work, struct batadv_priv, orig_work); _batadv_purge_orig(bat_priv); queue_delayed_work(batadv_event_workqueue, diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 4e8b67f11..64a8951e5 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -33,7 +33,7 @@ struct seq_file; -int batadv_compare_orig(const struct hlist_node *node, const void *data2); +bool batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); void batadv_purge_orig_ref(struct batadv_priv *bat_priv); diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index 8a8d7ca1a..372128ddb 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -175,6 +175,7 @@ enum batadv_bla_claimframe { BATADV_CLAIM_TYPE_UNCLAIM = 0x01, BATADV_CLAIM_TYPE_ANNOUNCE = 0x02, BATADV_CLAIM_TYPE_REQUEST = 0x03, + BATADV_CLAIM_TYPE_LOOPDETECT = 0x04, }; /** @@ -501,7 +502,7 @@ struct batadv_coded_packet { #pragma pack() /** - * struct batadv_unicast_tvlv - generic unicast packet with tvlv payload + * struct batadv_unicast_tvlv_packet - generic unicast packet with tvlv payload * @packet_type: batman-adv packet type, part of the general header * @version: batman-adv protocol version, part of the genereal header * @ttl: time to live for this packet, part of the genereal header diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index b781bf753..bfac086b4 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -100,10 +100,6 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, if (curr_router) batadv_neigh_node_put(curr_router); - /* increase refcount of new best neighbor */ - if (neigh_node && !kref_get_unless_zero(&neigh_node->refcount)) - neigh_node = NULL; - spin_lock_bh(&orig_node->neigh_list_lock); /* curr_router used earlier may not be the current orig_ifinfo->router * anymore because it was dereferenced outside of the neigh_list_lock @@ -114,6 +110,10 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, */ curr_router = rcu_dereference_protected(orig_ifinfo->router, true); + /* increase refcount of new best neighbor */ + if (neigh_node) + kref_get(&neigh_node->refcount); + rcu_assign_pointer(orig_ifinfo->router, neigh_node); spin_unlock_bh(&orig_node->neigh_list_lock); batadv_orig_ifinfo_put(orig_ifinfo); @@ -163,18 +163,18 @@ out: * doesn't change otherwise. * * Return: - * 0 if the packet is to be accepted. - * 1 if the packet is to be ignored. + * false if the packet is to be accepted. + * true if the packet is to be ignored. */ -int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, - s32 seq_old_max_diff, unsigned long *last_reset, - bool *protection_started) +bool batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, + s32 seq_old_max_diff, unsigned long *last_reset, + bool *protection_started) { if (seq_num_diff <= -seq_old_max_diff || seq_num_diff >= BATADV_EXPECTED_SEQNO_RANGE) { if (!batadv_has_timed_out(*last_reset, BATADV_RESET_PROTECTION_MS)) - return 1; + return true; *last_reset = jiffies; if (protection_started) @@ -183,7 +183,7 @@ int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, "old packet received, start protection\n"); } - return 0; + return false; } bool batadv_check_management_packet(struct sk_buff *skb, @@ -374,6 +374,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, if (skb_cow(skb, ETH_HLEN) < 0) goto out; + ethhdr = eth_hdr(skb); icmph = (struct batadv_icmp_header *)skb->data; icmp_packet_rr = (struct batadv_icmp_packet_rr *)icmph; if (icmp_packet_rr->rr_cur >= BATADV_RR_LEN) @@ -455,6 +456,29 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv, } /** + * batadv_last_bonding_replace - Replace last_bonding_candidate of orig_node + * @orig_node: originator node whose bonding candidates should be replaced + * @new_candidate: new bonding candidate or NULL + */ +static void +batadv_last_bonding_replace(struct batadv_orig_node *orig_node, + struct batadv_orig_ifinfo *new_candidate) +{ + struct batadv_orig_ifinfo *old_candidate; + + spin_lock_bh(&orig_node->neigh_list_lock); + old_candidate = orig_node->last_bonding_candidate; + + if (new_candidate) + kref_get(&new_candidate->refcount); + orig_node->last_bonding_candidate = new_candidate; + spin_unlock_bh(&orig_node->neigh_list_lock); + + if (old_candidate) + batadv_orig_ifinfo_put(old_candidate); +} + +/** * batadv_find_router - find a suitable router for this originator * @bat_priv: the bat priv with all the soft interface information * @orig_node: the destination node @@ -561,10 +585,6 @@ next: } rcu_read_unlock(); - /* last_bonding_candidate is reset below, remove the old reference. */ - if (orig_node->last_bonding_candidate) - batadv_orig_ifinfo_put(orig_node->last_bonding_candidate); - /* After finding candidates, handle the three cases: * 1) there is a next candidate, use that * 2) there is no next candidate, use the first of the list @@ -573,21 +593,28 @@ next: if (next_candidate) { batadv_neigh_node_put(router); - /* remove references to first candidate, we don't need it. */ - if (first_candidate) { - batadv_neigh_node_put(first_candidate_router); - batadv_orig_ifinfo_put(first_candidate); - } + kref_get(&next_candidate_router->refcount); router = next_candidate_router; - orig_node->last_bonding_candidate = next_candidate; + batadv_last_bonding_replace(orig_node, next_candidate); } else if (first_candidate) { batadv_neigh_node_put(router); - /* refcounting has already been done in the loop above. */ + kref_get(&first_candidate_router->refcount); router = first_candidate_router; - orig_node->last_bonding_candidate = first_candidate; + batadv_last_bonding_replace(orig_node, first_candidate); } else { - orig_node->last_bonding_candidate = NULL; + batadv_last_bonding_replace(orig_node, NULL); + } + + /* cleanup of candidates */ + if (first_candidate) { + batadv_neigh_node_put(first_candidate_router); + batadv_orig_ifinfo_put(first_candidate); + } + + if (next_candidate) { + batadv_neigh_node_put(next_candidate_router); + batadv_orig_ifinfo_put(next_candidate); } return router; @@ -601,6 +628,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, struct batadv_unicast_packet *unicast_packet; struct ethhdr *ethhdr = eth_hdr(skb); int res, hdr_len, ret = NET_RX_DROP; + unsigned int len; unicast_packet = (struct batadv_unicast_packet *)skb->data; @@ -641,6 +669,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, if (hdr_len > 0) batadv_skb_set_priority(skb, hdr_len); + len = skb->len; res = batadv_send_skb_to_orig(skb, orig_node, recv_if); /* translate transmit result into receive result */ @@ -648,7 +677,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, /* skb was transmitted and consumed */ batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD); batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES, - skb->len + ETH_HLEN); + len + ETH_HLEN); ret = NET_RX_SUCCESS; } else if (res == NET_XMIT_POLICED) { @@ -718,8 +747,9 @@ out: return ret; } -static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, - struct sk_buff *skb, int hdr_len) { +static bool batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, + struct sk_buff *skb, int hdr_len) +{ struct batadv_unicast_packet *unicast_packet; struct batadv_hard_iface *primary_if; struct batadv_orig_node *orig_node; @@ -730,11 +760,11 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, /* check if there is enough data before accessing it */ if (!pskb_may_pull(skb, hdr_len + ETH_HLEN)) - return 0; + return false; /* create a copy of the skb (in case of for re-routing) to modify it. */ if (skb_cow(skb, sizeof(*unicast_packet)) < 0) - return 0; + return false; unicast_packet = (struct batadv_unicast_packet *)skb->data; vid = batadv_get_vid(skb, hdr_len); @@ -758,7 +788,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, * table. If not, let the packet go untouched anyway because * there is nothing the node can do */ - return 1; + return true; } /* retrieve the TTVN known by this node for the packet destination. This @@ -774,7 +804,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, * not be possible to deliver it */ if (!orig_node) - return 0; + return false; curr_ttvn = (u8)atomic_read(&orig_node->last_ttvn); batadv_orig_node_put(orig_node); @@ -785,7 +815,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, */ is_old_ttvn = batadv_seq_before(unicast_packet->ttvn, curr_ttvn); if (!is_old_ttvn) - return 1; + return true; old_ttvn = unicast_packet->ttvn; /* the packet was forged based on outdated network information. Its @@ -798,7 +828,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, "Rerouting unicast packet to %pM (dst=%pM): TTVN mismatch old_ttvn=%u new_ttvn=%u\n", unicast_packet->dest, ethhdr->h_dest, old_ttvn, curr_ttvn); - return 1; + return true; } /* the packet has not been re-routed: either the destination is @@ -806,14 +836,14 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, * it is possible to drop the packet */ if (!batadv_is_my_client(bat_priv, ethhdr->h_dest, vid)) - return 0; + return false; /* update the header in order to let the packet be delivered to this * node's soft interface */ primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) - return 0; + return false; ether_addr_copy(unicast_packet->dest, primary_if->net_dev->dev_addr); @@ -821,7 +851,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, unicast_packet->ttvn = curr_ttvn; - return 1; + return true; } /** @@ -912,7 +942,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, hdr_size)) goto rx_success; - batadv_interface_rx(recv_if->soft_iface, skb, recv_if, hdr_size, + batadv_interface_rx(recv_if->soft_iface, skb, hdr_size, orig_node); rx_success: @@ -1122,8 +1152,7 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, goto rx_success; /* broadcast for me */ - batadv_interface_rx(recv_if->soft_iface, skb, recv_if, hdr_size, - orig_node); + batadv_interface_rx(recv_if->soft_iface, skb, hdr_size, orig_node); rx_success: ret = NET_RX_SUCCESS; diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 02a5caa84..05c3ff42e 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -51,8 +51,8 @@ struct batadv_neigh_node * batadv_find_router(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if); -int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, - s32 seq_old_max_diff, unsigned long *last_reset, - bool *protection_started); +bool batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, + s32 seq_old_max_diff, unsigned long *last_reset, + bool *protection_started); #endif /* _NET_BATMAN_ADV_ROUTING_H_ */ diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 76417850d..010397650 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -26,6 +26,7 @@ #include <linux/if.h> #include <linux/jiffies.h> #include <linux/kernel.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/printk.h> @@ -423,8 +424,8 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_orig_node *orig_node; orig_node = batadv_gw_get_selected_orig(bat_priv); - return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST, 0, - orig_node, vid); + return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR, + BATADV_P_DATA, orig_node, vid); } void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface) @@ -552,7 +553,7 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) struct net_device *soft_iface; struct batadv_priv *bat_priv; - delayed_work = container_of(work, struct delayed_work, work); + delayed_work = to_delayed_work(work); forw_packet = container_of(delayed_work, struct batadv_forw_packet, delayed_work); soft_iface = forw_packet->if_incoming->soft_iface; @@ -577,10 +578,15 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) if (forw_packet->num_packets >= hard_iface->num_bcasts) continue; + if (!kref_get_unless_zero(&hard_iface->refcount)) + continue; + /* send a copy of the saved skb */ skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC); if (skb1) batadv_send_broadcast_skb(skb1, hard_iface); + + batadv_hardif_put(hard_iface); } rcu_read_unlock(); @@ -604,7 +610,7 @@ void batadv_send_outstanding_bat_ogm_packet(struct work_struct *work) struct batadv_forw_packet *forw_packet; struct batadv_priv *bat_priv; - delayed_work = container_of(work, struct delayed_work, work); + delayed_work = to_delayed_work(work); forw_packet = container_of(delayed_work, struct batadv_forw_packet, delayed_work); bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 8a136b6a1..287a3879e 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -186,7 +186,6 @@ static int batadv_interface_tx(struct sk_buff *skb, struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_hard_iface *primary_if = NULL; struct batadv_bcast_packet *bcast_packet; - __be16 ethertype = htons(ETH_P_BATMAN); static const u8 stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00, 0x00, 0x00}; static const u8 ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00, @@ -208,7 +207,7 @@ static int batadv_interface_tx(struct sk_buff *skb, if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) goto dropped; - soft_iface->trans_start = jiffies; + netif_trans_update(soft_iface); vid = batadv_get_vid(skb, 0); ethhdr = eth_hdr(skb); @@ -216,7 +215,8 @@ static int batadv_interface_tx(struct sk_buff *skb, case ETH_P_8021Q: vhdr = vlan_eth_hdr(skb); - if (vhdr->h_vlan_encapsulated_proto != ethertype) { + /* drop batman-in-batman packets to prevent loops */ + if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN)) { network_offset += VLAN_HLEN; break; } @@ -381,13 +381,29 @@ end: return NETDEV_TX_OK; } +/** + * batadv_interface_rx - receive ethernet frame on local batman-adv interface + * @soft_iface: local interface which will receive the ethernet frame + * @skb: ethernet frame for @soft_iface + * @hdr_size: size of already parsed batman-adv header + * @orig_node: originator from which the batman-adv packet was sent + * + * Sends a ethernet frame to the receive path of the local @soft_iface. + * skb->data has still point to the batman-adv header with the size @hdr_size. + * The caller has to have parsed this header already and made sure that at least + * @hdr_size bytes are still available for pull in @skb. + * + * The packet may still get dropped. This can happen when the encapsulated + * ethernet frame is invalid or contains again an batman-adv packet. Also + * unicast packets will be dropped directly when it was sent between two + * isolated clients. + */ void batadv_interface_rx(struct net_device *soft_iface, - struct sk_buff *skb, struct batadv_hard_iface *recv_if, - int hdr_size, struct batadv_orig_node *orig_node) + struct sk_buff *skb, int hdr_size, + struct batadv_orig_node *orig_node) { struct batadv_bcast_packet *batadv_bcast_packet; struct batadv_priv *bat_priv = netdev_priv(soft_iface); - __be16 ethertype = htons(ETH_P_BATMAN); struct vlan_ethhdr *vhdr; struct ethhdr *ethhdr; unsigned short vid; @@ -396,10 +412,6 @@ void batadv_interface_rx(struct net_device *soft_iface, batadv_bcast_packet = (struct batadv_bcast_packet *)skb->data; is_bcast = (batadv_bcast_packet->packet_type == BATADV_BCAST); - /* check if enough space is available for pulling, and pull */ - if (!pskb_may_pull(skb, hdr_size)) - goto dropped; - skb_pull_rcsum(skb, hdr_size); skb_reset_mac_header(skb); @@ -421,7 +433,8 @@ void batadv_interface_rx(struct net_device *soft_iface, vhdr = (struct vlan_ethhdr *)skb->data; - if (vhdr->h_vlan_encapsulated_proto != ethertype) + /* drop batman-in-batman packets to prevent loops */ + if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN)) break; /* fall through */ @@ -543,7 +556,7 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, } /** - * batadv_create_vlan - allocate the needed resources for a new vlan + * batadv_softif_create_vlan - allocate the needed resources for a new vlan * @bat_priv: the bat priv with all the soft interface information * @vid: the VLAN identifier * @@ -872,13 +885,14 @@ static int batadv_softif_slave_add(struct net_device *dev, struct net_device *slave_dev) { struct batadv_hard_iface *hard_iface; + struct net *net = dev_net(dev); int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); if (!hard_iface || hard_iface->soft_iface) goto out; - ret = batadv_hardif_enable_interface(hard_iface, dev->name); + ret = batadv_hardif_enable_interface(hard_iface, net, dev->name); out: if (hard_iface) @@ -959,7 +973,7 @@ static void batadv_softif_init_early(struct net_device *dev) dev->netdev_ops = &batadv_netdev_ops; dev->destructor = batadv_softif_free; - dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; + dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL; dev->priv_flags |= IFF_NO_QUEUE; /* can't call min_mtu, because the needed variables @@ -975,7 +989,7 @@ static void batadv_softif_init_early(struct net_device *dev) memset(priv, 0, sizeof(*priv)); } -struct net_device *batadv_softif_create(const char *name) +struct net_device *batadv_softif_create(struct net *net, const char *name) { struct net_device *soft_iface; int ret; @@ -985,6 +999,8 @@ struct net_device *batadv_softif_create(const char *name) if (!soft_iface) return NULL; + dev_net_set(soft_iface, net); + soft_iface->rtnl_link_ops = &batadv_link_ops; ret = register_netdevice(soft_iface); @@ -1017,7 +1033,9 @@ void batadv_softif_destroy_sysfs(struct net_device *soft_iface) static void batadv_softif_destroy_netlink(struct net_device *soft_iface, struct list_head *head) { + struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_hard_iface *hard_iface; + struct batadv_softif_vlan *vlan; list_for_each_entry(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface == soft_iface) @@ -1025,16 +1043,23 @@ static void batadv_softif_destroy_netlink(struct net_device *soft_iface, BATADV_IF_CLEANUP_KEEP); } + /* destroy the "untagged" VLAN */ + vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); + if (vlan) { + batadv_softif_destroy_vlan(bat_priv, vlan); + batadv_softif_vlan_put(vlan); + } + batadv_sysfs_del_meshif(soft_iface); unregister_netdevice_queue(soft_iface, head); } -int batadv_softif_is_valid(const struct net_device *net_dev) +bool batadv_softif_is_valid(const struct net_device *net_dev) { if (net_dev->netdev_ops->ndo_start_xmit == batadv_interface_tx) - return 1; + return true; - return 0; + return false; } struct rtnl_link_ops batadv_link_ops __read_mostly = { diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 9ae265703..ec303ddbf 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -20,18 +20,20 @@ #include "main.h" +#include <linux/types.h> #include <net/rtnetlink.h> struct net_device; +struct net; struct sk_buff; int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *soft_iface, - struct sk_buff *skb, struct batadv_hard_iface *recv_if, - int hdr_size, struct batadv_orig_node *orig_node); -struct net_device *batadv_softif_create(const char *name); + struct sk_buff *skb, int hdr_size, + struct batadv_orig_node *orig_node); +struct net_device *batadv_softif_create(struct net *net, const char *name); void batadv_softif_destroy_sysfs(struct net_device *soft_iface); -int batadv_softif_is_valid(const struct net_device *net_dev); +bool batadv_softif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); void batadv_softif_vlan_put(struct batadv_softif_vlan *softif_vlan); diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index e7cf51333..414b20741 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -116,11 +116,13 @@ batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj) static char *batadv_uev_action_str[] = { "add", "del", - "change" + "change", + "loopdetect", }; static char *batadv_uev_type_str[] = { - "gw" + "gw", + "bla", }; /* Use this, if you have customized show and store functions for vlan attrs */ @@ -830,6 +832,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj, size_t count) { struct net_device *net_dev = batadv_kobj_to_netdev(kobj); + struct net *net = dev_net(net_dev); struct batadv_hard_iface *hard_iface; int status_tmp = -1; int ret = count; @@ -873,7 +876,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj, batadv_hardif_disable_interface(hard_iface, BATADV_IF_CLEANUP_AUTO); - ret = batadv_hardif_enable_interface(hard_iface, buff); + ret = batadv_hardif_enable_interface(hard_iface, net, buff); unlock: rtnl_unlock(); diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 9b4551a86..57ec87f37 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -43,7 +43,6 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/workqueue.h> -#include <net/net_namespace.h> #include "bridge_loop_avoidance.h" #include "hard-interface.h" @@ -76,9 +75,9 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv, * * Compare the MAC address and the VLAN ID of the two TT entries and check if * they are the same TT client. - * Return: 1 if the two TT clients are the same, 0 otherwise + * Return: true if the two TT clients are the same, false otherwise */ -static int batadv_compare_tt(const struct hlist_node *node, const void *data2) +static bool batadv_compare_tt(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_tt_common_entry, hash_entry); @@ -585,6 +584,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_tt_local_entry *tt_local; struct batadv_tt_global_entry *tt_global = NULL; + struct net *net = dev_net(soft_iface); struct batadv_softif_vlan *vlan; struct net_device *in_dev = NULL; struct hlist_head *head; @@ -596,7 +596,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, u32 match_mark; if (ifindex != BATADV_NULL_IFINDEX) - in_dev = dev_get_by_index(&init_net, ifindex); + in_dev = dev_get_by_index(net, ifindex); tt_local = batadv_tt_local_hash_find(bat_priv, addr, vid); @@ -650,8 +650,10 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, /* increase the refcounter of the related vlan */ vlan = batadv_softif_vlan_get(bat_priv, vid); - if (WARN(!vlan, "adding TT local entry %pM to non-existent VLAN %d", - addr, BATADV_PRINT_VID(vid))) { + if (!vlan) { + net_ratelimited_function(batadv_info, soft_iface, + "adding TT local entry %pM to non-existent VLAN %d\n", + addr, BATADV_PRINT_VID(vid)); kfree(tt_local); tt_local = NULL; goto out; @@ -691,7 +693,6 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, if (unlikely(hash_added != 0)) { /* remove the reference for the hash */ batadv_tt_local_entry_put(tt_local); - batadv_softif_vlan_put(vlan); goto out; } @@ -1010,8 +1011,8 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, "Locally retrieved addresses (from %s) announced via TT (TTVN: %u):\n", net_dev->name, (u8)atomic_read(&bat_priv->tt.vn)); - seq_printf(seq, " %-13s %s %-8s %-9s (%-10s)\n", "Client", "VID", - "Flags", "Last seen", "CRC"); + seq_puts(seq, + " Client VID Flags Last seen (CRC )\n"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -1680,9 +1681,8 @@ int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, "Globally announced TT entries received via the mesh %s\n", net_dev->name); - seq_printf(seq, " %-13s %s %s %-15s %s (%-10s) %s\n", - "Client", "VID", "(TTVN)", "Originator", "(Curr TTVN)", - "CRC", "Flags"); + seq_puts(seq, + " Client VID (TTVN) Originator (Curr TTVN) (CRC ) Flags\n"); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -2270,6 +2270,29 @@ static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv, return crc; } +/** + * batadv_tt_req_node_release - free tt_req node entry + * @ref: kref pointer of the tt req_node entry + */ +static void batadv_tt_req_node_release(struct kref *ref) +{ + struct batadv_tt_req_node *tt_req_node; + + tt_req_node = container_of(ref, struct batadv_tt_req_node, refcount); + + kfree(tt_req_node); +} + +/** + * batadv_tt_req_node_put - decrement the tt_req_node refcounter and + * possibly release it + * @tt_req_node: tt_req_node to be free'd + */ +static void batadv_tt_req_node_put(struct batadv_tt_req_node *tt_req_node) +{ + kref_put(&tt_req_node->refcount, batadv_tt_req_node_release); +} + static void batadv_tt_req_list_free(struct batadv_priv *bat_priv) { struct batadv_tt_req_node *node; @@ -2279,7 +2302,7 @@ static void batadv_tt_req_list_free(struct batadv_priv *bat_priv) hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { hlist_del_init(&node->list); - kfree(node); + batadv_tt_req_node_put(node); } spin_unlock_bh(&bat_priv->tt.req_list_lock); @@ -2316,7 +2339,7 @@ static void batadv_tt_req_purge(struct batadv_priv *bat_priv) if (batadv_has_timed_out(node->issued_at, BATADV_TT_REQUEST_TIMEOUT)) { hlist_del_init(&node->list); - kfree(node); + batadv_tt_req_node_put(node); } } spin_unlock_bh(&bat_priv->tt.req_list_lock); @@ -2348,9 +2371,11 @@ batadv_tt_req_node_new(struct batadv_priv *bat_priv, if (!tt_req_node) goto unlock; + kref_init(&tt_req_node->refcount); ether_addr_copy(tt_req_node->addr, orig_node->orig); tt_req_node->issued_at = jiffies; + kref_get(&tt_req_node->refcount); hlist_add_head(&tt_req_node->list, &bat_priv->tt.req_list); unlock: spin_unlock_bh(&bat_priv->tt.req_list_lock); @@ -2362,19 +2387,19 @@ unlock: * @entry_ptr: to be checked local tt entry * @data_ptr: not used but definition required to satisfy the callback prototype * - * Return: 1 if the entry is a valid, 0 otherwise. + * Return: true if the entry is a valid, false otherwise. */ -static int batadv_tt_local_valid(const void *entry_ptr, const void *data_ptr) +static bool batadv_tt_local_valid(const void *entry_ptr, const void *data_ptr) { const struct batadv_tt_common_entry *tt_common_entry = entry_ptr; if (tt_common_entry->flags & BATADV_TT_CLIENT_NEW) - return 0; - return 1; + return false; + return true; } -static int batadv_tt_global_valid(const void *entry_ptr, - const void *data_ptr) +static bool batadv_tt_global_valid(const void *entry_ptr, + const void *data_ptr) { const struct batadv_tt_common_entry *tt_common_entry = entry_ptr; const struct batadv_tt_global_entry *tt_global_entry; @@ -2382,7 +2407,7 @@ static int batadv_tt_global_valid(const void *entry_ptr, if (tt_common_entry->flags & BATADV_TT_CLIENT_ROAM || tt_common_entry->flags & BATADV_TT_CLIENT_TEMP) - return 0; + return false; tt_global_entry = container_of(tt_common_entry, struct batadv_tt_global_entry, @@ -2404,7 +2429,8 @@ static int batadv_tt_global_valid(const void *entry_ptr, static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, struct batadv_hashtable *hash, void *tvlv_buff, u16 tt_len, - int (*valid_cb)(const void *, const void *), + bool (*valid_cb)(const void *, + const void *), void *cb_data) { struct batadv_tt_common_entry *tt_common_entry; @@ -2553,11 +2579,11 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv, * * Return: true if the TT Request was sent, false otherwise */ -static int batadv_send_tt_request(struct batadv_priv *bat_priv, - struct batadv_orig_node *dst_orig_node, - u8 ttvn, - struct batadv_tvlv_tt_vlan_data *tt_vlan, - u16 num_vlan, bool full_table) +static bool batadv_send_tt_request(struct batadv_priv *bat_priv, + struct batadv_orig_node *dst_orig_node, + u8 ttvn, + struct batadv_tvlv_tt_vlan_data *tt_vlan, + u16 num_vlan, bool full_table) { struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_tt_req_node *tt_req_node = NULL; @@ -2613,13 +2639,19 @@ static int batadv_send_tt_request(struct batadv_priv *bat_priv, out: if (primary_if) batadv_hardif_put(primary_if); + if (ret && tt_req_node) { spin_lock_bh(&bat_priv->tt.req_list_lock); - /* hlist_del_init() verifies tt_req_node still is in the list */ - hlist_del_init(&tt_req_node->list); + if (!hlist_unhashed(&tt_req_node->list)) { + hlist_del_init(&tt_req_node->list); + batadv_tt_req_node_put(tt_req_node); + } spin_unlock_bh(&bat_priv->tt.req_list_lock); - kfree(tt_req_node); } + + if (tt_req_node) + batadv_tt_req_node_put(tt_req_node); + kfree(tvlv_tt_data); return ret; } @@ -3055,7 +3087,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, if (!batadv_compare_eth(node->addr, resp_src)) continue; hlist_del_init(&node->list); - kfree(node); + batadv_tt_req_node_put(node); } spin_unlock_bh(&bat_priv->tt.req_list_lock); @@ -3201,7 +3233,7 @@ static void batadv_tt_purge(struct work_struct *work) struct batadv_priv_tt *priv_tt; struct batadv_priv *bat_priv; - delayed_work = container_of(work, struct delayed_work, work); + delayed_work = to_delayed_work(work); priv_tt = container_of(delayed_work, struct batadv_priv_tt, work); bat_priv = container_of(priv_tt, struct batadv_priv, tt); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 1e47fbe8b..74d865a4d 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -330,7 +330,9 @@ struct batadv_orig_node { DECLARE_BITMAP(bcast_bits, BATADV_TQ_LOCAL_WINDOW_SIZE); u32 last_bcast_seqno; struct hlist_head neigh_list; - /* neigh_list_lock protects: neigh_list and router */ + /* neigh_list_lock protects: neigh_list, ifinfo_list, + * last_bonding_candidate and router + */ spinlock_t neigh_list_lock; struct hlist_node hash_entry; struct batadv_priv *bat_priv; @@ -657,6 +659,9 @@ struct batadv_priv_tt { * @num_requests: number of bla requests in flight * @claim_hash: hash table containing mesh nodes this host has claimed * @backbone_hash: hash table containing all detected backbone gateways + * @loopdetect_addr: MAC address used for own loopdetection frames + * @loopdetect_lasttime: time when the loopdetection frames were sent + * @loopdetect_next: how many periods to wait for the next loopdetect process * @bcast_duplist: recently received broadcast packets array (for broadcast * duplicate suppression) * @bcast_duplist_curr: index of last broadcast packet added to bcast_duplist @@ -668,6 +673,9 @@ struct batadv_priv_bla { atomic_t num_requests; struct batadv_hashtable *claim_hash; struct batadv_hashtable *backbone_hash; + u8 loopdetect_addr[ETH_ALEN]; + unsigned long loopdetect_lasttime; + atomic_t loopdetect_next; struct batadv_bcast_duplist_entry bcast_duplist[BATADV_DUPLIST_SIZE]; int bcast_duplist_curr; /* protects bcast_duplist & bcast_duplist_curr */ @@ -1012,6 +1020,7 @@ struct batadv_socket_packet { * resolved * @crc: crc16 checksum over all claims * @crc_lock: lock protecting crc + * @report_work: work struct for reporting detected loops * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner */ @@ -1025,6 +1034,7 @@ struct batadv_bla_backbone_gw { atomic_t request_sent; u16 crc; spinlock_t crc_lock; /* protects crc */ + struct work_struct report_work; struct kref refcount; struct rcu_head rcu; }; @@ -1034,6 +1044,7 @@ struct batadv_bla_backbone_gw { * @addr: mac address of claimed non-mesh client * @vid: vlan id this client was detected on * @backbone_gw: pointer to backbone gw claiming this client + * @backbone_lock: lock protecting backbone_gw pointer * @lasttime: last time we heard of claim (locals only) * @hash_entry: hlist node for batadv_priv_bla::claim_hash * @refcount: number of contexts the object is used @@ -1043,6 +1054,7 @@ struct batadv_bla_claim { u8 addr[ETH_ALEN]; unsigned short vid; struct batadv_bla_backbone_gw *backbone_gw; + spinlock_t backbone_lock; /* protects backbone_gw */ unsigned long lasttime; struct hlist_node hash_entry; struct rcu_head rcu; @@ -1129,11 +1141,13 @@ struct batadv_tt_change_node { * struct batadv_tt_req_node - data to keep track of the tt requests in flight * @addr: mac address address of the originator this request was sent to * @issued_at: timestamp used for purging stale tt requests + * @refcount: number of contexts the object is used by * @list: list node for batadv_priv_tt::req_list */ struct batadv_tt_req_node { u8 addr[ETH_ALEN]; unsigned long issued_at; + struct kref refcount; struct hlist_node list; }; diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index e45cb9155..780089d75 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -68,7 +68,7 @@ struct lowpan_peer { struct in6_addr peer_addr; }; -struct lowpan_dev { +struct lowpan_btle_dev { struct list_head list; struct hci_dev *hdev; @@ -80,18 +80,21 @@ struct lowpan_dev { struct delayed_work notify_peers; }; -static inline struct lowpan_dev *lowpan_dev(const struct net_device *netdev) +static inline struct lowpan_btle_dev * +lowpan_btle_dev(const struct net_device *netdev) { - return (struct lowpan_dev *)lowpan_priv(netdev)->priv; + return (struct lowpan_btle_dev *)lowpan_dev(netdev)->priv; } -static inline void peer_add(struct lowpan_dev *dev, struct lowpan_peer *peer) +static inline void peer_add(struct lowpan_btle_dev *dev, + struct lowpan_peer *peer) { list_add_rcu(&peer->list, &dev->peers); atomic_inc(&dev->peer_count); } -static inline bool peer_del(struct lowpan_dev *dev, struct lowpan_peer *peer) +static inline bool peer_del(struct lowpan_btle_dev *dev, + struct lowpan_peer *peer) { list_del_rcu(&peer->list); kfree_rcu(peer, rcu); @@ -106,7 +109,7 @@ static inline bool peer_del(struct lowpan_dev *dev, struct lowpan_peer *peer) return false; } -static inline struct lowpan_peer *peer_lookup_ba(struct lowpan_dev *dev, +static inline struct lowpan_peer *peer_lookup_ba(struct lowpan_btle_dev *dev, bdaddr_t *ba, __u8 type) { struct lowpan_peer *peer; @@ -134,8 +137,8 @@ static inline struct lowpan_peer *peer_lookup_ba(struct lowpan_dev *dev, return NULL; } -static inline struct lowpan_peer *__peer_lookup_chan(struct lowpan_dev *dev, - struct l2cap_chan *chan) +static inline struct lowpan_peer * +__peer_lookup_chan(struct lowpan_btle_dev *dev, struct l2cap_chan *chan) { struct lowpan_peer *peer; @@ -147,8 +150,8 @@ static inline struct lowpan_peer *__peer_lookup_chan(struct lowpan_dev *dev, return NULL; } -static inline struct lowpan_peer *__peer_lookup_conn(struct lowpan_dev *dev, - struct l2cap_conn *conn) +static inline struct lowpan_peer * +__peer_lookup_conn(struct lowpan_btle_dev *dev, struct l2cap_conn *conn) { struct lowpan_peer *peer; @@ -160,7 +163,7 @@ static inline struct lowpan_peer *__peer_lookup_conn(struct lowpan_dev *dev, return NULL; } -static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_dev *dev, +static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev, struct in6_addr *daddr, struct sk_buff *skb) { @@ -220,7 +223,7 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_dev *dev, static struct lowpan_peer *lookup_peer(struct l2cap_conn *conn) { - struct lowpan_dev *entry; + struct lowpan_btle_dev *entry; struct lowpan_peer *peer = NULL; rcu_read_lock(); @@ -236,10 +239,10 @@ static struct lowpan_peer *lookup_peer(struct l2cap_conn *conn) return peer; } -static struct lowpan_dev *lookup_dev(struct l2cap_conn *conn) +static struct lowpan_btle_dev *lookup_dev(struct l2cap_conn *conn) { - struct lowpan_dev *entry; - struct lowpan_dev *dev = NULL; + struct lowpan_btle_dev *entry; + struct lowpan_btle_dev *dev = NULL; rcu_read_lock(); @@ -270,10 +273,10 @@ static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev, struct l2cap_chan *chan) { const u8 *saddr, *daddr; - struct lowpan_dev *dev; + struct lowpan_btle_dev *dev; struct lowpan_peer *peer; - dev = lowpan_dev(netdev); + dev = lowpan_btle_dev(netdev); rcu_read_lock(); peer = __peer_lookup_chan(dev, chan); @@ -375,7 +378,7 @@ drop: /* Packet from BT LE device */ static int chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) { - struct lowpan_dev *dev; + struct lowpan_btle_dev *dev; struct lowpan_peer *peer; int err; @@ -432,7 +435,7 @@ static int setup_header(struct sk_buff *skb, struct net_device *netdev, { struct in6_addr ipv6_daddr; struct ipv6hdr *hdr; - struct lowpan_dev *dev; + struct lowpan_btle_dev *dev; struct lowpan_peer *peer; bdaddr_t addr, *any = BDADDR_ANY; u8 *daddr = any->b; @@ -440,7 +443,7 @@ static int setup_header(struct sk_buff *skb, struct net_device *netdev, hdr = ipv6_hdr(skb); - dev = lowpan_dev(netdev); + dev = lowpan_btle_dev(netdev); memcpy(&ipv6_daddr, &hdr->daddr, sizeof(ipv6_daddr)); @@ -540,19 +543,19 @@ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev) { struct sk_buff *local_skb; - struct lowpan_dev *entry; + struct lowpan_btle_dev *entry; int err = 0; rcu_read_lock(); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { struct lowpan_peer *pentry; - struct lowpan_dev *dev; + struct lowpan_btle_dev *dev; if (entry->netdev != netdev) continue; - dev = lowpan_dev(entry->netdev); + dev = lowpan_btle_dev(entry->netdev); list_for_each_entry_rcu(pentry, &dev->peers, list) { int ret; @@ -720,8 +723,8 @@ static void ifdown(struct net_device *netdev) static void do_notify_peers(struct work_struct *work) { - struct lowpan_dev *dev = container_of(work, struct lowpan_dev, - notify_peers.work); + struct lowpan_btle_dev *dev = container_of(work, struct lowpan_btle_dev, + notify_peers.work); netdev_notify_peers(dev->netdev); /* send neighbour adv at startup */ } @@ -763,7 +766,7 @@ static void set_ip_addr_bits(u8 addr_type, u8 *addr) } static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, - struct lowpan_dev *dev) + struct lowpan_btle_dev *dev) { struct lowpan_peer *peer; @@ -800,12 +803,12 @@ static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, return peer->chan; } -static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev) +static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev) { struct net_device *netdev; int err = 0; - netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_dev)), + netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_btle_dev)), IFACE_NAME_TEMPLATE, NET_NAME_UNKNOWN, netdev_setup); if (!netdev) @@ -817,7 +820,7 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev) SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev); SET_NETDEV_DEVTYPE(netdev, &bt_type); - *dev = lowpan_dev(netdev); + *dev = lowpan_btle_dev(netdev); (*dev)->netdev = netdev; (*dev)->hdev = chan->conn->hcon->hdev; INIT_LIST_HEAD(&(*dev)->peers); @@ -850,7 +853,7 @@ out: static inline void chan_ready_cb(struct l2cap_chan *chan) { - struct lowpan_dev *dev; + struct lowpan_btle_dev *dev; dev = lookup_dev(chan->conn); @@ -887,8 +890,9 @@ static inline struct l2cap_chan *chan_new_conn_cb(struct l2cap_chan *pchan) static void delete_netdev(struct work_struct *work) { - struct lowpan_dev *entry = container_of(work, struct lowpan_dev, - delete_netdev); + struct lowpan_btle_dev *entry = container_of(work, + struct lowpan_btle_dev, + delete_netdev); lowpan_unregister_netdev(entry->netdev); @@ -897,8 +901,8 @@ static void delete_netdev(struct work_struct *work) static void chan_close_cb(struct l2cap_chan *chan) { - struct lowpan_dev *entry; - struct lowpan_dev *dev = NULL; + struct lowpan_btle_dev *entry; + struct lowpan_btle_dev *dev = NULL; struct lowpan_peer *peer; int err = -ENOENT; bool last = false, remove = true; @@ -918,7 +922,7 @@ static void chan_close_cb(struct l2cap_chan *chan) spin_lock(&devices_lock); list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { - dev = lowpan_dev(entry->netdev); + dev = lowpan_btle_dev(entry->netdev); peer = __peer_lookup_chan(dev, chan); if (peer) { last = peer_del(dev, peer); @@ -1128,7 +1132,7 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, static void disconnect_all_peers(void) { - struct lowpan_dev *entry; + struct lowpan_btle_dev *entry; struct lowpan_peer *peer, *tmp_peer, *new_peer; struct list_head peers; @@ -1288,7 +1292,7 @@ static ssize_t lowpan_control_write(struct file *fp, static int lowpan_control_show(struct seq_file *f, void *ptr) { - struct lowpan_dev *entry; + struct lowpan_btle_dev *entry; struct lowpan_peer *peer; spin_lock(&devices_lock); @@ -1319,7 +1323,7 @@ static const struct file_operations lowpan_control_fops = { static void disconnect_devices(void) { - struct lowpan_dev *entry, *tmp, *new_dev; + struct lowpan_btle_dev *entry, *tmp, *new_dev; struct list_head devices; INIT_LIST_HEAD(&devices); @@ -1357,7 +1361,7 @@ static int device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - struct lowpan_dev *entry; + struct lowpan_btle_dev *entry; if (netdev->type != ARPHRD_6LOWPAN) return NOTIFY_DONE; diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 955eda93e..3df7aefb7 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -65,7 +65,7 @@ static const char *const bt_slock_key_strings[BT_MAX_PROTO] = { void bt_sock_reclassify_lock(struct sock *sk, int proto) { BUG_ON(!sk); - BUG_ON(sock_owned_by_user(sk)); + BUG_ON(!sock_allow_reclassification(sk)); sock_lock_init_class_and_name(sk, bt_slock_key_strings[proto], &bt_slock_key[proto], diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c index 6ceb5d36a..f4fcb4a9d 100644 --- a/net/bluetooth/bnep/netdev.c +++ b/net/bluetooth/bnep/netdev.c @@ -188,7 +188,7 @@ static netdev_tx_t bnep_net_xmit(struct sk_buff *skb, * So we have to queue them and wake up session thread which is sleeping * on the sk_sleep(sk). */ - dev->trans_start = jiffies; + netif_trans_update(dev); skb_queue_tail(&sk->sk_write_queue, skb); wake_up_interruptible(sk_sleep(sk)); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 2713fc86e..45a9fc68c 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3139,10 +3139,10 @@ void hci_unregister_dev(struct hci_dev *hdev) list_del(&hdev->list); write_unlock(&hci_dev_list_lock); - hci_dev_do_close(hdev); - cancel_work_sync(&hdev->power_on); + hci_dev_do_close(hdev); + if (!test_bit(HCI_INIT, &hdev->flags) && !hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) { diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index c162af5d1..d4b3dd541 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4727,6 +4727,19 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, u32 flags; u8 *ptr, real_len; + switch (type) { + case LE_ADV_IND: + case LE_ADV_DIRECT_IND: + case LE_ADV_SCAN_IND: + case LE_ADV_NONCONN_IND: + case LE_ADV_SCAN_RSP: + break; + default: + BT_ERR_RATELIMITED("Unknown advetising packet type: 0x%02x", + type); + return; + } + /* Find the end of the data in case the report contains padded zero * bytes at the end causing an invalid length value. * diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 6e125d76d..c045b3c54 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -1065,6 +1065,9 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) if (instance_flags & MGMT_ADV_FLAG_LIMITED_DISCOV) flags |= LE_AD_LIMITED; + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) + flags |= LE_AD_NO_BREDR; + if (flags || (instance_flags & MGMT_ADV_FLAG_MANAGED_FLAGS)) { /* If a discovery flag wasn't provided, simply use the global * settings. @@ -1072,9 +1075,6 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) if (!flags) flags |= mgmt_get_adv_discov_flags(hdev); - if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) - flags |= LE_AD_NO_BREDR; - /* If flags would still be empty, then there is no need to * include the "Flags" AD field". */ diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index e4cae7289..388ee8b59 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -778,7 +778,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, } if (sec.level < BT_SECURITY_LOW || - sec.level > BT_SECURITY_HIGH) { + sec.level > BT_SECURITY_FIPS) { err = -EINVAL; break; } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 160797722..28d5ec269 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -213,8 +213,7 @@ drop: } EXPORT_SYMBOL_GPL(br_handle_frame_finish); -/* note: already called with rcu_read_lock */ -static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +static void __br_handle_local_finish(struct sk_buff *skb) { struct net_bridge_port *p = br_port_get_rcu(skb->dev); u16 vid = 0; @@ -222,6 +221,14 @@ static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_bu /* check if vlan is allowed, to avoid spoofing */ if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid)) br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false); +} + +/* note: already called with rcu_read_lock */ +static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct net_bridge_port *p = br_port_get_rcu(skb->dev); + + __br_handle_local_finish(skb); BR_INPUT_SKB_CB(skb)->brdev = p->br->dev; br_pass_frame_up(skb); @@ -274,11 +281,21 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) if (p->br->stp_enabled == BR_NO_STP || fwd_mask & (1u << dest[5])) goto forward; - break; + *pskb = skb; + __br_handle_local_finish(skb); + return RX_HANDLER_PASS; case 0x01: /* IEEE MAC (Pause) */ goto drop; + case 0x0E: /* 802.1AB LLDP */ + fwd_mask |= p->br->group_fwd_mask; + if (fwd_mask & (1u << dest[5])) + goto forward; + *pskb = skb; + __br_handle_local_finish(skb); + return RX_HANDLER_PASS; + default: /* Allow selective forwarding for most other protocols */ fwd_mask |= p->br->group_fwd_mask; diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 60a3dbfca..d99b20097 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -113,7 +113,9 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) { struct net_bridge *br = netdev_priv(dev); + struct net_bridge_port *p = NULL; unsigned long args[4]; + int ret = -EOPNOTSUPP; if (copy_from_user(args, rq->ifr_data, sizeof(args))) return -EFAULT; @@ -183,25 +185,29 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; - return br_set_forward_delay(br, args[1]); + ret = br_set_forward_delay(br, args[1]); + break; case BRCTL_SET_BRIDGE_HELLO_TIME: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; - return br_set_hello_time(br, args[1]); + ret = br_set_hello_time(br, args[1]); + break; case BRCTL_SET_BRIDGE_MAX_AGE: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; - return br_set_max_age(br, args[1]); + ret = br_set_max_age(br, args[1]); + break; case BRCTL_SET_AGEING_TIME: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; - return br_set_ageing_time(br, args[1]); + ret = br_set_ageing_time(br, args[1]); + break; case BRCTL_GET_PORT_INFO: { @@ -241,20 +247,19 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) return -EPERM; br_stp_set_enabled(br, args[1]); - return 0; + ret = 0; + break; case BRCTL_SET_BRIDGE_PRIORITY: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; br_stp_set_bridge_priority(br, args[1]); - return 0; + ret = 0; + break; case BRCTL_SET_PORT_PRIORITY: { - struct net_bridge_port *p; - int ret; - if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -264,14 +269,11 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) else ret = br_stp_set_port_priority(p, args[2]); spin_unlock_bh(&br->lock); - return ret; + break; } case BRCTL_SET_PATH_COST: { - struct net_bridge_port *p; - int ret; - if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -281,8 +283,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) else ret = br_stp_set_path_cost(p, args[2]); spin_unlock_bh(&br->lock); - - return ret; + break; } case BRCTL_GET_FDB_ENTRIES: @@ -290,7 +291,14 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) args[2], args[3]); } - return -EOPNOTSUPP; + if (!ret) { + if (p) + br_ifinfo_notify(RTM_NEWLINK, p); + else + netdev_state_change(br->dev); + } + + return ret; } static int old_deviceless(struct net *net, void __user *uarg) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 44114a94c..77e7f69bf 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -217,13 +217,13 @@ static int br_validate_ipv4(struct net *net, struct sk_buff *skb) len = ntohs(iph->tot_len); if (skb->len < len) { - IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS); + __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS); + __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto drop; } @@ -236,7 +236,7 @@ static int br_validate_ipv4(struct net *net, struct sk_buff *skb) return 0; inhdr_error: - IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS); + __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: return -1; } @@ -700,7 +700,7 @@ static int br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)) { - unsigned int mtu = ip_skb_dst_mtu(skb); + unsigned int mtu = ip_skb_dst_mtu(sk, skb); struct iphdr *iph = ip_hdr(skb); if (unlikely(((iph->frag_off & htons(IP_DF)) && !skb->ignore_df) || diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index d61f56efc..5e59a8457 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -122,13 +122,13 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb) if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { if (pkt_len + ip6h_len > skb->len) { - IP6_INC_STATS_BH(net, idev, - IPSTATS_MIB_INTRUNCATEDPKTS); + __IP6_INC_STATS(net, idev, + IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) { - IP6_INC_STATS_BH(net, idev, - IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, idev, + IPSTATS_MIB_INDISCARDS); goto drop; } } @@ -142,7 +142,7 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb) return 0; inhdr_error: - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); drop: return -1; } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index e9c635eae..85e89f693 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -135,9 +135,9 @@ static inline size_t br_port_info_size(void) + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_NO */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_TOPOLOGY_CHANGE_ACK */ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_CONFIG_PENDING */ - + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_MESSAGE_AGE_TIMER */ - + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_FORWARD_DELAY_TIMER */ - + nla_total_size(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */ + + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_MESSAGE_AGE_TIMER */ + + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_FORWARD_DELAY_TIMER */ + + nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ #endif @@ -190,13 +190,16 @@ static int br_port_fill_attrs(struct sk_buff *skb, return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); - if (nla_put_u64(skb, IFLA_BRPORT_MESSAGE_AGE_TIMER, timerval)) + if (nla_put_u64_64bit(skb, IFLA_BRPORT_MESSAGE_AGE_TIMER, timerval, + IFLA_BRPORT_PAD)) return -EMSGSIZE; timerval = br_timer_value(&p->forward_delay_timer); - if (nla_put_u64(skb, IFLA_BRPORT_FORWARD_DELAY_TIMER, timerval)) + if (nla_put_u64_64bit(skb, IFLA_BRPORT_FORWARD_DELAY_TIMER, timerval, + IFLA_BRPORT_PAD)) return -EMSGSIZE; timerval = br_timer_value(&p->hold_timer); - if (nla_put_u64(skb, IFLA_BRPORT_HOLD_TIMER, timerval)) + if (nla_put_u64_64bit(skb, IFLA_BRPORT_HOLD_TIMER, timerval, + IFLA_BRPORT_PAD)) return -EMSGSIZE; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING @@ -847,6 +850,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_NF_CALL_IP6TABLES] = { .type = NLA_U8 }, [IFLA_BR_NF_CALL_ARPTABLES] = { .type = NLA_U8 }, [IFLA_BR_VLAN_DEFAULT_PVID] = { .type = NLA_U16 }, + [IFLA_BR_VLAN_STATS_ENABLED] = { .type = NLA_U8 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -918,6 +922,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (err) return err; } + + if (data[IFLA_BR_VLAN_STATS_ENABLED]) { + __u8 vlan_stats = nla_get_u8(data[IFLA_BR_VLAN_STATS_ENABLED]); + + err = br_vlan_set_stats(br, vlan_stats); + if (err) + return err; + } #endif if (data[IFLA_BR_GROUP_FWD_MASK]) { @@ -1079,6 +1091,7 @@ static size_t br_get_size(const struct net_device *brdev) #ifdef CONFIG_BRIDGE_VLAN_FILTERING nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */ nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_ENABLED */ #endif nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */ @@ -1087,10 +1100,10 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u32)) + /* IFLA_BR_ROOT_PATH_COST */ nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE */ nla_total_size(sizeof(u8)) + /* IFLA_BR_TOPOLOGY_CHANGE_DETECTED */ - nla_total_size(sizeof(u64)) + /* IFLA_BR_HELLO_TIMER */ - nla_total_size(sizeof(u64)) + /* IFLA_BR_TCN_TIMER */ - nla_total_size(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */ - nla_total_size(sizeof(u64)) + /* IFLA_BR_GC_TIMER */ + nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_HELLO_TIMER */ + nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_TCN_TIMER */ + nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */ + nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_GC_TIMER */ nla_total_size(ETH_ALEN) + /* IFLA_BR_GROUP_ADDR */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_ROUTER */ @@ -1101,12 +1114,12 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_HASH_MAX */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_LAST_MEMBER_CNT */ nla_total_size(sizeof(u32)) + /* IFLA_BR_MCAST_STARTUP_QUERY_CNT */ - nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_LAST_MEMBER_INTVL */ - nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_MEMBERSHIP_INTVL */ - nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERIER_INTVL */ - nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */ - nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */ - nla_total_size(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */ + nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_LAST_MEMBER_INTVL */ + nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_MEMBERSHIP_INTVL */ + nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERIER_INTVL */ + nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_INTVL */ + nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_QUERY_RESPONSE_INTVL */ + nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */ #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */ @@ -1129,16 +1142,17 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) u64 clockval; clockval = br_timer_value(&br->hello_timer); - if (nla_put_u64(skb, IFLA_BR_HELLO_TIMER, clockval)) + if (nla_put_u64_64bit(skb, IFLA_BR_HELLO_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = br_timer_value(&br->tcn_timer); - if (nla_put_u64(skb, IFLA_BR_TCN_TIMER, clockval)) + if (nla_put_u64_64bit(skb, IFLA_BR_TCN_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; clockval = br_timer_value(&br->topology_change_timer); - if (nla_put_u64(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval)) + if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval, + IFLA_BR_PAD)) return -EMSGSIZE; clockval = br_timer_value(&br->gc_timer); - if (nla_put_u64(skb, IFLA_BR_GC_TIMER, clockval)) + if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD)) return -EMSGSIZE; if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || @@ -1163,7 +1177,8 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) #ifdef CONFIG_BRIDGE_VLAN_FILTERING if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) || - nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid)) + nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid) || + nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED, br->vlan_stats_enabled)) return -EMSGSIZE; #endif #ifdef CONFIG_BRIDGE_IGMP_SNOOPING @@ -1182,22 +1197,28 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_last_member_interval); - if (nla_put_u64(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval)) + if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval, + IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_membership_interval); - if (nla_put_u64(skb, IFLA_BR_MCAST_MEMBERSHIP_INTVL, clockval)) + if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_MEMBERSHIP_INTVL, clockval, + IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_querier_interval); - if (nla_put_u64(skb, IFLA_BR_MCAST_QUERIER_INTVL, clockval)) + if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERIER_INTVL, clockval, + IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_query_interval); - if (nla_put_u64(skb, IFLA_BR_MCAST_QUERY_INTVL, clockval)) + if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_INTVL, clockval, + IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_query_response_interval); - if (nla_put_u64(skb, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, clockval)) + if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, clockval, + IFLA_BR_PAD)) return -EMSGSIZE; clockval = jiffies_to_clock_t(br->multicast_startup_query_interval); - if (nla_put_u64(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval)) + if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval, + IFLA_BR_PAD)) return -EMSGSIZE; #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) @@ -1213,6 +1234,69 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) return 0; } +static size_t br_get_linkxstats_size(const struct net_device *dev) +{ + struct net_bridge *br = netdev_priv(dev); + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + int numvls = 0; + + vg = br_vlan_group(br); + if (!vg) + return 0; + + /* we need to count all, even placeholder entries */ + list_for_each_entry(v, &vg->vlan_list, vlist) + numvls++; + + /* account for the vlans and the link xstats type nest attribute */ + return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) + + nla_total_size(0); +} + +static int br_fill_linkxstats(struct sk_buff *skb, const struct net_device *dev, + int *prividx) +{ + struct net_bridge *br = netdev_priv(dev); + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct nlattr *nest; + int vl_idx = 0; + + vg = br_vlan_group(br); + if (!vg) + goto out; + nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE); + if (!nest) + return -EMSGSIZE; + list_for_each_entry(v, &vg->vlan_list, vlist) { + struct bridge_vlan_xstats vxi; + struct br_vlan_stats stats; + + if (++vl_idx < *prividx) + continue; + memset(&vxi, 0, sizeof(vxi)); + vxi.vid = v->vid; + br_vlan_get_stats(v, &stats); + vxi.rx_bytes = stats.rx_bytes; + vxi.rx_packets = stats.rx_packets; + vxi.tx_bytes = stats.tx_bytes; + vxi.tx_packets = stats.tx_packets; + + if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi)) + goto nla_put_failure; + } + nla_nest_end(skb, nest); + *prividx = 0; +out: + return 0; + +nla_put_failure: + nla_nest_end(skb, nest); + *prividx = vl_idx; + + return -EMSGSIZE; +} static struct rtnl_af_ops br_af_ops __read_mostly = { .family = AF_BRIDGE, @@ -1231,6 +1315,8 @@ struct rtnl_link_ops br_link_ops __read_mostly = { .dellink = br_dev_delete, .get_size = br_get_size, .fill_info = br_fill_info, + .fill_linkxstats = br_fill_linkxstats, + .get_linkxstats_size = br_get_linkxstats_size, .slave_maxtype = IFLA_BRPORT_MAX, .slave_policy = br_port_policy, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index f516c53ba..52edecf3c 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -77,12 +77,21 @@ struct bridge_mcast_querier { }; #endif +struct br_vlan_stats { + u64 rx_bytes; + u64 rx_packets; + u64 tx_bytes; + u64 tx_packets; + struct u64_stats_sync syncp; +}; + /** * struct net_bridge_vlan - per-vlan entry * * @vnode: rhashtable member * @vid: VLAN id * @flags: bridge vlan flags + * @stats: per-cpu VLAN statistics * @br: if MASTER flag set, this points to a bridge struct * @port: if MASTER flag unset, this points to a port struct * @refcnt: if MASTER flag set, this is bumped for each port referencing it @@ -100,6 +109,7 @@ struct net_bridge_vlan { struct rhash_head vnode; u16 vid; u16 flags; + struct br_vlan_stats __percpu *stats; union { struct net_bridge *br; struct net_bridge_port *port; @@ -343,6 +353,7 @@ struct net_bridge #ifdef CONFIG_BRIDGE_VLAN_FILTERING struct net_bridge_vlan_group __rcu *vlgrp; u8 vlan_enabled; + u8 vlan_stats_enabled; __be16 vlan_proto; u16 default_pvid; #endif @@ -706,6 +717,7 @@ int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); int __br_vlan_set_proto(struct net_bridge *br, __be16 proto); int br_vlan_set_proto(struct net_bridge *br, unsigned long val); +int br_vlan_set_stats(struct net_bridge *br, unsigned long val); int br_vlan_init(struct net_bridge *br); int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid); @@ -714,6 +726,8 @@ int nbp_vlan_delete(struct net_bridge_port *port, u16 vid); void nbp_vlan_flush(struct net_bridge_port *port); int nbp_vlan_init(struct net_bridge_port *port); int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask); +void br_vlan_get_stats(const struct net_bridge_vlan *v, + struct br_vlan_stats *stats); static inline struct net_bridge_vlan_group *br_vlan_group( const struct net_bridge *br) @@ -896,6 +910,10 @@ static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu( return NULL; } +static inline void br_vlan_get_stats(const struct net_bridge_vlan *v, + struct br_vlan_stats *stats) +{ +} #endif struct nf_br_ops { diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 6b8091407..beb47071e 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -43,7 +43,14 @@ static ssize_t store_bridge_parm(struct device *d, if (endp == buf) return -EINVAL; + if (!rtnl_trylock()) + return restart_syscall(); + err = (*set)(br, val); + if (!err) + netdev_state_change(br->dev); + rtnl_unlock(); + return err ? err : len; } @@ -101,15 +108,7 @@ static ssize_t ageing_time_show(struct device *d, static int set_ageing_time(struct net_bridge *br, unsigned long val) { - int ret; - - if (!rtnl_trylock()) - return restart_syscall(); - - ret = br_set_ageing_time(br, val); - rtnl_unlock(); - - return ret; + return br_set_ageing_time(br, val); } static ssize_t ageing_time_store(struct device *d, @@ -128,27 +127,18 @@ static ssize_t stp_state_show(struct device *d, } +static int set_stp_state(struct net_bridge *br, unsigned long val) +{ + br_stp_set_enabled(br, val); + + return 0; +} + static ssize_t stp_state_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - struct net_bridge *br = to_bridge(d); - char *endp; - unsigned long val; - - if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - val = simple_strtoul(buf, &endp, 0); - if (endp == buf) - return -EINVAL; - - if (!rtnl_trylock()) - return restart_syscall(); - br_stp_set_enabled(br, val); - rtnl_unlock(); - - return len; + return store_bridge_parm(d, buf, len, set_stp_state); } static DEVICE_ATTR_RW(stp_state); @@ -160,29 +150,22 @@ static ssize_t group_fwd_mask_show(struct device *d, return sprintf(buf, "%#x\n", br->group_fwd_mask); } - -static ssize_t group_fwd_mask_store(struct device *d, - struct device_attribute *attr, - const char *buf, - size_t len) +static int set_group_fwd_mask(struct net_bridge *br, unsigned long val) { - struct net_bridge *br = to_bridge(d); - char *endp; - unsigned long val; - - if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - val = simple_strtoul(buf, &endp, 0); - if (endp == buf) - return -EINVAL; - if (val & BR_GROUPFWD_RESTRICTED) return -EINVAL; br->group_fwd_mask = val; - return len; + return 0; +} + +static ssize_t group_fwd_mask_store(struct device *d, + struct device_attribute *attr, + const char *buf, + size_t len) +{ + return store_bridge_parm(d, buf, len, set_group_fwd_mask); } static DEVICE_ATTR_RW(group_fwd_mask); @@ -328,6 +311,7 @@ static ssize_t group_addr_store(struct device *d, br->group_addr_set = true; br_recalculate_fwd_mask(br); + netdev_state_change(br->dev); rtnl_unlock(); @@ -336,17 +320,17 @@ static ssize_t group_addr_store(struct device *d, static DEVICE_ATTR_RW(group_addr); +static int set_flush(struct net_bridge *br, unsigned long val) +{ + br_fdb_flush(br); + return 0; +} + static ssize_t flush_store(struct device *d, struct device_attribute *attr, const char *buf, size_t len) { - struct net_bridge *br = to_bridge(d); - - if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - - br_fdb_flush(br); - return len; + return store_bridge_parm(d, buf, len, set_flush); } static DEVICE_ATTR_WO(flush); @@ -747,6 +731,22 @@ static ssize_t default_pvid_store(struct device *d, return store_bridge_parm(d, buf, len, br_vlan_set_default_pvid); } static DEVICE_ATTR_RW(default_pvid); + +static ssize_t vlan_stats_enabled_show(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%u\n", br->vlan_stats_enabled); +} + +static ssize_t vlan_stats_enabled_store(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_vlan_set_stats); +} +static DEVICE_ATTR_RW(vlan_stats_enabled); #endif static struct attribute *bridge_attrs[] = { @@ -794,6 +794,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_vlan_filtering.attr, &dev_attr_vlan_protocol.attr, &dev_attr_default_pvid.attr, + &dev_attr_vlan_stats_enabled.attr, #endif NULL }; diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index efe415ad8..1e04d4d44 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -61,7 +61,6 @@ static int store_flag(struct net_bridge_port *p, unsigned long v, if (flags != p->flags) { p->flags = flags; br_port_flags_change(p, mask); - br_ifinfo_notify(RTM_NEWLINK, p); } return 0; } @@ -253,8 +252,10 @@ static ssize_t brport_store(struct kobject *kobj, spin_lock_bh(&p->br->lock); ret = brport_attr->store(p, val); spin_unlock_bh(&p->br->lock); - if (ret == 0) + if (!ret) { + br_ifinfo_notify(RTM_NEWLINK, p); ret = count; + } } rtnl_unlock(); } diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 9309bb4f2..b6de4f457 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -162,6 +162,17 @@ static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid return masterv; } +static void br_master_vlan_rcu_free(struct rcu_head *rcu) +{ + struct net_bridge_vlan *v; + + v = container_of(rcu, struct net_bridge_vlan, rcu); + WARN_ON(!br_vlan_is_master(v)); + free_percpu(v->stats); + v->stats = NULL; + kfree(v); +} + static void br_vlan_put_master(struct net_bridge_vlan *masterv) { struct net_bridge_vlan_group *vg; @@ -174,7 +185,7 @@ static void br_vlan_put_master(struct net_bridge_vlan *masterv) rhashtable_remove_fast(&vg->vlan_hash, &masterv->vnode, br_vlan_rht_params); __vlan_del_list(masterv); - kfree_rcu(masterv, rcu); + call_rcu(&masterv->rcu, br_master_vlan_rcu_free); } } @@ -230,6 +241,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) if (!masterv) goto out_filt; v->brvlan = masterv; + v->stats = masterv->stats; } /* Add the dev mac and count the vlan only if it's usable */ @@ -329,6 +341,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb) { + struct br_vlan_stats *stats; struct net_bridge_vlan *v; u16 vid; @@ -355,18 +368,27 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, return NULL; } } + if (br->vlan_stats_enabled) { + stats = this_cpu_ptr(v->stats); + u64_stats_update_begin(&stats->syncp); + stats->tx_bytes += skb->len; + stats->tx_packets++; + u64_stats_update_end(&stats->syncp); + } + if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) skb->vlan_tci = 0; - out: return skb; } /* Called under RCU */ -static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto, +static bool __allowed_ingress(const struct net_bridge *br, + struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid) { - const struct net_bridge_vlan *v; + struct br_vlan_stats *stats; + struct net_bridge_vlan *v; bool tagged; BR_INPUT_SKB_CB(skb)->vlan_filtered = true; @@ -375,7 +397,7 @@ static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto, * HW accelerated vlan tag. */ if (unlikely(!skb_vlan_tag_present(skb) && - skb->protocol == proto)) { + skb->protocol == br->vlan_proto)) { skb = skb_vlan_untag(skb); if (unlikely(!skb)) return false; @@ -383,7 +405,7 @@ static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto, if (!br_vlan_get_tag(skb, vid)) { /* Tagged frame */ - if (skb->vlan_proto != proto) { + if (skb->vlan_proto != br->vlan_proto) { /* Protocol-mismatch, empty out vlan_tci for new tag */ skb_push(skb, ETH_HLEN); skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, @@ -419,7 +441,7 @@ static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto, *vid = pvid; if (likely(!tagged)) /* Untagged Frame. */ - __vlan_hwaccel_put_tag(skb, proto, pvid); + __vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid); else /* Priority-tagged Frame. * At this point, We know that skb->vlan_tci had @@ -428,13 +450,24 @@ static bool __allowed_ingress(struct net_bridge_vlan_group *vg, __be16 proto, */ skb->vlan_tci |= pvid; - return true; + /* if stats are disabled we can avoid the lookup */ + if (!br->vlan_stats_enabled) + return true; } - - /* Frame had a valid vlan tag. See if vlan is allowed */ v = br_vlan_find(vg, *vid); - if (v && br_vlan_should_use(v)) - return true; + if (!v || !br_vlan_should_use(v)) + goto drop; + + if (br->vlan_stats_enabled) { + stats = this_cpu_ptr(v->stats); + u64_stats_update_begin(&stats->syncp); + stats->rx_bytes += skb->len; + stats->rx_packets++; + u64_stats_update_end(&stats->syncp); + } + + return true; + drop: kfree_skb(skb); return false; @@ -452,7 +485,7 @@ bool br_allowed_ingress(const struct net_bridge *br, return true; } - return __allowed_ingress(vg, br->vlan_proto, skb, vid); + return __allowed_ingress(br, vg, skb, vid); } /* Called under RCU. */ @@ -542,6 +575,11 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags) if (!vlan) return -ENOMEM; + vlan->stats = netdev_alloc_pcpu_stats(struct br_vlan_stats); + if (!vlan->stats) { + kfree(vlan); + return -ENOMEM; + } vlan->vid = vid; vlan->flags = flags | BRIDGE_VLAN_INFO_MASTER; vlan->flags &= ~BRIDGE_VLAN_INFO_PVID; @@ -549,8 +587,10 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags) if (flags & BRIDGE_VLAN_INFO_BRENTRY) atomic_set(&vlan->refcnt, 1); ret = __vlan_add(vlan, flags); - if (ret) + if (ret) { + free_percpu(vlan->stats); kfree(vlan); + } return ret; } @@ -651,15 +691,7 @@ int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) { - int err; - - if (!rtnl_trylock()) - return restart_syscall(); - - err = __br_vlan_filter_toggle(br, val); - rtnl_unlock(); - - return err; + return __br_vlan_filter_toggle(br, val); } int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) @@ -713,18 +745,24 @@ err_filt: int br_vlan_set_proto(struct net_bridge *br, unsigned long val) { - int err; - if (val != ETH_P_8021Q && val != ETH_P_8021AD) return -EPROTONOSUPPORT; - if (!rtnl_trylock()) - return restart_syscall(); + return __br_vlan_set_proto(br, htons(val)); +} - err = __br_vlan_set_proto(br, htons(val)); - rtnl_unlock(); +int br_vlan_set_stats(struct net_bridge *br, unsigned long val) +{ + switch (val) { + case 0: + case 1: + br->vlan_stats_enabled = val; + break; + default: + return -EINVAL; + } - return err; + return 0; } static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid) @@ -855,21 +893,17 @@ int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val) if (val >= VLAN_VID_MASK) return -EINVAL; - if (!rtnl_trylock()) - return restart_syscall(); - if (pvid == br->default_pvid) - goto unlock; + goto out; /* Only allow default pvid change when filtering is disabled */ if (br->vlan_enabled) { pr_info_once("Please disable vlan filtering to change default_pvid\n"); err = -EPERM; - goto unlock; + goto out; } err = __br_vlan_set_default_pvid(br, pvid); -unlock: - rtnl_unlock(); +out: return err; } @@ -1020,3 +1054,30 @@ void nbp_vlan_flush(struct net_bridge_port *port) synchronize_rcu(); __vlan_group_free(vg); } + +void br_vlan_get_stats(const struct net_bridge_vlan *v, + struct br_vlan_stats *stats) +{ + int i; + + memset(stats, 0, sizeof(*stats)); + for_each_possible_cpu(i) { + u64 rxpackets, rxbytes, txpackets, txbytes; + struct br_vlan_stats *cpu_stats; + unsigned int start; + + cpu_stats = per_cpu_ptr(v->stats, i); + do { + start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + rxpackets = cpu_stats->rx_packets; + rxbytes = cpu_stats->rx_bytes; + txbytes = cpu_stats->tx_bytes; + txpackets = cpu_stats->tx_packets; + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + + stats->rx_packets += rxpackets; + stats->rx_bytes += rxbytes; + stats->tx_bytes += txbytes; + stats->tx_packets += txpackets; + } +} diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c index 7fcdd7261..a78c4e282 100644 --- a/net/bridge/netfilter/nf_tables_bridge.c +++ b/net/bridge/netfilter/nf_tables_bridge.c @@ -162,15 +162,57 @@ static const struct nf_chain_type filter_bridge = { (1 << NF_BR_POST_ROUTING), }; +static void nf_br_saveroute(const struct sk_buff *skb, + struct nf_queue_entry *entry) +{ +} + +static int nf_br_reroute(struct net *net, struct sk_buff *skb, + const struct nf_queue_entry *entry) +{ + return 0; +} + +static __sum16 nf_br_checksum(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, u_int8_t protocol) +{ + return 0; +} + +static __sum16 nf_br_checksum_partial(struct sk_buff *skb, unsigned int hook, + unsigned int dataoff, unsigned int len, + u_int8_t protocol) +{ + return 0; +} + +static int nf_br_route(struct net *net, struct dst_entry **dst, + struct flowi *fl, bool strict __always_unused) +{ + return 0; +} + +static const struct nf_afinfo nf_br_afinfo = { + .family = AF_BRIDGE, + .checksum = nf_br_checksum, + .checksum_partial = nf_br_checksum_partial, + .route = nf_br_route, + .saveroute = nf_br_saveroute, + .reroute = nf_br_reroute, + .route_key_size = 0, +}; + static int __init nf_tables_bridge_init(void) { int ret; + nf_register_afinfo(&nf_br_afinfo); nft_register_chain_type(&filter_bridge); ret = register_pernet_subsys(&nf_tables_bridge_net_ops); - if (ret < 0) + if (ret < 0) { nft_unregister_chain_type(&filter_bridge); - + nf_unregister_afinfo(&nf_br_afinfo); + } return ret; } @@ -178,6 +220,7 @@ static void __exit nf_tables_bridge_exit(void) { unregister_pernet_subsys(&nf_tables_bridge_net_ops); nft_unregister_chain_type(&filter_bridge); + nf_unregister_afinfo(&nf_br_afinfo); } module_init(nf_tables_bridge_init); diff --git a/net/can/raw.c b/net/can/raw.c index 2e67b1423..972c187d4 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -755,7 +755,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (err < 0) goto free_skb; - sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); + sock_tx_timestamp(sk, sk->sk_tsflags, &skb_shinfo(skb)->tx_flags); skb->dev = dev; skb->sk = sk; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index dcc18c6f7..55d2bfee1 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -651,7 +651,7 @@ EXPORT_SYMBOL(ceph_destroy_client); /* * true if we have the mon map (and have thus joined the cluster) */ -static int have_mon_and_osd_map(struct ceph_client *client) +static bool have_mon_and_osd_map(struct ceph_client *client) { return client->monc.monmap && client->monc.monmap->epoch && client->osdc.osdmap && client->osdc.osdmap->epoch; diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c index 139a9cb19..3773a4fa1 100644 --- a/net/ceph/ceph_strings.c +++ b/net/ceph/ceph_strings.c @@ -27,6 +27,22 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE) } } +const char *ceph_osd_watch_op_name(int o) +{ + switch (o) { + case CEPH_OSD_WATCH_OP_UNWATCH: + return "unwatch"; + case CEPH_OSD_WATCH_OP_WATCH: + return "watch"; + case CEPH_OSD_WATCH_OP_RECONNECT: + return "reconnect"; + case CEPH_OSD_WATCH_OP_PING: + return "ping"; + default: + return "???"; + } +} + const char *ceph_osd_state_name(int s) { switch (s) { diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index b902fbc78..e77b04ca7 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -54,24 +54,25 @@ static int osdmap_show(struct seq_file *s, void *p) { int i; struct ceph_client *client = s->private; - struct ceph_osdmap *map = client->osdc.osdmap; + struct ceph_osd_client *osdc = &client->osdc; + struct ceph_osdmap *map = osdc->osdmap; struct rb_node *n; if (map == NULL) return 0; - seq_printf(s, "epoch %d\n", map->epoch); - seq_printf(s, "flags%s%s\n", - (map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "", - (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : ""); + down_read(&osdc->lock); + seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags); for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) { - struct ceph_pg_pool_info *pool = + struct ceph_pg_pool_info *pi = rb_entry(n, struct ceph_pg_pool_info, node); - seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n", - pool->id, pool->pg_num, pool->pg_num_mask, - pool->read_tier, pool->write_tier); + seq_printf(s, "pool %lld '%s' type %d size %d min_size %d pg_num %u pg_num_mask %d flags 0x%llx lfor %u read_tier %lld write_tier %lld\n", + pi->id, pi->name, pi->type, pi->size, pi->min_size, + pi->pg_num, pi->pg_num_mask, pi->flags, + pi->last_force_request_resend, pi->read_tier, + pi->write_tier); } for (i = 0; i < map->max_osd; i++) { struct ceph_entity_addr *addr = &map->osd_addr[i]; @@ -103,6 +104,7 @@ static int osdmap_show(struct seq_file *s, void *p) pg->pgid.seed, pg->primary_temp.osd); } + up_read(&osdc->lock); return 0; } @@ -126,6 +128,7 @@ static int monc_show(struct seq_file *s, void *p) CEPH_SUBSCRIBE_ONETIME ? "" : "+")); seq_putc(s, '\n'); } + seq_printf(s, "fs_cluster_id %d\n", monc->fs_cluster_id); for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { __u16 op; @@ -143,43 +146,113 @@ static int monc_show(struct seq_file *s, void *p) return 0; } -static int osdc_show(struct seq_file *s, void *pp) +static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t) { - struct ceph_client *client = s->private; - struct ceph_osd_client *osdc = &client->osdc; - struct rb_node *p; + int i; - mutex_lock(&osdc->request_mutex); - for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { - struct ceph_osd_request *req; - unsigned int i; - int opcode; + seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed); + for (i = 0; i < t->up.size; i++) + seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]); + seq_printf(s, "]/%d\t[", t->up.primary); + for (i = 0; i < t->acting.size; i++) + seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]); + seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary, + t->target_oid.name_len, t->target_oid.name, t->flags); + if (t->paused) + seq_puts(s, "\tP"); +} - req = rb_entry(p, struct ceph_osd_request, r_node); +static void dump_request(struct seq_file *s, struct ceph_osd_request *req) +{ + int i; - seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1, - req->r_pgid.pool, req->r_pgid.seed); + seq_printf(s, "%llu\t", req->r_tid); + dump_target(s, &req->r_t); - seq_printf(s, "%.*s", req->r_base_oid.name_len, - req->r_base_oid.name); + seq_printf(s, "\t%d\t%u'%llu", req->r_attempts, + le32_to_cpu(req->r_replay_version.epoch), + le64_to_cpu(req->r_replay_version.version)); - if (req->r_reassert_version.epoch) - seq_printf(s, "\t%u'%llu", - (unsigned int)le32_to_cpu(req->r_reassert_version.epoch), - le64_to_cpu(req->r_reassert_version.version)); - else - seq_printf(s, "\t"); + for (i = 0; i < req->r_num_ops; i++) { + struct ceph_osd_req_op *op = &req->r_ops[i]; + + seq_printf(s, "%s%s", (i == 0 ? "\t" : ","), + ceph_osd_op_name(op->op)); + if (op->op == CEPH_OSD_OP_WATCH) + seq_printf(s, "-%s", + ceph_osd_watch_op_name(op->watch.op)); + } + + seq_putc(s, '\n'); +} + +static void dump_requests(struct seq_file *s, struct ceph_osd *osd) +{ + struct rb_node *n; + + mutex_lock(&osd->lock); + for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) { + struct ceph_osd_request *req = + rb_entry(n, struct ceph_osd_request, r_node); + + dump_request(s, req); + } + + mutex_unlock(&osd->lock); +} - for (i = 0; i < req->r_num_ops; i++) { - opcode = req->r_ops[i].op; - seq_printf(s, "%s%s", (i == 0 ? "\t" : ","), - ceph_osd_op_name(opcode)); - } +static void dump_linger_request(struct seq_file *s, + struct ceph_osd_linger_request *lreq) +{ + seq_printf(s, "%llu\t", lreq->linger_id); + dump_target(s, &lreq->t); + + seq_printf(s, "\t%u\t%s%s/%d\n", lreq->register_gen, + lreq->is_watch ? "W" : "N", lreq->committed ? "C" : "", + lreq->last_error); +} + +static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd) +{ + struct rb_node *n; + + mutex_lock(&osd->lock); + for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) { + struct ceph_osd_linger_request *lreq = + rb_entry(n, struct ceph_osd_linger_request, node); + + dump_linger_request(s, lreq); + } + + mutex_unlock(&osd->lock); +} - seq_printf(s, "\n"); +static int osdc_show(struct seq_file *s, void *pp) +{ + struct ceph_client *client = s->private; + struct ceph_osd_client *osdc = &client->osdc; + struct rb_node *n; + + down_read(&osdc->lock); + seq_printf(s, "REQUESTS %d homeless %d\n", + atomic_read(&osdc->num_requests), + atomic_read(&osdc->num_homeless)); + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + + dump_requests(s, osd); } - mutex_unlock(&osdc->request_mutex); + dump_requests(s, &osdc->homeless_osd); + + seq_puts(s, "LINGER REQUESTS\n"); + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + + dump_linger_requests(s, osd); + } + dump_linger_requests(s, &osdc->homeless_osd); + + up_read(&osdc->lock); return 0; } diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index cf638c009..37c38a7fb 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -260,20 +260,26 @@ static void __send_subscribe(struct ceph_mon_client *monc) BUG_ON(num < 1); /* monmap sub is always there */ ceph_encode_32(&p, num); for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { - const char *s = ceph_sub_str[i]; + char buf[32]; + int len; if (!monc->subs[i].want) continue; - dout("%s %s start %llu flags 0x%x\n", __func__, s, + len = sprintf(buf, "%s", ceph_sub_str[i]); + if (i == CEPH_SUB_MDSMAP && + monc->fs_cluster_id != CEPH_FS_CLUSTER_ID_NONE) + len += sprintf(buf + len, ".%d", monc->fs_cluster_id); + + dout("%s %s start %llu flags 0x%x\n", __func__, buf, le64_to_cpu(monc->subs[i].item.start), monc->subs[i].item.flags); - ceph_encode_string(&p, end, s, strlen(s)); + ceph_encode_string(&p, end, buf, len); memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item)); p += sizeof(monc->subs[i].item); } - BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19)); + BUG_ON(p > end); msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); ceph_msg_revoke(msg); @@ -376,19 +382,13 @@ void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch) } EXPORT_SYMBOL(ceph_monc_got_map); -/* - * Register interest in the next osdmap - */ -void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) +void ceph_monc_renew_subs(struct ceph_mon_client *monc) { - dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have); mutex_lock(&monc->mutex); - if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, - monc->subs[CEPH_SUB_OSDMAP].have + 1, false)) - __send_subscribe(monc); + __send_subscribe(monc); mutex_unlock(&monc->mutex); } -EXPORT_SYMBOL(ceph_monc_request_next_osdmap); +EXPORT_SYMBOL(ceph_monc_renew_subs); /* * Wait for an osdmap with a given epoch. @@ -478,51 +478,17 @@ out: /* * generic requests (currently statfs, mon_get_version) */ -static struct ceph_mon_generic_request *__lookup_generic_req( - struct ceph_mon_client *monc, u64 tid) -{ - struct ceph_mon_generic_request *req; - struct rb_node *n = monc->generic_request_tree.rb_node; - - while (n) { - req = rb_entry(n, struct ceph_mon_generic_request, node); - if (tid < req->tid) - n = n->rb_left; - else if (tid > req->tid) - n = n->rb_right; - else - return req; - } - return NULL; -} - -static void __insert_generic_request(struct ceph_mon_client *monc, - struct ceph_mon_generic_request *new) -{ - struct rb_node **p = &monc->generic_request_tree.rb_node; - struct rb_node *parent = NULL; - struct ceph_mon_generic_request *req = NULL; - - while (*p) { - parent = *p; - req = rb_entry(parent, struct ceph_mon_generic_request, node); - if (new->tid < req->tid) - p = &(*p)->rb_left; - else if (new->tid > req->tid) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&new->node, parent, p); - rb_insert_color(&new->node, &monc->generic_request_tree); -} +DEFINE_RB_FUNCS(generic_request, struct ceph_mon_generic_request, tid, node) static void release_generic_request(struct kref *kref) { struct ceph_mon_generic_request *req = container_of(kref, struct ceph_mon_generic_request, kref); + dout("%s greq %p request %p reply %p\n", __func__, req, req->request, + req->reply); + WARN_ON(!RB_EMPTY_NODE(&req->node)); + if (req->reply) ceph_msg_put(req->reply); if (req->request) @@ -533,7 +499,8 @@ static void release_generic_request(struct kref *kref) static void put_generic_request(struct ceph_mon_generic_request *req) { - kref_put(&req->kref, release_generic_request); + if (req) + kref_put(&req->kref, release_generic_request); } static void get_generic_request(struct ceph_mon_generic_request *req) @@ -541,6 +508,103 @@ static void get_generic_request(struct ceph_mon_generic_request *req) kref_get(&req->kref); } +static struct ceph_mon_generic_request * +alloc_generic_request(struct ceph_mon_client *monc, gfp_t gfp) +{ + struct ceph_mon_generic_request *req; + + req = kzalloc(sizeof(*req), gfp); + if (!req) + return NULL; + + req->monc = monc; + kref_init(&req->kref); + RB_CLEAR_NODE(&req->node); + init_completion(&req->completion); + + dout("%s greq %p\n", __func__, req); + return req; +} + +static void register_generic_request(struct ceph_mon_generic_request *req) +{ + struct ceph_mon_client *monc = req->monc; + + WARN_ON(req->tid); + + get_generic_request(req); + req->tid = ++monc->last_tid; + insert_generic_request(&monc->generic_request_tree, req); +} + +static void send_generic_request(struct ceph_mon_client *monc, + struct ceph_mon_generic_request *req) +{ + WARN_ON(!req->tid); + + dout("%s greq %p tid %llu\n", __func__, req, req->tid); + req->request->hdr.tid = cpu_to_le64(req->tid); + ceph_con_send(&monc->con, ceph_msg_get(req->request)); +} + +static void __finish_generic_request(struct ceph_mon_generic_request *req) +{ + struct ceph_mon_client *monc = req->monc; + + dout("%s greq %p tid %llu\n", __func__, req, req->tid); + erase_generic_request(&monc->generic_request_tree, req); + + ceph_msg_revoke(req->request); + ceph_msg_revoke_incoming(req->reply); +} + +static void finish_generic_request(struct ceph_mon_generic_request *req) +{ + __finish_generic_request(req); + put_generic_request(req); +} + +static void complete_generic_request(struct ceph_mon_generic_request *req) +{ + if (req->complete_cb) + req->complete_cb(req); + else + complete_all(&req->completion); + put_generic_request(req); +} + +void cancel_generic_request(struct ceph_mon_generic_request *req) +{ + struct ceph_mon_client *monc = req->monc; + struct ceph_mon_generic_request *lookup_req; + + dout("%s greq %p tid %llu\n", __func__, req, req->tid); + + mutex_lock(&monc->mutex); + lookup_req = lookup_generic_request(&monc->generic_request_tree, + req->tid); + if (lookup_req) { + WARN_ON(lookup_req != req); + finish_generic_request(req); + } + + mutex_unlock(&monc->mutex); +} + +static int wait_generic_request(struct ceph_mon_generic_request *req) +{ + int ret; + + dout("%s greq %p tid %llu\n", __func__, req, req->tid); + ret = wait_for_completion_interruptible(&req->completion); + if (ret) + cancel_generic_request(req); + else + ret = req->result; /* completed */ + + return ret; +} + static struct ceph_msg *get_generic_reply(struct ceph_connection *con, struct ceph_msg_header *hdr, int *skip) @@ -551,7 +615,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con, struct ceph_msg *m; mutex_lock(&monc->mutex); - req = __lookup_generic_req(monc, tid); + req = lookup_generic_request(&monc->generic_request_tree, tid); if (!req) { dout("get_generic_reply %lld dne\n", tid); *skip = 1; @@ -570,42 +634,6 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con, return m; } -static int __do_generic_request(struct ceph_mon_client *monc, u64 tid, - struct ceph_mon_generic_request *req) -{ - int err; - - /* register request */ - req->tid = tid != 0 ? tid : ++monc->last_tid; - req->request->hdr.tid = cpu_to_le64(req->tid); - __insert_generic_request(monc, req); - monc->num_generic_requests++; - ceph_con_send(&monc->con, ceph_msg_get(req->request)); - mutex_unlock(&monc->mutex); - - err = wait_for_completion_interruptible(&req->completion); - - mutex_lock(&monc->mutex); - rb_erase(&req->node, &monc->generic_request_tree); - monc->num_generic_requests--; - - if (!err) - err = req->result; - return err; -} - -static int do_generic_request(struct ceph_mon_client *monc, - struct ceph_mon_generic_request *req) -{ - int err; - - mutex_lock(&monc->mutex); - err = __do_generic_request(monc, 0, req); - mutex_unlock(&monc->mutex); - - return err; -} - /* * statfs */ @@ -616,22 +644,24 @@ static void handle_statfs_reply(struct ceph_mon_client *monc, struct ceph_mon_statfs_reply *reply = msg->front.iov_base; u64 tid = le64_to_cpu(msg->hdr.tid); + dout("%s msg %p tid %llu\n", __func__, msg, tid); + if (msg->front.iov_len != sizeof(*reply)) goto bad; - dout("handle_statfs_reply %p tid %llu\n", msg, tid); mutex_lock(&monc->mutex); - req = __lookup_generic_req(monc, tid); - if (req) { - *(struct ceph_statfs *)req->buf = reply->st; - req->result = 0; - get_generic_request(req); + req = lookup_generic_request(&monc->generic_request_tree, tid); + if (!req) { + mutex_unlock(&monc->mutex); + return; } + + req->result = 0; + *req->u.st = reply->st; /* struct */ + __finish_generic_request(req); mutex_unlock(&monc->mutex); - if (req) { - complete_all(&req->completion); - put_generic_request(req); - } + + complete_generic_request(req); return; bad: @@ -646,38 +676,38 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) { struct ceph_mon_generic_request *req; struct ceph_mon_statfs *h; - int err; + int ret = -ENOMEM; - req = kzalloc(sizeof(*req), GFP_NOFS); + req = alloc_generic_request(monc, GFP_NOFS); if (!req) - return -ENOMEM; - - kref_init(&req->kref); - req->buf = buf; - init_completion(&req->completion); + goto out; - err = -ENOMEM; req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, true); if (!req->request) goto out; - req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS, - true); + + req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 64, GFP_NOFS, true); if (!req->reply) goto out; + req->u.st = buf; + + mutex_lock(&monc->mutex); + register_generic_request(req); /* fill out request */ h = req->request->front.iov_base; h->monhdr.have_version = 0; h->monhdr.session_mon = cpu_to_le16(-1); h->monhdr.session_mon_tid = 0; h->fsid = monc->monmap->fsid; + send_generic_request(monc, req); + mutex_unlock(&monc->mutex); - err = do_generic_request(monc, req); - + ret = wait_generic_request(req); out: put_generic_request(req); - return err; + return ret; } EXPORT_SYMBOL(ceph_monc_do_statfs); @@ -690,7 +720,7 @@ static void handle_get_version_reply(struct ceph_mon_client *monc, void *end = p + msg->front_alloc_len; u64 handle; - dout("%s %p tid %llu\n", __func__, msg, tid); + dout("%s msg %p tid %llu\n", __func__, msg, tid); ceph_decode_need(&p, end, 2*sizeof(u64), bad); handle = ceph_decode_64(&p); @@ -698,77 +728,111 @@ static void handle_get_version_reply(struct ceph_mon_client *monc, goto bad; mutex_lock(&monc->mutex); - req = __lookup_generic_req(monc, handle); - if (req) { - *(u64 *)req->buf = ceph_decode_64(&p); - req->result = 0; - get_generic_request(req); + req = lookup_generic_request(&monc->generic_request_tree, handle); + if (!req) { + mutex_unlock(&monc->mutex); + return; } + + req->result = 0; + req->u.newest = ceph_decode_64(&p); + __finish_generic_request(req); mutex_unlock(&monc->mutex); - if (req) { - complete_all(&req->completion); - put_generic_request(req); - } + complete_generic_request(req); return; + bad: pr_err("corrupt mon_get_version reply, tid %llu\n", tid); ceph_msg_dump(msg); } -/* - * Send MMonGetVersion and wait for the reply. - * - * @what: one of "mdsmap", "osdmap" or "monmap" - */ -int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what, - u64 *newest) +static struct ceph_mon_generic_request * +__ceph_monc_get_version(struct ceph_mon_client *monc, const char *what, + ceph_monc_callback_t cb, u64 private_data) { struct ceph_mon_generic_request *req; - void *p, *end; - u64 tid; - int err; - req = kzalloc(sizeof(*req), GFP_NOFS); + req = alloc_generic_request(monc, GFP_NOIO); if (!req) - return -ENOMEM; - - kref_init(&req->kref); - req->buf = newest; - init_completion(&req->completion); + goto err_put_req; req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, sizeof(u64) + sizeof(u32) + strlen(what), - GFP_NOFS, true); - if (!req->request) { - err = -ENOMEM; - goto out; - } + GFP_NOIO, true); + if (!req->request) + goto err_put_req; - req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024, - GFP_NOFS, true); - if (!req->reply) { - err = -ENOMEM; - goto out; - } + req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 32, GFP_NOIO, + true); + if (!req->reply) + goto err_put_req; - p = req->request->front.iov_base; - end = p + req->request->front_alloc_len; + req->complete_cb = cb; + req->private_data = private_data; - /* fill out request */ mutex_lock(&monc->mutex); - tid = ++monc->last_tid; - ceph_encode_64(&p, tid); /* handle */ - ceph_encode_string(&p, end, what, strlen(what)); + register_generic_request(req); + { + void *p = req->request->front.iov_base; + void *const end = p + req->request->front_alloc_len; + + ceph_encode_64(&p, req->tid); /* handle */ + ceph_encode_string(&p, end, what, strlen(what)); + WARN_ON(p != end); + } + send_generic_request(monc, req); + mutex_unlock(&monc->mutex); - err = __do_generic_request(monc, tid, req); + return req; - mutex_unlock(&monc->mutex); -out: +err_put_req: put_generic_request(req); - return err; + return ERR_PTR(-ENOMEM); +} + +/* + * Send MMonGetVersion and wait for the reply. + * + * @what: one of "mdsmap", "osdmap" or "monmap" + */ +int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what, + u64 *newest) +{ + struct ceph_mon_generic_request *req; + int ret; + + req = __ceph_monc_get_version(monc, what, NULL, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + ret = wait_generic_request(req); + if (!ret) + *newest = req->u.newest; + + put_generic_request(req); + return ret; } -EXPORT_SYMBOL(ceph_monc_do_get_version); +EXPORT_SYMBOL(ceph_monc_get_version); + +/* + * Send MMonGetVersion, + * + * @what: one of "mdsmap", "osdmap" or "monmap" + */ +int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what, + ceph_monc_callback_t cb, u64 private_data) +{ + struct ceph_mon_generic_request *req; + + req = __ceph_monc_get_version(monc, what, cb, private_data); + if (IS_ERR(req)) + return PTR_ERR(req); + + put_generic_request(req); + return 0; +} +EXPORT_SYMBOL(ceph_monc_get_version_async); /* * Resend pending generic requests. @@ -890,7 +954,7 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) if (!monc->m_subscribe_ack) goto out_auth; - monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, + monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 128, GFP_NOFS, true); if (!monc->m_subscribe) goto out_subscribe_ack; @@ -914,9 +978,10 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); monc->generic_request_tree = RB_ROOT; - monc->num_generic_requests = 0; monc->last_tid = 0; + monc->fs_cluster_id = CEPH_FS_CLUSTER_ID_NONE; + return 0; out_auth_reply: @@ -954,6 +1019,8 @@ void ceph_monc_stop(struct ceph_mon_client *monc) ceph_auth_destroy(monc->auth); + WARN_ON(!RB_EMPTY_ROOT(&monc->generic_request_tree)); + ceph_msg_put(monc->m_auth); ceph_msg_put(monc->m_auth_reply); ceph_msg_put(monc->m_subscribe); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 40a53a70e..894695920 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -19,25 +19,12 @@ #include <linux/ceph/auth.h> #include <linux/ceph/pagelist.h> -#define OSD_OP_FRONT_LEN 4096 #define OSD_OPREPLY_FRONT_LEN 512 static struct kmem_cache *ceph_osd_request_cache; static const struct ceph_connection_operations osd_con_ops; -static void __send_queued(struct ceph_osd_client *osdc); -static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); -static void __register_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req); -static void __unregister_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req); -static void __unregister_linger_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req); -static void __enqueue_request(struct ceph_osd_request *req); -static void __send_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req); - /* * Implement client access to distributed object storage cluster. * @@ -56,6 +43,52 @@ static void __send_request(struct ceph_osd_client *osdc, * channel with an OSD is reset. */ +static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req); +static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req); +static void link_linger(struct ceph_osd *osd, + struct ceph_osd_linger_request *lreq); +static void unlink_linger(struct ceph_osd *osd, + struct ceph_osd_linger_request *lreq); + +#if 1 +static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem) +{ + bool wrlocked = true; + + if (unlikely(down_read_trylock(sem))) { + wrlocked = false; + up_read(sem); + } + + return wrlocked; +} +static inline void verify_osdc_locked(struct ceph_osd_client *osdc) +{ + WARN_ON(!rwsem_is_locked(&osdc->lock)); +} +static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) +{ + WARN_ON(!rwsem_is_wrlocked(&osdc->lock)); +} +static inline void verify_osd_locked(struct ceph_osd *osd) +{ + struct ceph_osd_client *osdc = osd->o_osdc; + + WARN_ON(!(mutex_is_locked(&osd->lock) && + rwsem_is_locked(&osdc->lock)) && + !rwsem_is_wrlocked(&osdc->lock)); +} +static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) +{ + WARN_ON(!mutex_is_locked(&lreq->lock)); +} +#else +static inline void verify_osdc_locked(struct ceph_osd_client *osdc) { } +static inline void verify_osdc_wrlocked(struct ceph_osd_client *osdc) { } +static inline void verify_osd_locked(struct ceph_osd *osd) { } +static inline void verify_lreq_locked(struct ceph_osd_linger_request *lreq) { } +#endif + /* * calculate the mapping of a file extent onto an object, and fill out the * request accordingly. shorten extent as necessary if it crosses an @@ -144,14 +177,6 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_extent_osd_data); -struct ceph_osd_data * -osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, - unsigned int which) -{ - return osd_req_op_data(osd_req, which, cls, response_data); -} -EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */ - void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, @@ -218,6 +243,8 @@ void osd_req_op_cls_request_data_pagelist( osd_data = osd_req_op_data(osd_req, which, cls, request_data); ceph_osd_data_pagelist_init(osd_data, pagelist); + osd_req->r_ops[which].cls.indata_len += pagelist->length; + osd_req->r_ops[which].indata_len += pagelist->length; } EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); @@ -230,6 +257,8 @@ void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req, osd_data = osd_req_op_data(osd_req, which, cls, request_data); ceph_osd_data_pages_init(osd_data, pages, length, alignment, pages_from_pool, own_pages); + osd_req->r_ops[which].cls.indata_len += length; + osd_req->r_ops[which].indata_len += length; } EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); @@ -302,14 +331,76 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_STAT: ceph_osd_data_release(&op->raw_data_in); break; + case CEPH_OSD_OP_NOTIFY_ACK: + ceph_osd_data_release(&op->notify_ack.request_data); + break; + case CEPH_OSD_OP_NOTIFY: + ceph_osd_data_release(&op->notify.request_data); + ceph_osd_data_release(&op->notify.response_data); + break; default: break; } } /* + * Assumes @t is zero-initialized. + */ +static void target_init(struct ceph_osd_request_target *t) +{ + ceph_oid_init(&t->base_oid); + ceph_oloc_init(&t->base_oloc); + ceph_oid_init(&t->target_oid); + ceph_oloc_init(&t->target_oloc); + + ceph_osds_init(&t->acting); + ceph_osds_init(&t->up); + t->size = -1; + t->min_size = -1; + + t->osd = CEPH_HOMELESS_OSD; +} + +static void target_copy(struct ceph_osd_request_target *dest, + const struct ceph_osd_request_target *src) +{ + ceph_oid_copy(&dest->base_oid, &src->base_oid); + ceph_oloc_copy(&dest->base_oloc, &src->base_oloc); + ceph_oid_copy(&dest->target_oid, &src->target_oid); + ceph_oloc_copy(&dest->target_oloc, &src->target_oloc); + + dest->pgid = src->pgid; /* struct */ + dest->pg_num = src->pg_num; + dest->pg_num_mask = src->pg_num_mask; + ceph_osds_copy(&dest->acting, &src->acting); + ceph_osds_copy(&dest->up, &src->up); + dest->size = src->size; + dest->min_size = src->min_size; + dest->sort_bitwise = src->sort_bitwise; + + dest->flags = src->flags; + dest->paused = src->paused; + + dest->osd = src->osd; +} + +static void target_destroy(struct ceph_osd_request_target *t) +{ + ceph_oid_destroy(&t->base_oid); + ceph_oid_destroy(&t->target_oid); +} + +/* * requests */ +static void request_release_checks(struct ceph_osd_request *req) +{ + WARN_ON(!RB_EMPTY_NODE(&req->r_node)); + WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node)); + WARN_ON(!list_empty(&req->r_unsafe_item)); + WARN_ON(req->r_osd); +} + static void ceph_osdc_release_request(struct kref *kref) { struct ceph_osd_request *req = container_of(kref, @@ -318,24 +409,19 @@ static void ceph_osdc_release_request(struct kref *kref) dout("%s %p (r_request %p r_reply %p)\n", __func__, req, req->r_request, req->r_reply); - WARN_ON(!RB_EMPTY_NODE(&req->r_node)); - WARN_ON(!list_empty(&req->r_req_lru_item)); - WARN_ON(!list_empty(&req->r_osd_item)); - WARN_ON(!list_empty(&req->r_linger_item)); - WARN_ON(!list_empty(&req->r_linger_osd_item)); - WARN_ON(req->r_osd); + request_release_checks(req); if (req->r_request) ceph_msg_put(req->r_request); - if (req->r_reply) { - ceph_msg_revoke_incoming(req->r_reply); + if (req->r_reply) ceph_msg_put(req->r_reply); - } for (which = 0; which < req->r_num_ops; which++) osd_req_op_data_release(req, which); + target_destroy(&req->r_t); ceph_put_snap_context(req->r_snapc); + if (req->r_mempool) mempool_free(req, req->r_osdc->req_mempool); else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS) @@ -354,12 +440,66 @@ EXPORT_SYMBOL(ceph_osdc_get_request); void ceph_osdc_put_request(struct ceph_osd_request *req) { - dout("%s %p (was %d)\n", __func__, req, - atomic_read(&req->r_kref.refcount)); - kref_put(&req->r_kref, ceph_osdc_release_request); + if (req) { + dout("%s %p (was %d)\n", __func__, req, + atomic_read(&req->r_kref.refcount)); + kref_put(&req->r_kref, ceph_osdc_release_request); + } } EXPORT_SYMBOL(ceph_osdc_put_request); +static void request_init(struct ceph_osd_request *req) +{ + /* req only, each op is zeroed in _osd_req_op_init() */ + memset(req, 0, sizeof(*req)); + + kref_init(&req->r_kref); + init_completion(&req->r_completion); + init_completion(&req->r_safe_completion); + RB_CLEAR_NODE(&req->r_node); + RB_CLEAR_NODE(&req->r_mc_node); + INIT_LIST_HEAD(&req->r_unsafe_item); + + target_init(&req->r_t); +} + +/* + * This is ugly, but it allows us to reuse linger registration and ping + * requests, keeping the structure of the code around send_linger{_ping}() + * reasonable. Setting up a min_nr=2 mempool for each linger request + * and dealing with copying ops (this blasts req only, watch op remains + * intact) isn't any better. + */ +static void request_reinit(struct ceph_osd_request *req) +{ + struct ceph_osd_client *osdc = req->r_osdc; + bool mempool = req->r_mempool; + unsigned int num_ops = req->r_num_ops; + u64 snapid = req->r_snapid; + struct ceph_snap_context *snapc = req->r_snapc; + bool linger = req->r_linger; + struct ceph_msg *request_msg = req->r_request; + struct ceph_msg *reply_msg = req->r_reply; + + dout("%s req %p\n", __func__, req); + WARN_ON(atomic_read(&req->r_kref.refcount) != 1); + request_release_checks(req); + + WARN_ON(atomic_read(&request_msg->kref.refcount) != 1); + WARN_ON(atomic_read(&reply_msg->kref.refcount) != 1); + target_destroy(&req->r_t); + + request_init(req); + req->r_osdc = osdc; + req->r_mempool = mempool; + req->r_num_ops = num_ops; + req->r_snapid = snapid; + req->r_snapc = snapc; + req->r_linger = linger; + req->r_request = request_msg; + req->r_reply = reply_msg; +} + struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, unsigned int num_ops, @@ -367,8 +507,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, gfp_t gfp_flags) { struct ceph_osd_request *req; - struct ceph_msg *msg; - size_t msg_size; if (use_mempool) { BUG_ON(num_ops > CEPH_OSD_SLAB_OPS); @@ -383,73 +521,65 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, if (unlikely(!req)) return NULL; - /* req only, each op is zeroed in _osd_req_op_init() */ - memset(req, 0, sizeof(*req)); - + request_init(req); req->r_osdc = osdc; req->r_mempool = use_mempool; req->r_num_ops = num_ops; + req->r_snapid = CEPH_NOSNAP; + req->r_snapc = ceph_get_snap_context(snapc); - kref_init(&req->r_kref); - init_completion(&req->r_completion); - init_completion(&req->r_safe_completion); - RB_CLEAR_NODE(&req->r_node); - INIT_LIST_HEAD(&req->r_unsafe_item); - INIT_LIST_HEAD(&req->r_linger_item); - INIT_LIST_HEAD(&req->r_linger_osd_item); - INIT_LIST_HEAD(&req->r_req_lru_item); - INIT_LIST_HEAD(&req->r_osd_item); - - req->r_base_oloc.pool = -1; - req->r_target_oloc.pool = -1; + dout("%s req %p\n", __func__, req); + return req; +} +EXPORT_SYMBOL(ceph_osdc_alloc_request); - msg_size = OSD_OPREPLY_FRONT_LEN; - if (num_ops > CEPH_OSD_SLAB_OPS) { - /* ceph_osd_op and rval */ - msg_size += (num_ops - CEPH_OSD_SLAB_OPS) * - (sizeof(struct ceph_osd_op) + 4); - } +int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) +{ + struct ceph_osd_client *osdc = req->r_osdc; + struct ceph_msg *msg; + int msg_size; - /* create reply message */ - if (use_mempool) - msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); - else - msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, - gfp_flags, true); - if (!msg) { - ceph_osdc_put_request(req); - return NULL; - } - req->r_reply = msg; + WARN_ON(ceph_oid_empty(&req->r_base_oid)); + /* create request message */ msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ msg_size += 1 + 8 + 4 + 4; /* pgid */ - msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ - msg_size += 2 + num_ops * sizeof(struct ceph_osd_op); + msg_size += 4 + req->r_base_oid.name_len; /* oid */ + msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op); msg_size += 8; /* snapid */ msg_size += 8; /* snap_seq */ - msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ + msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0); msg_size += 4; /* retry_attempt */ - /* create request message; allow space for oid */ - if (use_mempool) + if (req->r_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op, 0); else - msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true); - if (!msg) { - ceph_osdc_put_request(req); - return NULL; - } + msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true); + if (!msg) + return -ENOMEM; memset(msg->front.iov_base, 0, msg->front.iov_len); - req->r_request = msg; - return req; + /* create reply message */ + msg_size = OSD_OPREPLY_FRONT_LEN; + msg_size += req->r_base_oid.name_len; + msg_size += req->r_num_ops * sizeof(struct ceph_osd_op); + + if (req->r_mempool) + msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); + else + msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true); + if (!msg) + return -ENOMEM; + + req->r_reply = msg; + + return 0; } -EXPORT_SYMBOL(ceph_osdc_alloc_request); +EXPORT_SYMBOL(ceph_osdc_alloc_messages); static bool osd_req_opcode_valid(u16 opcode) { @@ -587,8 +717,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); - op->cls.argc = 0; /* currently unused */ - op->indata_len = payload_len; } EXPORT_SYMBOL(osd_req_op_cls_init); @@ -627,21 +755,19 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, } EXPORT_SYMBOL(osd_req_op_xattr_init); -void osd_req_op_watch_init(struct ceph_osd_request *osd_req, - unsigned int which, u16 opcode, - u64 cookie, u64 version, int flag) +/* + * @watch_opcode: CEPH_OSD_WATCH_OP_* + */ +static void osd_req_op_watch_init(struct ceph_osd_request *req, int which, + u64 cookie, u8 watch_opcode) { - struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, - opcode, 0); - - BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); + struct ceph_osd_req_op *op; + op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0); op->watch.cookie = cookie; - op->watch.ver = version; - if (opcode == CEPH_OSD_OP_WATCH && flag) - op->watch.flag = (u8)1; + op->watch.op = watch_opcode; + op->watch.gen = 0; } -EXPORT_SYMBOL(osd_req_op_watch_init); void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, unsigned int which, @@ -686,16 +812,9 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg, } } -static u64 osd_req_encode_op(struct ceph_osd_request *req, - struct ceph_osd_op *dst, unsigned int which) +static u32 osd_req_encode_op(struct ceph_osd_op *dst, + const struct ceph_osd_req_op *src) { - struct ceph_osd_req_op *src; - struct ceph_osd_data *osd_data; - u64 request_data_len = 0; - u64 data_length; - - BUG_ON(which >= req->r_num_ops); - src = &req->r_ops[which]; if (WARN_ON(!osd_req_opcode_valid(src->op))) { pr_err("unrecognized osd opcode %d\n", src->op); @@ -704,57 +823,36 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, switch (src->op) { case CEPH_OSD_OP_STAT: - osd_data = &src->raw_data_in; - ceph_osdc_msg_data_add(req->r_reply, osd_data); break; case CEPH_OSD_OP_READ: case CEPH_OSD_OP_WRITE: case CEPH_OSD_OP_WRITEFULL: case CEPH_OSD_OP_ZERO: case CEPH_OSD_OP_TRUNCATE: - if (src->op == CEPH_OSD_OP_WRITE || - src->op == CEPH_OSD_OP_WRITEFULL) - request_data_len = src->extent.length; dst->extent.offset = cpu_to_le64(src->extent.offset); dst->extent.length = cpu_to_le64(src->extent.length); dst->extent.truncate_size = cpu_to_le64(src->extent.truncate_size); dst->extent.truncate_seq = cpu_to_le32(src->extent.truncate_seq); - osd_data = &src->extent.osd_data; - if (src->op == CEPH_OSD_OP_WRITE || - src->op == CEPH_OSD_OP_WRITEFULL) - ceph_osdc_msg_data_add(req->r_request, osd_data); - else - ceph_osdc_msg_data_add(req->r_reply, osd_data); break; case CEPH_OSD_OP_CALL: dst->cls.class_len = src->cls.class_len; dst->cls.method_len = src->cls.method_len; - osd_data = &src->cls.request_info; - ceph_osdc_msg_data_add(req->r_request, osd_data); - BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST); - request_data_len = osd_data->pagelist->length; - - osd_data = &src->cls.request_data; - data_length = ceph_osd_data_length(osd_data); - if (data_length) { - BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE); - dst->cls.indata_len = cpu_to_le32(data_length); - ceph_osdc_msg_data_add(req->r_request, osd_data); - src->indata_len += data_length; - request_data_len += data_length; - } - osd_data = &src->cls.response_data; - ceph_osdc_msg_data_add(req->r_reply, osd_data); + dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); break; case CEPH_OSD_OP_STARTSYNC: break; - case CEPH_OSD_OP_NOTIFY_ACK: case CEPH_OSD_OP_WATCH: dst->watch.cookie = cpu_to_le64(src->watch.cookie); - dst->watch.ver = cpu_to_le64(src->watch.ver); - dst->watch.flag = src->watch.flag; + dst->watch.ver = cpu_to_le64(0); + dst->watch.op = src->watch.op; + dst->watch.gen = cpu_to_le32(src->watch.gen); + break; + case CEPH_OSD_OP_NOTIFY_ACK: + break; + case CEPH_OSD_OP_NOTIFY: + dst->notify.cookie = cpu_to_le64(src->notify.cookie); break; case CEPH_OSD_OP_SETALLOCHINT: dst->alloc_hint.expected_object_size = @@ -768,9 +866,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); dst->xattr.cmp_op = src->xattr.cmp_op; dst->xattr.cmp_mode = src->xattr.cmp_mode; - osd_data = &src->xattr.osd_data; - ceph_osdc_msg_data_add(req->r_request, osd_data); - request_data_len = osd_data->pagelist->length; break; case CEPH_OSD_OP_CREATE: case CEPH_OSD_OP_DELETE: @@ -787,7 +882,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, dst->flags = cpu_to_le32(src->flags); dst->payload_len = cpu_to_le32(src->indata_len); - return request_data_len; + return src->indata_len; } /* @@ -824,17 +919,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, GFP_NOFS); - if (!req) - return ERR_PTR(-ENOMEM); - - req->r_flags = flags; + if (!req) { + r = -ENOMEM; + goto fail; + } /* calculate max write size */ r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen); - if (r < 0) { - ceph_osdc_put_request(req); - return ERR_PTR(r); - } + if (r) + goto fail; if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { osd_req_op_init(req, which, opcode, 0); @@ -854,194 +947,71 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, truncate_size, truncate_seq); } + req->r_flags = flags; req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); + ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum); + + req->r_snapid = vino.snap; + if (flags & CEPH_OSD_FLAG_WRITE) + req->r_data_offset = off; - snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name), - "%llx.%08llx", vino.ino, objnum); - req->r_base_oid.name_len = strlen(req->r_base_oid.name); + r = ceph_osdc_alloc_messages(req, GFP_NOFS); + if (r) + goto fail; return req; + +fail: + ceph_osdc_put_request(req); + return ERR_PTR(r); } EXPORT_SYMBOL(ceph_osdc_new_request); /* * We keep osd requests in an rbtree, sorted by ->r_tid. */ -static void __insert_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *new) -{ - struct rb_node **p = &osdc->requests.rb_node; - struct rb_node *parent = NULL; - struct ceph_osd_request *req = NULL; - - while (*p) { - parent = *p; - req = rb_entry(parent, struct ceph_osd_request, r_node); - if (new->r_tid < req->r_tid) - p = &(*p)->rb_left; - else if (new->r_tid > req->r_tid) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&new->r_node, parent, p); - rb_insert_color(&new->r_node, &osdc->requests); -} +DEFINE_RB_FUNCS(request, struct ceph_osd_request, r_tid, r_node) +DEFINE_RB_FUNCS(request_mc, struct ceph_osd_request, r_tid, r_mc_node) -static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc, - u64 tid) +static bool osd_homeless(struct ceph_osd *osd) { - struct ceph_osd_request *req; - struct rb_node *n = osdc->requests.rb_node; - - while (n) { - req = rb_entry(n, struct ceph_osd_request, r_node); - if (tid < req->r_tid) - n = n->rb_left; - else if (tid > req->r_tid) - n = n->rb_right; - else - return req; - } - return NULL; + return osd->o_osd == CEPH_HOMELESS_OSD; } -static struct ceph_osd_request * -__lookup_request_ge(struct ceph_osd_client *osdc, - u64 tid) +static bool osd_registered(struct ceph_osd *osd) { - struct ceph_osd_request *req; - struct rb_node *n = osdc->requests.rb_node; - - while (n) { - req = rb_entry(n, struct ceph_osd_request, r_node); - if (tid < req->r_tid) { - if (!n->rb_left) - return req; - n = n->rb_left; - } else if (tid > req->r_tid) { - n = n->rb_right; - } else { - return req; - } - } - return NULL; -} - -static void __kick_linger_request(struct ceph_osd_request *req) -{ - struct ceph_osd_client *osdc = req->r_osdc; - struct ceph_osd *osd = req->r_osd; + verify_osdc_locked(osd->o_osdc); - /* - * Linger requests need to be resent with a new tid to avoid - * the dup op detection logic on the OSDs. Achieve this with - * a re-register dance instead of open-coding. - */ - ceph_osdc_get_request(req); - if (!list_empty(&req->r_linger_item)) - __unregister_linger_request(osdc, req); - else - __unregister_request(osdc, req); - __register_request(osdc, req); - ceph_osdc_put_request(req); - - /* - * Unless request has been registered as both normal and - * lingering, __unregister{,_linger}_request clears r_osd. - * However, here we need to preserve r_osd to make sure we - * requeue on the same OSD. - */ - WARN_ON(req->r_osd || !osd); - req->r_osd = osd; - - dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid); - __enqueue_request(req); + return !RB_EMPTY_NODE(&osd->o_node); } /* - * Resubmit requests pending on the given osd. + * Assumes @osd is zero-initialized. */ -static void __kick_osd_requests(struct ceph_osd_client *osdc, - struct ceph_osd *osd) +static void osd_init(struct ceph_osd *osd) { - struct ceph_osd_request *req, *nreq; - LIST_HEAD(resend); - LIST_HEAD(resend_linger); - int err; - - dout("%s osd%d\n", __func__, osd->o_osd); - err = __reset_osd(osdc, osd); - if (err) - return; - - /* - * Build up a list of requests to resend by traversing the - * osd's list of requests. Requests for a given object are - * sent in tid order, and that is also the order they're - * kept on this list. Therefore all requests that are in - * flight will be found first, followed by all requests that - * have not yet been sent. And to resend requests while - * preserving this order we will want to put any sent - * requests back on the front of the osd client's unsent - * list. - * - * So we build a separate ordered list of already-sent - * requests for the affected osd and splice it onto the - * front of the osd client's unsent list. Once we've seen a - * request that has not yet been sent we're done. Those - * requests are already sitting right where they belong. - */ - list_for_each_entry(req, &osd->o_requests, r_osd_item) { - if (!req->r_sent) - break; - - if (!req->r_linger) { - dout("%s requeueing %p tid %llu\n", __func__, req, - req->r_tid); - list_move_tail(&req->r_req_lru_item, &resend); - req->r_flags |= CEPH_OSD_FLAG_RETRY; - } else { - list_move_tail(&req->r_req_lru_item, &resend_linger); - } - } - list_splice(&resend, &osdc->req_unsent); - - /* - * Both registered and not yet registered linger requests are - * enqueued with a new tid on the same OSD. We add/move them - * to req_unsent/o_requests at the end to keep things in tid - * order. - */ - list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, - r_linger_osd_item) { - WARN_ON(!list_empty(&req->r_req_lru_item)); - __kick_linger_request(req); - } - - list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item) - __kick_linger_request(req); + atomic_set(&osd->o_ref, 1); + RB_CLEAR_NODE(&osd->o_node); + osd->o_requests = RB_ROOT; + osd->o_linger_requests = RB_ROOT; + INIT_LIST_HEAD(&osd->o_osd_lru); + INIT_LIST_HEAD(&osd->o_keepalive_item); + osd->o_incarnation = 1; + mutex_init(&osd->lock); } -/* - * If the osd connection drops, we need to resubmit all requests. - */ -static void osd_reset(struct ceph_connection *con) +static void osd_cleanup(struct ceph_osd *osd) { - struct ceph_osd *osd = con->private; - struct ceph_osd_client *osdc; - - if (!osd) - return; - dout("osd_reset osd%d\n", osd->o_osd); - osdc = osd->o_osdc; - down_read(&osdc->map_sem); - mutex_lock(&osdc->request_mutex); - __kick_osd_requests(osdc, osd); - __send_queued(osdc); - mutex_unlock(&osdc->request_mutex); - up_read(&osdc->map_sem); + WARN_ON(!RB_EMPTY_NODE(&osd->o_node)); + WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests)); + WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests)); + WARN_ON(!list_empty(&osd->o_osd_lru)); + WARN_ON(!list_empty(&osd->o_keepalive_item)); + + if (osd->o_auth.authorizer) { + WARN_ON(osd_homeless(osd)); + ceph_auth_destroy_authorizer(osd->o_auth.authorizer); + } } /* @@ -1051,22 +1021,15 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) { struct ceph_osd *osd; - osd = kzalloc(sizeof(*osd), GFP_NOFS); - if (!osd) - return NULL; + WARN_ON(onum == CEPH_HOMELESS_OSD); - atomic_set(&osd->o_ref, 1); + osd = kzalloc(sizeof(*osd), GFP_NOIO | __GFP_NOFAIL); + osd_init(osd); osd->o_osdc = osdc; osd->o_osd = onum; - RB_CLEAR_NODE(&osd->o_node); - INIT_LIST_HEAD(&osd->o_requests); - INIT_LIST_HEAD(&osd->o_linger_requests); - INIT_LIST_HEAD(&osd->o_osd_lru); - osd->o_incarnation = 1; ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); - INIT_LIST_HEAD(&osd->o_keepalive_item); return osd; } @@ -1087,114 +1050,115 @@ static void put_osd(struct ceph_osd *osd) dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), atomic_read(&osd->o_ref) - 1); if (atomic_dec_and_test(&osd->o_ref)) { - if (osd->o_auth.authorizer) - ceph_auth_destroy_authorizer(osd->o_auth.authorizer); + osd_cleanup(osd); kfree(osd); } } -/* - * remove an osd from our map - */ -static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) -{ - dout("%s %p osd%d\n", __func__, osd, osd->o_osd); - WARN_ON(!list_empty(&osd->o_requests)); - WARN_ON(!list_empty(&osd->o_linger_requests)); - - list_del_init(&osd->o_osd_lru); - rb_erase(&osd->o_node, &osdc->osds); - RB_CLEAR_NODE(&osd->o_node); -} - -static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) -{ - dout("%s %p osd%d\n", __func__, osd, osd->o_osd); - - if (!RB_EMPTY_NODE(&osd->o_node)) { - ceph_con_close(&osd->o_con); - __remove_osd(osdc, osd); - put_osd(osd); - } -} +DEFINE_RB_FUNCS(osd, struct ceph_osd, o_osd, o_node) -static void remove_all_osds(struct ceph_osd_client *osdc) +static void __move_osd_to_lru(struct ceph_osd *osd) { - dout("%s %p\n", __func__, osdc); - mutex_lock(&osdc->request_mutex); - while (!RB_EMPTY_ROOT(&osdc->osds)) { - struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), - struct ceph_osd, o_node); - remove_osd(osdc, osd); - } - mutex_unlock(&osdc->request_mutex); -} + struct ceph_osd_client *osdc = osd->o_osdc; -static void __move_osd_to_lru(struct ceph_osd_client *osdc, - struct ceph_osd *osd) -{ - dout("%s %p\n", __func__, osd); + dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); BUG_ON(!list_empty(&osd->o_osd_lru)); + spin_lock(&osdc->osd_lru_lock); list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); + spin_unlock(&osdc->osd_lru_lock); + osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl; } -static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc, - struct ceph_osd *osd) +static void maybe_move_osd_to_lru(struct ceph_osd *osd) { - dout("%s %p\n", __func__, osd); - - if (list_empty(&osd->o_requests) && - list_empty(&osd->o_linger_requests)) - __move_osd_to_lru(osdc, osd); + if (RB_EMPTY_ROOT(&osd->o_requests) && + RB_EMPTY_ROOT(&osd->o_linger_requests)) + __move_osd_to_lru(osd); } static void __remove_osd_from_lru(struct ceph_osd *osd) { - dout("__remove_osd_from_lru %p\n", osd); + struct ceph_osd_client *osdc = osd->o_osdc; + + dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); + + spin_lock(&osdc->osd_lru_lock); if (!list_empty(&osd->o_osd_lru)) list_del_init(&osd->o_osd_lru); + spin_unlock(&osdc->osd_lru_lock); } -static void remove_old_osds(struct ceph_osd_client *osdc) +/* + * Close the connection and assign any leftover requests to the + * homeless session. + */ +static void close_osd(struct ceph_osd *osd) { - struct ceph_osd *osd, *nosd; + struct ceph_osd_client *osdc = osd->o_osdc; + struct rb_node *n; - dout("__remove_old_osds %p\n", osdc); - mutex_lock(&osdc->request_mutex); - list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { - if (time_before(jiffies, osd->lru_ttl)) - break; - remove_osd(osdc, osd); + verify_osdc_wrlocked(osdc); + dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); + + ceph_con_close(&osd->o_con); + + for (n = rb_first(&osd->o_requests); n; ) { + struct ceph_osd_request *req = + rb_entry(n, struct ceph_osd_request, r_node); + + n = rb_next(n); /* unlink_request() */ + + dout(" reassigning req %p tid %llu\n", req, req->r_tid); + unlink_request(osd, req); + link_request(&osdc->homeless_osd, req); + } + for (n = rb_first(&osd->o_linger_requests); n; ) { + struct ceph_osd_linger_request *lreq = + rb_entry(n, struct ceph_osd_linger_request, node); + + n = rb_next(n); /* unlink_linger() */ + + dout(" reassigning lreq %p linger_id %llu\n", lreq, + lreq->linger_id); + unlink_linger(osd, lreq); + link_linger(&osdc->homeless_osd, lreq); } - mutex_unlock(&osdc->request_mutex); + + __remove_osd_from_lru(osd); + erase_osd(&osdc->osds, osd); + put_osd(osd); } /* * reset osd connect */ -static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) +static int reopen_osd(struct ceph_osd *osd) { struct ceph_entity_addr *peer_addr; - dout("__reset_osd %p osd%d\n", osd, osd->o_osd); - if (list_empty(&osd->o_requests) && - list_empty(&osd->o_linger_requests)) { - remove_osd(osdc, osd); + dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); + + if (RB_EMPTY_ROOT(&osd->o_requests) && + RB_EMPTY_ROOT(&osd->o_linger_requests)) { + close_osd(osd); return -ENODEV; } - peer_addr = &osdc->osdmap->osd_addr[osd->o_osd]; + peer_addr = &osd->o_osdc->osdmap->osd_addr[osd->o_osd]; if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) && !ceph_con_opened(&osd->o_con)) { - struct ceph_osd_request *req; + struct rb_node *n; dout("osd addr hasn't changed and connection never opened, " "letting msgr retry\n"); /* touch each r_stamp for handle_timeout()'s benfit */ - list_for_each_entry(req, &osd->o_requests, r_osd_item) + for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) { + struct ceph_osd_request *req = + rb_entry(n, struct ceph_osd_request, r_node); req->r_stamp = jiffies; + } return -EAGAIN; } @@ -1206,455 +1170,1369 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) return 0; } -static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) +static struct ceph_osd *lookup_create_osd(struct ceph_osd_client *osdc, int o, + bool wrlocked) { - struct rb_node **p = &osdc->osds.rb_node; - struct rb_node *parent = NULL; - struct ceph_osd *osd = NULL; + struct ceph_osd *osd; - dout("__insert_osd %p osd%d\n", new, new->o_osd); - while (*p) { - parent = *p; - osd = rb_entry(parent, struct ceph_osd, o_node); - if (new->o_osd < osd->o_osd) - p = &(*p)->rb_left; - else if (new->o_osd > osd->o_osd) - p = &(*p)->rb_right; - else - BUG(); + if (wrlocked) + verify_osdc_wrlocked(osdc); + else + verify_osdc_locked(osdc); + + if (o != CEPH_HOMELESS_OSD) + osd = lookup_osd(&osdc->osds, o); + else + osd = &osdc->homeless_osd; + if (!osd) { + if (!wrlocked) + return ERR_PTR(-EAGAIN); + + osd = create_osd(osdc, o); + insert_osd(&osdc->osds, osd); + ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, + &osdc->osdmap->osd_addr[osd->o_osd]); } - rb_link_node(&new->o_node, parent, p); - rb_insert_color(&new->o_node, &osdc->osds); + dout("%s osdc %p osd%d -> osd %p\n", __func__, osdc, o, osd); + return osd; } -static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) +/* + * Create request <-> OSD session relation. + * + * @req has to be assigned a tid, @osd may be homeless. + */ +static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req) { - struct ceph_osd *osd; - struct rb_node *n = osdc->osds.rb_node; - - while (n) { - osd = rb_entry(n, struct ceph_osd, o_node); - if (o < osd->o_osd) - n = n->rb_left; - else if (o > osd->o_osd) - n = n->rb_right; - else - return osd; - } - return NULL; + verify_osd_locked(osd); + WARN_ON(!req->r_tid || req->r_osd); + dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd, + req, req->r_tid); + + if (!osd_homeless(osd)) + __remove_osd_from_lru(osd); + else + atomic_inc(&osd->o_osdc->num_homeless); + + get_osd(osd); + insert_request(&osd->o_requests, req); + req->r_osd = osd; } -static void __schedule_osd_timeout(struct ceph_osd_client *osdc) +static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req) { - schedule_delayed_work(&osdc->timeout_work, - osdc->client->options->osd_keepalive_timeout); + verify_osd_locked(osd); + WARN_ON(req->r_osd != osd); + dout("%s osd %p osd%d req %p tid %llu\n", __func__, osd, osd->o_osd, + req, req->r_tid); + + req->r_osd = NULL; + erase_request(&osd->o_requests, req); + put_osd(osd); + + if (!osd_homeless(osd)) + maybe_move_osd_to_lru(osd); + else + atomic_dec(&osd->o_osdc->num_homeless); } -static void __cancel_osd_timeout(struct ceph_osd_client *osdc) +static bool __pool_full(struct ceph_pg_pool_info *pi) { - cancel_delayed_work(&osdc->timeout_work); + return pi->flags & CEPH_POOL_FLAG_FULL; } -/* - * Register request, assign tid. If this is the first request, set up - * the timeout event. - */ -static void __register_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) +static bool have_pool_full(struct ceph_osd_client *osdc) { - req->r_tid = ++osdc->last_tid; - req->r_request->hdr.tid = cpu_to_le64(req->r_tid); - dout("__register_request %p tid %lld\n", req, req->r_tid); - __insert_request(osdc, req); - ceph_osdc_get_request(req); - osdc->num_requests++; - if (osdc->num_requests == 1) { - dout(" first request, scheduling timeout\n"); - __schedule_osd_timeout(osdc); + struct rb_node *n; + + for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) { + struct ceph_pg_pool_info *pi = + rb_entry(n, struct ceph_pg_pool_info, node); + + if (__pool_full(pi)) + return true; } + + return false; +} + +static bool pool_full(struct ceph_osd_client *osdc, s64 pool_id) +{ + struct ceph_pg_pool_info *pi; + + pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id); + if (!pi) + return false; + + return __pool_full(pi); } /* - * called under osdc->request_mutex + * Returns whether a request should be blocked from being sent + * based on the current osdmap and osd_client settings. */ -static void __unregister_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) +static bool target_should_be_paused(struct ceph_osd_client *osdc, + const struct ceph_osd_request_target *t, + struct ceph_pg_pool_info *pi) { - if (RB_EMPTY_NODE(&req->r_node)) { - dout("__unregister_request %p tid %lld not registered\n", - req, req->r_tid); - return; + bool pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD); + bool pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || + __pool_full(pi); + + WARN_ON(pi->id != t->base_oloc.pool); + return (t->flags & CEPH_OSD_FLAG_READ && pauserd) || + (t->flags & CEPH_OSD_FLAG_WRITE && pausewr); +} + +enum calc_target_result { + CALC_TARGET_NO_ACTION = 0, + CALC_TARGET_NEED_RESEND, + CALC_TARGET_POOL_DNE, +}; + +static enum calc_target_result calc_target(struct ceph_osd_client *osdc, + struct ceph_osd_request_target *t, + u32 *last_force_resend, + bool any_change) +{ + struct ceph_pg_pool_info *pi; + struct ceph_pg pgid, last_pgid; + struct ceph_osds up, acting; + bool force_resend = false; + bool need_check_tiering = false; + bool need_resend = false; + bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE); + enum calc_target_result ct_res; + int ret; + + pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool); + if (!pi) { + t->osd = CEPH_HOMELESS_OSD; + ct_res = CALC_TARGET_POOL_DNE; + goto out; } - dout("__unregister_request %p tid %lld\n", req, req->r_tid); - rb_erase(&req->r_node, &osdc->requests); - RB_CLEAR_NODE(&req->r_node); - osdc->num_requests--; + if (osdc->osdmap->epoch == pi->last_force_request_resend) { + if (last_force_resend && + *last_force_resend < pi->last_force_request_resend) { + *last_force_resend = pi->last_force_request_resend; + force_resend = true; + } else if (!last_force_resend) { + force_resend = true; + } + } + if (ceph_oid_empty(&t->target_oid) || force_resend) { + ceph_oid_copy(&t->target_oid, &t->base_oid); + need_check_tiering = true; + } + if (ceph_oloc_empty(&t->target_oloc) || force_resend) { + ceph_oloc_copy(&t->target_oloc, &t->base_oloc); + need_check_tiering = true; + } - if (req->r_osd) { - /* make sure the original request isn't in flight. */ - ceph_msg_revoke(req->r_request); + if (need_check_tiering && + (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { + if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0) + t->target_oloc.pool = pi->read_tier; + if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0) + t->target_oloc.pool = pi->write_tier; + } - list_del_init(&req->r_osd_item); - maybe_move_osd_to_lru(osdc, req->r_osd); - if (list_empty(&req->r_linger_osd_item)) - req->r_osd = NULL; + ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid, + &t->target_oloc, &pgid); + if (ret) { + WARN_ON(ret != -ENOENT); + t->osd = CEPH_HOMELESS_OSD; + ct_res = CALC_TARGET_POOL_DNE; + goto out; + } + last_pgid.pool = pgid.pool; + last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask); + + ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting); + if (any_change && + ceph_is_new_interval(&t->acting, + &acting, + &t->up, + &up, + t->size, + pi->size, + t->min_size, + pi->min_size, + t->pg_num, + pi->pg_num, + t->sort_bitwise, + sort_bitwise, + &last_pgid)) + force_resend = true; + + if (t->paused && !target_should_be_paused(osdc, t, pi)) { + t->paused = false; + need_resend = true; } - list_del_init(&req->r_req_lru_item); - ceph_osdc_put_request(req); + if (ceph_pg_compare(&t->pgid, &pgid) || + ceph_osds_changed(&t->acting, &acting, any_change) || + force_resend) { + t->pgid = pgid; /* struct */ + ceph_osds_copy(&t->acting, &acting); + ceph_osds_copy(&t->up, &up); + t->size = pi->size; + t->min_size = pi->min_size; + t->pg_num = pi->pg_num; + t->pg_num_mask = pi->pg_num_mask; + t->sort_bitwise = sort_bitwise; + + t->osd = acting.primary; + need_resend = true; + } + + ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION; +out: + dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd); + return ct_res; +} + +static void setup_request_data(struct ceph_osd_request *req, + struct ceph_msg *msg) +{ + u32 data_len = 0; + int i; + + if (!list_empty(&msg->data)) + return; + + WARN_ON(msg->data_length); + for (i = 0; i < req->r_num_ops; i++) { + struct ceph_osd_req_op *op = &req->r_ops[i]; + + switch (op->op) { + /* request */ + case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_WRITEFULL: + WARN_ON(op->indata_len != op->extent.length); + ceph_osdc_msg_data_add(msg, &op->extent.osd_data); + break; + case CEPH_OSD_OP_SETXATTR: + case CEPH_OSD_OP_CMPXATTR: + WARN_ON(op->indata_len != op->xattr.name_len + + op->xattr.value_len); + ceph_osdc_msg_data_add(msg, &op->xattr.osd_data); + break; + case CEPH_OSD_OP_NOTIFY_ACK: + ceph_osdc_msg_data_add(msg, + &op->notify_ack.request_data); + break; + + /* reply */ + case CEPH_OSD_OP_STAT: + ceph_osdc_msg_data_add(req->r_reply, + &op->raw_data_in); + break; + case CEPH_OSD_OP_READ: + ceph_osdc_msg_data_add(req->r_reply, + &op->extent.osd_data); + break; + + /* both */ + case CEPH_OSD_OP_CALL: + WARN_ON(op->indata_len != op->cls.class_len + + op->cls.method_len + + op->cls.indata_len); + ceph_osdc_msg_data_add(msg, &op->cls.request_info); + /* optional, can be NONE */ + ceph_osdc_msg_data_add(msg, &op->cls.request_data); + /* optional, can be NONE */ + ceph_osdc_msg_data_add(req->r_reply, + &op->cls.response_data); + break; + case CEPH_OSD_OP_NOTIFY: + ceph_osdc_msg_data_add(msg, + &op->notify.request_data); + ceph_osdc_msg_data_add(req->r_reply, + &op->notify.response_data); + break; + } + + data_len += op->indata_len; + } + + WARN_ON(data_len != msg->data_length); +} + +static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) +{ + void *p = msg->front.iov_base; + void *const end = p + msg->front_alloc_len; + u32 data_len = 0; + int i; + + if (req->r_flags & CEPH_OSD_FLAG_WRITE) { + /* snapshots aren't writeable */ + WARN_ON(req->r_snapid != CEPH_NOSNAP); + } else { + WARN_ON(req->r_mtime.tv_sec || req->r_mtime.tv_nsec || + req->r_data_offset || req->r_snapc); + } + + setup_request_data(req, msg); + + ceph_encode_32(&p, 1); /* client_inc, always 1 */ + ceph_encode_32(&p, req->r_osdc->osdmap->epoch); + ceph_encode_32(&p, req->r_flags); + ceph_encode_timespec(p, &req->r_mtime); + p += sizeof(struct ceph_timespec); + /* aka reassert_version */ + memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version)); + p += sizeof(req->r_replay_version); + + /* oloc */ + ceph_encode_8(&p, 4); + ceph_encode_8(&p, 4); + ceph_encode_32(&p, 8 + 4 + 4); + ceph_encode_64(&p, req->r_t.target_oloc.pool); + ceph_encode_32(&p, -1); /* preferred */ + ceph_encode_32(&p, 0); /* key len */ + + /* pgid */ + ceph_encode_8(&p, 1); + ceph_encode_64(&p, req->r_t.pgid.pool); + ceph_encode_32(&p, req->r_t.pgid.seed); + ceph_encode_32(&p, -1); /* preferred */ + + /* oid */ + ceph_encode_32(&p, req->r_t.target_oid.name_len); + memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len); + p += req->r_t.target_oid.name_len; - if (osdc->num_requests == 0) { - dout(" no requests, canceling timeout\n"); - __cancel_osd_timeout(osdc); + /* ops, can imply data */ + ceph_encode_16(&p, req->r_num_ops); + for (i = 0; i < req->r_num_ops; i++) { + data_len += osd_req_encode_op(p, &req->r_ops[i]); + p += sizeof(struct ceph_osd_op); } + + ceph_encode_64(&p, req->r_snapid); /* snapid */ + if (req->r_snapc) { + ceph_encode_64(&p, req->r_snapc->seq); + ceph_encode_32(&p, req->r_snapc->num_snaps); + for (i = 0; i < req->r_snapc->num_snaps; i++) + ceph_encode_64(&p, req->r_snapc->snaps[i]); + } else { + ceph_encode_64(&p, 0); /* snap_seq */ + ceph_encode_32(&p, 0); /* snaps len */ + } + + ceph_encode_32(&p, req->r_attempts); /* retry_attempt */ + + BUG_ON(p > end); + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */ + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + msg->hdr.data_len = cpu_to_le32(data_len); + /* + * The header "data_off" is a hint to the receiver allowing it + * to align received data into its buffers such that there's no + * need to re-copy it before writing it to disk (direct I/O). + */ + msg->hdr.data_off = cpu_to_le16(req->r_data_offset); + + dout("%s req %p oid %s oid_len %d front %zu data %u\n", __func__, + req, req->r_t.target_oid.name, req->r_t.target_oid.name_len, + msg->front.iov_len, data_len); } /* - * Cancel a previously queued request message + * @req has to be assigned a tid and registered. */ -static void __cancel_request(struct ceph_osd_request *req) +static void send_request(struct ceph_osd_request *req) { - if (req->r_sent && req->r_osd) { + struct ceph_osd *osd = req->r_osd; + + verify_osd_locked(osd); + WARN_ON(osd->o_osd != req->r_t.osd); + + /* + * We may have a previously queued request message hanging + * around. Cancel it to avoid corrupting the msgr. + */ + if (req->r_sent) ceph_msg_revoke(req->r_request); - req->r_sent = 0; + + req->r_flags |= CEPH_OSD_FLAG_KNOWN_REDIR; + if (req->r_attempts) + req->r_flags |= CEPH_OSD_FLAG_RETRY; + else + WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY); + + encode_request(req, req->r_request); + + dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n", + __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, + req->r_t.osd, req->r_flags, req->r_attempts); + + req->r_t.paused = false; + req->r_stamp = jiffies; + req->r_attempts++; + + req->r_sent = osd->o_incarnation; + req->r_request->hdr.tid = cpu_to_le64(req->r_tid); + ceph_con_send(&osd->o_con, ceph_msg_get(req->r_request)); +} + +static void maybe_request_map(struct ceph_osd_client *osdc) +{ + bool continuous = false; + + verify_osdc_locked(osdc); + WARN_ON(!osdc->osdmap->epoch); + + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD) || + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { + dout("%s osdc %p continuous\n", __func__, osdc); + continuous = true; + } else { + dout("%s osdc %p onetime\n", __func__, osdc); } + + if (ceph_monc_want_map(&osdc->client->monc, CEPH_SUB_OSDMAP, + osdc->osdmap->epoch + 1, continuous)) + ceph_monc_renew_subs(&osdc->client->monc); } -static void __register_linger_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) +static void send_map_check(struct ceph_osd_request *req); + +static void __submit_request(struct ceph_osd_request *req, bool wrlocked) { - dout("%s %p tid %llu\n", __func__, req, req->r_tid); - WARN_ON(!req->r_linger); + struct ceph_osd_client *osdc = req->r_osdc; + struct ceph_osd *osd; + enum calc_target_result ct_res; + bool need_send = false; + bool promoted = false; + + WARN_ON(req->r_tid || req->r_got_reply); + dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); + +again: + ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false); + if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) + goto promote; + + osd = lookup_create_osd(osdc, req->r_t.osd, wrlocked); + if (IS_ERR(osd)) { + WARN_ON(PTR_ERR(osd) != -EAGAIN || wrlocked); + goto promote; + } + + if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { + dout("req %p pausewr\n", req); + req->r_t.paused = true; + maybe_request_map(osdc); + } else if ((req->r_flags & CEPH_OSD_FLAG_READ) && + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) { + dout("req %p pauserd\n", req); + req->r_t.paused = true; + maybe_request_map(osdc); + } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && + !(req->r_flags & (CEPH_OSD_FLAG_FULL_TRY | + CEPH_OSD_FLAG_FULL_FORCE)) && + (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || + pool_full(osdc, req->r_t.base_oloc.pool))) { + dout("req %p full/pool_full\n", req); + pr_warn_ratelimited("FULL or reached pool quota\n"); + req->r_t.paused = true; + maybe_request_map(osdc); + } else if (!osd_homeless(osd)) { + need_send = true; + } else { + maybe_request_map(osdc); + } + + mutex_lock(&osd->lock); + /* + * Assign the tid atomically with send_request() to protect + * multiple writes to the same object from racing with each + * other, resulting in out of order ops on the OSDs. + */ + req->r_tid = atomic64_inc_return(&osdc->last_tid); + link_request(osd, req); + if (need_send) + send_request(req); + mutex_unlock(&osd->lock); + if (ct_res == CALC_TARGET_POOL_DNE) + send_map_check(req); + + if (promoted) + downgrade_write(&osdc->lock); + return; + +promote: + up_read(&osdc->lock); + down_write(&osdc->lock); + wrlocked = true; + promoted = true; + goto again; +} + +static void account_request(struct ceph_osd_request *req) +{ + unsigned int mask = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; + + if (req->r_flags & CEPH_OSD_FLAG_READ) { + WARN_ON(req->r_flags & mask); + req->r_flags |= CEPH_OSD_FLAG_ACK; + } else if (req->r_flags & CEPH_OSD_FLAG_WRITE) + WARN_ON(!(req->r_flags & mask)); + else + WARN_ON(1); + + WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask); + atomic_inc(&req->r_osdc->num_requests); +} + +static void submit_request(struct ceph_osd_request *req, bool wrlocked) +{ ceph_osdc_get_request(req); - list_add_tail(&req->r_linger_item, &osdc->req_linger); - if (req->r_osd) - list_add_tail(&req->r_linger_osd_item, - &req->r_osd->o_linger_requests); + account_request(req); + __submit_request(req, wrlocked); } -static void __unregister_linger_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) +static void __finish_request(struct ceph_osd_request *req) { - WARN_ON(!req->r_linger); + struct ceph_osd_client *osdc = req->r_osdc; + struct ceph_osd *osd = req->r_osd; - if (list_empty(&req->r_linger_item)) { - dout("%s %p tid %llu not registered\n", __func__, req, - req->r_tid); + verify_osd_locked(osd); + dout("%s req %p tid %llu\n", __func__, req, req->r_tid); + + WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid)); + unlink_request(osd, req); + atomic_dec(&osdc->num_requests); + + /* + * If an OSD has failed or returned and a request has been sent + * twice, it's possible to get a reply and end up here while the + * request message is queued for delivery. We will ignore the + * reply, so not a big deal, but better to try and catch it. + */ + ceph_msg_revoke(req->r_request); + ceph_msg_revoke_incoming(req->r_reply); +} + +static void finish_request(struct ceph_osd_request *req) +{ + __finish_request(req); + ceph_osdc_put_request(req); +} + +static void __complete_request(struct ceph_osd_request *req) +{ + if (req->r_callback) + req->r_callback(req); + else + complete_all(&req->r_completion); +} + +/* + * Note that this is open-coded in handle_reply(), which has to deal + * with ack vs commit, dup acks, etc. + */ +static void complete_request(struct ceph_osd_request *req, int err) +{ + dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err); + + req->r_result = err; + __finish_request(req); + __complete_request(req); + complete_all(&req->r_safe_completion); + ceph_osdc_put_request(req); +} + +static void cancel_map_check(struct ceph_osd_request *req) +{ + struct ceph_osd_client *osdc = req->r_osdc; + struct ceph_osd_request *lookup_req; + + verify_osdc_wrlocked(osdc); + + lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid); + if (!lookup_req) return; + + WARN_ON(lookup_req != req); + erase_request_mc(&osdc->map_checks, req); + ceph_osdc_put_request(req); +} + +static void cancel_request(struct ceph_osd_request *req) +{ + dout("%s req %p tid %llu\n", __func__, req, req->r_tid); + + cancel_map_check(req); + finish_request(req); +} + +static void check_pool_dne(struct ceph_osd_request *req) +{ + struct ceph_osd_client *osdc = req->r_osdc; + struct ceph_osdmap *map = osdc->osdmap; + + verify_osdc_wrlocked(osdc); + WARN_ON(!map->epoch); + + if (req->r_attempts) { + /* + * We sent a request earlier, which means that + * previously the pool existed, and now it does not + * (i.e., it was deleted). + */ + req->r_map_dne_bound = map->epoch; + dout("%s req %p tid %llu pool disappeared\n", __func__, req, + req->r_tid); + } else { + dout("%s req %p tid %llu map_dne_bound %u have %u\n", __func__, + req, req->r_tid, req->r_map_dne_bound, map->epoch); + } + + if (req->r_map_dne_bound) { + if (map->epoch >= req->r_map_dne_bound) { + /* we had a new enough map */ + pr_info_ratelimited("tid %llu pool does not exist\n", + req->r_tid); + complete_request(req, -ENOENT); + } + } else { + send_map_check(req); } +} + +static void map_check_cb(struct ceph_mon_generic_request *greq) +{ + struct ceph_osd_client *osdc = &greq->monc->client->osdc; + struct ceph_osd_request *req; + u64 tid = greq->private_data; - dout("%s %p tid %llu\n", __func__, req, req->r_tid); - list_del_init(&req->r_linger_item); + WARN_ON(greq->result || !greq->u.newest); - if (req->r_osd) { - list_del_init(&req->r_linger_osd_item); - maybe_move_osd_to_lru(osdc, req->r_osd); - if (list_empty(&req->r_osd_item)) - req->r_osd = NULL; + down_write(&osdc->lock); + req = lookup_request_mc(&osdc->map_checks, tid); + if (!req) { + dout("%s tid %llu dne\n", __func__, tid); + goto out_unlock; } + + dout("%s req %p tid %llu map_dne_bound %u newest %llu\n", __func__, + req, req->r_tid, req->r_map_dne_bound, greq->u.newest); + if (!req->r_map_dne_bound) + req->r_map_dne_bound = greq->u.newest; + erase_request_mc(&osdc->map_checks, req); + check_pool_dne(req); + ceph_osdc_put_request(req); +out_unlock: + up_write(&osdc->lock); } -void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) +static void send_map_check(struct ceph_osd_request *req) { - if (!req->r_linger) { - dout("set_request_linger %p\n", req); - req->r_linger = 1; + struct ceph_osd_client *osdc = req->r_osdc; + struct ceph_osd_request *lookup_req; + int ret; + + verify_osdc_wrlocked(osdc); + + lookup_req = lookup_request_mc(&osdc->map_checks, req->r_tid); + if (lookup_req) { + WARN_ON(lookup_req != req); + return; } + + ceph_osdc_get_request(req); + insert_request_mc(&osdc->map_checks, req); + ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap", + map_check_cb, req->r_tid); + WARN_ON(ret); } -EXPORT_SYMBOL(ceph_osdc_set_request_linger); /* - * Returns whether a request should be blocked from being sent - * based on the current osdmap and osd_client settings. - * - * Caller should hold map_sem for read. + * lingering requests, watch/notify v2 infrastructure */ -static bool __req_should_be_paused(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) +static void linger_release(struct kref *kref) +{ + struct ceph_osd_linger_request *lreq = + container_of(kref, struct ceph_osd_linger_request, kref); + + dout("%s lreq %p reg_req %p ping_req %p\n", __func__, lreq, + lreq->reg_req, lreq->ping_req); + WARN_ON(!RB_EMPTY_NODE(&lreq->node)); + WARN_ON(!RB_EMPTY_NODE(&lreq->osdc_node)); + WARN_ON(!RB_EMPTY_NODE(&lreq->mc_node)); + WARN_ON(!list_empty(&lreq->scan_item)); + WARN_ON(!list_empty(&lreq->pending_lworks)); + WARN_ON(lreq->osd); + + if (lreq->reg_req) + ceph_osdc_put_request(lreq->reg_req); + if (lreq->ping_req) + ceph_osdc_put_request(lreq->ping_req); + target_destroy(&lreq->t); + kfree(lreq); +} + +static void linger_put(struct ceph_osd_linger_request *lreq) { - bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); - bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); - return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) || - (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr); + if (lreq) + kref_put(&lreq->kref, linger_release); +} + +static struct ceph_osd_linger_request * +linger_get(struct ceph_osd_linger_request *lreq) +{ + kref_get(&lreq->kref); + return lreq; +} + +static struct ceph_osd_linger_request * +linger_alloc(struct ceph_osd_client *osdc) +{ + struct ceph_osd_linger_request *lreq; + + lreq = kzalloc(sizeof(*lreq), GFP_NOIO); + if (!lreq) + return NULL; + + kref_init(&lreq->kref); + mutex_init(&lreq->lock); + RB_CLEAR_NODE(&lreq->node); + RB_CLEAR_NODE(&lreq->osdc_node); + RB_CLEAR_NODE(&lreq->mc_node); + INIT_LIST_HEAD(&lreq->scan_item); + INIT_LIST_HEAD(&lreq->pending_lworks); + init_completion(&lreq->reg_commit_wait); + init_completion(&lreq->notify_finish_wait); + + lreq->osdc = osdc; + target_init(&lreq->t); + + dout("%s lreq %p\n", __func__, lreq); + return lreq; } +DEFINE_RB_INSDEL_FUNCS(linger, struct ceph_osd_linger_request, linger_id, node) +DEFINE_RB_FUNCS(linger_osdc, struct ceph_osd_linger_request, linger_id, osdc_node) +DEFINE_RB_FUNCS(linger_mc, struct ceph_osd_linger_request, linger_id, mc_node) + /* - * Calculate mapping of a request to a PG. Takes tiering into account. + * Create linger request <-> OSD session relation. + * + * @lreq has to be registered, @osd may be homeless. */ -static int __calc_request_pg(struct ceph_osdmap *osdmap, - struct ceph_osd_request *req, - struct ceph_pg *pg_out) +static void link_linger(struct ceph_osd *osd, + struct ceph_osd_linger_request *lreq) { - bool need_check_tiering; + verify_osd_locked(osd); + WARN_ON(!lreq->linger_id || lreq->osd); + dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd, + osd->o_osd, lreq, lreq->linger_id); - need_check_tiering = false; - if (req->r_target_oloc.pool == -1) { - req->r_target_oloc = req->r_base_oloc; /* struct */ - need_check_tiering = true; + if (!osd_homeless(osd)) + __remove_osd_from_lru(osd); + else + atomic_inc(&osd->o_osdc->num_homeless); + + get_osd(osd); + insert_linger(&osd->o_linger_requests, lreq); + lreq->osd = osd; +} + +static void unlink_linger(struct ceph_osd *osd, + struct ceph_osd_linger_request *lreq) +{ + verify_osd_locked(osd); + WARN_ON(lreq->osd != osd); + dout("%s osd %p osd%d lreq %p linger_id %llu\n", __func__, osd, + osd->o_osd, lreq, lreq->linger_id); + + lreq->osd = NULL; + erase_linger(&osd->o_linger_requests, lreq); + put_osd(osd); + + if (!osd_homeless(osd)) + maybe_move_osd_to_lru(osd); + else + atomic_dec(&osd->o_osdc->num_homeless); +} + +static bool __linger_registered(struct ceph_osd_linger_request *lreq) +{ + verify_osdc_locked(lreq->osdc); + + return !RB_EMPTY_NODE(&lreq->osdc_node); +} + +static bool linger_registered(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; + bool registered; + + down_read(&osdc->lock); + registered = __linger_registered(lreq); + up_read(&osdc->lock); + + return registered; +} + +static void linger_register(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; + + verify_osdc_wrlocked(osdc); + WARN_ON(lreq->linger_id); + + linger_get(lreq); + lreq->linger_id = ++osdc->last_linger_id; + insert_linger_osdc(&osdc->linger_requests, lreq); +} + +static void linger_unregister(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; + + verify_osdc_wrlocked(osdc); + + erase_linger_osdc(&osdc->linger_requests, lreq); + linger_put(lreq); +} + +static void cancel_linger_request(struct ceph_osd_request *req) +{ + struct ceph_osd_linger_request *lreq = req->r_priv; + + WARN_ON(!req->r_linger); + cancel_request(req); + linger_put(lreq); +} + +struct linger_work { + struct work_struct work; + struct ceph_osd_linger_request *lreq; + struct list_head pending_item; + unsigned long queued_stamp; + + union { + struct { + u64 notify_id; + u64 notifier_id; + void *payload; /* points into @msg front */ + size_t payload_len; + + struct ceph_msg *msg; /* for ceph_msg_put() */ + } notify; + struct { + int err; + } error; + }; +}; + +static struct linger_work *lwork_alloc(struct ceph_osd_linger_request *lreq, + work_func_t workfn) +{ + struct linger_work *lwork; + + lwork = kzalloc(sizeof(*lwork), GFP_NOIO); + if (!lwork) + return NULL; + + INIT_WORK(&lwork->work, workfn); + INIT_LIST_HEAD(&lwork->pending_item); + lwork->lreq = linger_get(lreq); + + return lwork; +} + +static void lwork_free(struct linger_work *lwork) +{ + struct ceph_osd_linger_request *lreq = lwork->lreq; + + mutex_lock(&lreq->lock); + list_del(&lwork->pending_item); + mutex_unlock(&lreq->lock); + + linger_put(lreq); + kfree(lwork); +} + +static void lwork_queue(struct linger_work *lwork) +{ + struct ceph_osd_linger_request *lreq = lwork->lreq; + struct ceph_osd_client *osdc = lreq->osdc; + + verify_lreq_locked(lreq); + WARN_ON(!list_empty(&lwork->pending_item)); + + lwork->queued_stamp = jiffies; + list_add_tail(&lwork->pending_item, &lreq->pending_lworks); + queue_work(osdc->notify_wq, &lwork->work); +} + +static void do_watch_notify(struct work_struct *w) +{ + struct linger_work *lwork = container_of(w, struct linger_work, work); + struct ceph_osd_linger_request *lreq = lwork->lreq; + + if (!linger_registered(lreq)) { + dout("%s lreq %p not registered\n", __func__, lreq); + goto out; } - if (req->r_target_oid.name_len == 0) { - ceph_oid_copy(&req->r_target_oid, &req->r_base_oid); - need_check_tiering = true; + + WARN_ON(!lreq->is_watch); + dout("%s lreq %p notify_id %llu notifier_id %llu payload_len %zu\n", + __func__, lreq, lwork->notify.notify_id, lwork->notify.notifier_id, + lwork->notify.payload_len); + lreq->wcb(lreq->data, lwork->notify.notify_id, lreq->linger_id, + lwork->notify.notifier_id, lwork->notify.payload, + lwork->notify.payload_len); + +out: + ceph_msg_put(lwork->notify.msg); + lwork_free(lwork); +} + +static void do_watch_error(struct work_struct *w) +{ + struct linger_work *lwork = container_of(w, struct linger_work, work); + struct ceph_osd_linger_request *lreq = lwork->lreq; + + if (!linger_registered(lreq)) { + dout("%s lreq %p not registered\n", __func__, lreq); + goto out; } - if (need_check_tiering && - (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { - struct ceph_pg_pool_info *pi; - - pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool); - if (pi) { - if ((req->r_flags & CEPH_OSD_FLAG_READ) && - pi->read_tier >= 0) - req->r_target_oloc.pool = pi->read_tier; - if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && - pi->write_tier >= 0) - req->r_target_oloc.pool = pi->write_tier; + dout("%s lreq %p err %d\n", __func__, lreq, lwork->error.err); + lreq->errcb(lreq->data, lreq->linger_id, lwork->error.err); + +out: + lwork_free(lwork); +} + +static void queue_watch_error(struct ceph_osd_linger_request *lreq) +{ + struct linger_work *lwork; + + lwork = lwork_alloc(lreq, do_watch_error); + if (!lwork) { + pr_err("failed to allocate error-lwork\n"); + return; + } + + lwork->error.err = lreq->last_error; + lwork_queue(lwork); +} + +static void linger_reg_commit_complete(struct ceph_osd_linger_request *lreq, + int result) +{ + if (!completion_done(&lreq->reg_commit_wait)) { + lreq->reg_commit_error = (result <= 0 ? result : 0); + complete_all(&lreq->reg_commit_wait); + } +} + +static void linger_commit_cb(struct ceph_osd_request *req) +{ + struct ceph_osd_linger_request *lreq = req->r_priv; + + mutex_lock(&lreq->lock); + dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq, + lreq->linger_id, req->r_result); + WARN_ON(!__linger_registered(lreq)); + linger_reg_commit_complete(lreq, req->r_result); + lreq->committed = true; + + if (!lreq->is_watch) { + struct ceph_osd_data *osd_data = + osd_req_op_data(req, 0, notify, response_data); + void *p = page_address(osd_data->pages[0]); + + WARN_ON(req->r_ops[0].op != CEPH_OSD_OP_NOTIFY || + osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); + + /* make note of the notify_id */ + if (req->r_ops[0].outdata_len >= sizeof(u64)) { + lreq->notify_id = ceph_decode_64(&p); + dout("lreq %p notify_id %llu\n", lreq, + lreq->notify_id); + } else { + dout("lreq %p no notify_id\n", lreq); } - /* !pi is caught in ceph_oloc_oid_to_pg() */ } - return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc, - &req->r_target_oid, pg_out); + mutex_unlock(&lreq->lock); + linger_put(lreq); } -static void __enqueue_request(struct ceph_osd_request *req) +static int normalize_watch_error(int err) { - struct ceph_osd_client *osdc = req->r_osdc; + /* + * Translate ENOENT -> ENOTCONN so that a delete->disconnection + * notification and a failure to reconnect because we raced with + * the delete appear the same to the user. + */ + if (err == -ENOENT) + err = -ENOTCONN; + + return err; +} + +static void linger_reconnect_cb(struct ceph_osd_request *req) +{ + struct ceph_osd_linger_request *lreq = req->r_priv; + + mutex_lock(&lreq->lock); + dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__, + lreq, lreq->linger_id, req->r_result, lreq->last_error); + if (req->r_result < 0) { + if (!lreq->last_error) { + lreq->last_error = normalize_watch_error(req->r_result); + queue_watch_error(lreq); + } + } + + mutex_unlock(&lreq->lock); + linger_put(lreq); +} + +static void send_linger(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_request *req = lreq->reg_req; + struct ceph_osd_req_op *op = &req->r_ops[0]; - dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1); + verify_osdc_wrlocked(req->r_osdc); + dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); - if (req->r_osd) { - __remove_osd_from_lru(req->r_osd); - list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); - list_move_tail(&req->r_req_lru_item, &osdc->req_unsent); + if (req->r_osd) + cancel_linger_request(req); + + request_reinit(req); + ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); + ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); + req->r_flags = lreq->t.flags; + req->r_mtime = lreq->mtime; + + mutex_lock(&lreq->lock); + if (lreq->is_watch && lreq->committed) { + WARN_ON(op->op != CEPH_OSD_OP_WATCH || + op->watch.cookie != lreq->linger_id); + op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT; + op->watch.gen = ++lreq->register_gen; + dout("lreq %p reconnect register_gen %u\n", lreq, + op->watch.gen); + req->r_callback = linger_reconnect_cb; } else { - list_move_tail(&req->r_req_lru_item, &osdc->req_notarget); + if (!lreq->is_watch) + lreq->notify_id = 0; + else + WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH); + dout("lreq %p register\n", lreq); + req->r_callback = linger_commit_cb; } + mutex_unlock(&lreq->lock); + + req->r_priv = linger_get(lreq); + req->r_linger = true; + + submit_request(req, true); } -/* - * Pick an osd (the first 'up' osd in the pg), allocate the osd struct - * (as needed), and set the request r_osd appropriately. If there is - * no up osd, set r_osd to NULL. Move the request to the appropriate list - * (unsent, homeless) or leave on in-flight lru. - * - * Return 0 if unchanged, 1 if changed, or negative on error. - * - * Caller should hold map_sem for read and request_mutex. - */ -static int __map_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req, int force_resend) +static void linger_ping_cb(struct ceph_osd_request *req) { - struct ceph_pg pgid; - int acting[CEPH_PG_MAX_SIZE]; - int num, o; - int err; - bool was_paused; - - dout("map_request %p tid %lld\n", req, req->r_tid); - - err = __calc_request_pg(osdc->osdmap, req, &pgid); - if (err) { - list_move(&req->r_req_lru_item, &osdc->req_notarget); - return err; - } - req->r_pgid = pgid; - - num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o); - if (num < 0) - num = 0; - - was_paused = req->r_paused; - req->r_paused = __req_should_be_paused(osdc, req); - if (was_paused && !req->r_paused) - force_resend = 1; - - if ((!force_resend && - req->r_osd && req->r_osd->o_osd == o && - req->r_sent >= req->r_osd->o_incarnation && - req->r_num_pg_osds == num && - memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || - (req->r_osd == NULL && o == -1) || - req->r_paused) - return 0; /* no change */ - - dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", - req->r_tid, pgid.pool, pgid.seed, o, - req->r_osd ? req->r_osd->o_osd : -1); - - /* record full pg acting set */ - memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num); - req->r_num_pg_osds = num; - - if (req->r_osd) { - __cancel_request(req); - list_del_init(&req->r_osd_item); - list_del_init(&req->r_linger_osd_item); - req->r_osd = NULL; - } - - req->r_osd = __lookup_osd(osdc, o); - if (!req->r_osd && o >= 0) { - err = -ENOMEM; - req->r_osd = create_osd(osdc, o); - if (!req->r_osd) { - list_move(&req->r_req_lru_item, &osdc->req_notarget); - goto out; + struct ceph_osd_linger_request *lreq = req->r_priv; + + mutex_lock(&lreq->lock); + dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n", + __func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent, + lreq->last_error); + if (lreq->register_gen == req->r_ops[0].watch.gen) { + if (!req->r_result) { + lreq->watch_valid_thru = lreq->ping_sent; + } else if (!lreq->last_error) { + lreq->last_error = normalize_watch_error(req->r_result); + queue_watch_error(lreq); } + } else { + dout("lreq %p register_gen %u ignoring old pong %u\n", lreq, + lreq->register_gen, req->r_ops[0].watch.gen); + } + + mutex_unlock(&lreq->lock); + linger_put(lreq); +} - dout("map_request osd %p is osd%d\n", req->r_osd, o); - __insert_osd(osdc, req->r_osd); +static void send_linger_ping(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; + struct ceph_osd_request *req = lreq->ping_req; + struct ceph_osd_req_op *op = &req->r_ops[0]; - ceph_con_open(&req->r_osd->o_con, - CEPH_ENTITY_TYPE_OSD, o, - &osdc->osdmap->osd_addr[o]); + if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) { + dout("%s PAUSERD\n", __func__); + return; } - __enqueue_request(req); - err = 1; /* osd or pg changed */ + lreq->ping_sent = jiffies; + dout("%s lreq %p linger_id %llu ping_sent %lu register_gen %u\n", + __func__, lreq, lreq->linger_id, lreq->ping_sent, + lreq->register_gen); -out: - return err; + if (req->r_osd) + cancel_linger_request(req); + + request_reinit(req); + target_copy(&req->r_t, &lreq->t); + + WARN_ON(op->op != CEPH_OSD_OP_WATCH || + op->watch.cookie != lreq->linger_id || + op->watch.op != CEPH_OSD_WATCH_OP_PING); + op->watch.gen = lreq->register_gen; + req->r_callback = linger_ping_cb; + req->r_priv = linger_get(lreq); + req->r_linger = true; + + ceph_osdc_get_request(req); + account_request(req); + req->r_tid = atomic64_inc_return(&osdc->last_tid); + link_request(lreq->osd, req); + send_request(req); } -/* - * caller should hold map_sem (for read) and request_mutex - */ -static void __send_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) +static void linger_submit(struct ceph_osd_linger_request *lreq) { - void *p; + struct ceph_osd_client *osdc = lreq->osdc; + struct ceph_osd *osd; - dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n", - req, req->r_tid, req->r_osd->o_osd, req->r_flags, - (unsigned long long)req->r_pgid.pool, req->r_pgid.seed); + calc_target(osdc, &lreq->t, &lreq->last_force_resend, false); + osd = lookup_create_osd(osdc, lreq->t.osd, true); + link_linger(osd, lreq); - /* fill in message content that changes each time we send it */ - put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); - put_unaligned_le32(req->r_flags, req->r_request_flags); - put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool); - p = req->r_request_pgid; - ceph_encode_64(&p, req->r_pgid.pool); - ceph_encode_32(&p, req->r_pgid.seed); - put_unaligned_le64(1, req->r_request_attempts); /* FIXME */ - memcpy(req->r_request_reassert_version, &req->r_reassert_version, - sizeof(req->r_reassert_version)); + send_linger(lreq); +} - req->r_stamp = jiffies; - list_move_tail(&req->r_req_lru_item, &osdc->req_lru); +static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; + struct ceph_osd_linger_request *lookup_lreq; - ceph_msg_get(req->r_request); /* send consumes a ref */ + verify_osdc_wrlocked(osdc); - req->r_sent = req->r_osd->o_incarnation; + lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks, + lreq->linger_id); + if (!lookup_lreq) + return; - ceph_con_send(&req->r_osd->o_con, req->r_request); + WARN_ON(lookup_lreq != lreq); + erase_linger_mc(&osdc->linger_map_checks, lreq); + linger_put(lreq); } /* - * Send any requests in the queue (req_unsent). + * @lreq has to be both registered and linked. */ -static void __send_queued(struct ceph_osd_client *osdc) +static void __linger_cancel(struct ceph_osd_linger_request *lreq) { - struct ceph_osd_request *req, *tmp; + if (lreq->is_watch && lreq->ping_req->r_osd) + cancel_linger_request(lreq->ping_req); + if (lreq->reg_req->r_osd) + cancel_linger_request(lreq->reg_req); + cancel_linger_map_check(lreq); + unlink_linger(lreq->osd, lreq); + linger_unregister(lreq); +} + +static void linger_cancel(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; - dout("__send_queued\n"); - list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) - __send_request(osdc, req); + down_write(&osdc->lock); + if (__linger_registered(lreq)) + __linger_cancel(lreq); + up_write(&osdc->lock); } -/* - * Caller should hold map_sem for read and request_mutex. - */ -static int __ceph_osdc_start_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req, - bool nofail) -{ - int rc; - - __register_request(osdc, req); - req->r_sent = 0; - req->r_got_reply = 0; - rc = __map_request(osdc, req, 0); - if (rc < 0) { - if (nofail) { - dout("osdc_start_request failed map, " - " will retry %lld\n", req->r_tid); - rc = 0; - } else { - __unregister_request(osdc, req); - } - return rc; +static void send_linger_map_check(struct ceph_osd_linger_request *lreq); + +static void check_linger_pool_dne(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; + struct ceph_osdmap *map = osdc->osdmap; + + verify_osdc_wrlocked(osdc); + WARN_ON(!map->epoch); + + if (lreq->register_gen) { + lreq->map_dne_bound = map->epoch; + dout("%s lreq %p linger_id %llu pool disappeared\n", __func__, + lreq, lreq->linger_id); + } else { + dout("%s lreq %p linger_id %llu map_dne_bound %u have %u\n", + __func__, lreq, lreq->linger_id, lreq->map_dne_bound, + map->epoch); } - if (req->r_osd == NULL) { - dout("send_request %p no up osds in pg\n", req); - ceph_monc_request_next_osdmap(&osdc->client->monc); + if (lreq->map_dne_bound) { + if (map->epoch >= lreq->map_dne_bound) { + /* we had a new enough map */ + pr_info("linger_id %llu pool does not exist\n", + lreq->linger_id); + linger_reg_commit_complete(lreq, -ENOENT); + __linger_cancel(lreq); + } } else { - __send_queued(osdc); + send_linger_map_check(lreq); } +} - return 0; +static void linger_map_check_cb(struct ceph_mon_generic_request *greq) +{ + struct ceph_osd_client *osdc = &greq->monc->client->osdc; + struct ceph_osd_linger_request *lreq; + u64 linger_id = greq->private_data; + + WARN_ON(greq->result || !greq->u.newest); + + down_write(&osdc->lock); + lreq = lookup_linger_mc(&osdc->linger_map_checks, linger_id); + if (!lreq) { + dout("%s linger_id %llu dne\n", __func__, linger_id); + goto out_unlock; + } + + dout("%s lreq %p linger_id %llu map_dne_bound %u newest %llu\n", + __func__, lreq, lreq->linger_id, lreq->map_dne_bound, + greq->u.newest); + if (!lreq->map_dne_bound) + lreq->map_dne_bound = greq->u.newest; + erase_linger_mc(&osdc->linger_map_checks, lreq); + check_linger_pool_dne(lreq); + + linger_put(lreq); +out_unlock: + up_write(&osdc->lock); +} + +static void send_linger_map_check(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_client *osdc = lreq->osdc; + struct ceph_osd_linger_request *lookup_lreq; + int ret; + + verify_osdc_wrlocked(osdc); + + lookup_lreq = lookup_linger_mc(&osdc->linger_map_checks, + lreq->linger_id); + if (lookup_lreq) { + WARN_ON(lookup_lreq != lreq); + return; + } + + linger_get(lreq); + insert_linger_mc(&osdc->linger_map_checks, lreq); + ret = ceph_monc_get_version_async(&osdc->client->monc, "osdmap", + linger_map_check_cb, lreq->linger_id); + WARN_ON(ret); +} + +static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq) +{ + int ret; + + dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); + ret = wait_for_completion_interruptible(&lreq->reg_commit_wait); + return ret ?: lreq->reg_commit_error; +} + +static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq) +{ + int ret; + + dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id); + ret = wait_for_completion_interruptible(&lreq->notify_finish_wait); + return ret ?: lreq->notify_finish_error; } /* - * Timeout callback, called every N seconds when 1 or more osd - * requests has been active for more than N seconds. When this - * happens, we ping all OSDs with requests who have timed out to - * ensure any communications channel reset is detected. Reset the - * request timeouts another N seconds in the future as we go. - * Reschedule the timeout event another N seconds in future (unless - * there are no open requests). + * Timeout callback, called every N seconds. When 1 or more OSD + * requests has been active for more than N seconds, we send a keepalive + * (tag + timestamp) to its OSD to ensure any communications channel + * reset is detected. */ static void handle_timeout(struct work_struct *work) { struct ceph_osd_client *osdc = container_of(work, struct ceph_osd_client, timeout_work.work); struct ceph_options *opts = osdc->client->options; - struct ceph_osd_request *req; - struct ceph_osd *osd; - struct list_head slow_osds; - dout("timeout\n"); - down_read(&osdc->map_sem); - - ceph_monc_request_next_osdmap(&osdc->client->monc); + unsigned long cutoff = jiffies - opts->osd_keepalive_timeout; + LIST_HEAD(slow_osds); + struct rb_node *n, *p; - mutex_lock(&osdc->request_mutex); + dout("%s osdc %p\n", __func__, osdc); + down_write(&osdc->lock); /* * ping osds that are a bit slow. this ensures that if there * is a break in the TCP connection we will notice, and reopen * a connection with that osd (from the fault callback). */ - INIT_LIST_HEAD(&slow_osds); - list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { - if (time_before(jiffies, - req->r_stamp + opts->osd_keepalive_timeout)) - break; + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + bool found = false; + + for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) { + struct ceph_osd_request *req = + rb_entry(p, struct ceph_osd_request, r_node); + + if (time_before(req->r_stamp, cutoff)) { + dout(" req %p tid %llu on osd%d is laggy\n", + req, req->r_tid, osd->o_osd); + found = true; + } + } + for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) { + struct ceph_osd_linger_request *lreq = + rb_entry(p, struct ceph_osd_linger_request, node); + + dout(" lreq %p linger_id %llu is served by osd%d\n", + lreq, lreq->linger_id, osd->o_osd); + found = true; + + mutex_lock(&lreq->lock); + if (lreq->is_watch && lreq->committed && !lreq->last_error) + send_linger_ping(lreq); + mutex_unlock(&lreq->lock); + } - osd = req->r_osd; - BUG_ON(!osd); - dout(" tid %llu is slow, will send keepalive on osd%d\n", - req->r_tid, osd->o_osd); - list_move_tail(&osd->o_keepalive_item, &slow_osds); + if (found) + list_move_tail(&osd->o_keepalive_item, &slow_osds); } + + if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds)) + maybe_request_map(osdc); + while (!list_empty(&slow_osds)) { - osd = list_entry(slow_osds.next, struct ceph_osd, - o_keepalive_item); + struct ceph_osd *osd = list_first_entry(&slow_osds, + struct ceph_osd, + o_keepalive_item); list_del_init(&osd->o_keepalive_item); ceph_con_keepalive(&osd->o_con); } - __schedule_osd_timeout(osdc); - __send_queued(osdc); - mutex_unlock(&osdc->request_mutex); - up_read(&osdc->map_sem); + up_write(&osdc->lock); + schedule_delayed_work(&osdc->timeout_work, + osdc->client->options->osd_keepalive_timeout); } static void handle_osds_timeout(struct work_struct *work) @@ -1663,12 +2541,20 @@ static void handle_osds_timeout(struct work_struct *work) container_of(work, struct ceph_osd_client, osds_timeout_work.work); unsigned long delay = osdc->client->options->osd_idle_ttl / 4; + struct ceph_osd *osd, *nosd; - dout("osds timeout\n"); - down_read(&osdc->map_sem); - remove_old_osds(osdc); - up_read(&osdc->map_sem); + dout("%s osdc %p\n", __func__, osdc); + down_write(&osdc->lock); + list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { + if (time_before(jiffies, osd->lru_ttl)) + break; + WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests)); + WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests)); + close_osd(osd); + } + + up_write(&osdc->lock); schedule_delayed_work(&osdc->osds_timeout_work, round_jiffies_relative(delay)); } @@ -1776,107 +2662,76 @@ e_inval: goto out; } -static void complete_request(struct ceph_osd_request *req) -{ - complete_all(&req->r_safe_completion); /* fsync waiter */ -} +struct MOSDOpReply { + struct ceph_pg pgid; + u64 flags; + int result; + u32 epoch; + int num_ops; + u32 outdata_len[CEPH_OSD_MAX_OPS]; + s32 rval[CEPH_OSD_MAX_OPS]; + int retry_attempt; + struct ceph_eversion replay_version; + u64 user_version; + struct ceph_request_redirect redirect; +}; -/* - * handle osd op reply. either call the callback if it is specified, - * or do the completion to wake up the waiting thread. - */ -static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) +static int decode_MOSDOpReply(const struct ceph_msg *msg, struct MOSDOpReply *m) { - void *p, *end; - struct ceph_osd_request *req; - struct ceph_request_redirect redir; - u64 tid; - int object_len; - unsigned int numops; - int payload_len, flags; - s32 result; - s32 retry_attempt; - struct ceph_pg pg; - int err; - u32 reassert_epoch; - u64 reassert_version; - u32 osdmap_epoch; - int already_completed; - u32 bytes; + void *p = msg->front.iov_base; + void *const end = p + msg->front.iov_len; + u16 version = le16_to_cpu(msg->hdr.version); + struct ceph_eversion bad_replay_version; u8 decode_redir; - unsigned int i; - - tid = le64_to_cpu(msg->hdr.tid); - dout("handle_reply %p tid %llu\n", msg, tid); + u32 len; + int ret; + int i; - p = msg->front.iov_base; - end = p + msg->front.iov_len; + ceph_decode_32_safe(&p, end, len, e_inval); + ceph_decode_need(&p, end, len, e_inval); + p += len; /* skip oid */ - ceph_decode_need(&p, end, 4, bad); - object_len = ceph_decode_32(&p); - ceph_decode_need(&p, end, object_len, bad); - p += object_len; + ret = ceph_decode_pgid(&p, end, &m->pgid); + if (ret) + return ret; - err = ceph_decode_pgid(&p, end, &pg); - if (err) - goto bad; + ceph_decode_64_safe(&p, end, m->flags, e_inval); + ceph_decode_32_safe(&p, end, m->result, e_inval); + ceph_decode_need(&p, end, sizeof(bad_replay_version), e_inval); + memcpy(&bad_replay_version, p, sizeof(bad_replay_version)); + p += sizeof(bad_replay_version); + ceph_decode_32_safe(&p, end, m->epoch, e_inval); - ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad); - flags = ceph_decode_64(&p); - result = ceph_decode_32(&p); - reassert_epoch = ceph_decode_32(&p); - reassert_version = ceph_decode_64(&p); - osdmap_epoch = ceph_decode_32(&p); - - /* lookup */ - down_read(&osdc->map_sem); - mutex_lock(&osdc->request_mutex); - req = __lookup_request(osdc, tid); - if (req == NULL) { - dout("handle_reply tid %llu dne\n", tid); - goto bad_mutex; - } - ceph_osdc_get_request(req); + ceph_decode_32_safe(&p, end, m->num_ops, e_inval); + if (m->num_ops > ARRAY_SIZE(m->outdata_len)) + goto e_inval; - dout("handle_reply %p tid %llu req %p result %d\n", msg, tid, - req, result); - - ceph_decode_need(&p, end, 4, bad_put); - numops = ceph_decode_32(&p); - if (numops > CEPH_OSD_MAX_OPS) - goto bad_put; - if (numops != req->r_num_ops) - goto bad_put; - payload_len = 0; - ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put); - for (i = 0; i < numops; i++) { + ceph_decode_need(&p, end, m->num_ops * sizeof(struct ceph_osd_op), + e_inval); + for (i = 0; i < m->num_ops; i++) { struct ceph_osd_op *op = p; - int len; - len = le32_to_cpu(op->payload_len); - req->r_ops[i].outdata_len = len; - dout(" op %d has %d bytes\n", i, len); - payload_len += len; + m->outdata_len[i] = le32_to_cpu(op->payload_len); p += sizeof(*op); } - bytes = le32_to_cpu(msg->hdr.data_len); - if (payload_len != bytes) { - pr_warn("sum of op payload lens %d != data_len %d\n", - payload_len, bytes); - goto bad_put; - } - ceph_decode_need(&p, end, 4 + numops * 4, bad_put); - retry_attempt = ceph_decode_32(&p); - for (i = 0; i < numops; i++) - req->r_ops[i].rval = ceph_decode_32(&p); + ceph_decode_32_safe(&p, end, m->retry_attempt, e_inval); + for (i = 0; i < m->num_ops; i++) + ceph_decode_32_safe(&p, end, m->rval[i], e_inval); - if (le16_to_cpu(msg->hdr.version) >= 6) { - p += 8 + 4; /* skip replay_version */ - p += 8; /* skip user_version */ + if (version >= 5) { + ceph_decode_need(&p, end, sizeof(m->replay_version), e_inval); + memcpy(&m->replay_version, p, sizeof(m->replay_version)); + p += sizeof(m->replay_version); + ceph_decode_64_safe(&p, end, m->user_version, e_inval); + } else { + m->replay_version = bad_replay_version; /* struct */ + m->user_version = le64_to_cpu(m->replay_version.version); + } - if (le16_to_cpu(msg->hdr.version) >= 7) - ceph_decode_8_safe(&p, end, decode_redir, bad_put); + if (version >= 6) { + if (version >= 7) + ceph_decode_8_safe(&p, end, decode_redir, e_inval); else decode_redir = 1; } else { @@ -1884,228 +2739,410 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) } if (decode_redir) { - err = ceph_redirect_decode(&p, end, &redir); - if (err) - goto bad_put; + ret = ceph_redirect_decode(&p, end, &m->redirect); + if (ret) + return ret; } else { - redir.oloc.pool = -1; + ceph_oloc_init(&m->redirect.oloc); } - if (redir.oloc.pool != -1) { - dout("redirect pool %lld\n", redir.oloc.pool); + return 0; - __unregister_request(osdc, req); +e_inval: + return -EINVAL; +} - req->r_target_oloc = redir.oloc; /* struct */ +/* + * We are done with @req if + * - @m is a safe reply, or + * - @m is an unsafe reply and we didn't want a safe one + */ +static bool done_request(const struct ceph_osd_request *req, + const struct MOSDOpReply *m) +{ + return (m->result < 0 || + (m->flags & CEPH_OSD_FLAG_ONDISK) || + !(req->r_flags & CEPH_OSD_FLAG_ONDISK)); +} - /* - * Start redirect requests with nofail=true. If - * mapping fails, request will end up on the notarget - * list, waiting for the new osdmap (which can take - * a while), even though the original request mapped - * successfully. In the future we might want to follow - * original request's nofail setting here. - */ - err = __ceph_osdc_start_request(osdc, req, true); - BUG_ON(err); +/* + * handle osd op reply. either call the callback if it is specified, + * or do the completion to wake up the waiting thread. + * + * ->r_unsafe_callback is set? yes no + * + * first reply is OK (needed r_cb/r_completion, r_cb/r_completion, + * any or needed/got safe) r_safe_completion r_safe_completion + * + * first reply is unsafe r_unsafe_cb(true) (nothing) + * + * when we get the safe reply r_unsafe_cb(false), r_cb/r_completion, + * r_safe_completion r_safe_completion + */ +static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg) +{ + struct ceph_osd_client *osdc = osd->o_osdc; + struct ceph_osd_request *req; + struct MOSDOpReply m; + u64 tid = le64_to_cpu(msg->hdr.tid); + u32 data_len = 0; + bool already_acked; + int ret; + int i; - goto out_unlock; - } + dout("%s msg %p tid %llu\n", __func__, msg, tid); - already_completed = req->r_got_reply; - if (!req->r_got_reply) { - req->r_result = result; - dout("handle_reply result %d bytes %d\n", req->r_result, - bytes); - if (req->r_result == 0) - req->r_result = bytes; + down_read(&osdc->lock); + if (!osd_registered(osd)) { + dout("%s osd%d unknown\n", __func__, osd->o_osd); + goto out_unlock_osdc; + } + WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num)); - /* in case this is a write and we need to replay, */ - req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch); - req->r_reassert_version.version = cpu_to_le64(reassert_version); + mutex_lock(&osd->lock); + req = lookup_request(&osd->o_requests, tid); + if (!req) { + dout("%s osd%d tid %llu unknown\n", __func__, osd->o_osd, tid); + goto out_unlock_session; + } - req->r_got_reply = 1; - } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { - dout("handle_reply tid %llu dup ack\n", tid); - goto out_unlock; + ret = decode_MOSDOpReply(msg, &m); + if (ret) { + pr_err("failed to decode MOSDOpReply for tid %llu: %d\n", + req->r_tid, ret); + ceph_msg_dump(msg); + goto fail_request; + } + dout("%s req %p tid %llu flags 0x%llx pgid %llu.%x epoch %u attempt %d v %u'%llu uv %llu\n", + __func__, req, req->r_tid, m.flags, m.pgid.pool, m.pgid.seed, + m.epoch, m.retry_attempt, le32_to_cpu(m.replay_version.epoch), + le64_to_cpu(m.replay_version.version), m.user_version); + + if (m.retry_attempt >= 0) { + if (m.retry_attempt != req->r_attempts - 1) { + dout("req %p tid %llu retry_attempt %d != %d, ignoring\n", + req, req->r_tid, m.retry_attempt, + req->r_attempts - 1); + goto out_unlock_session; + } + } else { + WARN_ON(1); /* MOSDOpReply v4 is assumed */ } - dout("handle_reply tid %llu flags %d\n", tid, flags); + if (!ceph_oloc_empty(&m.redirect.oloc)) { + dout("req %p tid %llu redirect pool %lld\n", req, req->r_tid, + m.redirect.oloc.pool); + unlink_request(osd, req); + mutex_unlock(&osd->lock); + + ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc); + req->r_flags |= CEPH_OSD_FLAG_REDIRECTED; + req->r_tid = 0; + __submit_request(req, false); + goto out_unlock_osdc; + } - if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK)) - __register_linger_request(osdc, req); + if (m.num_ops != req->r_num_ops) { + pr_err("num_ops %d != %d for tid %llu\n", m.num_ops, + req->r_num_ops, req->r_tid); + goto fail_request; + } + for (i = 0; i < req->r_num_ops; i++) { + dout(" req %p tid %llu op %d rval %d len %u\n", req, + req->r_tid, i, m.rval[i], m.outdata_len[i]); + req->r_ops[i].rval = m.rval[i]; + req->r_ops[i].outdata_len = m.outdata_len[i]; + data_len += m.outdata_len[i]; + } + if (data_len != le32_to_cpu(msg->hdr.data_len)) { + pr_err("sum of lens %u != %u for tid %llu\n", data_len, + le32_to_cpu(msg->hdr.data_len), req->r_tid); + goto fail_request; + } + dout("%s req %p tid %llu acked %d result %d data_len %u\n", __func__, + req, req->r_tid, req->r_got_reply, m.result, data_len); + + already_acked = req->r_got_reply; + if (!already_acked) { + req->r_result = m.result ?: data_len; + req->r_replay_version = m.replay_version; /* struct */ + req->r_got_reply = true; + } else if (!(m.flags & CEPH_OSD_FLAG_ONDISK)) { + dout("req %p tid %llu dup ack\n", req, req->r_tid); + goto out_unlock_session; + } - /* either this is a read, or we got the safe response */ - if (result < 0 || - (flags & CEPH_OSD_FLAG_ONDISK) || - ((flags & CEPH_OSD_FLAG_WRITE) == 0)) - __unregister_request(osdc, req); + if (done_request(req, &m)) { + __finish_request(req); + if (req->r_linger) { + WARN_ON(req->r_unsafe_callback); + dout("req %p tid %llu cb (locked)\n", req, req->r_tid); + __complete_request(req); + } + } - mutex_unlock(&osdc->request_mutex); - up_read(&osdc->map_sem); + mutex_unlock(&osd->lock); + up_read(&osdc->lock); - if (!already_completed) { - if (req->r_unsafe_callback && - result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK)) + if (done_request(req, &m)) { + if (already_acked && req->r_unsafe_callback) { + dout("req %p tid %llu safe-cb\n", req, req->r_tid); + req->r_unsafe_callback(req, false); + } else if (!req->r_linger) { + dout("req %p tid %llu cb\n", req, req->r_tid); + __complete_request(req); + } + if (m.flags & CEPH_OSD_FLAG_ONDISK) + complete_all(&req->r_safe_completion); + ceph_osdc_put_request(req); + } else { + if (req->r_unsafe_callback) { + dout("req %p tid %llu unsafe-cb\n", req, req->r_tid); req->r_unsafe_callback(req, true); - if (req->r_callback) - req->r_callback(req, msg); - else - complete_all(&req->r_completion); + } else { + WARN_ON(1); + } } - if (flags & CEPH_OSD_FLAG_ONDISK) { - if (req->r_unsafe_callback && already_completed) - req->r_unsafe_callback(req, false); - complete_request(req); + return; + +fail_request: + complete_request(req, -EIO); +out_unlock_session: + mutex_unlock(&osd->lock); +out_unlock_osdc: + up_read(&osdc->lock); +} + +static void set_pool_was_full(struct ceph_osd_client *osdc) +{ + struct rb_node *n; + + for (n = rb_first(&osdc->osdmap->pg_pools); n; n = rb_next(n)) { + struct ceph_pg_pool_info *pi = + rb_entry(n, struct ceph_pg_pool_info, node); + + pi->was_full = __pool_full(pi); } +} -out: - dout("req=%p req->r_linger=%d\n", req, req->r_linger); - ceph_osdc_put_request(req); - return; -out_unlock: - mutex_unlock(&osdc->request_mutex); - up_read(&osdc->map_sem); - goto out; +static bool pool_cleared_full(struct ceph_osd_client *osdc, s64 pool_id) +{ + struct ceph_pg_pool_info *pi; -bad_put: - req->r_result = -EIO; - __unregister_request(osdc, req); - if (req->r_callback) - req->r_callback(req, msg); - else - complete_all(&req->r_completion); - complete_request(req); - ceph_osdc_put_request(req); -bad_mutex: - mutex_unlock(&osdc->request_mutex); - up_read(&osdc->map_sem); -bad: - pr_err("corrupt osd_op_reply got %d %d\n", - (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len)); - ceph_msg_dump(msg); + pi = ceph_pg_pool_by_id(osdc->osdmap, pool_id); + if (!pi) + return false; + + return pi->was_full && !__pool_full(pi); } -static void reset_changed_osds(struct ceph_osd_client *osdc) +static enum calc_target_result +recalc_linger_target(struct ceph_osd_linger_request *lreq) { - struct rb_node *p, *n; + struct ceph_osd_client *osdc = lreq->osdc; + enum calc_target_result ct_res; - dout("%s %p\n", __func__, osdc); - for (p = rb_first(&osdc->osds); p; p = n) { - struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); + ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true); + if (ct_res == CALC_TARGET_NEED_RESEND) { + struct ceph_osd *osd; - n = rb_next(p); - if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || - memcmp(&osd->o_con.peer_addr, - ceph_osd_addr(osdc->osdmap, - osd->o_osd), - sizeof(struct ceph_entity_addr)) != 0) - __reset_osd(osdc, osd); + osd = lookup_create_osd(osdc, lreq->t.osd, true); + if (osd != lreq->osd) { + unlink_linger(lreq->osd, lreq); + link_linger(osd, lreq); + } } + + return ct_res; } /* - * Requeue requests whose mapping to an OSD has changed. If requests map to - * no osd, request a new map. - * - * Caller should hold map_sem for read. + * Requeue requests whose mapping to an OSD has changed. */ -static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, - bool force_resend_writes) +static void scan_requests(struct ceph_osd *osd, + bool force_resend, + bool cleared_full, + bool check_pool_cleared_full, + struct rb_root *need_resend, + struct list_head *need_resend_linger) { - struct ceph_osd_request *req, *nreq; - struct rb_node *p; - int needmap = 0; - int err; - bool force_resend_req; + struct ceph_osd_client *osdc = osd->o_osdc; + struct rb_node *n; + bool force_resend_writes; + + for (n = rb_first(&osd->o_linger_requests); n; ) { + struct ceph_osd_linger_request *lreq = + rb_entry(n, struct ceph_osd_linger_request, node); + enum calc_target_result ct_res; + + n = rb_next(n); /* recalc_linger_target() */ + + dout("%s lreq %p linger_id %llu\n", __func__, lreq, + lreq->linger_id); + ct_res = recalc_linger_target(lreq); + switch (ct_res) { + case CALC_TARGET_NO_ACTION: + force_resend_writes = cleared_full || + (check_pool_cleared_full && + pool_cleared_full(osdc, lreq->t.base_oloc.pool)); + if (!force_resend && !force_resend_writes) + break; + + /* fall through */ + case CALC_TARGET_NEED_RESEND: + cancel_linger_map_check(lreq); + /* + * scan_requests() for the previous epoch(s) + * may have already added it to the list, since + * it's not unlinked here. + */ + if (list_empty(&lreq->scan_item)) + list_add_tail(&lreq->scan_item, need_resend_linger); + break; + case CALC_TARGET_POOL_DNE: + check_linger_pool_dne(lreq); + break; + } + } - dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "", - force_resend_writes ? " (force resend writes)" : ""); - mutex_lock(&osdc->request_mutex); - for (p = rb_first(&osdc->requests); p; ) { - req = rb_entry(p, struct ceph_osd_request, r_node); - p = rb_next(p); + for (n = rb_first(&osd->o_requests); n; ) { + struct ceph_osd_request *req = + rb_entry(n, struct ceph_osd_request, r_node); + enum calc_target_result ct_res; + + n = rb_next(n); /* unlink_request(), check_pool_dne() */ + + dout("%s req %p tid %llu\n", __func__, req, req->r_tid); + ct_res = calc_target(osdc, &req->r_t, + &req->r_last_force_resend, false); + switch (ct_res) { + case CALC_TARGET_NO_ACTION: + force_resend_writes = cleared_full || + (check_pool_cleared_full && + pool_cleared_full(osdc, req->r_t.base_oloc.pool)); + if (!force_resend && + (!(req->r_flags & CEPH_OSD_FLAG_WRITE) || + !force_resend_writes)) + break; + + /* fall through */ + case CALC_TARGET_NEED_RESEND: + cancel_map_check(req); + unlink_request(osd, req); + insert_request(need_resend, req); + break; + case CALC_TARGET_POOL_DNE: + check_pool_dne(req); + break; + } + } +} +static int handle_one_map(struct ceph_osd_client *osdc, + void *p, void *end, bool incremental, + struct rb_root *need_resend, + struct list_head *need_resend_linger) +{ + struct ceph_osdmap *newmap; + struct rb_node *n; + bool skipped_map = false; + bool was_full; + + was_full = ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL); + set_pool_was_full(osdc); + + if (incremental) + newmap = osdmap_apply_incremental(&p, end, osdc->osdmap); + else + newmap = ceph_osdmap_decode(&p, end); + if (IS_ERR(newmap)) + return PTR_ERR(newmap); + + if (newmap != osdc->osdmap) { /* - * For linger requests that have not yet been - * registered, move them to the linger list; they'll - * be sent to the osd in the loop below. Unregister - * the request before re-registering it as a linger - * request to ensure the __map_request() below - * will decide it needs to be sent. + * Preserve ->was_full before destroying the old map. + * For pools that weren't in the old map, ->was_full + * should be false. */ - if (req->r_linger && list_empty(&req->r_linger_item)) { - dout("%p tid %llu restart on osd%d\n", - req, req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1); - ceph_osdc_get_request(req); - __unregister_request(osdc, req); - __register_linger_request(osdc, req); - ceph_osdc_put_request(req); - continue; + for (n = rb_first(&newmap->pg_pools); n; n = rb_next(n)) { + struct ceph_pg_pool_info *pi = + rb_entry(n, struct ceph_pg_pool_info, node); + struct ceph_pg_pool_info *old_pi; + + old_pi = ceph_pg_pool_by_id(osdc->osdmap, pi->id); + if (old_pi) + pi->was_full = old_pi->was_full; + else + WARN_ON(pi->was_full); } - force_resend_req = force_resend || - (force_resend_writes && - req->r_flags & CEPH_OSD_FLAG_WRITE); - err = __map_request(osdc, req, force_resend_req); - if (err < 0) - continue; /* error */ - if (req->r_osd == NULL) { - dout("%p tid %llu maps to no osd\n", req, req->r_tid); - needmap++; /* request a newer map */ - } else if (err > 0) { - if (!req->r_linger) { - dout("%p tid %llu requeued on osd%d\n", req, - req->r_tid, - req->r_osd ? req->r_osd->o_osd : -1); - req->r_flags |= CEPH_OSD_FLAG_RETRY; - } + if (osdc->osdmap->epoch && + osdc->osdmap->epoch + 1 < newmap->epoch) { + WARN_ON(incremental); + skipped_map = true; } + + ceph_osdmap_destroy(osdc->osdmap); + osdc->osdmap = newmap; } - list_for_each_entry_safe(req, nreq, &osdc->req_linger, - r_linger_item) { - dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); - - err = __map_request(osdc, req, - force_resend || force_resend_writes); - dout("__map_request returned %d\n", err); - if (err < 0) - continue; /* hrm! */ - if (req->r_osd == NULL || err > 0) { - if (req->r_osd == NULL) { - dout("lingering %p tid %llu maps to no osd\n", - req, req->r_tid); - /* - * A homeless lingering request makes - * no sense, as it's job is to keep - * a particular OSD connection open. - * Request a newer map and kick the - * request, knowing that it won't be - * resent until we actually get a map - * that can tell us where to send it. - */ - needmap++; - } + was_full &= !ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL); + scan_requests(&osdc->homeless_osd, skipped_map, was_full, true, + need_resend, need_resend_linger); + + for (n = rb_first(&osdc->osds); n; ) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + + n = rb_next(n); /* close_osd() */ + + scan_requests(osd, skipped_map, was_full, true, need_resend, + need_resend_linger); + if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || + memcmp(&osd->o_con.peer_addr, + ceph_osd_addr(osdc->osdmap, osd->o_osd), + sizeof(struct ceph_entity_addr))) + close_osd(osd); + } - dout("kicking lingering %p tid %llu osd%d\n", req, - req->r_tid, req->r_osd ? req->r_osd->o_osd : -1); - __register_request(osdc, req); - __unregister_linger_request(osdc, req); + return 0; +} + +static void kick_requests(struct ceph_osd_client *osdc, + struct rb_root *need_resend, + struct list_head *need_resend_linger) +{ + struct ceph_osd_linger_request *lreq, *nlreq; + struct rb_node *n; + + for (n = rb_first(need_resend); n; ) { + struct ceph_osd_request *req = + rb_entry(n, struct ceph_osd_request, r_node); + struct ceph_osd *osd; + + n = rb_next(n); + erase_request(need_resend, req); /* before link_request() */ + + WARN_ON(req->r_osd); + calc_target(osdc, &req->r_t, NULL, false); + osd = lookup_create_osd(osdc, req->r_t.osd, true); + link_request(osd, req); + if (!req->r_linger) { + if (!osd_homeless(osd) && !req->r_t.paused) + send_request(req); + } else { + cancel_linger_request(req); } } - reset_changed_osds(osdc); - mutex_unlock(&osdc->request_mutex); - if (needmap) { - dout("%d requests for down osds, need new map\n", needmap); - ceph_monc_request_next_osdmap(&osdc->client->monc); + list_for_each_entry_safe(lreq, nlreq, need_resend_linger, scan_item) { + if (!osd_homeless(lreq->osd)) + send_linger(lreq); + + list_del_init(&lreq->scan_item); } } - /* * Process updated osd map. * @@ -2115,27 +3152,31 @@ static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, */ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) { - void *p, *end, *next; + void *p = msg->front.iov_base; + void *const end = p + msg->front.iov_len; u32 nr_maps, maplen; u32 epoch; - struct ceph_osdmap *newmap = NULL, *oldmap; - int err; struct ceph_fsid fsid; - bool was_full; + struct rb_root need_resend = RB_ROOT; + LIST_HEAD(need_resend_linger); + bool handled_incremental = false; + bool was_pauserd, was_pausewr; + bool pauserd, pausewr; + int err; - dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); - p = msg->front.iov_base; - end = p + msg->front.iov_len; + dout("%s have %u\n", __func__, osdc->osdmap->epoch); + down_write(&osdc->lock); /* verify fsid */ ceph_decode_need(&p, end, sizeof(fsid), bad); ceph_decode_copy(&p, &fsid, sizeof(fsid)); if (ceph_check_fsid(osdc->client, &fsid) < 0) - return; - - down_write(&osdc->map_sem); + goto bad; - was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); + was_pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD); + was_pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || + have_pool_full(osdc); /* incremental maps */ ceph_decode_32_safe(&p, end, nr_maps, bad); @@ -2145,34 +3186,23 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) epoch = ceph_decode_32(&p); maplen = ceph_decode_32(&p); ceph_decode_need(&p, end, maplen, bad); - next = p + maplen; - if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) { + if (osdc->osdmap->epoch && + osdc->osdmap->epoch + 1 == epoch) { dout("applying incremental map %u len %d\n", epoch, maplen); - newmap = osdmap_apply_incremental(&p, next, - osdc->osdmap, - &osdc->client->msgr); - if (IS_ERR(newmap)) { - err = PTR_ERR(newmap); + err = handle_one_map(osdc, p, p + maplen, true, + &need_resend, &need_resend_linger); + if (err) goto bad; - } - BUG_ON(!newmap); - if (newmap != osdc->osdmap) { - ceph_osdmap_destroy(osdc->osdmap); - osdc->osdmap = newmap; - } - was_full = was_full || - ceph_osdmap_flag(osdc->osdmap, - CEPH_OSDMAP_FULL); - kick_requests(osdc, 0, was_full); + handled_incremental = true; } else { dout("ignoring incremental map %u len %d\n", epoch, maplen); } - p = next; + p += maplen; nr_maps--; } - if (newmap) + if (handled_incremental) goto done; /* full maps */ @@ -2186,455 +3216,647 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) if (nr_maps > 1) { dout("skipping non-latest full map %u len %d\n", epoch, maplen); - } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) { + } else if (osdc->osdmap->epoch >= epoch) { dout("skipping full map %u len %d, " "older than our %u\n", epoch, maplen, osdc->osdmap->epoch); } else { - int skipped_map = 0; - dout("taking full map %u len %d\n", epoch, maplen); - newmap = ceph_osdmap_decode(&p, p+maplen); - if (IS_ERR(newmap)) { - err = PTR_ERR(newmap); + err = handle_one_map(osdc, p, p + maplen, false, + &need_resend, &need_resend_linger); + if (err) goto bad; - } - BUG_ON(!newmap); - oldmap = osdc->osdmap; - osdc->osdmap = newmap; - if (oldmap) { - if (oldmap->epoch + 1 < newmap->epoch) - skipped_map = 1; - ceph_osdmap_destroy(oldmap); - } - was_full = was_full || - ceph_osdmap_flag(osdc->osdmap, - CEPH_OSDMAP_FULL); - kick_requests(osdc, skipped_map, was_full); } p += maplen; nr_maps--; } - if (!osdc->osdmap) - goto bad; done: - downgrade_write(&osdc->map_sem); - ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP, - osdc->osdmap->epoch); - /* * subscribe to subsequent osdmap updates if full to ensure * we find out when we are no longer full and stop returning * ENOSPC. */ - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) || - ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) - ceph_monc_request_next_osdmap(&osdc->client->monc); - - mutex_lock(&osdc->request_mutex); - __send_queued(osdc); - mutex_unlock(&osdc->request_mutex); - up_read(&osdc->map_sem); + pauserd = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD); + pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || + have_pool_full(osdc); + if (was_pauserd || was_pausewr || pauserd || pausewr) + maybe_request_map(osdc); + + kick_requests(osdc, &need_resend, &need_resend_linger); + + ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP, + osdc->osdmap->epoch); + up_write(&osdc->lock); wake_up_all(&osdc->client->auth_wq); return; bad: pr_err("osdc handle_map corrupt msg\n"); ceph_msg_dump(msg); - up_write(&osdc->map_sem); + up_write(&osdc->lock); } /* - * watch/notify callback event infrastructure - * - * These callbacks are used both for watch and notify operations. + * Resubmit requests pending on the given osd. */ -static void __release_event(struct kref *kref) +static void kick_osd_requests(struct ceph_osd *osd) { - struct ceph_osd_event *event = - container_of(kref, struct ceph_osd_event, kref); + struct rb_node *n; - dout("__release_event %p\n", event); - kfree(event); -} + for (n = rb_first(&osd->o_requests); n; ) { + struct ceph_osd_request *req = + rb_entry(n, struct ceph_osd_request, r_node); -static void get_event(struct ceph_osd_event *event) -{ - kref_get(&event->kref); -} + n = rb_next(n); /* cancel_linger_request() */ -void ceph_osdc_put_event(struct ceph_osd_event *event) -{ - kref_put(&event->kref, __release_event); + if (!req->r_linger) { + if (!req->r_t.paused) + send_request(req); + } else { + cancel_linger_request(req); + } + } + for (n = rb_first(&osd->o_linger_requests); n; n = rb_next(n)) { + struct ceph_osd_linger_request *lreq = + rb_entry(n, struct ceph_osd_linger_request, node); + + send_linger(lreq); + } } -EXPORT_SYMBOL(ceph_osdc_put_event); -static void __insert_event(struct ceph_osd_client *osdc, - struct ceph_osd_event *new) +/* + * If the osd connection drops, we need to resubmit all requests. + */ +static void osd_fault(struct ceph_connection *con) { - struct rb_node **p = &osdc->event_tree.rb_node; - struct rb_node *parent = NULL; - struct ceph_osd_event *event = NULL; + struct ceph_osd *osd = con->private; + struct ceph_osd_client *osdc = osd->o_osdc; - while (*p) { - parent = *p; - event = rb_entry(parent, struct ceph_osd_event, node); - if (new->cookie < event->cookie) - p = &(*p)->rb_left; - else if (new->cookie > event->cookie) - p = &(*p)->rb_right; - else - BUG(); + dout("%s osd %p osd%d\n", __func__, osd, osd->o_osd); + + down_write(&osdc->lock); + if (!osd_registered(osd)) { + dout("%s osd%d unknown\n", __func__, osd->o_osd); + goto out_unlock; } - rb_link_node(&new->node, parent, p); - rb_insert_color(&new->node, &osdc->event_tree); + if (!reopen_osd(osd)) + kick_osd_requests(osd); + maybe_request_map(osdc); + +out_unlock: + up_write(&osdc->lock); } -static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc, - u64 cookie) +/* + * Process osd watch notifications + */ +static void handle_watch_notify(struct ceph_osd_client *osdc, + struct ceph_msg *msg) { - struct rb_node **p = &osdc->event_tree.rb_node; - struct rb_node *parent = NULL; - struct ceph_osd_event *event = NULL; + void *p = msg->front.iov_base; + void *const end = p + msg->front.iov_len; + struct ceph_osd_linger_request *lreq; + struct linger_work *lwork; + u8 proto_ver, opcode; + u64 cookie, notify_id; + u64 notifier_id = 0; + s32 return_code = 0; + void *payload = NULL; + u32 payload_len = 0; - while (*p) { - parent = *p; - event = rb_entry(parent, struct ceph_osd_event, node); - if (cookie < event->cookie) - p = &(*p)->rb_left; - else if (cookie > event->cookie) - p = &(*p)->rb_right; - else - return event; + ceph_decode_8_safe(&p, end, proto_ver, bad); + ceph_decode_8_safe(&p, end, opcode, bad); + ceph_decode_64_safe(&p, end, cookie, bad); + p += 8; /* skip ver */ + ceph_decode_64_safe(&p, end, notify_id, bad); + + if (proto_ver >= 1) { + ceph_decode_32_safe(&p, end, payload_len, bad); + ceph_decode_need(&p, end, payload_len, bad); + payload = p; + p += payload_len; } - return NULL; -} -static void __remove_event(struct ceph_osd_event *event) -{ - struct ceph_osd_client *osdc = event->osdc; + if (le16_to_cpu(msg->hdr.version) >= 2) + ceph_decode_32_safe(&p, end, return_code, bad); + + if (le16_to_cpu(msg->hdr.version) >= 3) + ceph_decode_64_safe(&p, end, notifier_id, bad); - if (!RB_EMPTY_NODE(&event->node)) { - dout("__remove_event removed %p\n", event); - rb_erase(&event->node, &osdc->event_tree); - ceph_osdc_put_event(event); + down_read(&osdc->lock); + lreq = lookup_linger_osdc(&osdc->linger_requests, cookie); + if (!lreq) { + dout("%s opcode %d cookie %llu dne\n", __func__, opcode, + cookie); + goto out_unlock_osdc; + } + + mutex_lock(&lreq->lock); + dout("%s opcode %d cookie %llu lreq %p is_watch %d\n", __func__, + opcode, cookie, lreq, lreq->is_watch); + if (opcode == CEPH_WATCH_EVENT_DISCONNECT) { + if (!lreq->last_error) { + lreq->last_error = -ENOTCONN; + queue_watch_error(lreq); + } + } else if (!lreq->is_watch) { + /* CEPH_WATCH_EVENT_NOTIFY_COMPLETE */ + if (lreq->notify_id && lreq->notify_id != notify_id) { + dout("lreq %p notify_id %llu != %llu, ignoring\n", lreq, + lreq->notify_id, notify_id); + } else if (!completion_done(&lreq->notify_finish_wait)) { + struct ceph_msg_data *data = + list_first_entry_or_null(&msg->data, + struct ceph_msg_data, + links); + + if (data) { + if (lreq->preply_pages) { + WARN_ON(data->type != + CEPH_MSG_DATA_PAGES); + *lreq->preply_pages = data->pages; + *lreq->preply_len = data->length; + } else { + ceph_release_page_vector(data->pages, + calc_pages_for(0, data->length)); + } + } + lreq->notify_finish_error = return_code; + complete_all(&lreq->notify_finish_wait); + } } else { - dout("__remove_event didn't remove %p\n", event); + /* CEPH_WATCH_EVENT_NOTIFY */ + lwork = lwork_alloc(lreq, do_watch_notify); + if (!lwork) { + pr_err("failed to allocate notify-lwork\n"); + goto out_unlock_lreq; + } + + lwork->notify.notify_id = notify_id; + lwork->notify.notifier_id = notifier_id; + lwork->notify.payload = payload; + lwork->notify.payload_len = payload_len; + lwork->notify.msg = ceph_msg_get(msg); + lwork_queue(lwork); } + +out_unlock_lreq: + mutex_unlock(&lreq->lock); +out_unlock_osdc: + up_read(&osdc->lock); + return; + +bad: + pr_err("osdc handle_watch_notify corrupt msg\n"); } -int ceph_osdc_create_event(struct ceph_osd_client *osdc, - void (*event_cb)(u64, u64, u8, void *), - void *data, struct ceph_osd_event **pevent) +/* + * Register request, send initial attempt. + */ +int ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req, + bool nofail) { - struct ceph_osd_event *event; + down_read(&osdc->lock); + submit_request(req, false); + up_read(&osdc->lock); - event = kmalloc(sizeof(*event), GFP_NOIO); - if (!event) - return -ENOMEM; - - dout("create_event %p\n", event); - event->cb = event_cb; - event->one_shot = 0; - event->data = data; - event->osdc = osdc; - INIT_LIST_HEAD(&event->osd_node); - RB_CLEAR_NODE(&event->node); - kref_init(&event->kref); /* one ref for us */ - kref_get(&event->kref); /* one ref for the caller */ - - spin_lock(&osdc->event_lock); - event->cookie = ++osdc->event_count; - __insert_event(osdc, event); - spin_unlock(&osdc->event_lock); - - *pevent = event; return 0; } -EXPORT_SYMBOL(ceph_osdc_create_event); +EXPORT_SYMBOL(ceph_osdc_start_request); -void ceph_osdc_cancel_event(struct ceph_osd_event *event) +/* + * Unregister a registered request. The request is not completed (i.e. + * no callbacks or wakeups) - higher layers are supposed to know what + * they are canceling. + */ +void ceph_osdc_cancel_request(struct ceph_osd_request *req) { - struct ceph_osd_client *osdc = event->osdc; + struct ceph_osd_client *osdc = req->r_osdc; - dout("cancel_event %p\n", event); - spin_lock(&osdc->event_lock); - __remove_event(event); - spin_unlock(&osdc->event_lock); - ceph_osdc_put_event(event); /* caller's */ + down_write(&osdc->lock); + if (req->r_osd) + cancel_request(req); + up_write(&osdc->lock); } -EXPORT_SYMBOL(ceph_osdc_cancel_event); - +EXPORT_SYMBOL(ceph_osdc_cancel_request); -static void do_event_work(struct work_struct *work) +/* + * @timeout: in jiffies, 0 means "wait forever" + */ +static int wait_request_timeout(struct ceph_osd_request *req, + unsigned long timeout) { - struct ceph_osd_event_work *event_work = - container_of(work, struct ceph_osd_event_work, work); - struct ceph_osd_event *event = event_work->event; - u64 ver = event_work->ver; - u64 notify_id = event_work->notify_id; - u8 opcode = event_work->opcode; + long left; + + dout("%s req %p tid %llu\n", __func__, req, req->r_tid); + left = wait_for_completion_killable_timeout(&req->r_completion, + ceph_timeout_jiffies(timeout)); + if (left <= 0) { + left = left ?: -ETIMEDOUT; + ceph_osdc_cancel_request(req); - dout("do_event_work completing %p\n", event); - event->cb(ver, notify_id, opcode, event->data); - dout("do_event_work completed %p\n", event); - ceph_osdc_put_event(event); - kfree(event_work); + /* kludge - need to to wake ceph_osdc_sync() */ + complete_all(&req->r_safe_completion); + } else { + left = req->r_result; /* completed */ + } + + return left; } +/* + * wait for a request to complete + */ +int ceph_osdc_wait_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + return wait_request_timeout(req, 0); +} +EXPORT_SYMBOL(ceph_osdc_wait_request); /* - * Process osd watch notifications + * sync - wait for all in-flight requests to flush. avoid starvation. */ -static void handle_watch_notify(struct ceph_osd_client *osdc, - struct ceph_msg *msg) +void ceph_osdc_sync(struct ceph_osd_client *osdc) { - void *p, *end; - u8 proto_ver; - u64 cookie, ver, notify_id; - u8 opcode; - struct ceph_osd_event *event; - struct ceph_osd_event_work *event_work; + struct rb_node *n, *p; + u64 last_tid = atomic64_read(&osdc->last_tid); - p = msg->front.iov_base; - end = p + msg->front.iov_len; +again: + down_read(&osdc->lock); + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); - ceph_decode_8_safe(&p, end, proto_ver, bad); - ceph_decode_8_safe(&p, end, opcode, bad); - ceph_decode_64_safe(&p, end, cookie, bad); - ceph_decode_64_safe(&p, end, ver, bad); - ceph_decode_64_safe(&p, end, notify_id, bad); + mutex_lock(&osd->lock); + for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) { + struct ceph_osd_request *req = + rb_entry(p, struct ceph_osd_request, r_node); + + if (req->r_tid > last_tid) + break; - spin_lock(&osdc->event_lock); - event = __find_event(osdc, cookie); - if (event) { - BUG_ON(event->one_shot); - get_event(event); - } - spin_unlock(&osdc->event_lock); - dout("handle_watch_notify cookie %lld ver %lld event %p\n", - cookie, ver, event); - if (event) { - event_work = kmalloc(sizeof(*event_work), GFP_NOIO); - if (!event_work) { - pr_err("couldn't allocate event_work\n"); - ceph_osdc_put_event(event); - return; + if (!(req->r_flags & CEPH_OSD_FLAG_WRITE)) + continue; + + ceph_osdc_get_request(req); + mutex_unlock(&osd->lock); + up_read(&osdc->lock); + dout("%s waiting on req %p tid %llu last_tid %llu\n", + __func__, req, req->r_tid, last_tid); + wait_for_completion(&req->r_safe_completion); + ceph_osdc_put_request(req); + goto again; } - INIT_WORK(&event_work->work, do_event_work); - event_work->event = event; - event_work->ver = ver; - event_work->notify_id = notify_id; - event_work->opcode = opcode; - queue_work(osdc->notify_wq, &event_work->work); + mutex_unlock(&osd->lock); } - return; + up_read(&osdc->lock); + dout("%s done last_tid %llu\n", __func__, last_tid); +} +EXPORT_SYMBOL(ceph_osdc_sync); -bad: - pr_err("osdc handle_watch_notify corrupt msg\n"); +static struct ceph_osd_request * +alloc_linger_request(struct ceph_osd_linger_request *lreq) +{ + struct ceph_osd_request *req; + + req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO); + if (!req) + return NULL; + + ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); + ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); + + if (ceph_osdc_alloc_messages(req, GFP_NOIO)) { + ceph_osdc_put_request(req); + return NULL; + } + + return req; } /* - * build new request AND message - * + * Returns a handle, caller owns a ref. */ -void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, - struct ceph_snap_context *snapc, u64 snap_id, - struct timespec *mtime) -{ - struct ceph_msg *msg = req->r_request; - void *p; - size_t msg_size; - int flags = req->r_flags; - u64 data_len; - unsigned int i; - - req->r_snapid = snap_id; - req->r_snapc = ceph_get_snap_context(snapc); - - /* encode request */ - msg->hdr.version = cpu_to_le16(4); - - p = msg->front.iov_base; - ceph_encode_32(&p, 1); /* client_inc is always 1 */ - req->r_request_osdmap_epoch = p; - p += 4; - req->r_request_flags = p; - p += 4; - if (req->r_flags & CEPH_OSD_FLAG_WRITE) - ceph_encode_timespec(p, mtime); - p += sizeof(struct ceph_timespec); - req->r_request_reassert_version = p; - p += sizeof(struct ceph_eversion); /* will get filled in */ - - /* oloc */ - ceph_encode_8(&p, 4); - ceph_encode_8(&p, 4); - ceph_encode_32(&p, 8 + 4 + 4); - req->r_request_pool = p; - p += 8; - ceph_encode_32(&p, -1); /* preferred */ - ceph_encode_32(&p, 0); /* key len */ +struct ceph_osd_linger_request * +ceph_osdc_watch(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + rados_watchcb2_t wcb, + rados_watcherrcb_t errcb, + void *data) +{ + struct ceph_osd_linger_request *lreq; + int ret; - ceph_encode_8(&p, 1); - req->r_request_pgid = p; - p += 8 + 4; - ceph_encode_32(&p, -1); /* preferred */ + lreq = linger_alloc(osdc); + if (!lreq) + return ERR_PTR(-ENOMEM); - /* oid */ - ceph_encode_32(&p, req->r_base_oid.name_len); - memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len); - dout("oid '%.*s' len %d\n", req->r_base_oid.name_len, - req->r_base_oid.name, req->r_base_oid.name_len); - p += req->r_base_oid.name_len; - - /* ops--can imply data */ - ceph_encode_16(&p, (u16)req->r_num_ops); - data_len = 0; - for (i = 0; i < req->r_num_ops; i++) { - data_len += osd_req_encode_op(req, p, i); - p += sizeof(struct ceph_osd_op); + lreq->is_watch = true; + lreq->wcb = wcb; + lreq->errcb = errcb; + lreq->data = data; + lreq->watch_valid_thru = jiffies; + + ceph_oid_copy(&lreq->t.base_oid, oid); + ceph_oloc_copy(&lreq->t.base_oloc, oloc); + lreq->t.flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + lreq->mtime = CURRENT_TIME; + + lreq->reg_req = alloc_linger_request(lreq); + if (!lreq->reg_req) { + ret = -ENOMEM; + goto err_put_lreq; } - /* snaps */ - ceph_encode_64(&p, req->r_snapid); - ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); - ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); - if (req->r_snapc) { - for (i = 0; i < snapc->num_snaps; i++) { - ceph_encode_64(&p, req->r_snapc->snaps[i]); - } + lreq->ping_req = alloc_linger_request(lreq); + if (!lreq->ping_req) { + ret = -ENOMEM; + goto err_put_lreq; } - req->r_request_attempts = p; - p += 4; - - /* data */ - if (flags & CEPH_OSD_FLAG_WRITE) { - u16 data_off; - - /* - * The header "data_off" is a hint to the receiver - * allowing it to align received data into its - * buffers such that there's no need to re-copy - * it before writing it to disk (direct I/O). - */ - data_off = (u16) (off & 0xffff); - req->r_request->hdr.data_off = cpu_to_le16(data_off); + down_write(&osdc->lock); + linger_register(lreq); /* before osd_req_op_* */ + osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id, + CEPH_OSD_WATCH_OP_WATCH); + osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id, + CEPH_OSD_WATCH_OP_PING); + linger_submit(lreq); + up_write(&osdc->lock); + + ret = linger_reg_commit_wait(lreq); + if (ret) { + linger_cancel(lreq); + goto err_put_lreq; } - req->r_request->hdr.data_len = cpu_to_le32(data_len); - BUG_ON(p > msg->front.iov_base + msg->front.iov_len); - msg_size = p - msg->front.iov_base; - msg->front.iov_len = msg_size; - msg->hdr.front_len = cpu_to_le32(msg_size); + return lreq; - dout("build_request msg_size was %d\n", (int)msg_size); +err_put_lreq: + linger_put(lreq); + return ERR_PTR(ret); } -EXPORT_SYMBOL(ceph_osdc_build_request); +EXPORT_SYMBOL(ceph_osdc_watch); /* - * Register request, send initial attempt. + * Releases a ref. + * + * Times out after mount_timeout to preserve rbd unmap behaviour + * introduced in 2894e1d76974 ("rbd: timeout watch teardown on unmap + * with mount_timeout"). */ -int ceph_osdc_start_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req, - bool nofail) +int ceph_osdc_unwatch(struct ceph_osd_client *osdc, + struct ceph_osd_linger_request *lreq) { - int rc; + struct ceph_options *opts = osdc->client->options; + struct ceph_osd_request *req; + int ret; - down_read(&osdc->map_sem); - mutex_lock(&osdc->request_mutex); + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); + if (!req) + return -ENOMEM; - rc = __ceph_osdc_start_request(osdc, req, nofail); + ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); + ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); + req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; + req->r_mtime = CURRENT_TIME; + osd_req_op_watch_init(req, 0, lreq->linger_id, + CEPH_OSD_WATCH_OP_UNWATCH); - mutex_unlock(&osdc->request_mutex); - up_read(&osdc->map_sem); + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + if (ret) + goto out_put_req; - return rc; + ceph_osdc_start_request(osdc, req, false); + linger_cancel(lreq); + linger_put(lreq); + ret = wait_request_timeout(req, opts->mount_timeout); + +out_put_req: + ceph_osdc_put_request(req); + return ret; } -EXPORT_SYMBOL(ceph_osdc_start_request); +EXPORT_SYMBOL(ceph_osdc_unwatch); -/* - * Unregister a registered request. The request is not completed (i.e. - * no callbacks or wakeups) - higher layers are supposed to know what - * they are canceling. - */ -void ceph_osdc_cancel_request(struct ceph_osd_request *req) +static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which, + u64 notify_id, u64 cookie, void *payload, + size_t payload_len) { - struct ceph_osd_client *osdc = req->r_osdc; + struct ceph_osd_req_op *op; + struct ceph_pagelist *pl; + int ret; + + op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0); + + pl = kmalloc(sizeof(*pl), GFP_NOIO); + if (!pl) + return -ENOMEM; - mutex_lock(&osdc->request_mutex); - if (req->r_linger) - __unregister_linger_request(osdc, req); - __unregister_request(osdc, req); - mutex_unlock(&osdc->request_mutex); + ceph_pagelist_init(pl); + ret = ceph_pagelist_encode_64(pl, notify_id); + ret |= ceph_pagelist_encode_64(pl, cookie); + if (payload) { + ret |= ceph_pagelist_encode_32(pl, payload_len); + ret |= ceph_pagelist_append(pl, payload, payload_len); + } else { + ret |= ceph_pagelist_encode_32(pl, 0); + } + if (ret) { + ceph_pagelist_release(pl); + return -ENOMEM; + } - dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid); + ceph_osd_data_pagelist_init(&op->notify_ack.request_data, pl); + op->indata_len = pl->length; + return 0; } -EXPORT_SYMBOL(ceph_osdc_cancel_request); -/* - * wait for a request to complete - */ -int ceph_osdc_wait_request(struct ceph_osd_client *osdc, - struct ceph_osd_request *req) +int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + u64 notify_id, + u64 cookie, + void *payload, + size_t payload_len) +{ + struct ceph_osd_request *req; + int ret; + + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO); + if (!req) + return -ENOMEM; + + ceph_oid_copy(&req->r_base_oid, oid); + ceph_oloc_copy(&req->r_base_oloc, oloc); + req->r_flags = CEPH_OSD_FLAG_READ; + + ret = ceph_osdc_alloc_messages(req, GFP_NOIO); + if (ret) + goto out_put_req; + + ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload, + payload_len); + if (ret) + goto out_put_req; + + ceph_osdc_start_request(osdc, req, false); + ret = ceph_osdc_wait_request(osdc, req); + +out_put_req: + ceph_osdc_put_request(req); + return ret; +} +EXPORT_SYMBOL(ceph_osdc_notify_ack); + +static int osd_req_op_notify_init(struct ceph_osd_request *req, int which, + u64 cookie, u32 prot_ver, u32 timeout, + void *payload, size_t payload_len) { - int rc; + struct ceph_osd_req_op *op; + struct ceph_pagelist *pl; + int ret; - dout("%s %p tid %llu\n", __func__, req, req->r_tid); + op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); + op->notify.cookie = cookie; - rc = wait_for_completion_interruptible(&req->r_completion); - if (rc < 0) { - dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid); - ceph_osdc_cancel_request(req); - complete_request(req); - return rc; + pl = kmalloc(sizeof(*pl), GFP_NOIO); + if (!pl) + return -ENOMEM; + + ceph_pagelist_init(pl); + ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */ + ret |= ceph_pagelist_encode_32(pl, timeout); + ret |= ceph_pagelist_encode_32(pl, payload_len); + ret |= ceph_pagelist_append(pl, payload, payload_len); + if (ret) { + ceph_pagelist_release(pl); + return -ENOMEM; } - dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid, - req->r_result); - return req->r_result; + ceph_osd_data_pagelist_init(&op->notify.request_data, pl); + op->indata_len = pl->length; + return 0; } -EXPORT_SYMBOL(ceph_osdc_wait_request); /* - * sync - wait for all in-flight requests to flush. avoid starvation. + * @timeout: in seconds + * + * @preply_{pages,len} are initialized both on success and error. + * The caller is responsible for: + * + * ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len)) */ -void ceph_osdc_sync(struct ceph_osd_client *osdc) +int ceph_osdc_notify(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + void *payload, + size_t payload_len, + u32 timeout, + struct page ***preply_pages, + size_t *preply_len) { - struct ceph_osd_request *req; - u64 last_tid, next_tid = 0; + struct ceph_osd_linger_request *lreq; + struct page **pages; + int ret; - mutex_lock(&osdc->request_mutex); - last_tid = osdc->last_tid; - while (1) { - req = __lookup_request_ge(osdc, next_tid); - if (!req) - break; - if (req->r_tid > last_tid) - break; + WARN_ON(!timeout); + if (preply_pages) { + *preply_pages = NULL; + *preply_len = 0; + } - next_tid = req->r_tid + 1; - if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0) - continue; + lreq = linger_alloc(osdc); + if (!lreq) + return -ENOMEM; - ceph_osdc_get_request(req); - mutex_unlock(&osdc->request_mutex); - dout("sync waiting on tid %llu (last is %llu)\n", - req->r_tid, last_tid); - wait_for_completion(&req->r_safe_completion); - mutex_lock(&osdc->request_mutex); - ceph_osdc_put_request(req); + lreq->preply_pages = preply_pages; + lreq->preply_len = preply_len; + + ceph_oid_copy(&lreq->t.base_oid, oid); + ceph_oloc_copy(&lreq->t.base_oloc, oloc); + lreq->t.flags = CEPH_OSD_FLAG_READ; + + lreq->reg_req = alloc_linger_request(lreq); + if (!lreq->reg_req) { + ret = -ENOMEM; + goto out_put_lreq; } - mutex_unlock(&osdc->request_mutex); - dout("sync done (thru tid %llu)\n", last_tid); + + /* for notify_id */ + pages = ceph_alloc_page_vector(1, GFP_NOIO); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + goto out_put_lreq; + } + + down_write(&osdc->lock); + linger_register(lreq); /* before osd_req_op_* */ + ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1, + timeout, payload, payload_len); + if (ret) { + linger_unregister(lreq); + up_write(&osdc->lock); + ceph_release_page_vector(pages, 1); + goto out_put_lreq; + } + ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify, + response_data), + pages, PAGE_SIZE, 0, false, true); + linger_submit(lreq); + up_write(&osdc->lock); + + ret = linger_reg_commit_wait(lreq); + if (!ret) + ret = linger_notify_finish_wait(lreq); + else + dout("lreq %p failed to initiate notify %d\n", lreq, ret); + + linger_cancel(lreq); +out_put_lreq: + linger_put(lreq); + return ret; +} +EXPORT_SYMBOL(ceph_osdc_notify); + +/* + * Return the number of milliseconds since the watch was last + * confirmed, or an error. If there is an error, the watch is no + * longer valid, and should be destroyed with ceph_osdc_unwatch(). + */ +int ceph_osdc_watch_check(struct ceph_osd_client *osdc, + struct ceph_osd_linger_request *lreq) +{ + unsigned long stamp, age; + int ret; + + down_read(&osdc->lock); + mutex_lock(&lreq->lock); + stamp = lreq->watch_valid_thru; + if (!list_empty(&lreq->pending_lworks)) { + struct linger_work *lwork = + list_first_entry(&lreq->pending_lworks, + struct linger_work, + pending_item); + + if (time_before(lwork->queued_stamp, stamp)) + stamp = lwork->queued_stamp; + } + age = jiffies - stamp; + dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__, + lreq, lreq->linger_id, age, lreq->last_error); + /* we are truncating to msecs, so return a safe upper bound */ + ret = lreq->last_error ?: 1 + jiffies_to_msecs(age); + + mutex_unlock(&lreq->lock); + up_read(&osdc->lock); + return ret; } -EXPORT_SYMBOL(ceph_osdc_sync); /* * Call all pending notify callbacks - for use after a watch is @@ -2646,6 +3868,13 @@ void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc) } EXPORT_SYMBOL(ceph_osdc_flush_notifies); +void ceph_osdc_maybe_request_map(struct ceph_osd_client *osdc) +{ + down_read(&osdc->lock); + maybe_request_map(osdc); + up_read(&osdc->lock); +} +EXPORT_SYMBOL(ceph_osdc_maybe_request_map); /* * init, shutdown @@ -2656,43 +3885,35 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) dout("init\n"); osdc->client = client; - osdc->osdmap = NULL; - init_rwsem(&osdc->map_sem); - init_completion(&osdc->map_waiters); - osdc->last_requested_map = 0; - mutex_init(&osdc->request_mutex); - osdc->last_tid = 0; + init_rwsem(&osdc->lock); osdc->osds = RB_ROOT; INIT_LIST_HEAD(&osdc->osd_lru); - osdc->requests = RB_ROOT; - INIT_LIST_HEAD(&osdc->req_lru); - INIT_LIST_HEAD(&osdc->req_unsent); - INIT_LIST_HEAD(&osdc->req_notarget); - INIT_LIST_HEAD(&osdc->req_linger); - osdc->num_requests = 0; + spin_lock_init(&osdc->osd_lru_lock); + osd_init(&osdc->homeless_osd); + osdc->homeless_osd.o_osdc = osdc; + osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD; + osdc->linger_requests = RB_ROOT; + osdc->map_checks = RB_ROOT; + osdc->linger_map_checks = RB_ROOT; INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); - spin_lock_init(&osdc->event_lock); - osdc->event_tree = RB_ROOT; - osdc->event_count = 0; - - schedule_delayed_work(&osdc->osds_timeout_work, - round_jiffies_relative(osdc->client->options->osd_idle_ttl)); err = -ENOMEM; + osdc->osdmap = ceph_osdmap_alloc(); + if (!osdc->osdmap) + goto out; + osdc->req_mempool = mempool_create_slab_pool(10, ceph_osd_request_cache); if (!osdc->req_mempool) - goto out; + goto out_map; err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, - OSD_OP_FRONT_LEN, 10, true, - "osd_op"); + PAGE_SIZE, 10, true, "osd_op"); if (err < 0) goto out_mempool; err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, - OSD_OPREPLY_FRONT_LEN, 10, true, - "osd_op_reply"); + PAGE_SIZE, 10, true, "osd_op_reply"); if (err < 0) goto out_msgpool; @@ -2701,6 +3922,11 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) if (!osdc->notify_wq) goto out_msgpool_reply; + schedule_delayed_work(&osdc->timeout_work, + osdc->client->options->osd_keepalive_timeout); + schedule_delayed_work(&osdc->osds_timeout_work, + round_jiffies_relative(osdc->client->options->osd_idle_ttl)); + return 0; out_msgpool_reply: @@ -2709,6 +3935,8 @@ out_msgpool: ceph_msgpool_destroy(&osdc->msgpool_op); out_mempool: mempool_destroy(osdc->req_mempool); +out_map: + ceph_osdmap_destroy(osdc->osdmap); out: return err; } @@ -2719,11 +3947,25 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) destroy_workqueue(osdc->notify_wq); cancel_delayed_work_sync(&osdc->timeout_work); cancel_delayed_work_sync(&osdc->osds_timeout_work); - if (osdc->osdmap) { - ceph_osdmap_destroy(osdc->osdmap); - osdc->osdmap = NULL; + + down_write(&osdc->lock); + while (!RB_EMPTY_ROOT(&osdc->osds)) { + struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), + struct ceph_osd, o_node); + close_osd(osd); } - remove_all_osds(osdc); + up_write(&osdc->lock); + WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1); + osd_cleanup(&osdc->homeless_osd); + + WARN_ON(!list_empty(&osdc->osd_lru)); + WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_requests)); + WARN_ON(!RB_EMPTY_ROOT(&osdc->map_checks)); + WARN_ON(!RB_EMPTY_ROOT(&osdc->linger_map_checks)); + WARN_ON(atomic_read(&osdc->num_requests)); + WARN_ON(atomic_read(&osdc->num_homeless)); + + ceph_osdmap_destroy(osdc->osdmap); mempool_destroy(osdc->req_mempool); ceph_msgpool_destroy(&osdc->msgpool_op); ceph_msgpool_destroy(&osdc->msgpool_op_reply); @@ -2752,15 +3994,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, return PTR_ERR(req); /* it may be a short read due to an object boundary */ - osd_req_op_extent_osd_data_pages(req, 0, pages, *plen, page_align, false, false); dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", off, *plen, *plen, page_align); - ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); - rc = ceph_osdc_start_request(osdc, req, false); if (!rc) rc = ceph_osdc_wait_request(osdc, req); @@ -2786,7 +4025,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, int rc = 0; int page_align = off & ~PAGE_MASK; - BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */ req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, @@ -2800,8 +4038,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, false, false); dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); - ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime); - + req->r_mtime = *mtime; rc = ceph_osdc_start_request(osdc, req, true); if (!rc) rc = ceph_osdc_wait_request(osdc, req); @@ -2841,19 +4078,15 @@ EXPORT_SYMBOL(ceph_osdc_cleanup); static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) { struct ceph_osd *osd = con->private; - struct ceph_osd_client *osdc; + struct ceph_osd_client *osdc = osd->o_osdc; int type = le16_to_cpu(msg->hdr.type); - if (!osd) - goto out; - osdc = osd->o_osdc; - switch (type) { case CEPH_MSG_OSD_MAP: ceph_osdc_handle_map(osdc, msg); break; case CEPH_MSG_OSD_OPREPLY: - handle_reply(osdc, msg); + handle_reply(osd, msg); break; case CEPH_MSG_WATCH_NOTIFY: handle_watch_notify(osdc, msg); @@ -2863,7 +4096,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) pr_err("received unknown message type %d %s\n", type, ceph_msg_type_name(type)); } -out: + ceph_msg_put(msg); } @@ -2878,21 +4111,27 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, { struct ceph_osd *osd = con->private; struct ceph_osd_client *osdc = osd->o_osdc; - struct ceph_msg *m; + struct ceph_msg *m = NULL; struct ceph_osd_request *req; int front_len = le32_to_cpu(hdr->front_len); int data_len = le32_to_cpu(hdr->data_len); - u64 tid; + u64 tid = le64_to_cpu(hdr->tid); - tid = le64_to_cpu(hdr->tid); - mutex_lock(&osdc->request_mutex); - req = __lookup_request(osdc, tid); + down_read(&osdc->lock); + if (!osd_registered(osd)) { + dout("%s osd%d unknown, skipping\n", __func__, osd->o_osd); + *skip = 1; + goto out_unlock_osdc; + } + WARN_ON(osd->o_osd != le64_to_cpu(hdr->src.num)); + + mutex_lock(&osd->lock); + req = lookup_request(&osd->o_requests, tid); if (!req) { dout("%s osd%d tid %llu unknown, skipping\n", __func__, osd->o_osd, tid); - m = NULL; *skip = 1; - goto out; + goto out_unlock_session; } ceph_msg_revoke_incoming(req->r_reply); @@ -2904,7 +4143,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, false); if (!m) - goto out; + goto out_unlock_session; ceph_msg_put(req->r_reply); req->r_reply = m; } @@ -2915,14 +4154,49 @@ static struct ceph_msg *get_reply(struct ceph_connection *con, req->r_reply->data_length); m = NULL; *skip = 1; - goto out; + goto out_unlock_session; } m = ceph_msg_get(req->r_reply); dout("get_reply tid %lld %p\n", tid, m); -out: - mutex_unlock(&osdc->request_mutex); +out_unlock_session: + mutex_unlock(&osd->lock); +out_unlock_osdc: + up_read(&osdc->lock); + return m; +} + +/* + * TODO: switch to a msg-owned pagelist + */ +static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr) +{ + struct ceph_msg *m; + int type = le16_to_cpu(hdr->type); + u32 front_len = le32_to_cpu(hdr->front_len); + u32 data_len = le32_to_cpu(hdr->data_len); + + m = ceph_msg_new(type, front_len, GFP_NOIO, false); + if (!m) + return NULL; + + if (data_len) { + struct page **pages; + struct ceph_osd_data osd_data; + + pages = ceph_alloc_page_vector(calc_pages_for(0, data_len), + GFP_NOIO); + if (!pages) { + ceph_msg_put(m); + return NULL; + } + + ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false, + false); + ceph_osdc_msg_data_add(m, &osd_data); + } + return m; } @@ -2932,18 +4206,17 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, { struct ceph_osd *osd = con->private; int type = le16_to_cpu(hdr->type); - int front = le32_to_cpu(hdr->front_len); *skip = 0; switch (type) { case CEPH_MSG_OSD_MAP: case CEPH_MSG_WATCH_NOTIFY: - return ceph_msg_new(type, front, GFP_NOFS, false); + return alloc_msg_with_page_vector(hdr); case CEPH_MSG_OSD_OPREPLY: return get_reply(con, hdr, skip); default: - pr_info("alloc_msg unexpected msg type %d from osd%d\n", type, - osd->o_osd); + pr_warn("%s osd%d unknown msg type %d, skipping\n", __func__, + osd->o_osd, type); *skip = 1; return NULL; } @@ -3047,5 +4320,5 @@ static const struct ceph_connection_operations osd_con_ops = { .alloc_msg = alloc_msg, .sign_message = osd_sign_message, .check_message_signature = osd_check_message_signature, - .fault = osd_reset, + .fault = osd_fault, }; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 243574c8c..7e480bf75 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -380,23 +380,24 @@ bad: return ERR_PTR(err); } -/* - * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid - * to a set of osds) and primary_temp (explicit primary setting) - */ -static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) +int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs) { - if (l.pool < r.pool) + if (lhs->pool < rhs->pool) return -1; - if (l.pool > r.pool) + if (lhs->pool > rhs->pool) return 1; - if (l.seed < r.seed) + if (lhs->seed < rhs->seed) return -1; - if (l.seed > r.seed) + if (lhs->seed > rhs->seed) return 1; + return 0; } +/* + * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid + * to a set of osds) and primary_temp (explicit primary setting) + */ static int __insert_pg_mapping(struct ceph_pg_mapping *new, struct rb_root *root) { @@ -409,7 +410,7 @@ static int __insert_pg_mapping(struct ceph_pg_mapping *new, while (*p) { parent = *p; pg = rb_entry(parent, struct ceph_pg_mapping, node); - c = pgid_cmp(new->pgid, pg->pgid); + c = ceph_pg_compare(&new->pgid, &pg->pgid); if (c < 0) p = &(*p)->rb_left; else if (c > 0) @@ -432,7 +433,7 @@ static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, while (n) { pg = rb_entry(n, struct ceph_pg_mapping, node); - c = pgid_cmp(pgid, pg->pgid); + c = ceph_pg_compare(&pgid, &pg->pgid); if (c < 0) { n = n->rb_left; } else if (c > 0) { @@ -596,7 +597,9 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) *p += 4; /* skip crash_replay_interval */ if (ev >= 7) - *p += 1; /* skip min_size */ + pi->min_size = ceph_decode_8(p); + else + pi->min_size = pi->size - pi->size / 2; if (ev >= 8) *p += 8 + 8; /* skip quota_max_* */ @@ -616,6 +619,50 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) pi->write_tier = -1; } + if (ev >= 10) { + /* skip properties */ + num = ceph_decode_32(p); + while (num--) { + len = ceph_decode_32(p); + *p += len; /* key */ + len = ceph_decode_32(p); + *p += len; /* val */ + } + } + + if (ev >= 11) { + /* skip hit_set_params */ + *p += 1 + 1; /* versions */ + len = ceph_decode_32(p); + *p += len; + + *p += 4; /* skip hit_set_period */ + *p += 4; /* skip hit_set_count */ + } + + if (ev >= 12) + *p += 4; /* skip stripe_width */ + + if (ev >= 13) { + *p += 8; /* skip target_max_bytes */ + *p += 8; /* skip target_max_objects */ + *p += 4; /* skip cache_target_dirty_ratio_micro */ + *p += 4; /* skip cache_target_full_ratio_micro */ + *p += 4; /* skip cache_min_flush_age */ + *p += 4; /* skip cache_min_evict_age */ + } + + if (ev >= 14) { + /* skip erasure_code_profile */ + len = ceph_decode_32(p); + *p += len; + } + + if (ev >= 15) + pi->last_force_request_resend = ceph_decode_32(p); + else + pi->last_force_request_resend = 0; + /* ignore the rest */ *p = pool_end; @@ -660,6 +707,23 @@ bad: /* * osd map */ +struct ceph_osdmap *ceph_osdmap_alloc(void) +{ + struct ceph_osdmap *map; + + map = kzalloc(sizeof(*map), GFP_NOIO); + if (!map) + return NULL; + + map->pg_pools = RB_ROOT; + map->pool_max = -1; + map->pg_temp = RB_ROOT; + map->primary_temp = RB_ROOT; + mutex_init(&map->crush_scratch_mutex); + + return map; +} + void ceph_osdmap_destroy(struct ceph_osdmap *map) { dout("osdmap_destroy %p\n", map); @@ -1183,14 +1247,10 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) struct ceph_osdmap *map; int ret; - map = kzalloc(sizeof(*map), GFP_NOFS); + map = ceph_osdmap_alloc(); if (!map) return ERR_PTR(-ENOMEM); - map->pg_temp = RB_ROOT; - map->primary_temp = RB_ROOT; - mutex_init(&map->crush_scratch_mutex); - ret = osdmap_decode(p, end, map); if (ret) { ceph_osdmap_destroy(map); @@ -1201,11 +1261,119 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) } /* + * Encoding order is (new_up_client, new_state, new_weight). Need to + * apply in the (new_weight, new_state, new_up_client) order, because + * an incremental map may look like e.g. + * + * new_up_client: { osd=6, addr=... } # set osd_state and addr + * new_state: { osd=6, xorstate=EXISTS } # clear osd_state + */ +static int decode_new_up_state_weight(void **p, void *end, + struct ceph_osdmap *map) +{ + void *new_up_client; + void *new_state; + void *new_weight_end; + u32 len; + + new_up_client = *p; + ceph_decode_32_safe(p, end, len, e_inval); + len *= sizeof(u32) + sizeof(struct ceph_entity_addr); + ceph_decode_need(p, end, len, e_inval); + *p += len; + + new_state = *p; + ceph_decode_32_safe(p, end, len, e_inval); + len *= sizeof(u32) + sizeof(u8); + ceph_decode_need(p, end, len, e_inval); + *p += len; + + /* new_weight */ + ceph_decode_32_safe(p, end, len, e_inval); + while (len--) { + s32 osd; + u32 w; + + ceph_decode_need(p, end, 2*sizeof(u32), e_inval); + osd = ceph_decode_32(p); + w = ceph_decode_32(p); + BUG_ON(osd >= map->max_osd); + pr_info("osd%d weight 0x%x %s\n", osd, w, + w == CEPH_OSD_IN ? "(in)" : + (w == CEPH_OSD_OUT ? "(out)" : "")); + map->osd_weight[osd] = w; + + /* + * If we are marking in, set the EXISTS, and clear the + * AUTOOUT and NEW bits. + */ + if (w) { + map->osd_state[osd] |= CEPH_OSD_EXISTS; + map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT | + CEPH_OSD_NEW); + } + } + new_weight_end = *p; + + /* new_state (up/down) */ + *p = new_state; + len = ceph_decode_32(p); + while (len--) { + s32 osd; + u8 xorstate; + int ret; + + osd = ceph_decode_32(p); + xorstate = ceph_decode_8(p); + if (xorstate == 0) + xorstate = CEPH_OSD_UP; + BUG_ON(osd >= map->max_osd); + if ((map->osd_state[osd] & CEPH_OSD_UP) && + (xorstate & CEPH_OSD_UP)) + pr_info("osd%d down\n", osd); + if ((map->osd_state[osd] & CEPH_OSD_EXISTS) && + (xorstate & CEPH_OSD_EXISTS)) { + pr_info("osd%d does not exist\n", osd); + map->osd_weight[osd] = CEPH_OSD_IN; + ret = set_primary_affinity(map, osd, + CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); + if (ret) + return ret; + memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr)); + map->osd_state[osd] = 0; + } else { + map->osd_state[osd] ^= xorstate; + } + } + + /* new_up_client */ + *p = new_up_client; + len = ceph_decode_32(p); + while (len--) { + s32 osd; + struct ceph_entity_addr addr; + + osd = ceph_decode_32(p); + ceph_decode_copy(p, &addr, sizeof(addr)); + ceph_decode_addr(&addr); + BUG_ON(osd >= map->max_osd); + pr_info("osd%d up\n", osd); + map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; + map->osd_addr[osd] = addr; + } + + *p = new_weight_end; + return 0; + +e_inval: + return -EINVAL; +} + +/* * decode and apply an incremental map update. */ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, - struct ceph_osdmap *map, - struct ceph_messenger *msgr) + struct ceph_osdmap *map) { struct crush_map *newcrush = NULL; struct ceph_fsid fsid; @@ -1299,49 +1467,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, __remove_pg_pool(&map->pg_pools, pi); } - /* new_up */ - ceph_decode_32_safe(p, end, len, e_inval); - while (len--) { - u32 osd; - struct ceph_entity_addr addr; - ceph_decode_32_safe(p, end, osd, e_inval); - ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval); - ceph_decode_addr(&addr); - pr_info("osd%d up\n", osd); - BUG_ON(osd >= map->max_osd); - map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS; - map->osd_addr[osd] = addr; - } - - /* new_state */ - ceph_decode_32_safe(p, end, len, e_inval); - while (len--) { - u32 osd; - u8 xorstate; - ceph_decode_32_safe(p, end, osd, e_inval); - xorstate = **(u8 **)p; - (*p)++; /* clean flag */ - if (xorstate == 0) - xorstate = CEPH_OSD_UP; - if (xorstate & CEPH_OSD_UP) - pr_info("osd%d down\n", osd); - if (osd < map->max_osd) - map->osd_state[osd] ^= xorstate; - } - - /* new_weight */ - ceph_decode_32_safe(p, end, len, e_inval); - while (len--) { - u32 osd, off; - ceph_decode_need(p, end, sizeof(u32)*2, e_inval); - osd = ceph_decode_32(p); - off = ceph_decode_32(p); - pr_info("osd%d weight 0x%x %s\n", osd, off, - off == CEPH_OSD_IN ? "(in)" : - (off == CEPH_OSD_OUT ? "(out)" : "")); - if (osd < map->max_osd) - map->osd_weight[osd] = off; - } + /* new_up_client, new_state, new_weight */ + err = decode_new_up_state_weight(p, end, map); + if (err) + goto bad; /* new_pg_temp */ err = decode_new_pg_temp(p, end, map); @@ -1381,8 +1510,252 @@ bad: return ERR_PTR(err); } +void ceph_oid_copy(struct ceph_object_id *dest, + const struct ceph_object_id *src) +{ + WARN_ON(!ceph_oid_empty(dest)); + + if (src->name != src->inline_name) { + /* very rare, see ceph_object_id definition */ + dest->name = kmalloc(src->name_len + 1, + GFP_NOIO | __GFP_NOFAIL); + } + memcpy(dest->name, src->name, src->name_len + 1); + dest->name_len = src->name_len; +} +EXPORT_SYMBOL(ceph_oid_copy); +static __printf(2, 0) +int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap) +{ + int len; + + WARN_ON(!ceph_oid_empty(oid)); + + len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap); + if (len >= sizeof(oid->inline_name)) + return len; + + oid->name_len = len; + return 0; +} + +/* + * If oid doesn't fit into inline buffer, BUG. + */ +void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + BUG_ON(oid_printf_vargs(oid, fmt, ap)); + va_end(ap); +} +EXPORT_SYMBOL(ceph_oid_printf); + +static __printf(3, 0) +int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp, + const char *fmt, va_list ap) +{ + va_list aq; + int len; + + va_copy(aq, ap); + len = oid_printf_vargs(oid, fmt, aq); + va_end(aq); + + if (len) { + char *external_name; + + external_name = kmalloc(len + 1, gfp); + if (!external_name) + return -ENOMEM; + + oid->name = external_name; + WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len); + oid->name_len = len; + } + + return 0; +} + +/* + * If oid doesn't fit into inline buffer, allocate. + */ +int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp, + const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = oid_aprintf_vargs(oid, gfp, fmt, ap); + va_end(ap); + + return ret; +} +EXPORT_SYMBOL(ceph_oid_aprintf); + +void ceph_oid_destroy(struct ceph_object_id *oid) +{ + if (oid->name != oid->inline_name) + kfree(oid->name); +} +EXPORT_SYMBOL(ceph_oid_destroy); + +/* + * osds only + */ +static bool __osds_equal(const struct ceph_osds *lhs, + const struct ceph_osds *rhs) +{ + if (lhs->size == rhs->size && + !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0]))) + return true; + + return false; +} + +/* + * osds + primary + */ +static bool osds_equal(const struct ceph_osds *lhs, + const struct ceph_osds *rhs) +{ + if (__osds_equal(lhs, rhs) && + lhs->primary == rhs->primary) + return true; + + return false; +} + +static bool osds_valid(const struct ceph_osds *set) +{ + /* non-empty set */ + if (set->size > 0 && set->primary >= 0) + return true; + + /* empty can_shift_osds set */ + if (!set->size && set->primary == -1) + return true; + + /* empty !can_shift_osds set - all NONE */ + if (set->size > 0 && set->primary == -1) { + int i; + + for (i = 0; i < set->size; i++) { + if (set->osds[i] != CRUSH_ITEM_NONE) + break; + } + if (i == set->size) + return true; + } + + return false; +} + +void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src) +{ + memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0])); + dest->size = src->size; + dest->primary = src->primary; +} + +static bool is_split(const struct ceph_pg *pgid, + u32 old_pg_num, + u32 new_pg_num) +{ + int old_bits = calc_bits_of(old_pg_num); + int old_mask = (1 << old_bits) - 1; + int n; + + WARN_ON(pgid->seed >= old_pg_num); + if (new_pg_num <= old_pg_num) + return false; + + for (n = 1; ; n++) { + int next_bit = n << (old_bits - 1); + u32 s = next_bit | pgid->seed; + + if (s < old_pg_num || s == pgid->seed) + continue; + if (s >= new_pg_num) + break; + + s = ceph_stable_mod(s, old_pg_num, old_mask); + if (s == pgid->seed) + return true; + } + + return false; +} + +bool ceph_is_new_interval(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + const struct ceph_osds *old_up, + const struct ceph_osds *new_up, + int old_size, + int new_size, + int old_min_size, + int new_min_size, + u32 old_pg_num, + u32 new_pg_num, + bool old_sort_bitwise, + bool new_sort_bitwise, + const struct ceph_pg *pgid) +{ + return !osds_equal(old_acting, new_acting) || + !osds_equal(old_up, new_up) || + old_size != new_size || + old_min_size != new_min_size || + is_split(pgid, old_pg_num, new_pg_num) || + old_sort_bitwise != new_sort_bitwise; +} + +static int calc_pg_rank(int osd, const struct ceph_osds *acting) +{ + int i; + + for (i = 0; i < acting->size; i++) { + if (acting->osds[i] == osd) + return i; + } + + return -1; +} + +static bool primary_changed(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting) +{ + if (!old_acting->size && !new_acting->size) + return false; /* both still empty */ + + if (!old_acting->size ^ !new_acting->size) + return true; /* was empty, now not, or vice versa */ + + if (old_acting->primary != new_acting->primary) + return true; /* primary changed */ + + if (calc_pg_rank(old_acting->primary, old_acting) != + calc_pg_rank(new_acting->primary, new_acting)) + return true; + + return false; /* same primary (tho replicas may have changed) */ +} + +bool ceph_osds_changed(const struct ceph_osds *old_acting, + const struct ceph_osds *new_acting, + bool any_change) +{ + if (primary_changed(old_acting, new_acting)) + return true; + + if (any_change && !__osds_equal(old_acting, new_acting)) + return true; + + return false; +} /* * calculate file layout from given offset, length. @@ -1455,30 +1828,71 @@ invalid: EXPORT_SYMBOL(ceph_calc_file_object_mapping); /* - * Calculate mapping of a (oloc, oid) pair to a PG. Should only be - * called with target's (oloc, oid), since tiering isn't taken into - * account. + * Map an object into a PG. + * + * Should only be called with target_oid and target_oloc (as opposed to + * base_oid and base_oloc), since tiering isn't taken into account. */ -int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, - struct ceph_object_locator *oloc, - struct ceph_object_id *oid, - struct ceph_pg *pg_out) +int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + struct ceph_pg *raw_pgid) { struct ceph_pg_pool_info *pi; - pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool); + pi = ceph_pg_pool_by_id(osdmap, oloc->pool); if (!pi) - return -EIO; + return -ENOENT; - pg_out->pool = oloc->pool; - pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, - oid->name_len); + raw_pgid->pool = oloc->pool; + raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name, + oid->name_len); - dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name, - pg_out->pool, pg_out->seed); + dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name, + raw_pgid->pool, raw_pgid->seed); return 0; } -EXPORT_SYMBOL(ceph_oloc_oid_to_pg); +EXPORT_SYMBOL(ceph_object_locator_to_pg); + +/* + * Map a raw PG (full precision ps) into an actual PG. + */ +static void raw_pg_to_pg(struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid, + struct ceph_pg *pgid) +{ + pgid->pool = raw_pgid->pool; + pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num, + pi->pg_num_mask); +} + +/* + * Map a raw PG (full precision ps) into a placement ps (placement + * seed). Include pool id in that value so that different pools don't + * use the same seeds. + */ +static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid) +{ + if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) { + /* hash pool id and seed so that pool PGs do not overlap */ + return crush_hash32_2(CRUSH_HASH_RJENKINS1, + ceph_stable_mod(raw_pgid->seed, + pi->pgp_num, + pi->pgp_num_mask), + raw_pgid->pool); + } else { + /* + * legacy behavior: add ps and pool together. this is + * not a great approach because the PGs from each pool + * will overlap on top of each other: 0.5 == 1.4 == + * 2.3 == ... + */ + return ceph_stable_mod(raw_pgid->seed, pi->pgp_num, + pi->pgp_num_mask) + + (unsigned)raw_pgid->pool; + } +} static int do_crush(struct ceph_osdmap *map, int ruleno, int x, int *result, int result_max, @@ -1497,84 +1911,92 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, } /* - * Calculate raw (crush) set for given pgid. + * Calculate raw set (CRUSH output) for given PG. The result may + * contain nonexistent OSDs. ->primary is undefined for a raw set. * - * Return raw set length, or error. + * Placement seed (CRUSH input) is returned through @ppps. */ -static int pg_to_raw_osds(struct ceph_osdmap *osdmap, - struct ceph_pg_pool_info *pool, - struct ceph_pg pgid, u32 pps, int *osds) +static void pg_to_raw_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid, + struct ceph_osds *raw, + u32 *ppps) { + u32 pps = raw_pg_to_pps(pi, raw_pgid); int ruleno; int len; - /* crush */ - ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, - pool->type, pool->size); + ceph_osds_init(raw); + if (ppps) + *ppps = pps; + + ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type, + pi->size); if (ruleno < 0) { pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", - pgid.pool, pool->crush_ruleset, pool->type, - pool->size); - return -ENOENT; + pi->id, pi->crush_ruleset, pi->type, pi->size); + return; } - len = do_crush(osdmap, ruleno, pps, osds, - min_t(int, pool->size, CEPH_PG_MAX_SIZE), + len = do_crush(osdmap, ruleno, pps, raw->osds, + min_t(int, pi->size, ARRAY_SIZE(raw->osds)), osdmap->osd_weight, osdmap->max_osd); if (len < 0) { pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", - len, ruleno, pgid.pool, pool->crush_ruleset, - pool->type, pool->size); - return len; + len, ruleno, pi->id, pi->crush_ruleset, pi->type, + pi->size); + return; } - return len; + raw->size = len; } /* - * Given raw set, calculate up set and up primary. + * Given raw set, calculate up set and up primary. By definition of an + * up set, the result won't contain nonexistent or down OSDs. * - * Return up set length. *primary is set to up primary osd id, or -1 - * if up set is empty. + * This is done in-place - on return @set is the up set. If it's + * empty, ->primary will remain undefined. */ -static int raw_to_up_osds(struct ceph_osdmap *osdmap, - struct ceph_pg_pool_info *pool, - int *osds, int len, int *primary) +static void raw_to_up_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + struct ceph_osds *set) { - int up_primary = -1; int i; - if (ceph_can_shift_osds(pool)) { + /* ->primary is undefined for a raw set */ + BUG_ON(set->primary != -1); + + if (ceph_can_shift_osds(pi)) { int removed = 0; - for (i = 0; i < len; i++) { - if (ceph_osd_is_down(osdmap, osds[i])) { + /* shift left */ + for (i = 0; i < set->size; i++) { + if (ceph_osd_is_down(osdmap, set->osds[i])) { removed++; continue; } if (removed) - osds[i - removed] = osds[i]; + set->osds[i - removed] = set->osds[i]; } - - len -= removed; - if (len > 0) - up_primary = osds[0]; + set->size -= removed; + if (set->size > 0) + set->primary = set->osds[0]; } else { - for (i = len - 1; i >= 0; i--) { - if (ceph_osd_is_down(osdmap, osds[i])) - osds[i] = CRUSH_ITEM_NONE; + /* set down/dne devices to NONE */ + for (i = set->size - 1; i >= 0; i--) { + if (ceph_osd_is_down(osdmap, set->osds[i])) + set->osds[i] = CRUSH_ITEM_NONE; else - up_primary = osds[i]; + set->primary = set->osds[i]; } } - - *primary = up_primary; - return len; } -static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, - struct ceph_pg_pool_info *pool, - int *osds, int len, int *primary) +static void apply_primary_affinity(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + u32 pps, + struct ceph_osds *up) { int i; int pos = -1; @@ -1586,8 +2008,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, if (!osdmap->osd_primary_affinity) return; - for (i = 0; i < len; i++) { - int osd = osds[i]; + for (i = 0; i < up->size; i++) { + int osd = up->osds[i]; if (osd != CRUSH_ITEM_NONE && osdmap->osd_primary_affinity[osd] != @@ -1595,7 +2017,7 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, break; } } - if (i == len) + if (i == up->size) return; /* @@ -1603,8 +2025,8 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, * osd into the hash/rng so that a proportional fraction of an * osd's pgs get rejected as primary. */ - for (i = 0; i < len; i++) { - int osd = osds[i]; + for (i = 0; i < up->size; i++) { + int osd = up->osds[i]; u32 aff; if (osd == CRUSH_ITEM_NONE) @@ -1629,135 +2051,110 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, if (pos < 0) return; - *primary = osds[pos]; + up->primary = up->osds[pos]; - if (ceph_can_shift_osds(pool) && pos > 0) { + if (ceph_can_shift_osds(pi) && pos > 0) { /* move the new primary to the front */ for (i = pos; i > 0; i--) - osds[i] = osds[i - 1]; - osds[0] = *primary; + up->osds[i] = up->osds[i - 1]; + up->osds[0] = up->primary; } } /* - * Given up set, apply pg_temp and primary_temp mappings. + * Get pg_temp and primary_temp mappings for given PG. * - * Return acting set length. *primary is set to acting primary osd id, - * or -1 if acting set is empty. + * Note that a PG may have none, only pg_temp, only primary_temp or + * both pg_temp and primary_temp mappings. This means @temp isn't + * always a valid OSD set on return: in the "only primary_temp" case, + * @temp will have its ->primary >= 0 but ->size == 0. */ -static int apply_temps(struct ceph_osdmap *osdmap, - struct ceph_pg_pool_info *pool, struct ceph_pg pgid, - int *osds, int len, int *primary) +static void get_temp_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pi, + const struct ceph_pg *raw_pgid, + struct ceph_osds *temp) { + struct ceph_pg pgid; struct ceph_pg_mapping *pg; - int temp_len; - int temp_primary; int i; - /* raw_pg -> pg */ - pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, - pool->pg_num_mask); + raw_pg_to_pg(pi, raw_pgid, &pgid); + ceph_osds_init(temp); /* pg_temp? */ pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); if (pg) { - temp_len = 0; - temp_primary = -1; - for (i = 0; i < pg->pg_temp.len; i++) { if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { - if (ceph_can_shift_osds(pool)) + if (ceph_can_shift_osds(pi)) continue; - else - osds[temp_len++] = CRUSH_ITEM_NONE; + + temp->osds[temp->size++] = CRUSH_ITEM_NONE; } else { - osds[temp_len++] = pg->pg_temp.osds[i]; + temp->osds[temp->size++] = pg->pg_temp.osds[i]; } } /* apply pg_temp's primary */ - for (i = 0; i < temp_len; i++) { - if (osds[i] != CRUSH_ITEM_NONE) { - temp_primary = osds[i]; + for (i = 0; i < temp->size; i++) { + if (temp->osds[i] != CRUSH_ITEM_NONE) { + temp->primary = temp->osds[i]; break; } } - } else { - temp_len = len; - temp_primary = *primary; } /* primary_temp? */ pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); if (pg) - temp_primary = pg->primary_temp.osd; - - *primary = temp_primary; - return temp_len; + temp->primary = pg->primary_temp.osd; } /* - * Calculate acting set for given pgid. + * Map a PG to its acting set as well as its up set. * - * Return acting set length, or error. *primary is set to acting - * primary osd id, or -1 if acting set is empty or on error. + * Acting set is used for data mapping purposes, while up set can be + * recorded for detecting interval changes and deciding whether to + * resend a request. */ -int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, - int *osds, int *primary) +void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, + const struct ceph_pg *raw_pgid, + struct ceph_osds *up, + struct ceph_osds *acting) { - struct ceph_pg_pool_info *pool; + struct ceph_pg_pool_info *pi; u32 pps; - int len; - pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); - if (!pool) { - *primary = -1; - return -ENOENT; + pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool); + if (!pi) { + ceph_osds_init(up); + ceph_osds_init(acting); + goto out; } - if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { - /* hash pool id and seed so that pool PGs do not overlap */ - pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, - ceph_stable_mod(pgid.seed, pool->pgp_num, - pool->pgp_num_mask), - pgid.pool); - } else { - /* - * legacy behavior: add ps and pool together. this is - * not a great approach because the PGs from each pool - * will overlap on top of each other: 0.5 == 1.4 == - * 2.3 == ... - */ - pps = ceph_stable_mod(pgid.seed, pool->pgp_num, - pool->pgp_num_mask) + - (unsigned)pgid.pool; + pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps); + raw_to_up_osds(osdmap, pi, up); + apply_primary_affinity(osdmap, pi, pps, up); + get_temp_osds(osdmap, pi, raw_pgid, acting); + if (!acting->size) { + memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0])); + acting->size = up->size; + if (acting->primary == -1) + acting->primary = up->primary; } - - len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); - if (len < 0) { - *primary = -1; - return len; - } - - len = raw_to_up_osds(osdmap, pool, osds, len, primary); - - apply_primary_affinity(osdmap, pps, pool, osds, len, primary); - - len = apply_temps(osdmap, pool, pgid, osds, len, primary); - - return len; +out: + WARN_ON(!osds_valid(up) || !osds_valid(acting)); } /* - * Return primary osd for given pgid, or -1 if none. + * Return acting primary for given PG, or -1 if none. */ -int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) +int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, + const struct ceph_pg *raw_pgid) { - int osds[CEPH_PG_MAX_SIZE]; - int primary; - - ceph_calc_pg_acting(osdmap, pgid, osds, &primary); + struct ceph_osds up, acting; - return primary; + ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting); + return acting.primary; } -EXPORT_SYMBOL(ceph_calc_pg_primary); +EXPORT_SYMBOL(ceph_pg_to_acting_primary); diff --git a/net/compat.c b/net/compat.c index 5cfd26a00..1cd2ec046 100644 --- a/net/compat.c +++ b/net/compat.c @@ -309,8 +309,8 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm) __scm_destroy(scm); } -static int do_set_attach_filter(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) +/* allocate a 64-bit sock_fprog on the user stack for duration of syscall. */ +struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval) { struct compat_sock_fprog __user *fprog32 = (struct compat_sock_fprog __user *)optval; struct sock_fprog __user *kfprog = compat_alloc_user_space(sizeof(struct sock_fprog)); @@ -323,6 +323,19 @@ static int do_set_attach_filter(struct socket *sock, int level, int optname, __get_user(ptr, &fprog32->filter) || __put_user(len, &kfprog->len) || __put_user(compat_ptr(ptr), &kfprog->filter)) + return NULL; + + return kfprog; +} +EXPORT_SYMBOL_GPL(get_compat_bpf_fprog); + +static int do_set_attach_filter(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct sock_fprog __user *kfprog; + + kfprog = get_compat_bpf_fprog(optval); + if (!kfprog) return -EFAULT; return sock_setsockopt(sock, level, optname, (char __user *)kfprog, @@ -354,7 +367,8 @@ static int do_set_sock_timeout(struct socket *sock, int level, static int compat_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { - if (optname == SO_ATTACH_FILTER) + if (optname == SO_ATTACH_FILTER || + optname == SO_ATTACH_REUSEPORT_CBPF) return do_set_attach_filter(sock, level, optname, optval, optlen); if (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO) diff --git a/net/core/datagram.c b/net/core/datagram.c index fa9dc6450..b7de71f8d 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -301,16 +301,19 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(skb_free_datagram); -void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) +void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len) { bool slow; if (likely(atomic_read(&skb->users) == 1)) smp_rmb(); - else if (likely(!atomic_dec_and_test(&skb->users))) + else if (likely(!atomic_dec_and_test(&skb->users))) { + sk_peek_offset_bwd(sk, len); return; + } slow = lock_sock_fast(sk); + sk_peek_offset_bwd(sk, len); skb_orphan(skb); sk_mem_reclaim_partial(sk); unlock_sock_fast(sk, slow); @@ -318,7 +321,7 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) /* skb is now orphaned, can be freed outside of locked section */ __kfree_skb(skb); } -EXPORT_SYMBOL(skb_free_datagram_locked); +EXPORT_SYMBOL(__skb_free_datagram_locked); /** * skb_kill_datagram - Free a datagram skbuff forcibly diff --git a/net/core/dev.c b/net/core/dev.c index 5c925ac50..904ff431d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1741,7 +1741,7 @@ static inline void net_timestamp_set(struct sk_buff *skb) __net_timestamp(SKB); \ } \ -bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) +bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) { unsigned int len; @@ -1850,7 +1850,7 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) * taps currently in use. */ -static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) +void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; struct sk_buff *skb2 = NULL; @@ -1907,6 +1907,7 @@ out_unlock: pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); /** * netif_setup_tc - Handle tc mappings on real_num_tx_queues change @@ -2711,6 +2712,19 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, return ERR_PTR(err); } + /* Only report GSO partial support if it will enable us to + * support segmentation on this frame without needing additional + * work. + */ + if (features & NETIF_F_GSO_PARTIAL) { + netdev_features_t partial_features = NETIF_F_GSO_ROBUST; + struct net_device *dev = skb->dev; + + partial_features |= dev->features & dev->gso_partial_features; + if (!skb_gso_ok(skb, features | partial_features)) + features &= ~NETIF_F_GSO_PARTIAL; + } + BUILD_BUG_ON(SKB_SGO_CB_OFFSET + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); @@ -2825,14 +2839,45 @@ static netdev_features_t dflt_features_check(const struct sk_buff *skb, return vlan_features_check(skb, features); } +static netdev_features_t gso_features_check(const struct sk_buff *skb, + struct net_device *dev, + netdev_features_t features) +{ + u16 gso_segs = skb_shinfo(skb)->gso_segs; + + if (gso_segs > dev->gso_max_segs) + return features & ~NETIF_F_GSO_MASK; + + /* Support for GSO partial features requires software + * intervention before we can actually process the packets + * so we need to strip support for any partial features now + * and we can pull them back in after we have partially + * segmented the frame. + */ + if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) + features &= ~dev->gso_partial_features; + + /* Make sure to clear the IPv4 ID mangling feature if the + * IPv4 header has the potential to be fragmented. + */ + if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { + struct iphdr *iph = skb->encapsulation ? + inner_ip_hdr(skb) : ip_hdr(skb); + + if (!(iph->frag_off & htons(IP_DF))) + features &= ~NETIF_F_TSO_MANGLEID; + } + + return features; +} + netdev_features_t netif_skb_features(struct sk_buff *skb) { struct net_device *dev = skb->dev; netdev_features_t features = dev->features; - u16 gso_segs = skb_shinfo(skb)->gso_segs; - if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) - features &= ~NETIF_F_GSO_MASK; + if (skb_is_gso(skb)) + features = gso_features_check(skb, dev, features); /* If encapsulation offload request, verify we are testing * hardware encapsulation features instead of standard @@ -2915,9 +2960,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device { netdev_features_t features; - if (skb->next) - return skb; - features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); if (unlikely(!skb)) @@ -2960,6 +3002,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device out_kfree_skb: kfree_skb(skb); out_null: + atomic_long_inc(&dev->tx_dropped); return NULL; } @@ -3143,12 +3186,12 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) case TC_ACT_SHOT: qdisc_qstats_cpu_drop(cl->q); *ret = NET_XMIT_DROP; - goto drop; + kfree_skb(skb); + return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: *ret = NET_XMIT_SUCCESS; -drop: - kfree_skb(skb); + consume_skb(skb); return NULL; case TC_ACT_REDIRECT: /* No need to push/pop skb's mac_header here on egress! */ @@ -3349,7 +3392,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) skb = validate_xmit_skb(skb, dev); if (!skb) - goto drop; + goto out; HARD_TX_LOCK(dev, txq, cpu); @@ -3376,7 +3419,6 @@ recursion_alert: } rc = -ENETDOWN; -drop: rcu_read_unlock_bh(); atomic_long_inc(&dev->tx_dropped); @@ -3428,6 +3470,7 @@ u32 rps_cpu_mask __read_mostly; EXPORT_SYMBOL(rps_cpu_mask); struct static_key rps_needed __read_mostly; +EXPORT_SYMBOL(rps_needed); static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, @@ -3914,9 +3957,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, break; case TC_ACT_SHOT: qdisc_qstats_cpu_drop(cl->q); + kfree_skb(skb); + return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: - kfree_skb(skb); + consume_skb(skb); return NULL; case TC_ACT_REDIRECT: /* skb_mac_header check was done by cls/act_bpf, so @@ -4440,6 +4485,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff NAPI_GRO_CB(skb)->free = 0; NAPI_GRO_CB(skb)->encap_mark = 0; NAPI_GRO_CB(skb)->is_fou = 0; + NAPI_GRO_CB(skb)->is_atomic = 1; NAPI_GRO_CB(skb)->gro_remcsum_start = 0; /* Setup for GRO checksum validation */ @@ -4664,6 +4710,8 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) if (unlikely(skb_gro_header_hard(skb, hlen))) { eth = skb_gro_header_slow(skb, hlen, 0); if (unlikely(!eth)) { + net_warn_ratelimited("%s: dropping impossible skb from %s\n", + __func__, napi->dev->name); napi_reuse_skb(napi, skb); return NULL; } @@ -4938,8 +4986,8 @@ bool sk_busy_loop(struct sock *sk, int nonblock) netpoll_poll_unlock(have); } if (rc > 0) - NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_BUSYPOLLRXPACKETS, rc); + __NET_ADD_STATS(sock_net(sk), + LINUX_MIB_BUSYPOLLRXPACKETS, rc); local_bh_enable(); if (rc == LL_FLUSH_FAILED) @@ -6676,6 +6724,10 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~NETIF_F_TSO6; } + /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ + if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO)) + features &= ~NETIF_F_TSO_MANGLEID; + /* TSO ECN requires that TSO is present as well. */ if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) features &= ~NETIF_F_TSO_ECN; @@ -6704,6 +6756,14 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } + /* GSO partial features require GSO partial be set */ + if ((features & dev->gso_partial_features) && + !(features & NETIF_F_GSO_PARTIAL)) { + netdev_dbg(dev, + "Dropping partially supported GSO features since no GSO partial.\n"); + features &= ~dev->gso_partial_features; + } + #ifdef CONFIG_NET_RX_BUSY_POLL if (dev->netdev_ops->ndo_busy_poll) features |= NETIF_F_BUSY_POLL; @@ -6974,9 +7034,22 @@ int register_netdevice(struct net_device *dev) dev->features |= NETIF_F_SOFT_FEATURES; dev->wanted_features = dev->features & dev->hw_features; - if (!(dev->flags & IFF_LOOPBACK)) { + if (!(dev->flags & IFF_LOOPBACK)) dev->hw_features |= NETIF_F_NOCACHE_COPY; - } + + /* If IPv4 TCP segmentation offload is supported we should also + * allow the device to enable segmenting the frame with the option + * of ignoring a static IP ID value. This doesn't enable the + * feature itself but allows the user to enable it later. + */ + if (dev->hw_features & NETIF_F_TSO) + dev->hw_features |= NETIF_F_TSO_MANGLEID; + if (dev->vlan_features & NETIF_F_TSO) + dev->vlan_features |= NETIF_F_TSO_MANGLEID; + if (dev->mpls_features & NETIF_F_TSO) + dev->mpls_features |= NETIF_F_TSO_MANGLEID; + if (dev->hw_enc_features & NETIF_F_TSO) + dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. */ @@ -6984,7 +7057,7 @@ int register_netdevice(struct net_device *dev) /* Make NETIF_F_SG inheritable to tunnel devices. */ - dev->hw_enc_features |= NETIF_F_SG; + dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL; /* Make NETIF_F_SG inheritable to MPLS. */ @@ -7427,7 +7500,6 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; - dev->gso_min_segs = 0; INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); diff --git a/net/core/devlink.c b/net/core/devlink.c index 590fa561c..933e8d4d3 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -119,7 +119,171 @@ static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink, return devlink_port_get_from_attrs(devlink, info->attrs); } -#define DEVLINK_NL_FLAG_NEED_PORT BIT(0) +struct devlink_sb { + struct list_head list; + unsigned int index; + u32 size; + u16 ingress_pools_count; + u16 egress_pools_count; + u16 ingress_tc_count; + u16 egress_tc_count; +}; + +static u16 devlink_sb_pool_count(struct devlink_sb *devlink_sb) +{ + return devlink_sb->ingress_pools_count + devlink_sb->egress_pools_count; +} + +static struct devlink_sb *devlink_sb_get_by_index(struct devlink *devlink, + unsigned int sb_index) +{ + struct devlink_sb *devlink_sb; + + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + if (devlink_sb->index == sb_index) + return devlink_sb; + } + return NULL; +} + +static bool devlink_sb_index_exists(struct devlink *devlink, + unsigned int sb_index) +{ + return devlink_sb_get_by_index(devlink, sb_index); +} + +static struct devlink_sb *devlink_sb_get_from_attrs(struct devlink *devlink, + struct nlattr **attrs) +{ + if (attrs[DEVLINK_ATTR_SB_INDEX]) { + u32 sb_index = nla_get_u32(attrs[DEVLINK_ATTR_SB_INDEX]); + struct devlink_sb *devlink_sb; + + devlink_sb = devlink_sb_get_by_index(devlink, sb_index); + if (!devlink_sb) + return ERR_PTR(-ENODEV); + return devlink_sb; + } + return ERR_PTR(-EINVAL); +} + +static struct devlink_sb *devlink_sb_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + return devlink_sb_get_from_attrs(devlink, info->attrs); +} + +static int devlink_sb_pool_index_get_from_attrs(struct devlink_sb *devlink_sb, + struct nlattr **attrs, + u16 *p_pool_index) +{ + u16 val; + + if (!attrs[DEVLINK_ATTR_SB_POOL_INDEX]) + return -EINVAL; + + val = nla_get_u16(attrs[DEVLINK_ATTR_SB_POOL_INDEX]); + if (val >= devlink_sb_pool_count(devlink_sb)) + return -EINVAL; + *p_pool_index = val; + return 0; +} + +static int devlink_sb_pool_index_get_from_info(struct devlink_sb *devlink_sb, + struct genl_info *info, + u16 *p_pool_index) +{ + return devlink_sb_pool_index_get_from_attrs(devlink_sb, info->attrs, + p_pool_index); +} + +static int +devlink_sb_pool_type_get_from_attrs(struct nlattr **attrs, + enum devlink_sb_pool_type *p_pool_type) +{ + u8 val; + + if (!attrs[DEVLINK_ATTR_SB_POOL_TYPE]) + return -EINVAL; + + val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_TYPE]); + if (val != DEVLINK_SB_POOL_TYPE_INGRESS && + val != DEVLINK_SB_POOL_TYPE_EGRESS) + return -EINVAL; + *p_pool_type = val; + return 0; +} + +static int +devlink_sb_pool_type_get_from_info(struct genl_info *info, + enum devlink_sb_pool_type *p_pool_type) +{ + return devlink_sb_pool_type_get_from_attrs(info->attrs, p_pool_type); +} + +static int +devlink_sb_th_type_get_from_attrs(struct nlattr **attrs, + enum devlink_sb_threshold_type *p_th_type) +{ + u8 val; + + if (!attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]) + return -EINVAL; + + val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]); + if (val != DEVLINK_SB_THRESHOLD_TYPE_STATIC && + val != DEVLINK_SB_THRESHOLD_TYPE_DYNAMIC) + return -EINVAL; + *p_th_type = val; + return 0; +} + +static int +devlink_sb_th_type_get_from_info(struct genl_info *info, + enum devlink_sb_threshold_type *p_th_type) +{ + return devlink_sb_th_type_get_from_attrs(info->attrs, p_th_type); +} + +static int +devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb, + struct nlattr **attrs, + enum devlink_sb_pool_type pool_type, + u16 *p_tc_index) +{ + u16 val; + + if (!attrs[DEVLINK_ATTR_SB_TC_INDEX]) + return -EINVAL; + + val = nla_get_u16(attrs[DEVLINK_ATTR_SB_TC_INDEX]); + if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS && + val >= devlink_sb->ingress_tc_count) + return -EINVAL; + if (pool_type == DEVLINK_SB_POOL_TYPE_EGRESS && + val >= devlink_sb->egress_tc_count) + return -EINVAL; + *p_tc_index = val; + return 0; +} + +static int +devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, + struct genl_info *info, + enum devlink_sb_pool_type pool_type, + u16 *p_tc_index) +{ + return devlink_sb_tc_index_get_from_attrs(devlink_sb, info->attrs, + pool_type, p_tc_index); +} + +#define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0) +#define DEVLINK_NL_FLAG_NEED_PORT BIT(1) +#define DEVLINK_NL_FLAG_NEED_SB BIT(2) +#define DEVLINK_NL_FLAG_LOCK_PORTS BIT(3) + /* port is not needed but we need to ensure they don't + * change in the middle of command + */ static int devlink_nl_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) @@ -132,8 +296,9 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, mutex_unlock(&devlink_mutex); return PTR_ERR(devlink); } - info->user_ptr[0] = devlink; - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) { + info->user_ptr[0] = devlink; + } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { struct devlink_port *devlink_port; mutex_lock(&devlink_port_mutex); @@ -143,7 +308,22 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, mutex_unlock(&devlink_mutex); return PTR_ERR(devlink_port); } - info->user_ptr[1] = devlink_port; + info->user_ptr[0] = devlink_port; + } + if (ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) { + mutex_lock(&devlink_port_mutex); + } + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_SB) { + struct devlink_sb *devlink_sb; + + devlink_sb = devlink_sb_get_from_info(devlink, info); + if (IS_ERR(devlink_sb)) { + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); + return PTR_ERR(devlink_sb); + } + info->user_ptr[1] = devlink_sb; } return 0; } @@ -151,7 +331,8 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, static void devlink_nl_post_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) + if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT || + ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) mutex_unlock(&devlink_port_mutex); mutex_unlock(&devlink_mutex); } @@ -356,8 +537,8 @@ out: static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink *devlink = info->user_ptr[0]; - struct devlink_port *devlink_port = info->user_ptr[1]; + struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink *devlink = devlink_port->devlink; struct sk_buff *msg; int err; @@ -436,8 +617,8 @@ static int devlink_port_type_set(struct devlink *devlink, static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink *devlink = info->user_ptr[0]; - struct devlink_port *devlink_port = info->user_ptr[1]; + struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink *devlink = devlink_port->devlink; int err; if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) { @@ -497,12 +678,735 @@ static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb, return devlink_port_unsplit(devlink, port_index); } +static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink, + struct devlink_sb *devlink_sb, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_SIZE, devlink_sb->size)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_POOL_COUNT, + devlink_sb->ingress_pools_count)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_POOL_COUNT, + devlink_sb->egress_pools_count)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_TC_COUNT, + devlink_sb->ingress_tc_count)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_TC_COUNT, + devlink_sb->egress_tc_count)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_sb_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct sk_buff *msg; + int err; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_sb_fill(msg, devlink, devlink_sb, + DEVLINK_CMD_SB_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + struct devlink_sb *devlink_sb; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_sb_fill(msg, devlink, devlink_sb, + DEVLINK_CMD_SB_NEW, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) + goto out; + idx++; + } + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink, + struct devlink_sb *devlink_sb, + u16 pool_index, enum devlink_command cmd, + u32 portid, u32 seq, int flags) +{ + struct devlink_sb_pool_info pool_info; + void *hdr; + int err; + + err = devlink->ops->sb_pool_get(devlink, devlink_sb->index, + pool_index, &pool_info); + if (err) + return err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) + goto nla_put_failure; + if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_info.pool_type)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_SIZE, pool_info.size)) + goto nla_put_failure; + if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE, + pool_info.threshold_type)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_sb_pool_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct sk_buff *msg; + u16 pool_index; + int err; + + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, + &pool_index); + if (err) + return err; + + if (!devlink->ops || !devlink->ops->sb_pool_get) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_sb_pool_fill(msg, devlink, devlink_sb, pool_index, + DEVLINK_CMD_SB_POOL_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, + struct devlink *devlink, + struct devlink_sb *devlink_sb, + u32 portid, u32 seq) +{ + u16 pool_count = devlink_sb_pool_count(devlink_sb); + u16 pool_index; + int err; + + for (pool_index = 0; pool_index < pool_count; pool_index++) { + if (*p_idx < start) { + (*p_idx)++; + continue; + } + err = devlink_nl_sb_pool_fill(msg, devlink, + devlink_sb, + pool_index, + DEVLINK_CMD_SB_POOL_NEW, + portid, seq, NLM_F_MULTI); + if (err) + return err; + (*p_idx)++; + } + return 0; +} + +static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + struct devlink_sb *devlink_sb; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || + !devlink->ops || !devlink->ops->sb_pool_get) + continue; + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_pool_get_dumpit(msg, start, &idx, devlink, + devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err && err != -EOPNOTSUPP) + goto out; + } + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index, + u16 pool_index, u32 size, + enum devlink_sb_threshold_type threshold_type) + +{ + const struct devlink_ops *ops = devlink->ops; + + if (ops && ops->sb_pool_set) + return ops->sb_pool_set(devlink, sb_index, pool_index, + size, threshold_type); + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + enum devlink_sb_threshold_type threshold_type; + u16 pool_index; + u32 size; + int err; + + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, + &pool_index); + if (err) + return err; + + err = devlink_sb_th_type_get_from_info(info, &threshold_type); + if (err) + return err; + + if (!info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]) + return -EINVAL; + + size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]); + return devlink_sb_pool_set(devlink, devlink_sb->index, + pool_index, size, threshold_type); +} + +static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_port *devlink_port, + struct devlink_sb *devlink_sb, + u16 pool_index, + enum devlink_command cmd, + u32 portid, u32 seq, int flags) +{ + const struct devlink_ops *ops = devlink->ops; + u32 threshold; + void *hdr; + int err; + + err = ops->sb_port_pool_get(devlink_port, devlink_sb->index, + pool_index, &threshold); + if (err) + return err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold)) + goto nla_put_failure; + + if (ops->sb_occ_port_pool_get) { + u32 cur; + u32 max; + + err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index, + pool_index, &cur, &max); + if (err && err != -EOPNOTSUPP) + return err; + if (!err) { + if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max)) + goto nla_put_failure; + } + } + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink *devlink = devlink_port->devlink; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct sk_buff *msg; + u16 pool_index; + int err; + + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, + &pool_index); + if (err) + return err; + + if (!devlink->ops || !devlink->ops->sb_port_pool_get) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_sb_port_pool_fill(msg, devlink, devlink_port, + devlink_sb, pool_index, + DEVLINK_CMD_SB_PORT_POOL_NEW, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, + struct devlink *devlink, + struct devlink_sb *devlink_sb, + u32 portid, u32 seq) +{ + struct devlink_port *devlink_port; + u16 pool_count = devlink_sb_pool_count(devlink_sb); + u16 pool_index; + int err; + + list_for_each_entry(devlink_port, &devlink->port_list, list) { + for (pool_index = 0; pool_index < pool_count; pool_index++) { + if (*p_idx < start) { + (*p_idx)++; + continue; + } + err = devlink_nl_sb_port_pool_fill(msg, devlink, + devlink_port, + devlink_sb, + pool_index, + DEVLINK_CMD_SB_PORT_POOL_NEW, + portid, seq, + NLM_F_MULTI); + if (err) + return err; + (*p_idx)++; + } + } + return 0; +} + +static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + struct devlink_sb *devlink_sb; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + mutex_lock(&devlink_port_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || + !devlink->ops || !devlink->ops->sb_port_pool_get) + continue; + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_port_pool_get_dumpit(msg, start, &idx, + devlink, devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err && err != -EOPNOTSUPP) + goto out; + } + } +out: + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_sb_port_pool_set(struct devlink_port *devlink_port, + unsigned int sb_index, u16 pool_index, + u32 threshold) + +{ + const struct devlink_ops *ops = devlink_port->devlink->ops; + + if (ops && ops->sb_port_pool_set) + return ops->sb_port_pool_set(devlink_port, sb_index, + pool_index, threshold); + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + u16 pool_index; + u32 threshold; + int err; + + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, + &pool_index); + if (err) + return err; + + if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD]) + return -EINVAL; + + threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); + return devlink_sb_port_pool_set(devlink_port, devlink_sb->index, + pool_index, threshold); +} + +static int +devlink_nl_sb_tc_pool_bind_fill(struct sk_buff *msg, struct devlink *devlink, + struct devlink_port *devlink_port, + struct devlink_sb *devlink_sb, u16 tc_index, + enum devlink_sb_pool_type pool_type, + enum devlink_command cmd, + u32 portid, u32 seq, int flags) +{ + const struct devlink_ops *ops = devlink->ops; + u16 pool_index; + u32 threshold; + void *hdr; + int err; + + err = ops->sb_tc_pool_bind_get(devlink_port, devlink_sb->index, + tc_index, pool_type, + &pool_index, &threshold); + if (err) + return err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_TC_INDEX, tc_index)) + goto nla_put_failure; + if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_type)) + goto nla_put_failure; + if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold)) + goto nla_put_failure; + + if (ops->sb_occ_tc_port_bind_get) { + u32 cur; + u32 max; + + err = ops->sb_occ_tc_port_bind_get(devlink_port, + devlink_sb->index, + tc_index, pool_type, + &cur, &max); + if (err && err != -EOPNOTSUPP) + return err; + if (!err) { + if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur)) + goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max)) + goto nla_put_failure; + } + } + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_sb_tc_pool_bind_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink *devlink = devlink_port->devlink; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + struct sk_buff *msg; + enum devlink_sb_pool_type pool_type; + u16 tc_index; + int err; + + err = devlink_sb_pool_type_get_from_info(info, &pool_type); + if (err) + return err; + + err = devlink_sb_tc_index_get_from_info(devlink_sb, info, + pool_type, &tc_index); + if (err) + return err; + + if (!devlink->ops || !devlink->ops->sb_tc_pool_bind_get) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, devlink_port, + devlink_sb, tc_index, pool_type, + DEVLINK_CMD_SB_TC_POOL_BIND_NEW, + info->snd_portid, + info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, + int start, int *p_idx, + struct devlink *devlink, + struct devlink_sb *devlink_sb, + u32 portid, u32 seq) +{ + struct devlink_port *devlink_port; + u16 tc_index; + int err; + + list_for_each_entry(devlink_port, &devlink->port_list, list) { + for (tc_index = 0; + tc_index < devlink_sb->ingress_tc_count; tc_index++) { + if (*p_idx < start) { + (*p_idx)++; + continue; + } + err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, + devlink_port, + devlink_sb, + tc_index, + DEVLINK_SB_POOL_TYPE_INGRESS, + DEVLINK_CMD_SB_TC_POOL_BIND_NEW, + portid, seq, + NLM_F_MULTI); + if (err) + return err; + (*p_idx)++; + } + for (tc_index = 0; + tc_index < devlink_sb->egress_tc_count; tc_index++) { + if (*p_idx < start) { + (*p_idx)++; + continue; + } + err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, + devlink_port, + devlink_sb, + tc_index, + DEVLINK_SB_POOL_TYPE_EGRESS, + DEVLINK_CMD_SB_TC_POOL_BIND_NEW, + portid, seq, + NLM_F_MULTI); + if (err) + return err; + (*p_idx)++; + } + } + return 0; +} + +static int +devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + struct devlink_sb *devlink_sb; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + mutex_lock(&devlink_port_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || + !devlink->ops || !devlink->ops->sb_tc_pool_bind_get) + continue; + list_for_each_entry(devlink_sb, &devlink->sb_list, list) { + err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx, + devlink, + devlink_sb, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq); + if (err && err != -EOPNOTSUPP) + goto out; + } + } +out: + mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 pool_index, u32 threshold) + +{ + const struct devlink_ops *ops = devlink_port->devlink->ops; + + if (ops && ops->sb_tc_pool_bind_set) + return ops->sb_tc_pool_bind_set(devlink_port, sb_index, + tc_index, pool_type, + pool_index, threshold); + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + enum devlink_sb_pool_type pool_type; + u16 tc_index; + u16 pool_index; + u32 threshold; + int err; + + err = devlink_sb_pool_type_get_from_info(info, &pool_type); + if (err) + return err; + + err = devlink_sb_tc_index_get_from_info(devlink_sb, info, + pool_type, &tc_index); + if (err) + return err; + + err = devlink_sb_pool_index_get_from_info(devlink_sb, info, + &pool_index); + if (err) + return err; + + if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD]) + return -EINVAL; + + threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]); + return devlink_sb_tc_pool_bind_set(devlink_port, devlink_sb->index, + tc_index, pool_type, + pool_index, threshold); +} + +static int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + const struct devlink_ops *ops = devlink->ops; + + if (ops && ops->sb_occ_snapshot) + return ops->sb_occ_snapshot(devlink, devlink_sb->index); + return -EOPNOTSUPP; +} + +static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_sb *devlink_sb = info->user_ptr[1]; + const struct devlink_ops *ops = devlink->ops; + + if (ops && ops->sb_occ_max_clear) + return ops->sb_occ_max_clear(devlink, devlink_sb->index); + return -EOPNOTSUPP; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 }, [DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 }, [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16 }, + [DEVLINK_ATTR_SB_POOL_TYPE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_SB_POOL_SIZE] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 }, + [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -511,6 +1415,7 @@ static const struct genl_ops devlink_nl_ops[] = { .doit = devlink_nl_cmd_get_doit, .dumpit = devlink_nl_cmd_get_dumpit, .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, { @@ -533,12 +1438,92 @@ static const struct genl_ops devlink_nl_ops[] = { .doit = devlink_nl_cmd_port_split_doit, .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, { .cmd = DEVLINK_CMD_PORT_UNSPLIT, .doit = devlink_nl_cmd_port_unsplit_doit, .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_SB_GET, + .doit = devlink_nl_cmd_sb_get_doit, + .dumpit = devlink_nl_cmd_sb_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NEED_SB, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_SB_POOL_GET, + .doit = devlink_nl_cmd_sb_pool_get_doit, + .dumpit = devlink_nl_cmd_sb_pool_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NEED_SB, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_SB_POOL_SET, + .doit = devlink_nl_cmd_sb_pool_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NEED_SB, + }, + { + .cmd = DEVLINK_CMD_SB_PORT_POOL_GET, + .doit = devlink_nl_cmd_sb_port_pool_get_doit, + .dumpit = devlink_nl_cmd_sb_port_pool_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | + DEVLINK_NL_FLAG_NEED_SB, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_SB_PORT_POOL_SET, + .doit = devlink_nl_cmd_sb_port_pool_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | + DEVLINK_NL_FLAG_NEED_SB, + }, + { + .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET, + .doit = devlink_nl_cmd_sb_tc_pool_bind_get_doit, + .dumpit = devlink_nl_cmd_sb_tc_pool_bind_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | + DEVLINK_NL_FLAG_NEED_SB, + /* can be retrieved by unprivileged users */ + }, + { + .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_SET, + .doit = devlink_nl_cmd_sb_tc_pool_bind_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT | + DEVLINK_NL_FLAG_NEED_SB, + }, + { + .cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT, + .doit = devlink_nl_cmd_sb_occ_snapshot_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NEED_SB | + DEVLINK_NL_FLAG_LOCK_PORTS, + }, + { + .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR, + .doit = devlink_nl_cmd_sb_occ_max_clear_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NEED_SB | + DEVLINK_NL_FLAG_LOCK_PORTS, }, }; @@ -561,6 +1546,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) devlink->ops = ops; devlink_net_set(devlink, &init_net); INIT_LIST_HEAD(&devlink->port_list); + INIT_LIST_HEAD(&devlink->sb_list); return devlink; } EXPORT_SYMBOL_GPL(devlink_alloc); @@ -630,7 +1616,6 @@ int devlink_port_register(struct devlink *devlink, } devlink_port->devlink = devlink; devlink_port->index = port_index; - devlink_port->type = DEVLINK_PORT_TYPE_NOTSET; devlink_port->registered = true; list_add_tail(&devlink_port->list, &devlink->port_list); mutex_unlock(&devlink_port_mutex); @@ -717,6 +1702,51 @@ void devlink_port_split_set(struct devlink_port *devlink_port, } EXPORT_SYMBOL_GPL(devlink_port_split_set); +int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, + u32 size, u16 ingress_pools_count, + u16 egress_pools_count, u16 ingress_tc_count, + u16 egress_tc_count) +{ + struct devlink_sb *devlink_sb; + int err = 0; + + mutex_lock(&devlink_mutex); + if (devlink_sb_index_exists(devlink, sb_index)) { + err = -EEXIST; + goto unlock; + } + + devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL); + if (!devlink_sb) { + err = -ENOMEM; + goto unlock; + } + devlink_sb->index = sb_index; + devlink_sb->size = size; + devlink_sb->ingress_pools_count = ingress_pools_count; + devlink_sb->egress_pools_count = egress_pools_count; + devlink_sb->ingress_tc_count = ingress_tc_count; + devlink_sb->egress_tc_count = egress_tc_count; + list_add_tail(&devlink_sb->list, &devlink->sb_list); +unlock: + mutex_unlock(&devlink_mutex); + return err; +} +EXPORT_SYMBOL_GPL(devlink_sb_register); + +void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index) +{ + struct devlink_sb *devlink_sb; + + mutex_lock(&devlink_mutex); + devlink_sb = devlink_sb_get_by_index(devlink, sb_index); + WARN_ON(!devlink_sb); + list_del(&devlink_sb->list); + mutex_unlock(&devlink_mutex); + kfree(devlink_sb); +} +EXPORT_SYMBOL_GPL(devlink_sb_unregister); + static int __init devlink_module_init(void) { return genl_register_family_with_ops_groups(&devlink_nl_family, diff --git a/net/core/ethtool.c b/net/core/ethtool.c index f426c5ad6..f4034817d 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -79,12 +79,16 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_UFO_BIT] = "tx-udp-fragmentation", [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust", [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", + [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation", [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation", - [NETIF_F_GSO_IPIP_BIT] = "tx-ipip-segmentation", - [NETIF_F_GSO_SIT_BIT] = "tx-sit-segmentation", + [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation", + [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation", + [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation", [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation", + [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", + [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", @@ -387,15 +391,17 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data) return 0; } -static void convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32) +void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst, + u32 legacy_u32) { bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); dst[0] = legacy_u32; } +EXPORT_SYMBOL(ethtool_convert_legacy_u32_to_link_mode); /* return false if src had higher bits set. lower bits always updated. */ -static bool convert_link_mode_to_legacy_u32(u32 *legacy_u32, - const unsigned long *src) +bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, + const unsigned long *src) { bool retval = true; @@ -415,6 +421,7 @@ static bool convert_link_mode_to_legacy_u32(u32 *legacy_u32, *legacy_u32 = src[0]; return retval; } +EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32); /* return false if legacy contained non-0 deprecated fields * transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated @@ -437,13 +444,13 @@ convert_legacy_settings_to_link_ksettings( legacy_settings->maxrxpkt) retval = false; - convert_legacy_u32_to_link_mode( + ethtool_convert_legacy_u32_to_link_mode( link_ksettings->link_modes.supported, legacy_settings->supported); - convert_legacy_u32_to_link_mode( + ethtool_convert_legacy_u32_to_link_mode( link_ksettings->link_modes.advertising, legacy_settings->advertising); - convert_legacy_u32_to_link_mode( + ethtool_convert_legacy_u32_to_link_mode( link_ksettings->link_modes.lp_advertising, legacy_settings->lp_advertising); link_ksettings->base.speed @@ -482,13 +489,13 @@ convert_link_ksettings_to_legacy_settings( * __u32 maxrxpkt; */ - retval &= convert_link_mode_to_legacy_u32( + retval &= ethtool_convert_link_mode_to_legacy_u32( &legacy_settings->supported, link_ksettings->link_modes.supported); - retval &= convert_link_mode_to_legacy_u32( + retval &= ethtool_convert_link_mode_to_legacy_u32( &legacy_settings->advertising, link_ksettings->link_modes.advertising); - retval &= convert_link_mode_to_legacy_u32( + retval &= ethtool_convert_link_mode_to_legacy_u32( &legacy_settings->lp_advertising, link_ksettings->link_modes.lp_advertising); ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 365de6643..840acebbb 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -549,7 +549,7 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ + nla_total_size(4) /* FRA_FWMASK */ - + nla_total_size(8); /* FRA_TUN_ID */ + + nla_total_size_64bit(8); /* FRA_TUN_ID */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -607,7 +607,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, (rule->target && nla_put_u32(skb, FRA_GOTO, rule->target)) || (rule->tun_id && - nla_put_be64(skb, FRA_TUN_ID, rule->tun_id))) + nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { diff --git a/net/core/filter.c b/net/core/filter.c index ca7f832b2..e759d90e8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -53,9 +53,10 @@ #include <net/sock_reuseport.h> /** - * sk_filter - run a packet through a socket filter + * sk_filter_trim_cap - run a packet through a socket filter * @sk: sock associated with &sk_buff * @skb: buffer to filter + * @cap: limit on how short the eBPF program may trim the packet * * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller @@ -64,7 +65,7 @@ * be accepted or -EPERM if the packet should be tossed. * */ -int sk_filter(struct sock *sk, struct sk_buff *skb) +int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) { int err; struct sk_filter *filter; @@ -85,14 +86,13 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) filter = rcu_dereference(sk->sk_filter); if (filter) { unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb); - - err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; + err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; } rcu_read_unlock(); return err; } -EXPORT_SYMBOL(sk_filter); +EXPORT_SYMBOL(sk_filter_trim_cap); static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) { @@ -994,7 +994,11 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) */ goto out_err_free; - bpf_prog_select_runtime(fp); + /* We are guaranteed to never error here with cBPF to eBPF + * transitions, since there's no issue with type compatibility + * checks on program arrays. + */ + fp = bpf_prog_select_runtime(fp, &err); kfree(old_prog); return fp; @@ -1149,8 +1153,7 @@ void bpf_prog_destroy(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(bpf_prog_destroy); -static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk, - bool locked) +static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) { struct sk_filter *fp, *old_fp; @@ -1166,8 +1169,10 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk, return -ENOMEM; } - old_fp = rcu_dereference_protected(sk->sk_filter, locked); + old_fp = rcu_dereference_protected(sk->sk_filter, + lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_filter, fp); + if (old_fp) sk_filter_uncharge(sk, old_fp); @@ -1246,8 +1251,7 @@ struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) * occurs or there is insufficient memory for the filter a negative * errno code is returned. On success the return is zero. */ -int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk, - bool locked) +int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) { struct bpf_prog *prog = __get_filter(fprog, sk); int err; @@ -1255,7 +1259,7 @@ int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk, if (IS_ERR(prog)) return PTR_ERR(prog); - err = __sk_attach_prog(prog, sk, locked); + err = __sk_attach_prog(prog, sk); if (err < 0) { __bpf_prog_release(prog); return err; @@ -1263,12 +1267,7 @@ int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk, return 0; } -EXPORT_SYMBOL_GPL(__sk_attach_filter); - -int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) -{ - return __sk_attach_filter(fprog, sk, sock_owned_by_user(sk)); -} +EXPORT_SYMBOL_GPL(sk_attach_filter); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { @@ -1314,7 +1313,7 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) if (IS_ERR(prog)) return PTR_ERR(prog); - err = __sk_attach_prog(prog, sk, sock_owned_by_user(sk)); + err = __sk_attach_prog(prog, sk); if (err < 0) { bpf_prog_put(prog); return err; @@ -1349,6 +1348,21 @@ struct bpf_scratchpad { static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); +static inline int bpf_try_make_writable(struct sk_buff *skb, + unsigned int write_len) +{ + int err; + + if (!skb_cloned(skb)) + return 0; + if (skb_clone_writable(skb, write_len)) + return 0; + err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + if (!err) + bpf_compute_data_end(skb); + return err; +} + static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) { struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); @@ -1371,7 +1385,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) */ if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff))) return -EFAULT; - if (unlikely(skb_try_make_writable(skb, offset + len))) + if (unlikely(bpf_try_make_writable(skb, offset + len))) return -EFAULT; ptr = skb_header_pointer(skb, offset, len, sp->buff); @@ -1414,16 +1428,19 @@ static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) unsigned int len = (unsigned int) r4; void *ptr; - if (unlikely((u32) offset > 0xffff || len > MAX_BPF_STACK)) - return -EFAULT; + if (unlikely((u32) offset > 0xffff)) + goto err_clear; ptr = skb_header_pointer(skb, offset, len, to); if (unlikely(!ptr)) - return -EFAULT; + goto err_clear; if (ptr != to) memcpy(to, ptr, len); return 0; +err_clear: + memset(to, 0, len); + return -EFAULT; } static const struct bpf_func_proto bpf_skb_load_bytes_proto = { @@ -1432,7 +1449,7 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_PTR_TO_RAW_STACK, .arg4_type = ARG_CONST_STACK_SIZE, }; @@ -1446,7 +1463,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; - if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) + if (unlikely(bpf_try_make_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); @@ -1501,7 +1518,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) return -EINVAL; if (unlikely((u32) offset > 0xffff)) return -EFAULT; - if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) + if (unlikely(bpf_try_make_writable(skb, offset + sizeof(sum)))) return -EFAULT; ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); @@ -1701,12 +1718,15 @@ static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1; __be16 vlan_proto = (__force __be16) r2; + int ret; if (unlikely(vlan_proto != htons(ETH_P_8021Q) && vlan_proto != htons(ETH_P_8021AD))) vlan_proto = htons(ETH_P_8021Q); - return skb_vlan_push(skb, vlan_proto, vlan_tci); + ret = skb_vlan_push(skb, vlan_proto, vlan_tci); + bpf_compute_data_end(skb); + return ret; } const struct bpf_func_proto bpf_skb_vlan_push_proto = { @@ -1722,8 +1742,11 @@ EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1; + int ret; - return skb_vlan_pop(skb); + ret = skb_vlan_pop(skb); + bpf_compute_data_end(skb); + return ret; } const struct bpf_func_proto bpf_skb_vlan_pop_proto = { @@ -1761,12 +1784,19 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; const struct ip_tunnel_info *info = skb_tunnel_info(skb); u8 compat[sizeof(struct bpf_tunnel_key)]; + void *to_orig = to; + int err; - if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) - return -EINVAL; - if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) - return -EPROTO; + if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) { + err = -EINVAL; + goto err_clear; + } + if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) { + err = -EPROTO; + goto err_clear; + } if (unlikely(size != sizeof(struct bpf_tunnel_key))) { + err = -EINVAL; switch (size) { case offsetof(struct bpf_tunnel_key, tunnel_label): case offsetof(struct bpf_tunnel_key, tunnel_ext): @@ -1776,12 +1806,12 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) * a common path later on. */ if (ip_tunnel_info_af(info) != AF_INET) - return -EINVAL; + goto err_clear; set_compat: to = (struct bpf_tunnel_key *)compat; break; default: - return -EINVAL; + goto err_clear; } } @@ -1798,9 +1828,12 @@ set_compat: } if (unlikely(size != sizeof(struct bpf_tunnel_key))) - memcpy((void *)(long) r2, to, size); + memcpy(to_orig, to, size); return 0; +err_clear: + memset(to_orig, 0, size); + return err; } static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { @@ -1808,7 +1841,7 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_PTR_TO_RAW_STACK, .arg3_type = ARG_CONST_STACK_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -1818,16 +1851,26 @@ static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5) struct sk_buff *skb = (struct sk_buff *) (long) r1; u8 *to = (u8 *) (long) r2; const struct ip_tunnel_info *info = skb_tunnel_info(skb); + int err; if (unlikely(!info || - !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) - return -ENOENT; - if (unlikely(size < info->options_len)) - return -ENOMEM; + !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) { + err = -ENOENT; + goto err_clear; + } + if (unlikely(size < info->options_len)) { + err = -ENOMEM; + goto err_clear; + } ip_tunnel_info_opts_get(to, info); + if (size > info->options_len) + memset(to + info->options_len, 0, size - info->options_len); return info->options_len; +err_clear: + memset(to, 0, size); + return err; } static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { @@ -1835,7 +1878,7 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_PTR_TO_RAW_STACK, .arg3_type = ARG_CONST_STACK_SIZE, }; @@ -2021,6 +2064,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_redirect_proto; case BPF_FUNC_get_route_realm: return &bpf_get_route_realm_proto; + case BPF_FUNC_perf_event_output: + return bpf_get_event_output_proto(); default: return sk_filter_func_proto(func_id); } @@ -2028,31 +2073,32 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) static bool __is_valid_access(int off, int size, enum bpf_access_type type) { - /* check bounds */ if (off < 0 || off >= sizeof(struct __sk_buff)) return false; - - /* disallow misaligned access */ + /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - - /* all __sk_buff fields are __u32 */ - if (size != 4) + if (size != sizeof(__u32)) return false; return true; } static bool sk_filter_is_valid_access(int off, int size, - enum bpf_access_type type) + enum bpf_access_type type, + enum bpf_reg_type *reg_type) { - if (off == offsetof(struct __sk_buff, tc_classid)) + switch (off) { + case offsetof(struct __sk_buff, tc_classid): + case offsetof(struct __sk_buff, data): + case offsetof(struct __sk_buff, data_end): return false; + } if (type == BPF_WRITE) { switch (off) { case offsetof(struct __sk_buff, cb[0]) ... - offsetof(struct __sk_buff, cb[4]): + offsetof(struct __sk_buff, cb[4]): break; default: return false; @@ -2063,7 +2109,8 @@ static bool sk_filter_is_valid_access(int off, int size, } static bool tc_cls_act_is_valid_access(int off, int size, - enum bpf_access_type type) + enum bpf_access_type type, + enum bpf_reg_type *reg_type) { if (type == BPF_WRITE) { switch (off) { @@ -2078,6 +2125,16 @@ static bool tc_cls_act_is_valid_access(int off, int size, return false; } } + + switch (off) { + case offsetof(struct __sk_buff, data): + *reg_type = PTR_TO_PACKET; + break; + case offsetof(struct __sk_buff, data_end): + *reg_type = PTR_TO_PACKET_END; + break; + } + return __is_valid_access(off, size, type); } @@ -2195,6 +2252,20 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off); break; + case offsetof(struct __sk_buff, data): + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, data)), + dst_reg, src_reg, + offsetof(struct sk_buff, data)); + break; + + case offsetof(struct __sk_buff, data_end): + ctx_off -= offsetof(struct __sk_buff, data_end); + ctx_off += offsetof(struct sk_buff, cb); + ctx_off += offsetof(struct bpf_skb_data_end, data_end); + *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(sizeof(void *)), + dst_reg, src_reg, ctx_off); + break; + case offsetof(struct __sk_buff, tc_index): #ifdef CONFIG_NET_SCHED BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); @@ -2219,30 +2290,30 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, } static const struct bpf_verifier_ops sk_filter_ops = { - .get_func_proto = sk_filter_func_proto, - .is_valid_access = sk_filter_is_valid_access, - .convert_ctx_access = bpf_net_convert_ctx_access, + .get_func_proto = sk_filter_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = bpf_net_convert_ctx_access, }; static const struct bpf_verifier_ops tc_cls_act_ops = { - .get_func_proto = tc_cls_act_func_proto, - .is_valid_access = tc_cls_act_is_valid_access, - .convert_ctx_access = bpf_net_convert_ctx_access, + .get_func_proto = tc_cls_act_func_proto, + .is_valid_access = tc_cls_act_is_valid_access, + .convert_ctx_access = bpf_net_convert_ctx_access, }; static struct bpf_prog_type_list sk_filter_type __read_mostly = { - .ops = &sk_filter_ops, - .type = BPF_PROG_TYPE_SOCKET_FILTER, + .ops = &sk_filter_ops, + .type = BPF_PROG_TYPE_SOCKET_FILTER, }; static struct bpf_prog_type_list sched_cls_type __read_mostly = { - .ops = &tc_cls_act_ops, - .type = BPF_PROG_TYPE_SCHED_CLS, + .ops = &tc_cls_act_ops, + .type = BPF_PROG_TYPE_SCHED_CLS, }; static struct bpf_prog_type_list sched_act_type __read_mostly = { - .ops = &tc_cls_act_ops, - .type = BPF_PROG_TYPE_SCHED_ACT, + .ops = &tc_cls_act_ops, + .type = BPF_PROG_TYPE_SCHED_ACT, }; static int __init register_sk_filter_ops(void) @@ -2255,7 +2326,7 @@ static int __init register_sk_filter_ops(void) } late_initcall(register_sk_filter_ops); -int __sk_detach_filter(struct sock *sk, bool locked) +int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; struct sk_filter *filter; @@ -2263,7 +2334,8 @@ int __sk_detach_filter(struct sock *sk, bool locked) if (sock_flag(sk, SOCK_FILTER_LOCKED)) return -EPERM; - filter = rcu_dereference_protected(sk->sk_filter, locked); + filter = rcu_dereference_protected(sk->sk_filter, + lockdep_sock_is_held(sk)); if (filter) { RCU_INIT_POINTER(sk->sk_filter, NULL); sk_filter_uncharge(sk, filter); @@ -2272,12 +2344,7 @@ int __sk_detach_filter(struct sock *sk, bool locked) return ret; } -EXPORT_SYMBOL_GPL(__sk_detach_filter); - -int sk_detach_filter(struct sock *sk) -{ - return __sk_detach_filter(sk, sock_owned_by_user(sk)); -} +EXPORT_SYMBOL_GPL(sk_detach_filter); int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, unsigned int len) @@ -2288,7 +2355,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, lock_sock(sk); filter = rcu_dereference_protected(sk->sk_filter, - sock_owned_by_user(sk)); + lockdep_sock_is_held(sk)); if (!filter) goto out; diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index e640462ea..be873e4e3 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -25,9 +25,9 @@ static inline int -gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size) +gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr) { - if (nla_put(d->skb, type, size, buf)) + if (nla_put_64bit(d->skb, type, size, buf, padattr)) goto nla_put_failure; return 0; @@ -47,6 +47,7 @@ nla_put_failure: * @xstats_type: TLV type for backward compatibility xstats TLV * @lock: statistics lock * @d: dumping handle + * @padattr: padding attribute * * Initializes the dumping handle, grabs the statistic lock and appends * an empty TLV header to the socket buffer for use a container for all @@ -59,7 +60,8 @@ nla_put_failure: */ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, - int xstats_type, spinlock_t *lock, struct gnet_dump *d) + int xstats_type, spinlock_t *lock, + struct gnet_dump *d, int padattr) __acquires(lock) { memset(d, 0, sizeof(*d)); @@ -71,20 +73,22 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, d->skb = skb; d->compat_tc_stats = tc_stats_type; d->compat_xstats = xstats_type; + d->padattr = padattr; if (d->tail) - return gnet_stats_copy(d, type, NULL, 0); + return gnet_stats_copy(d, type, NULL, 0, padattr); return 0; } EXPORT_SYMBOL(gnet_stats_start_copy_compat); /** - * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode + * gnet_stats_start_copy - start dumping procedure in compatibility mode * @skb: socket buffer to put statistics TLVs into * @type: TLV type for top level statistic TLV * @lock: statistics lock * @d: dumping handle + * @padattr: padding attribute * * Initializes the dumping handle, grabs the statistic lock and appends * an empty TLV header to the socket buffer for use a container for all @@ -94,9 +98,9 @@ EXPORT_SYMBOL(gnet_stats_start_copy_compat); */ int gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, - struct gnet_dump *d) + struct gnet_dump *d, int padattr) { - return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d); + return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d, padattr); } EXPORT_SYMBOL(gnet_stats_start_copy); @@ -169,7 +173,8 @@ gnet_stats_copy_basic(struct gnet_dump *d, memset(&sb, 0, sizeof(sb)); sb.bytes = bstats.bytes; sb.packets = bstats.packets; - return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb)); + return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb), + TCA_STATS_PAD); } return 0; } @@ -208,11 +213,13 @@ gnet_stats_copy_rate_est(struct gnet_dump *d, } if (d->tail) { - res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est)); + res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est), + TCA_STATS_PAD); if (res < 0 || est.bps == r->bps) return res; /* emit 64bit stats only if needed */ - return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r)); + return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r), + TCA_STATS_PAD); } return 0; @@ -286,7 +293,8 @@ gnet_stats_copy_queue(struct gnet_dump *d, if (d->tail) return gnet_stats_copy(d, TCA_STATS_QUEUE, - &qstats, sizeof(qstats)); + &qstats, sizeof(qstats), + TCA_STATS_PAD); return 0; } @@ -316,7 +324,8 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) } if (d->tail) - return gnet_stats_copy(d, TCA_STATS_APP, st, len); + return gnet_stats_copy(d, TCA_STATS_APP, st, len, + TCA_STATS_PAD); return 0; @@ -347,12 +356,12 @@ gnet_stats_finish_copy(struct gnet_dump *d) if (d->compat_tc_stats) if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats, - sizeof(d->tc_stats)) < 0) + sizeof(d->tc_stats), d->padattr) < 0) return -1; if (d->compat_xstats && d->xstats) { if (gnet_stats_copy(d, d->compat_xstats, d->xstats, - d->xstats_len) < 0) + d->xstats_len, d->padattr) < 0) return -1; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 769cece9b..510cd62fc 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1763,21 +1763,22 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) NEIGH_VAR(parms, MCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_REPROBES, NEIGH_VAR(parms, MCAST_REPROBES)) || - nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time) || + nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time, + NDTPA_PAD) || nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, - NEIGH_VAR(parms, BASE_REACHABLE_TIME)) || + NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_GC_STALETIME, - NEIGH_VAR(parms, GC_STALETIME)) || + NEIGH_VAR(parms, GC_STALETIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_DELAY_PROBE_TIME, - NEIGH_VAR(parms, DELAY_PROBE_TIME)) || + NEIGH_VAR(parms, DELAY_PROBE_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_RETRANS_TIME, - NEIGH_VAR(parms, RETRANS_TIME)) || + NEIGH_VAR(parms, RETRANS_TIME), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_ANYCAST_DELAY, - NEIGH_VAR(parms, ANYCAST_DELAY)) || + NEIGH_VAR(parms, ANYCAST_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_PROXY_DELAY, - NEIGH_VAR(parms, PROXY_DELAY)) || + NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_LOCKTIME, - NEIGH_VAR(parms, LOCKTIME))) + NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD)) goto nla_put_failure; return nla_nest_end(skb, nest); @@ -1804,7 +1805,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, ndtmsg->ndtm_pad2 = 0; if (nla_put_string(skb, NDTA_NAME, tbl->id) || - nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval) || + nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval, NDTA_PAD) || nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) || nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) || nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3)) @@ -1856,7 +1857,8 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, ndst.ndts_table_fulls += st->table_fulls; } - if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst)) + if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst, + NDTA_PAD)) goto nla_put_failure; } diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 2bf832996..14d09345f 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -162,7 +162,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v) "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", sd->processed, sd->dropped, sd->time_squeeze, 0, 0, 0, 0, 0, /* was fastroute */ - sd->cpu_collision, sd->received_rps, flow_limit_count); + 0, /* was cpu_collision */ + sd->received_rps, flow_limit_count); return 0; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 2b3f76fe6..7a0b61655 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -24,6 +24,7 @@ #include <linux/jiffies.h> #include <linux/pm_runtime.h> #include <linux/of.h> +#include <linux/of_net.h> #include "net-sysfs.h" diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 20999aa59..8b02df0d3 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2245,10 +2245,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) hrtimer_set_expires(&t.timer, spin_until); remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer)); - if (remaining <= 0) { - pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); - return; - } + if (remaining <= 0) + goto out; start_time = ktime_get(); if (remaining < 100000) { @@ -2273,7 +2271,9 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) } pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time)); +out: pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); + destroy_hrtimer_on_stack(&t.timer); } static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) @@ -3472,7 +3472,6 @@ xmit_more: pkt_dev->odevname, ret); pkt_dev->errors++; /* fallthru */ - case NETDEV_TX_LOCKED: case NETDEV_TX_BUSY: /* Retry it next time */ atomic_dec(&(pkt_dev->skb->users)); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 65763c29f..d69c4644f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -808,11 +808,6 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, a->rx_nohandler = b->rx_nohandler; } -static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) -{ - memcpy(v, b, sizeof(*b)); -} - /* All VF info */ static inline int rtnl_vfinfo_size(const struct net_device *dev, u32 ext_filter_mask) @@ -830,17 +825,17 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, nla_total_size(sizeof(struct ifla_vf_link_state)) + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + /* IFLA_VF_STATS_RX_PACKETS */ - nla_total_size(sizeof(__u64)) + + nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_PACKETS */ - nla_total_size(sizeof(__u64)) + + nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_RX_BYTES */ - nla_total_size(sizeof(__u64)) + + nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_BYTES */ - nla_total_size(sizeof(__u64)) + + nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_BROADCAST */ - nla_total_size(sizeof(__u64)) + + nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_MULTICAST */ - nla_total_size(sizeof(__u64)) + + nla_total_size_64bit(sizeof(__u64)) + nla_total_size(sizeof(struct ifla_vf_trust))); return size; } else @@ -881,9 +876,9 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ - + nla_total_size(sizeof(struct rtnl_link_ifmap)) + + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap)) + nla_total_size(sizeof(struct rtnl_link_stats)) - + nla_total_size(sizeof(struct rtnl_link_stats64)) + + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + nla_total_size(4) /* IFLA_TXQLEN */ @@ -1054,25 +1049,23 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, struct net_device *dev) { - const struct rtnl_link_stats64 *stats; - struct rtnl_link_stats64 temp; + struct rtnl_link_stats64 *sp; struct nlattr *attr; - stats = dev_get_stats(dev, &temp); - - attr = nla_reserve(skb, IFLA_STATS, - sizeof(struct rtnl_link_stats)); + attr = nla_reserve_64bit(skb, IFLA_STATS64, + sizeof(struct rtnl_link_stats64), IFLA_PAD); if (!attr) return -EMSGSIZE; - copy_rtnl_link_stats(nla_data(attr), stats); + sp = nla_data(attr); + dev_get_stats(dev, sp); - attr = nla_reserve(skb, IFLA_STATS64, - sizeof(struct rtnl_link_stats64)); + attr = nla_reserve(skb, IFLA_STATS, + sizeof(struct rtnl_link_stats)); if (!attr) return -EMSGSIZE; - copy_rtnl_link_stats64(nla_data(attr), stats); + copy_rtnl_link_stats(nla_data(attr), sp); return 0; } @@ -1160,18 +1153,18 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; } - if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS, - vf_stats.rx_packets) || - nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS, - vf_stats.tx_packets) || - nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES, - vf_stats.rx_bytes) || - nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES, - vf_stats.tx_bytes) || - nla_put_u64(skb, IFLA_VF_STATS_BROADCAST, - vf_stats.broadcast) || - nla_put_u64(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast)) + if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, + vf_stats.rx_packets, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, + vf_stats.tx_packets, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, + vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, + vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, + vf_stats.broadcast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, + vf_stats.multicast, IFLA_VF_STATS_PAD)) return -EMSGSIZE; nla_nest_end(skb, vfstats); nla_nest_end(skb, vf); @@ -1190,7 +1183,7 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) map.dma = dev->dma; map.port = dev->if_port; - if (nla_put(skb, IFLA_MAP, sizeof(map), &map)) + if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD)) return -EMSGSIZE; return 0; @@ -3453,6 +3446,202 @@ out: return err; } +static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) +{ + return (mask & IFLA_STATS_FILTER_BIT(attrid)) && + (!idxattr || idxattr == attrid); +} + +static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, + int type, u32 pid, u32 seq, u32 change, + unsigned int flags, unsigned int filter_mask, + int *idxattr, int *prividx) +{ + struct if_stats_msg *ifsm; + struct nlmsghdr *nlh; + struct nlattr *attr; + int s_prividx = *prividx; + + ASSERT_RTNL(); + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifsm), flags); + if (!nlh) + return -EMSGSIZE; + + ifsm = nlmsg_data(nlh); + ifsm->ifindex = dev->ifindex; + ifsm->filter_mask = filter_mask; + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, *idxattr)) { + struct rtnl_link_stats64 *sp; + + attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64, + sizeof(struct rtnl_link_stats64), + IFLA_STATS_UNSPEC); + if (!attr) + goto nla_put_failure; + + sp = nla_data(attr); + dev_get_stats(dev, sp); + } + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, *idxattr)) { + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + + if (ops && ops->fill_linkxstats) { + int err; + + *idxattr = IFLA_STATS_LINK_XSTATS; + attr = nla_nest_start(skb, + IFLA_STATS_LINK_XSTATS); + if (!attr) + goto nla_put_failure; + + err = ops->fill_linkxstats(skb, dev, prividx); + nla_nest_end(skb, attr); + if (err) + goto nla_put_failure; + *idxattr = 0; + } + } + + nlmsg_end(skb, nlh); + + return 0; + +nla_put_failure: + /* not a multi message or no progress mean a real error */ + if (!(flags & NLM_F_MULTI) || s_prividx == *prividx) + nlmsg_cancel(skb, nlh); + else + nlmsg_end(skb, nlh); + + return -EMSGSIZE; +} + +static const struct nla_policy ifla_stats_policy[IFLA_STATS_MAX + 1] = { + [IFLA_STATS_LINK_64] = { .len = sizeof(struct rtnl_link_stats64) }, +}; + +static size_t if_nlmsg_stats_size(const struct net_device *dev, + u32 filter_mask) +{ + size_t size = 0; + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) + size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); + + if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) { + const struct rtnl_link_ops *ops = dev->rtnl_link_ops; + + if (ops && ops->get_linkxstats_size) { + size += nla_total_size(ops->get_linkxstats_size(dev)); + /* for IFLA_STATS_LINK_XSTATS */ + size += nla_total_size(0); + } + } + + return size; +} + +static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct net *net = sock_net(skb->sk); + struct net_device *dev = NULL; + int idxattr = 0, prividx = 0; + struct if_stats_msg *ifsm; + struct sk_buff *nskb; + u32 filter_mask; + int err; + + ifsm = nlmsg_data(nlh); + if (ifsm->ifindex > 0) + dev = __dev_get_by_index(net, ifsm->ifindex); + else + return -EINVAL; + + if (!dev) + return -ENODEV; + + filter_mask = ifsm->filter_mask; + if (!filter_mask) + return -EINVAL; + + nskb = nlmsg_new(if_nlmsg_stats_size(dev, filter_mask), GFP_KERNEL); + if (!nskb) + return -ENOBUFS; + + err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS, + NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, + 0, filter_mask, &idxattr, &prividx); + if (err < 0) { + /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(nskb); + } else { + err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); + } + + return err; +} + +static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int h, s_h, err, s_idx, s_idxattr, s_prividx; + struct net *net = sock_net(skb->sk); + unsigned int flags = NLM_F_MULTI; + struct if_stats_msg *ifsm; + struct hlist_head *head; + struct net_device *dev; + u32 filter_mask = 0; + int idx = 0; + + s_h = cb->args[0]; + s_idx = cb->args[1]; + s_idxattr = cb->args[2]; + s_prividx = cb->args[3]; + + cb->seq = net->dev_base_seq; + + ifsm = nlmsg_data(cb->nlh); + filter_mask = ifsm->filter_mask; + if (!filter_mask) + return -EINVAL; + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + hlist_for_each_entry(dev, head, index_hlist) { + if (idx < s_idx) + goto cont; + err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, + flags, filter_mask, + &s_idxattr, &s_prividx); + /* If we ran out of room on the first message, + * we're in trouble + */ + WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); + + if (err < 0) + goto out; + s_prividx = 0; + s_idxattr = 0; + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); +cont: + idx++; + } + } +out: + cb->args[3] = s_prividx; + cb->args[2] = s_idxattr; + cb->args[1] = idx; + cb->args[0] = h; + + return skb->len; +} + /* Process one rtnetlink message. */ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) @@ -3602,4 +3791,7 @@ void __init rtnetlink_init(void) rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, NULL); rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, NULL); rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, NULL); + + rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump, + NULL); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 59bf4d771..eb12d2161 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3058,11 +3058,11 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, struct sk_buff *frag_skb = head_skb; unsigned int offset = doffset; unsigned int tnl_hlen = skb_tnl_header_len(head_skb); + unsigned int partial_segs = 0; unsigned int headroom; - unsigned int len; + unsigned int len = head_skb->len; __be16 proto; - bool csum; - int sg = !!(features & NETIF_F_SG); + bool csum, sg; int nfrags = skb_shinfo(head_skb)->nr_frags; int err = -ENOMEM; int i = 0; @@ -3074,8 +3074,21 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (unlikely(!proto)) return ERR_PTR(-EINVAL); + sg = !!(features & NETIF_F_SG); csum = !!can_checksum_protocol(features, proto); + /* GSO partial only requires that we trim off any excess that + * doesn't fit into an MSS sized block, so take care of that + * now. + */ + if (sg && csum && (features & NETIF_F_GSO_PARTIAL)) { + partial_segs = len / mss; + if (partial_segs > 1) + mss *= partial_segs; + else + partial_segs = 0; + } + headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); @@ -3263,6 +3276,23 @@ perform_csum_check: */ segs->prev = tail; + /* Update GSO info on first skb in partial sequence. */ + if (partial_segs) { + int type = skb_shinfo(head_skb)->gso_type; + + /* Update type to add partial and then remove dodgy if set */ + type |= SKB_GSO_PARTIAL; + type &= ~SKB_GSO_DODGY; + + /* Update GSO info and prepare to start updating headers on + * our way back down the stack of protocols. + */ + skb_shinfo(segs)->gso_size = skb_shinfo(head_skb)->gso_size; + skb_shinfo(segs)->gso_segs = partial_segs; + skb_shinfo(segs)->gso_type = type; + SKB_GSO_CB(segs)->data_offset = skb_headroom(segs) + doffset; + } + /* Following permits correct backpressure, for protocols * using skb_set_owner_w(). * Idea is to tranfert ownership from head_skb to last segment. @@ -4577,3 +4607,239 @@ failure: return NULL; } EXPORT_SYMBOL(alloc_skb_with_frags); + +/* carve out the first off bytes from skb when off < headlen */ +static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, + const int headlen, gfp_t gfp_mask) +{ + int i; + int size = skb_end_offset(skb); + int new_hlen = headlen - off; + u8 *data; + + size = SKB_DATA_ALIGN(size); + + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + data = kmalloc_reserve(size + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), + gfp_mask, NUMA_NO_NODE, NULL); + if (!data) + return -ENOMEM; + + size = SKB_WITH_OVERHEAD(ksize(data)); + + /* Copy real data, and all frags */ + skb_copy_from_linear_data_offset(skb, off, data, new_hlen); + skb->len -= off; + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), + offsetof(struct skb_shared_info, + frags[skb_shinfo(skb)->nr_frags])); + if (skb_cloned(skb)) { + /* drop the old head gracefully */ + if (skb_orphan_frags(skb, gfp_mask)) { + kfree(data); + return -ENOMEM; + } + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_frag_ref(skb, i); + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + skb_release_data(skb); + } else { + /* we can reuse existing recount- all we did was + * relocate values + */ + skb_free_head(skb); + } + + skb->head = data; + skb->data = data; + skb->head_frag = 0; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->end = size; +#else + skb->end = skb->head + size; +#endif + skb_set_tail_pointer(skb, skb_headlen(skb)); + skb_headers_offset_update(skb, 0); + skb->cloned = 0; + skb->hdr_len = 0; + skb->nohdr = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); + + return 0; +} + +static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp); + +/* carve out the first eat bytes from skb's frag_list. May recurse into + * pskb_carve() + */ +static int pskb_carve_frag_list(struct sk_buff *skb, + struct skb_shared_info *shinfo, int eat, + gfp_t gfp_mask) +{ + struct sk_buff *list = shinfo->frag_list; + struct sk_buff *clone = NULL; + struct sk_buff *insp = NULL; + + do { + if (!list) { + pr_err("Not enough bytes to eat. Want %d\n", eat); + return -EFAULT; + } + if (list->len <= eat) { + /* Eaten as whole. */ + eat -= list->len; + list = list->next; + insp = list; + } else { + /* Eaten partially. */ + if (skb_shared(list)) { + clone = skb_clone(list, gfp_mask); + if (!clone) + return -ENOMEM; + insp = list->next; + list = clone; + } else { + /* This may be pulled without problems. */ + insp = list; + } + if (pskb_carve(list, eat, gfp_mask) < 0) { + kfree_skb(clone); + return -ENOMEM; + } + break; + } + } while (eat); + + /* Free pulled out fragments. */ + while ((list = shinfo->frag_list) != insp) { + shinfo->frag_list = list->next; + kfree_skb(list); + } + /* And insert new clone at head. */ + if (clone) { + clone->next = list; + shinfo->frag_list = clone; + } + return 0; +} + +/* carve off first len bytes from skb. Split line (off) is in the + * non-linear part of skb + */ +static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, + int pos, gfp_t gfp_mask) +{ + int i, k = 0; + int size = skb_end_offset(skb); + u8 *data; + const int nfrags = skb_shinfo(skb)->nr_frags; + struct skb_shared_info *shinfo; + + size = SKB_DATA_ALIGN(size); + + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + data = kmalloc_reserve(size + + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), + gfp_mask, NUMA_NO_NODE, NULL); + if (!data) + return -ENOMEM; + + size = SKB_WITH_OVERHEAD(ksize(data)); + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), offsetof(struct skb_shared_info, + frags[skb_shinfo(skb)->nr_frags])); + if (skb_orphan_frags(skb, gfp_mask)) { + kfree(data); + return -ENOMEM; + } + shinfo = (struct skb_shared_info *)(data + size); + for (i = 0; i < nfrags; i++) { + int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]); + + if (pos + fsize > off) { + shinfo->frags[k] = skb_shinfo(skb)->frags[i]; + + if (pos < off) { + /* Split frag. + * We have two variants in this case: + * 1. Move all the frag to the second + * part, if it is possible. F.e. + * this approach is mandatory for TUX, + * where splitting is expensive. + * 2. Split is accurately. We make this. + */ + shinfo->frags[0].page_offset += off - pos; + skb_frag_size_sub(&shinfo->frags[0], off - pos); + } + skb_frag_ref(skb, i); + k++; + } + pos += fsize; + } + shinfo->nr_frags = k; + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + + if (k == 0) { + /* split line is in frag list */ + pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask); + } + skb_release_data(skb); + + skb->head = data; + skb->head_frag = 0; + skb->data = data; +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->end = size; +#else + skb->end = skb->head + size; +#endif + skb_reset_tail_pointer(skb); + skb_headers_offset_update(skb, 0); + skb->cloned = 0; + skb->hdr_len = 0; + skb->nohdr = 0; + skb->len -= off; + skb->data_len = skb->len; + atomic_set(&skb_shinfo(skb)->dataref, 1); + return 0; +} + +/* remove len bytes from the beginning of the skb */ +static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp) +{ + int headlen = skb_headlen(skb); + + if (len < headlen) + return pskb_carve_inside_header(skb, len, headlen, gfp); + else + return pskb_carve_inside_nonlinear(skb, len, headlen, gfp); +} + +/* Extract to_copy bytes starting at off from skb, and return this in + * a new skb + */ +struct sk_buff *pskb_extract(struct sk_buff *skb, int off, + int to_copy, gfp_t gfp) +{ + struct sk_buff *clone = skb_clone(skb, gfp); + + if (!clone) + return NULL; + + if (pskb_carve(clone, off, gfp) < 0 || + pskb_trim(clone, to_copy)) { + kfree_skb(clone); + return NULL; + } + return clone; +} +EXPORT_SYMBOL(pskb_extract); diff --git a/net/core/sock.c b/net/core/sock.c index 7e73c26b6..25dab8b60 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -405,9 +405,8 @@ static void sock_disable_timestamp(struct sock *sk, unsigned long flags) } -int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { - int err; unsigned long flags; struct sk_buff_head *list = &sk->sk_receive_queue; @@ -417,10 +416,6 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return -ENOMEM; } - err = sk_filter(sk, skb); - if (err) - return err; - if (!sk_rmem_schedule(sk, skb, skb->truesize)) { atomic_inc(&sk->sk_drops); return -ENOBUFS; @@ -443,13 +438,26 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sk->sk_data_ready(sk); return 0; } +EXPORT_SYMBOL(__sock_queue_rcv_skb); + +int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) +{ + int err; + + err = sk_filter(sk, skb); + if (err) + return err; + + return __sock_queue_rcv_skb(sk, skb); +} EXPORT_SYMBOL(sock_queue_rcv_skb); -int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) +int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, + const int nested, unsigned int trim_cap) { int rc = NET_RX_SUCCESS; - if (sk_filter(sk, skb)) + if (sk_filter_trim_cap(sk, skb, trim_cap)) goto discard_and_relse; skb->dev = NULL; @@ -485,7 +493,7 @@ discard_and_relse: kfree_skb(skb); goto out; } -EXPORT_SYMBOL(sk_receive_skb); +EXPORT_SYMBOL(__sk_receive_skb); struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { @@ -835,7 +843,8 @@ set_rcvbuf: !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { if (sk->sk_protocol == IPPROTO_TCP && sk->sk_type == SOCK_STREAM) { - if (sk->sk_state != TCP_ESTABLISHED) { + if ((1 << sk->sk_state) & + (TCPF_CLOSE | TCPF_LISTEN)) { ret = -EINVAL; break; } @@ -1421,8 +1430,12 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, } EXPORT_SYMBOL(sk_alloc); -void sk_destruct(struct sock *sk) +/* Sockets having SOCK_RCU_FREE will call this function after one RCU + * grace period. This is the case for UDP sockets and TCP listeners. + */ +static void __sk_destruct(struct rcu_head *head) { + struct sock *sk = container_of(head, struct sock, sk_rcu); struct sk_filter *filter; if (sk->sk_destruct) @@ -1451,6 +1464,14 @@ void sk_destruct(struct sock *sk) sk_prot_free(sk->sk_prot_creator, sk); } +void sk_destruct(struct sock *sk) +{ + if (sock_flag(sk, SOCK_RCU_FREE)) + call_rcu(&sk->sk_rcu, __sk_destruct); + else + __sk_destruct(&sk->sk_rcu); +} + static void __sk_free(struct sock *sk) { if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt)) @@ -1515,6 +1536,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_dst_cache = NULL; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; + atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; @@ -1634,6 +1656,17 @@ void sock_wfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_wfree); +/* This variant of sock_wfree() is used by TCP, + * since it sets SOCK_USE_WRITE_QUEUE. + */ +void __sock_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + if (atomic_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) + __sk_free(sk); +} + void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); @@ -1656,8 +1689,21 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) } EXPORT_SYMBOL(skb_set_owner_w); +/* This helper is used by netem, as it can hold packets in its + * delay queue. We want to allow the owner socket to send more + * packets, as if they were already TX completed by a typical driver. + * But we also want to keep skb->sk set because some packet schedulers + * rely on it (sch_fq for example). So we set skb->truesize to a small + * amount (1) and decrease sk_wmem_alloc accordingly. + */ void skb_orphan_partial(struct sk_buff *skb) { + /* If this skb is a TCP pure ACK or already went here, + * we have nothing to do. 2 is already a very small truesize. + */ + if (skb->truesize <= 2) + return; + /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, * so we do not completely orphan skb, but transfert all * accounted bytes but one, to avoid unexpected reorders. @@ -1869,27 +1915,55 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, } EXPORT_SYMBOL(sock_alloc_send_skb); +int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, + struct sockcm_cookie *sockc) +{ + u32 tsflags; + + switch (cmsg->cmsg_type) { + case SO_MARK: + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->mark = *(u32 *)CMSG_DATA(cmsg); + break; + case SO_TIMESTAMPING: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + + tsflags = *(u32 *)CMSG_DATA(cmsg); + if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) + return -EINVAL; + + sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; + sockc->tsflags |= tsflags; + break; + /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ + case SCM_RIGHTS: + case SCM_CREDENTIALS: + break; + default: + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL(__sock_cmsg_send); + int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc) { struct cmsghdr *cmsg; + int ret; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_SOCKET) continue; - switch (cmsg->cmsg_type) { - case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) - return -EPERM; - if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) - return -EINVAL; - sockc->mark = *(u32 *)CMSG_DATA(cmsg); - break; - default: - return -EINVAL; - } + ret = __sock_cmsg_send(sk, msg, cmsg, sockc); + if (ret) + return ret; } return 0; } @@ -1974,33 +2048,27 @@ static void __release_sock(struct sock *sk) __releases(&sk->sk_lock.slock) __acquires(&sk->sk_lock.slock) { - struct sk_buff *skb = sk->sk_backlog.head; + struct sk_buff *skb, *next; - do { + while ((skb = sk->sk_backlog.head) != NULL) { sk->sk_backlog.head = sk->sk_backlog.tail = NULL; - bh_unlock_sock(sk); - do { - struct sk_buff *next = skb->next; + spin_unlock_bh(&sk->sk_lock.slock); + do { + next = skb->next; prefetch(next); WARN_ON_ONCE(skb_dst_is_noref(skb)); skb->next = NULL; sk_backlog_rcv(sk, skb); - /* - * We are in process context here with softirqs - * disabled, use cond_resched_softirq() to preempt. - * This is safe to do because we've taken the backlog - * queue private: - */ - cond_resched_softirq(); + cond_resched(); skb = next; } while (skb != NULL); - bh_lock_sock(sk); - } while ((skb = sk->sk_backlog.head) != NULL); + spin_lock_bh(&sk->sk_lock.slock); + } /* * Doing the zeroing here guarantee we can not loop forever @@ -2009,6 +2077,13 @@ static void __release_sock(struct sock *sk) sk->sk_backlog.len = 0; } +void __sk_flush_backlog(struct sock *sk) +{ + spin_lock_bh(&sk->sk_lock.slock); + __release_sock(sk); + spin_unlock_bh(&sk->sk_lock.slock); +} + /** * sk_wait_data - wait for data to arrive at sk_receive_queue * @sk: sock to wait on @@ -2145,6 +2220,15 @@ void __sk_mem_reclaim(struct sock *sk, int amount) } EXPORT_SYMBOL(__sk_mem_reclaim); +int sk_set_peek_off(struct sock *sk, int val) +{ + if (val < 0) + return -EINVAL; + + sk->sk_peek_off = val; + return 0; +} +EXPORT_SYMBOL_GPL(sk_set_peek_off); /* * Set of default routines for initialising struct proto_ops when @@ -2432,11 +2516,6 @@ EXPORT_SYMBOL(lock_sock_nested); void release_sock(struct sock *sk) { - /* - * The sk_lock has mutex_unlock() semantics: - */ - mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); - spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_backlog.tail) __release_sock(sk); diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index a996ce8c8..6b10573cc 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -67,6 +67,7 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); return nla_put(skb, attrtype, sizeof(mem), &mem); } @@ -119,7 +120,7 @@ static size_t sock_diag_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct inet_diag_msg) + nla_total_size(sizeof(u8)) /* INET_DIAG_PROTOCOL */ - + nla_total_size(sizeof(struct tcp_info))); /* INET_DIAG_INFO */ + + nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */ } static void sock_diag_broadcast_destroy_work(struct work_struct *work) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index a6beb7b6a..0df2aa652 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -294,6 +294,15 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, +# ifdef CONFIG_HAVE_EBPF_JIT + { + .procname = "bpf_jit_harden", + .data = &bpf_jit_harden, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec, + }, +# endif #endif { .procname = "netdev_tstamp_prequeue", diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index b0e28d24e..0c55ffb85 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -198,9 +198,9 @@ struct dccp_mib { }; DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); -#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) -#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field) -#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) +#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) +#define __DCCP_INC_STATS(field) __SNMP_INC_STATS(dccp_statistics, field) +#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) /* * Checksumming routines diff --git a/net/dccp/input.c b/net/dccp/input.c index 3bd14e885..ba347184b 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -359,7 +359,7 @@ send_sync: goto discard; } - DCCP_INC_STATS_BH(DCCP_MIB_INERRS); + DCCP_INC_STATS(DCCP_MIB_INERRS); discard: __kfree_skb(skb); return 0; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 9c67a961b..345a3aeb8 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -62,7 +62,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) nexthop = daddr = usin->sin_addr.s_addr; inet_opt = rcu_dereference_protected(inet->inet_opt, - sock_owned_by_user(sk)); + lockdep_sock_is_held(sk)); if (inet_opt != NULL && inet_opt->opt.srr) { if (daddr == 0) return -EINVAL; @@ -205,7 +205,7 @@ void dccp_req_err(struct sock *sk, u64 seq) * socket here. */ if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); } else { /* * Still in RESPOND, just remove it silently. @@ -247,7 +247,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) if (skb->len < offset + sizeof(*dh) || skb->len < offset + __dccp_basic_hdr_len(dh)) { - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; } @@ -256,7 +256,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) iph->saddr, ntohs(dh->dccph_sport), inet_iif(skb)); if (!sk) { - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; } @@ -273,7 +273,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) * servers this needs to be solved differently. */ if (sock_owned_by_user(sk)) - NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); + __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == DCCP_CLOSED) goto out; @@ -281,7 +281,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) dp = dccp_sk(sk); if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && !between48(seq, dp->dccps_awl, dp->dccps_awh)) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -318,7 +318,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) case DCCP_REQUESTING: case DCCP_RESPOND: if (!sock_owned_by_user(sk)) { - DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); sk->sk_err = err; sk->sk_error_report(sk); @@ -431,11 +431,11 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, return newsk; exit_overflow: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); exit_nonewsk: dst_release(dst); exit: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; put_and_exit: inet_csk_prepare_forced_close(newsk); @@ -462,7 +462,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_flow(net, &fl4, sk); if (IS_ERR(rt)) { - IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } @@ -527,17 +527,19 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) rxiph->daddr); skb_dst_set(skb, dst_clone(dst)); + local_bh_disable(); bh_lock_sock(ctl_sk); err = ip_build_and_send_pkt(skb, ctl_sk, rxiph->daddr, rxiph->saddr, NULL); bh_unlock_sock(ctl_sk); if (net_xmit_eval(err) == 0) { - DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); - DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); + __DCCP_INC_STATS(DCCP_MIB_OUTSEGS); + __DCCP_INC_STATS(DCCP_MIB_OUTRSTS); } + local_bh_enable(); out: - dst_release(dst); + dst_release(dst); } static void dccp_v4_reqsk_destructor(struct request_sock *req) @@ -637,7 +639,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) drop_and_free: reqsk_free(req); drop: - DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); return -1; } EXPORT_SYMBOL_GPL(dccp_v4_conn_request); @@ -764,6 +766,7 @@ static int dccp_v4_rcv(struct sk_buff *skb) { const struct dccp_hdr *dh; const struct iphdr *iph; + bool refcounted; struct sock *sk; int min_cov; @@ -801,7 +804,7 @@ static int dccp_v4_rcv(struct sk_buff *skb) lookup: sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), - dh->dccph_sport, dh->dccph_dport); + dh->dccph_sport, dh->dccph_dport, &refcounted); if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " "get corresponding socket\n"); @@ -830,6 +833,7 @@ lookup: goto lookup; } sock_hold(sk); + refcounted = true; nsk = dccp_check_req(sk, skb, req); if (!nsk) { reqsk_put(req); @@ -864,7 +868,7 @@ lookup: goto discard_and_relse; nf_reset(skb); - return sk_receive_skb(sk, skb, 1); + return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4); no_dccp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) @@ -886,7 +890,8 @@ discard_it: return 0; discard_and_relse: - sock_put(sk); + if (refcounted) + sock_put(sk); goto discard_it; } diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4663a01d5..3ff137d94 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -80,8 +80,8 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (skb->len < offset + sizeof(*dh) || skb->len < offset + __dccp_basic_hdr_len(dh)) { - ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); return; } @@ -91,8 +91,8 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, inet6_iif(skb)); if (!sk) { - ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); return; } @@ -106,7 +106,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, bh_lock_sock(sk); if (sock_owned_by_user(sk)) - NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); + __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == DCCP_CLOSED) goto out; @@ -114,7 +114,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, dp = dccp_sk(sk); if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && !between48(seq, dp->dccps_awl, dp->dccps_awh)) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -156,7 +156,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, case DCCP_RESPOND: /* Cannot happen. It can, it SYNs are crossed. --ANK */ if (!sock_owned_by_user(sk)) { - DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); sk->sk_err = err; /* * Wake people up to see the error @@ -277,8 +277,8 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) if (!IS_ERR(dst)) { skb_dst_set(skb, dst); ip6_xmit(ctl_sk, skb, &fl6, NULL, 0); - DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); - DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); + DCCP_INC_STATS(DCCP_MIB_OUTSEGS); + DCCP_INC_STATS(DCCP_MIB_OUTRSTS); return; } @@ -378,7 +378,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) drop_and_free: reqsk_free(req); drop: - DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); return -1; } @@ -527,11 +527,11 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, return newsk; out_overflow: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); out_nonewsk: dst_release(dst); out: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; } @@ -642,6 +642,7 @@ discard: static int dccp_v6_rcv(struct sk_buff *skb) { const struct dccp_hdr *dh; + bool refcounted; struct sock *sk; int min_cov; @@ -670,7 +671,7 @@ static int dccp_v6_rcv(struct sk_buff *skb) lookup: sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), dh->dccph_sport, dh->dccph_dport, - inet6_iif(skb)); + inet6_iif(skb), &refcounted); if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " "get corresponding socket\n"); @@ -699,6 +700,7 @@ lookup: goto lookup; } sock_hold(sk); + refcounted = true; nsk = dccp_check_req(sk, skb, req); if (!nsk) { reqsk_put(req); @@ -730,7 +732,7 @@ lookup: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - return sk_receive_skb(sk, skb, 1) ? -1 : 0; + return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4) ? -1 : 0; no_dccp_socket: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) @@ -752,7 +754,8 @@ discard_it: return 0; discard_and_relse: - sock_put(sk); + if (refcounted) + sock_put(sk); goto discard_it; } @@ -865,7 +868,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.fl6_sport = inet->inet_sport; security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); - opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); final_p = fl6_update_dst(&fl6, opt, &final); dst = ip6_dst_lookup_flow(sk, &fl6, final_p); diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 1994f8af6..53eddf99e 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -127,7 +127,7 @@ struct sock *dccp_create_openreq_child(const struct sock *sk, } dccp_init_xmit_timers(newsk); - DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); + __DCCP_INC_STATS(DCCP_MIB_PASSIVEOPENS); } return newsk; } diff --git a/net/dccp/options.c b/net/dccp/options.c index 9bce31886..74d29c56c 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -253,7 +253,7 @@ out_nonsensical_length: return 0; out_invalid_option: - DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); + DCCP_INC_STATS(DCCP_MIB_INVALIDOPT); rc = DCCP_RESET_CODE_OPTION_ERROR; out_featneg_failed: DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc); diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 3ef7acef3..3a2c34027 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -28,7 +28,7 @@ static void dccp_write_err(struct sock *sk) dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); dccp_done(sk); - DCCP_INC_STATS_BH(DCCP_MIB_ABORTONTIMEOUT); + __DCCP_INC_STATS(DCCP_MIB_ABORTONTIMEOUT); } /* A write timeout has occurred. Process the after effects. */ @@ -100,7 +100,7 @@ static void dccp_retransmit_timer(struct sock *sk) * total number of retransmissions of clones of original packets. */ if (icsk->icsk_retransmits == 0) - DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS); + __DCCP_INC_STATS(DCCP_MIB_TIMEOUTS); if (dccp_retransmit_skb(sk) != 0) { /* @@ -179,7 +179,7 @@ static void dccp_delack_timer(unsigned long data) if (sock_owned_by_user(sk)) { /* Try again later. */ icsk->icsk_ack.blocked = 1; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); goto out; @@ -209,7 +209,7 @@ static void dccp_delack_timer(unsigned long data) icsk->icsk_ack.ato = TCP_ATO_MIN; } dccp_send_ack(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } out: bh_unlock_sock(sk); diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index df4803437..a796fc7cb 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -41,6 +41,7 @@ #include <net/dn_fib.h> #include <net/dn_neigh.h> #include <net/dn_dev.h> +#include <net/nexthop.h> #define RT_MIN_TABLE 1 @@ -150,14 +151,13 @@ static int dn_fib_count_nhs(const struct nlattr *attr) struct rtnexthop *nhp = nla_data(attr); int nhs = 0, nhlen = nla_len(attr); - while(nhlen >= (int)sizeof(struct rtnexthop)) { - if ((nhlen -= nhp->rtnh_len) < 0) - return 0; + while (rtnh_ok(nhp, nhlen)) { nhs++; - nhp = RTNH_NEXT(nhp); + nhp = rtnh_next(nhp, &nhlen); } - return nhs; + /* leftover implies invalid nexthop configuration, discard it */ + return nhlen > 0 ? 0 : nhs; } static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct nlattr *attr, @@ -167,21 +167,24 @@ static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct nlattr *attr, int nhlen = nla_len(attr); change_nexthops(fi) { - int attrlen = nhlen - sizeof(struct rtnexthop); - if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0) + int attrlen; + + if (!rtnh_ok(nhp, nhlen)) return -EINVAL; nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags; nh->nh_oif = nhp->rtnh_ifindex; nh->nh_weight = nhp->rtnh_hops + 1; - if (attrlen) { + attrlen = rtnh_attrlen(nhp); + if (attrlen > 0) { struct nlattr *gw_attr; gw_attr = nla_find((struct nlattr *) (nhp + 1), attrlen, RTA_GATEWAY); nh->nh_gw = gw_attr ? nla_get_le16(gw_attr) : 0; } - nhp = RTNH_NEXT(nhp); + + nhp = rtnh_next(nhp, &nhlen); } endfor_nexthops(fi); return 0; diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index c79b85eb4..8737412c7 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -281,7 +281,7 @@ static int __init init_dns_resolver(void) GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA, NULL); + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index c28c47463..eff5dfc2e 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -51,11 +51,12 @@ void unregister_switch_driver(struct dsa_switch_driver *drv) EXPORT_SYMBOL_GPL(unregister_switch_driver); static struct dsa_switch_driver * -dsa_switch_probe(struct device *host_dev, int sw_addr, char **_name) +dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, + const char **_name, void **priv) { struct dsa_switch_driver *ret; struct list_head *list; - char *name; + const char *name; ret = NULL; name = NULL; @@ -66,7 +67,7 @@ dsa_switch_probe(struct device *host_dev, int sw_addr, char **_name) drv = list_entry(list, struct dsa_switch_driver, list); - name = drv->probe(host_dev, sw_addr); + name = drv->probe(parent, host_dev, sw_addr, priv); if (name != NULL) { ret = drv; break; @@ -181,7 +182,7 @@ __ATTRIBUTE_GROUPS(dsa_hwmon); /* basic switch operations **************************************************/ static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master) { - struct dsa_chip_data *cd = ds->pd; + struct dsa_chip_data *cd = ds->cd; struct device_node *port_dn; struct phy_device *phydev; int ret, port, mode; @@ -218,7 +219,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) { struct dsa_switch_driver *drv = ds->drv; struct dsa_switch_tree *dst = ds->dst; - struct dsa_chip_data *pd = ds->pd; + struct dsa_chip_data *cd = ds->cd; bool valid_name_found = false; int index = ds->index; int i, ret; @@ -229,7 +230,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) for (i = 0; i < DSA_MAX_PORTS; i++) { char *name; - name = pd->port_names[i]; + name = cd->port_names[i]; if (name == NULL) continue; @@ -245,7 +246,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) } else if (!strcmp(name, "dsa")) { ds->dsa_port_mask |= 1 << i; } else { - ds->phys_port_mask |= 1 << i; + ds->enabled_port_mask |= 1 << i; } valid_name_found = true; } @@ -258,7 +259,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) /* Make the built-in MII bus mask match the number of ports, * switch drivers can override this later */ - ds->phys_mii_mask = ds->phys_port_mask; + ds->phys_mii_mask = ds->enabled_port_mask; /* * If the CPU connects to this switch, set the switch tree @@ -266,7 +267,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) * switch. */ if (dst->cpu_switch == index) { - switch (ds->tag_protocol) { + switch (drv->tag_protocol) { #ifdef CONFIG_NET_DSA_TAG_DSA case DSA_TAG_PROTO_DSA: dst->rcv = dsa_netdev_ops.rcv; @@ -294,7 +295,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) goto out; } - dst->tag_protocol = ds->tag_protocol; + dst->tag_protocol = drv->tag_protocol; } /* @@ -324,13 +325,13 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) * Create network devices for physical switch ports. */ for (i = 0; i < DSA_MAX_PORTS; i++) { - if (!(ds->phys_port_mask & (1 << i))) + if (!(ds->enabled_port_mask & (1 << i))) continue; - ret = dsa_slave_create(ds, parent, i, pd->port_names[i]); + ret = dsa_slave_create(ds, parent, i, cd->port_names[i]); if (ret < 0) { netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n", - index, i, pd->port_names[i], ret); + index, i, cd->port_names[i], ret); ret = 0; } } @@ -378,16 +379,17 @@ static struct dsa_switch * dsa_switch_setup(struct dsa_switch_tree *dst, int index, struct device *parent, struct device *host_dev) { - struct dsa_chip_data *pd = dst->pd->chip + index; + struct dsa_chip_data *cd = dst->pd->chip + index; struct dsa_switch_driver *drv; struct dsa_switch *ds; int ret; - char *name; + const char *name; + void *priv; /* * Probe for switch model. */ - drv = dsa_switch_probe(host_dev, pd->sw_addr, &name); + drv = dsa_switch_probe(parent, host_dev, cd->sw_addr, &name, &priv); if (drv == NULL) { netdev_err(dst->master_netdev, "[%d]: could not detect attached switch\n", index); @@ -400,16 +402,16 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index, /* * Allocate and initialise switch state. */ - ds = devm_kzalloc(parent, sizeof(*ds) + drv->priv_size, GFP_KERNEL); + ds = devm_kzalloc(parent, sizeof(*ds), GFP_KERNEL); if (ds == NULL) return ERR_PTR(-ENOMEM); ds->dst = dst; ds->index = index; - ds->pd = pd; + ds->cd = cd; ds->drv = drv; - ds->tag_protocol = drv->tag_protocol; - ds->master_dev = host_dev; + ds->priv = priv; + ds->dev = parent; ret = dsa_switch_setup_one(ds, parent); if (ret) @@ -422,7 +424,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds) { struct device_node *port_dn; struct phy_device *phydev; - struct dsa_chip_data *cd = ds->pd; + struct dsa_chip_data *cd = ds->cd; int port; #ifdef CONFIG_NET_DSA_HWMON @@ -432,7 +434,7 @@ static void dsa_switch_destroy(struct dsa_switch *ds) /* Destroy network devices for physical switch ports. */ for (port = 0; port < DSA_MAX_PORTS; port++) { - if (!(ds->phys_port_mask & (1 << port))) + if (!(ds->enabled_port_mask & (1 << port))) continue; if (!ds->ports[port]) @@ -657,9 +659,6 @@ static int dsa_of_probe(struct device *dev) const char *port_name; int chip_index, port_index; const unsigned int *sw_addr, *port_reg; - int gpio; - enum of_gpio_flags of_flags; - unsigned long flags; u32 eeprom_len; int ret; @@ -738,19 +737,6 @@ static int dsa_of_probe(struct device *dev) put_device(cd->host_dev); cd->host_dev = &mdio_bus_switch->dev; } - gpio = of_get_named_gpio_flags(child, "reset-gpios", 0, - &of_flags); - if (gpio_is_valid(gpio)) { - flags = (of_flags == OF_GPIO_ACTIVE_LOW ? - GPIOF_ACTIVE_LOW : 0); - ret = devm_gpio_request_one(dev, gpio, flags, - "switch_reset"); - if (ret) - goto out_free_chip; - - cd->reset = gpio_to_desc(gpio); - gpiod_direction_output(cd->reset, 0); - } for_each_available_child_of_node(child, port) { port_reg = of_get_property(port, "reg", NULL); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 1d1a54687..dfa33779d 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -22,11 +22,6 @@ struct dsa_device_ops { }; struct dsa_slave_priv { - /* - * The linux network interface corresponding to this - * switch port. - */ - struct net_device *dev; struct sk_buff * (*xmit)(struct sk_buff *skb, struct net_device *dev); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index a575f0350..152436cda 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -50,8 +50,8 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds) ds->slave_mii_bus->read = dsa_slave_phy_read; ds->slave_mii_bus->write = dsa_slave_phy_write; snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d:%.2x", - ds->index, ds->pd->sw_addr); - ds->slave_mii_bus->parent = ds->master_dev; + ds->index, ds->cd->sw_addr); + ds->slave_mii_bus->parent = ds->dev; ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask; } @@ -104,8 +104,8 @@ static int dsa_slave_open(struct net_device *dev) goto clear_promisc; } - if (ds->drv->port_stp_update) - ds->drv->port_stp_update(ds, p->port, stp_state); + if (ds->drv->port_stp_state_set) + ds->drv->port_stp_state_set(ds, p->port, stp_state); if (p->phy) phy_start(p->phy); @@ -147,8 +147,8 @@ static int dsa_slave_close(struct net_device *dev) if (ds->drv->port_disable) ds->drv->port_disable(ds, p->port, p->phy); - if (ds->drv->port_stp_update) - ds->drv->port_stp_update(ds, p->port, BR_STATE_DISABLED); + if (ds->drv->port_stp_state_set) + ds->drv->port_stp_state_set(ds, p->port, BR_STATE_DISABLED); return 0; } @@ -207,21 +207,16 @@ static int dsa_slave_port_vlan_add(struct net_device *dev, { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - int err; if (switchdev_trans_ph_prepare(trans)) { if (!ds->drv->port_vlan_prepare || !ds->drv->port_vlan_add) return -EOPNOTSUPP; - err = ds->drv->port_vlan_prepare(ds, p->port, vlan, trans); - if (err) - return err; - } else { - err = ds->drv->port_vlan_add(ds, p->port, vlan, trans); - if (err) - return err; + return ds->drv->port_vlan_prepare(ds, p->port, vlan, trans); } + ds->drv->port_vlan_add(ds, p->port, vlan, trans); + return 0; } @@ -256,17 +251,17 @@ static int dsa_slave_port_fdb_add(struct net_device *dev, { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - int ret; - if (!ds->drv->port_fdb_prepare || !ds->drv->port_fdb_add) - return -EOPNOTSUPP; + if (switchdev_trans_ph_prepare(trans)) { + if (!ds->drv->port_fdb_prepare || !ds->drv->port_fdb_add) + return -EOPNOTSUPP; - if (switchdev_trans_ph_prepare(trans)) - ret = ds->drv->port_fdb_prepare(ds, p->port, fdb, trans); - else - ret = ds->drv->port_fdb_add(ds, p->port, fdb, trans); + return ds->drv->port_fdb_prepare(ds, p->port, fdb, trans); + } - return ret; + ds->drv->port_fdb_add(ds, p->port, fdb, trans); + + return 0; } static int dsa_slave_port_fdb_del(struct net_device *dev, @@ -305,16 +300,19 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) return -EOPNOTSUPP; } -static int dsa_slave_stp_update(struct net_device *dev, u8 state) +static int dsa_slave_stp_state_set(struct net_device *dev, + const struct switchdev_attr *attr, + struct switchdev_trans *trans) { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - int ret = -EOPNOTSUPP; - if (ds->drv->port_stp_update) - ret = ds->drv->port_stp_update(ds, p->port, state); + if (switchdev_trans_ph_prepare(trans)) + return ds->drv->port_stp_state_set ? 0 : -EOPNOTSUPP; - return ret; + ds->drv->port_stp_state_set(ds, p->port, attr->u.stp_state); + + return 0; } static int dsa_slave_vlan_filtering(struct net_device *dev, @@ -339,17 +337,11 @@ static int dsa_slave_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct switchdev_trans *trans) { - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; int ret; switch (attr->id) { case SWITCHDEV_ATTR_ID_PORT_STP_STATE: - if (switchdev_trans_ph_prepare(trans)) - ret = ds->drv->port_stp_update ? 0 : -EOPNOTSUPP; - else - ret = ds->drv->port_stp_update(ds, p->port, - attr->u.stp_state); + ret = dsa_slave_stp_state_set(dev, attr, trans); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: ret = dsa_slave_vlan_filtering(dev, attr, trans); @@ -468,7 +460,8 @@ static void dsa_slave_bridge_port_leave(struct net_device *dev) /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, * so allow it to be in BR_STATE_FORWARDING to be kept functional */ - dsa_slave_stp_update(dev, BR_STATE_FORWARDING); + if (ds->drv->port_stp_state_set) + ds->drv->port_stp_state_set(ds, p->port, BR_STATE_FORWARDING); } static int dsa_slave_port_attr_get(struct net_device *dev, @@ -622,8 +615,8 @@ static int dsa_slave_get_eeprom_len(struct net_device *dev) struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - if (ds->pd->eeprom_len) - return ds->pd->eeprom_len; + if (ds->cd->eeprom_len) + return ds->cd->eeprom_len; if (ds->drv->get_eeprom_len) return ds->drv->get_eeprom_len(ds); @@ -673,6 +666,78 @@ static void dsa_slave_get_strings(struct net_device *dev, } } +static void dsa_cpu_port_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, + uint64_t *data) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds = dst->ds[0]; + s8 cpu_port = dst->cpu_port; + int count = 0; + + if (dst->master_ethtool_ops.get_sset_count) { + count = dst->master_ethtool_ops.get_sset_count(dev, + ETH_SS_STATS); + dst->master_ethtool_ops.get_ethtool_stats(dev, stats, data); + } + + if (ds->drv->get_ethtool_stats) + ds->drv->get_ethtool_stats(ds, cpu_port, data + count); +} + +static int dsa_cpu_port_get_sset_count(struct net_device *dev, int sset) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds = dst->ds[0]; + int count = 0; + + if (dst->master_ethtool_ops.get_sset_count) + count += dst->master_ethtool_ops.get_sset_count(dev, sset); + + if (sset == ETH_SS_STATS && ds->drv->get_sset_count) + count += ds->drv->get_sset_count(ds); + + return count; +} + +static void dsa_cpu_port_get_strings(struct net_device *dev, + uint32_t stringset, uint8_t *data) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds = dst->ds[0]; + s8 cpu_port = dst->cpu_port; + int len = ETH_GSTRING_LEN; + int mcount = 0, count; + unsigned int i; + uint8_t pfx[4]; + uint8_t *ndata; + + snprintf(pfx, sizeof(pfx), "p%.2d", cpu_port); + /* We do not want to be NULL-terminated, since this is a prefix */ + pfx[sizeof(pfx) - 1] = '_'; + + if (dst->master_ethtool_ops.get_sset_count) { + mcount = dst->master_ethtool_ops.get_sset_count(dev, + ETH_SS_STATS); + dst->master_ethtool_ops.get_strings(dev, stringset, data); + } + + if (stringset == ETH_SS_STATS && ds->drv->get_strings) { + ndata = data + mcount * len; + /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle + * the output after to prepend our CPU port prefix we + * constructed earlier + */ + ds->drv->get_strings(ds, cpu_port, ndata); + count = ds->drv->get_sset_count(ds); + for (i = 0; i < count; i++) { + memmove(ndata + (i * len + sizeof(pfx)), + ndata + i * len, len - sizeof(pfx)); + memcpy(ndata + i * len, pfx, sizeof(pfx)); + } + } +} + static void dsa_slave_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, uint64_t *data) @@ -680,10 +745,10 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; - data[0] = p->dev->stats.tx_packets; - data[1] = p->dev->stats.tx_bytes; - data[2] = p->dev->stats.rx_packets; - data[3] = p->dev->stats.rx_bytes; + data[0] = dev->stats.tx_packets; + data[1] = dev->stats.tx_bytes; + data[2] = dev->stats.rx_packets; + data[3] = dev->stats.rx_bytes; if (ds->drv->get_ethtool_stats != NULL) ds->drv->get_ethtool_stats(ds, p->port, data + 4); } @@ -828,6 +893,8 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_eee = dsa_slave_get_eee, }; +static struct ethtool_ops dsa_cpu_port_ethtool_ops; + static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_open = dsa_slave_open, .ndo_stop = dsa_slave_close, @@ -932,7 +999,7 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p, struct net_device *slave_dev) { struct dsa_switch *ds = p->parent; - struct dsa_chip_data *cd = ds->pd; + struct dsa_chip_data *cd = ds->cd; struct device_node *phy_dn, *port_dn; bool phy_is_fixed = false; u32 phy_flags = 0; @@ -1045,6 +1112,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, int port, char *name) { struct net_device *master = ds->dst->master_netdev; + struct dsa_switch_tree *dst = ds->dst; struct net_device *slave_dev; struct dsa_slave_priv *p; int ret; @@ -1056,6 +1124,19 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->features = master->vlan_features; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; + if (master->ethtool_ops != &dsa_cpu_port_ethtool_ops) { + memcpy(&dst->master_ethtool_ops, master->ethtool_ops, + sizeof(struct ethtool_ops)); + memcpy(&dsa_cpu_port_ethtool_ops, &dst->master_ethtool_ops, + sizeof(struct ethtool_ops)); + dsa_cpu_port_ethtool_ops.get_sset_count = + dsa_cpu_port_get_sset_count; + dsa_cpu_port_ethtool_ops.get_ethtool_stats = + dsa_cpu_port_get_ethtool_stats; + dsa_cpu_port_ethtool_ops.get_strings = + dsa_cpu_port_get_strings; + master->ethtool_ops = &dsa_cpu_port_ethtool_ops; + } eth_hw_addr_inherit(slave_dev, master); slave_dev->priv_flags |= IFF_NO_QUEUE; slave_dev->netdev_ops = &dsa_slave_netdev_ops; @@ -1066,11 +1147,10 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, NULL); SET_NETDEV_DEV(slave_dev, parent); - slave_dev->dev.of_node = ds->pd->port_dn[port]; + slave_dev->dev.of_node = ds->cd->port_dn[port]; slave_dev->vlan_features = master->vlan_features; p = netdev_priv(slave_dev); - p->dev = slave_dev; p->parent = ds; p->port = port; diff --git a/net/hsr/Kconfig b/net/hsr/Kconfig index 0d3d70905..4b683fd0a 100644 --- a/net/hsr/Kconfig +++ b/net/hsr/Kconfig @@ -18,8 +18,9 @@ config HSR earlier. This code is a "best effort" to comply with the HSR standard as - described in IEC 62439-3:2010 (HSRv0), but no compliancy tests have - been made. + described in IEC 62439-3:2010 (HSRv0) and IEC 62439-3:2012 (HSRv1), + but no compliancy tests have been made. Use iproute2 to select + the version you desire. You need to perform any and all necessary tests yourself before relying on this code in a safety critical system! diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index c7d1adca3..16737cd8d 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -90,7 +90,8 @@ static void hsr_check_announce(struct net_device *hsr_dev, hsr = netdev_priv(hsr_dev); - if ((hsr_dev->operstate == IF_OPER_UP) && (old_operstate != IF_OPER_UP)) { + if ((hsr_dev->operstate == IF_OPER_UP) + && (old_operstate != IF_OPER_UP)) { /* Went up */ hsr->announce_count = 0; hsr->announce_timer.expires = jiffies + @@ -250,31 +251,22 @@ static const struct header_ops hsr_header_ops = { .parse = eth_header_parse, }; - -/* HSR:2010 supervision frames should be padded so that the whole frame, - * including headers and FCS, is 64 bytes (without VLAN). - */ -static int hsr_pad(int size) -{ - const int min_size = ETH_ZLEN - HSR_HLEN - ETH_HLEN; - - if (size >= min_size) - return size; - return min_size; -} - -static void send_hsr_supervision_frame(struct hsr_port *master, u8 type) +static void send_hsr_supervision_frame(struct hsr_port *master, + u8 type, u8 hsrVer) { struct sk_buff *skb; int hlen, tlen; + struct hsr_tag *hsr_tag; struct hsr_sup_tag *hsr_stag; struct hsr_sup_payload *hsr_sp; unsigned long irqflags; hlen = LL_RESERVED_SPACE(master->dev); tlen = master->dev->needed_tailroom; - skb = alloc_skb(hsr_pad(sizeof(struct hsr_sup_payload)) + hlen + tlen, - GFP_ATOMIC); + skb = dev_alloc_skb( + sizeof(struct hsr_tag) + + sizeof(struct hsr_sup_tag) + + sizeof(struct hsr_sup_payload) + hlen + tlen); if (skb == NULL) return; @@ -282,32 +274,48 @@ static void send_hsr_supervision_frame(struct hsr_port *master, u8 type) skb_reserve(skb, hlen); skb->dev = master->dev; - skb->protocol = htons(ETH_P_PRP); + skb->protocol = htons(hsrVer ? ETH_P_HSR : ETH_P_PRP); skb->priority = TC_PRIO_CONTROL; - if (dev_hard_header(skb, skb->dev, ETH_P_PRP, + if (dev_hard_header(skb, skb->dev, (hsrVer ? ETH_P_HSR : ETH_P_PRP), master->hsr->sup_multicast_addr, skb->dev->dev_addr, skb->len) <= 0) goto out; skb_reset_mac_header(skb); - hsr_stag = (typeof(hsr_stag)) skb_put(skb, sizeof(*hsr_stag)); + if (hsrVer > 0) { + hsr_tag = (typeof(hsr_tag)) skb_put(skb, sizeof(struct hsr_tag)); + hsr_tag->encap_proto = htons(ETH_P_PRP); + set_hsr_tag_LSDU_size(hsr_tag, HSR_V1_SUP_LSDUSIZE); + } - set_hsr_stag_path(hsr_stag, 0xf); - set_hsr_stag_HSR_Ver(hsr_stag, 0); + hsr_stag = (typeof(hsr_stag)) skb_put(skb, sizeof(struct hsr_sup_tag)); + set_hsr_stag_path(hsr_stag, (hsrVer ? 0x0 : 0xf)); + set_hsr_stag_HSR_Ver(hsr_stag, hsrVer); + /* From HSRv1 on we have separate supervision sequence numbers. */ spin_lock_irqsave(&master->hsr->seqnr_lock, irqflags); - hsr_stag->sequence_nr = htons(master->hsr->sequence_nr); - master->hsr->sequence_nr++; + if (hsrVer > 0) { + hsr_stag->sequence_nr = htons(master->hsr->sup_sequence_nr); + hsr_tag->sequence_nr = htons(master->hsr->sequence_nr); + master->hsr->sup_sequence_nr++; + master->hsr->sequence_nr++; + } else { + hsr_stag->sequence_nr = htons(master->hsr->sequence_nr); + master->hsr->sequence_nr++; + } spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags); hsr_stag->HSR_TLV_Type = type; - hsr_stag->HSR_TLV_Length = 12; + /* TODO: Why 12 in HSRv0? */ + hsr_stag->HSR_TLV_Length = hsrVer ? sizeof(struct hsr_sup_payload) : 12; /* Payload: MacAddressA */ - hsr_sp = (typeof(hsr_sp)) skb_put(skb, sizeof(*hsr_sp)); + hsr_sp = (typeof(hsr_sp)) skb_put(skb, sizeof(struct hsr_sup_payload)); ether_addr_copy(hsr_sp->MacAddressA, master->dev->dev_addr); + skb_put_padto(skb, ETH_ZLEN + HSR_HLEN); + hsr_forward_skb(skb, master); return; @@ -329,19 +337,20 @@ static void hsr_announce(unsigned long data) rcu_read_lock(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); - if (hsr->announce_count < 3) { - send_hsr_supervision_frame(master, HSR_TLV_ANNOUNCE); + if (hsr->announce_count < 3 && hsr->protVersion == 0) { + send_hsr_supervision_frame(master, HSR_TLV_ANNOUNCE, + hsr->protVersion); hsr->announce_count++; - } else { - send_hsr_supervision_frame(master, HSR_TLV_LIFE_CHECK); - } - if (hsr->announce_count < 3) hsr->announce_timer.expires = jiffies + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL); - else + } else { + send_hsr_supervision_frame(master, HSR_TLV_LIFE_CHECK, + hsr->protVersion); + hsr->announce_timer.expires = jiffies + msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); + } if (is_admin_up(master->dev)) add_timer(&hsr->announce_timer); @@ -428,7 +437,7 @@ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = { }; int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], - unsigned char multicast_spec) + unsigned char multicast_spec, u8 protocol_version) { struct hsr_priv *hsr; struct hsr_port *port; @@ -450,18 +459,17 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], spin_lock_init(&hsr->seqnr_lock); /* Overflow soon to find bugs easier: */ hsr->sequence_nr = HSR_SEQNR_START; + hsr->sup_sequence_nr = HSR_SUP_SEQNR_START; - init_timer(&hsr->announce_timer); - hsr->announce_timer.function = hsr_announce; - hsr->announce_timer.data = (unsigned long) hsr; + setup_timer(&hsr->announce_timer, hsr_announce, (unsigned long)hsr); - init_timer(&hsr->prune_timer); - hsr->prune_timer.function = hsr_prune_nodes; - hsr->prune_timer.data = (unsigned long) hsr; + setup_timer(&hsr->prune_timer, hsr_prune_nodes, (unsigned long)hsr); ether_addr_copy(hsr->sup_multicast_addr, def_multicast_addr); hsr->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec; + hsr->protVersion = protocol_version; + /* FIXME: should I modify the value of these? * * - hsr_dev->flags - i.e. @@ -490,8 +498,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], if (res) goto fail; - hsr->prune_timer.expires = jiffies + msecs_to_jiffies(PRUNE_PERIOD); - add_timer(&hsr->prune_timer); + mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD)); return 0; diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h index 108a5d59d..9975e31bb 100644 --- a/net/hsr/hsr_device.h +++ b/net/hsr/hsr_device.h @@ -17,7 +17,7 @@ void hsr_dev_setup(struct net_device *dev); int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], - unsigned char multicast_spec); + unsigned char multicast_spec, u8 protocol_version); void hsr_check_carrier_and_operstate(struct hsr_priv *hsr); bool is_hsr_master(struct net_device *dev); int hsr_get_max_mtu(struct hsr_priv *hsr); diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 7871ed6d3..5ee1d43f1 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -50,21 +50,40 @@ struct hsr_frame_info { */ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) { - struct hsr_ethhdr_sp *hdr; + struct ethhdr *ethHdr; + struct hsr_sup_tag *hsrSupTag; + struct hsrv1_ethhdr_sp *hsrV1Hdr; WARN_ON_ONCE(!skb_mac_header_was_set(skb)); - hdr = (struct hsr_ethhdr_sp *) skb_mac_header(skb); + ethHdr = (struct ethhdr *) skb_mac_header(skb); - if (!ether_addr_equal(hdr->ethhdr.h_dest, + /* Correct addr? */ + if (!ether_addr_equal(ethHdr->h_dest, hsr->sup_multicast_addr)) return false; - if (get_hsr_stag_path(&hdr->hsr_sup) != 0x0f) + /* Correct ether type?. */ + if (!(ethHdr->h_proto == htons(ETH_P_PRP) + || ethHdr->h_proto == htons(ETH_P_HSR))) return false; - if ((hdr->hsr_sup.HSR_TLV_Type != HSR_TLV_ANNOUNCE) && - (hdr->hsr_sup.HSR_TLV_Type != HSR_TLV_LIFE_CHECK)) + + /* Get the supervision header from correct location. */ + if (ethHdr->h_proto == htons(ETH_P_HSR)) { /* Okay HSRv1. */ + hsrV1Hdr = (struct hsrv1_ethhdr_sp *) skb_mac_header(skb); + if (hsrV1Hdr->hsr.encap_proto != htons(ETH_P_PRP)) + return false; + + hsrSupTag = &hsrV1Hdr->hsr_sup; + } else { + hsrSupTag = &((struct hsrv0_ethhdr_sp *) skb_mac_header(skb))->hsr_sup; + } + + if ((hsrSupTag->HSR_TLV_Type != HSR_TLV_ANNOUNCE) && + (hsrSupTag->HSR_TLV_Type != HSR_TLV_LIFE_CHECK)) return false; - if (hdr->hsr_sup.HSR_TLV_Length != 12) + if ((hsrSupTag->HSR_TLV_Length != 12) && + (hsrSupTag->HSR_TLV_Length != + sizeof(struct hsr_sup_payload))) return false; return true; @@ -110,7 +129,7 @@ static struct sk_buff *frame_get_stripped_skb(struct hsr_frame_info *frame, static void hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame, - struct hsr_port *port) + struct hsr_port *port, u8 protoVersion) { struct hsr_ethhdr *hsr_ethhdr; int lane_id; @@ -131,7 +150,8 @@ static void hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame, set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size); hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr); hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto; - hsr_ethhdr->ethhdr.h_proto = htons(ETH_P_PRP); + hsr_ethhdr->ethhdr.h_proto = htons(protoVersion ? + ETH_P_HSR : ETH_P_PRP); } static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o, @@ -160,7 +180,7 @@ static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o, memmove(dst, src, movelen); skb_reset_mac_header(skb); - hsr_fill_tag(skb, frame, port); + hsr_fill_tag(skb, frame, port, port->hsr->protVersion); return skb; } @@ -320,7 +340,8 @@ static int hsr_fill_frame_info(struct hsr_frame_info *frame, /* FIXME: */ WARN_ONCE(1, "HSR: VLAN not yet supported"); } - if (ethhdr->h_proto == htons(ETH_P_PRP)) { + if (ethhdr->h_proto == htons(ETH_P_PRP) + || ethhdr->h_proto == htons(ETH_P_HSR)) { frame->skb_std = NULL; frame->skb_hsr = skb; frame->sequence_nr = hsr_get_skb_sequence_nr(skb); diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index bace124d1..7ea925816 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -177,17 +177,17 @@ struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb, return node; } - if (!is_sup) - return NULL; /* Only supervision frame may create node entry */ + /* Everyone may create a node entry, connected node to a HSR device. */ - if (ethhdr->h_proto == htons(ETH_P_PRP)) { + if (ethhdr->h_proto == htons(ETH_P_PRP) + || ethhdr->h_proto == htons(ETH_P_HSR)) { /* Use the existing sequence_nr from the tag as starting point * for filtering duplicate frames. */ seq_out = hsr_get_skb_sequence_nr(skb) - 1; } else { WARN_ONCE(1, "%s: Non-HSR frame\n", __func__); - seq_out = 0; + seq_out = HSR_SEQNR_START; } return hsr_add_node(node_db, ethhdr->h_source, seq_out); @@ -200,17 +200,25 @@ struct hsr_node *hsr_get_node(struct list_head *node_db, struct sk_buff *skb, void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, struct hsr_port *port_rcv) { + struct ethhdr *ethhdr; struct hsr_node *node_real; struct hsr_sup_payload *hsr_sp; struct list_head *node_db; int i; - skb_pull(skb, sizeof(struct hsr_ethhdr_sp)); - hsr_sp = (struct hsr_sup_payload *) skb->data; + ethhdr = (struct ethhdr *) skb_mac_header(skb); - if (ether_addr_equal(eth_hdr(skb)->h_source, hsr_sp->MacAddressA)) - /* Not sent from MacAddressB of a PICS_SUBS capable node */ - goto done; + /* Leave the ethernet header. */ + skb_pull(skb, sizeof(struct ethhdr)); + + /* And leave the HSR tag. */ + if (ethhdr->h_proto == htons(ETH_P_HSR)) + skb_pull(skb, sizeof(struct hsr_tag)); + + /* And leave the HSR sup tag. */ + skb_pull(skb, sizeof(struct hsr_sup_tag)); + + hsr_sp = (struct hsr_sup_payload *) skb->data; /* Merge node_curr (registered on MacAddressB) into node_real */ node_db = &port_rcv->hsr->node_db; @@ -225,7 +233,7 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, /* Node has already been merged */ goto done; - ether_addr_copy(node_real->MacAddressB, eth_hdr(skb)->h_source); + ether_addr_copy(node_real->MacAddressB, ethhdr->h_source); for (i = 0; i < HSR_PT_PORTS; i++) { if (!node_curr->time_in_stale[i] && time_after(node_curr->time_in[i], node_real->time_in[i])) { @@ -241,7 +249,7 @@ void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr, kfree_rcu(node_curr, rcu_head); done: - skb_push(skb, sizeof(struct hsr_ethhdr_sp)); + skb_push(skb, sizeof(struct hsrv1_ethhdr_sp)); } diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 5a9c69962..9b9909e89 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -30,6 +30,7 @@ */ #define MAX_SLAVE_DIFF 3000 /* ms */ #define HSR_SEQNR_START (USHRT_MAX - 1024) +#define HSR_SUP_SEQNR_START (HSR_SEQNR_START / 2) /* How often shall we check for broken ring and remove node entries older than @@ -58,6 +59,8 @@ struct hsr_tag { #define HSR_HLEN 6 +#define HSR_V1_SUP_LSDUSIZE 52 + /* The helper functions below assumes that 'path' occupies the 4 most * significant bits of the 16-bit field shared by 'path' and 'LSDU_size' (or * equivalently, the 4 most significant bits of HSR tag byte 14). @@ -131,8 +134,14 @@ static inline void set_hsr_stag_HSR_Ver(struct hsr_sup_tag *hst, u16 HSR_Ver) set_hsr_tag_LSDU_size((struct hsr_tag *) hst, HSR_Ver); } -struct hsr_ethhdr_sp { +struct hsrv0_ethhdr_sp { + struct ethhdr ethhdr; + struct hsr_sup_tag hsr_sup; +} __packed; + +struct hsrv1_ethhdr_sp { struct ethhdr ethhdr; + struct hsr_tag hsr; struct hsr_sup_tag hsr_sup; } __packed; @@ -162,6 +171,8 @@ struct hsr_priv { struct timer_list prune_timer; int announce_count; u16 sequence_nr; + u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */ + u8 protVersion; /* Indicate if HSRv0 or HSRv1. */ spinlock_t seqnr_lock; /* locking for sequence_nr */ unsigned char sup_multicast_addr[ETH_ALEN]; }; diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index a2c7e4c0a..d4d1617f4 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -23,7 +23,8 @@ static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = { [IFLA_HSR_SLAVE1] = { .type = NLA_U32 }, [IFLA_HSR_SLAVE2] = { .type = NLA_U32 }, [IFLA_HSR_MULTICAST_SPEC] = { .type = NLA_U8 }, - [IFLA_HSR_SUPERVISION_ADDR] = { .type = NLA_BINARY, .len = ETH_ALEN }, + [IFLA_HSR_VERSION] = { .type = NLA_U8 }, + [IFLA_HSR_SUPERVISION_ADDR] = { .len = ETH_ALEN }, [IFLA_HSR_SEQ_NR] = { .type = NLA_U16 }, }; @@ -35,7 +36,7 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct net_device *link[2]; - unsigned char multicast_spec; + unsigned char multicast_spec, hsr_version; if (!data) { netdev_info(dev, "HSR: No slave devices specified\n"); @@ -62,7 +63,12 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, else multicast_spec = nla_get_u8(data[IFLA_HSR_MULTICAST_SPEC]); - return hsr_dev_finalize(dev, link, multicast_spec); + if (!data[IFLA_HSR_VERSION]) + hsr_version = 0; + else + hsr_version = nla_get_u8(data[IFLA_HSR_VERSION]); + + return hsr_dev_finalize(dev, link, multicast_spec, hsr_version); } static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) @@ -115,10 +121,9 @@ static struct rtnl_link_ops hsr_link_ops __read_mostly = { /* attribute policy */ -/* NLA_BINARY missing in libnl; use NLA_UNSPEC in userspace instead. */ static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = { - [HSR_A_NODE_ADDR] = { .type = NLA_BINARY, .len = ETH_ALEN }, - [HSR_A_NODE_ADDR_B] = { .type = NLA_BINARY, .len = ETH_ALEN }, + [HSR_A_NODE_ADDR] = { .len = ETH_ALEN }, + [HSR_A_NODE_ADDR_B] = { .len = ETH_ALEN }, [HSR_A_IFINDEX] = { .type = NLA_U32 }, [HSR_A_IF1_AGE] = { .type = NLA_U32 }, [HSR_A_IF2_AGE] = { .type = NLA_U32 }, diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 7d37366cc..f5b60388d 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -22,6 +22,7 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct hsr_port *port; + u16 protocol; if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: skb invalid", __func__); @@ -37,7 +38,8 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) goto finish_consume; } - if (eth_hdr(skb)->h_proto != htons(ETH_P_PRP)) + protocol = eth_hdr(skb)->h_proto; + if (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR)) goto finish_pass; skb_push(skb, ETH_HLEN); diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h index b4e17a7c0..5ac778962 100644 --- a/net/ieee802154/6lowpan/6lowpan_i.h +++ b/net/ieee802154/6lowpan/6lowpan_i.h @@ -41,24 +41,12 @@ static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a) return (((__force u64)a->extended_addr) >> 32) ^ (((__force u64)a->extended_addr) & 0xffffffff); case IEEE802154_ADDR_SHORT: - return (__force u32)(a->short_addr); + return (__force u32)(a->short_addr + (a->pan_id << 16)); default: return 0; } } -/* private device info */ -struct lowpan_dev_info { - struct net_device *wdev; /* wpan device ptr */ - u16 fragment_tag; -}; - -static inline struct -lowpan_dev_info *lowpan_dev_info(const struct net_device *dev) -{ - return (struct lowpan_dev_info *)lowpan_priv(dev)->priv; -} - int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type); void lowpan_net_frag_exit(void); int lowpan_net_frag_init(void); diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 0023c9048..dd085db85 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -148,7 +148,7 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev, return -EBUSY; } - lowpan_dev_info(ldev)->wdev = wdev; + lowpan_802154_dev(ldev)->wdev = wdev; /* Set the lowpan hardware address to the wpan hardware address. */ memcpy(ldev->dev_addr, wdev->dev_addr, IEEE802154_ADDR_LEN); /* We need headroom for possible wpan_dev_hard_header call and tailroom @@ -173,7 +173,7 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev, static void lowpan_dellink(struct net_device *ldev, struct list_head *head) { - struct net_device *wdev = lowpan_dev_info(ldev)->wdev; + struct net_device *wdev = lowpan_802154_dev(ldev)->wdev; ASSERT_RTNL(); @@ -184,7 +184,7 @@ static void lowpan_dellink(struct net_device *ldev, struct list_head *head) static struct rtnl_link_ops lowpan_link_ops __read_mostly = { .kind = "lowpan", - .priv_size = LOWPAN_PRIV_SIZE(sizeof(struct lowpan_dev_info)), + .priv_size = LOWPAN_PRIV_SIZE(sizeof(struct lowpan_802154_dev)), .setup = lowpan_setup, .newlink = lowpan_newlink, .dellink = lowpan_dellink, diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index d4353face..e459afd16 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -84,7 +84,7 @@ static struct sk_buff* lowpan_alloc_frag(struct sk_buff *skb, int size, const struct ieee802154_hdr *master_hdr, bool frag1) { - struct net_device *wdev = lowpan_dev_info(skb->dev)->wdev; + struct net_device *wdev = lowpan_802154_dev(skb->dev)->wdev; struct sk_buff *frag; int rc; @@ -148,8 +148,8 @@ lowpan_xmit_fragmented(struct sk_buff *skb, struct net_device *ldev, int frag_cap, frag_len, payload_cap, rc; int skb_unprocessed, skb_offset; - frag_tag = htons(lowpan_dev_info(ldev)->fragment_tag); - lowpan_dev_info(ldev)->fragment_tag++; + frag_tag = htons(lowpan_802154_dev(ldev)->fragment_tag); + lowpan_802154_dev(ldev)->fragment_tag++; frag_hdr[0] = LOWPAN_DISPATCH_FRAG1 | ((dgram_size >> 8) & 0x07); frag_hdr[1] = dgram_size & 0xff; @@ -208,7 +208,7 @@ err: static int lowpan_header(struct sk_buff *skb, struct net_device *ldev, u16 *dgram_size, u16 *dgram_offset) { - struct wpan_dev *wpan_dev = lowpan_dev_info(ldev)->wdev->ieee802154_ptr; + struct wpan_dev *wpan_dev = lowpan_802154_dev(ldev)->wdev->ieee802154_ptr; struct ieee802154_addr sa, da; struct ieee802154_mac_cb *cb = mac_cb_init(skb); struct lowpan_addr_info info; @@ -248,8 +248,8 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *ldev, cb->ackreq = wpan_dev->ackreq; } - return wpan_dev_hard_header(skb, lowpan_dev_info(ldev)->wdev, &da, &sa, - 0); + return wpan_dev_hard_header(skb, lowpan_802154_dev(ldev)->wdev, &da, + &sa, 0); } netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev) @@ -283,7 +283,7 @@ netdev_tx_t lowpan_xmit(struct sk_buff *skb, struct net_device *ldev) max_single = ieee802154_max_payload(&wpan_hdr); if (skb_tail_pointer(skb) - skb_network_header(skb) <= max_single) { - skb->dev = lowpan_dev_info(ldev)->wdev; + skb->dev = lowpan_802154_dev(ldev)->wdev; ldev->stats.tx_packets++; ldev->stats.tx_bytes += dgram_size; return dev_queue_xmit(skb); diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c index 3503c3895..d3cbb3258 100644 --- a/net/ieee802154/nl-mac.c +++ b/net/ieee802154/nl-mac.c @@ -34,9 +34,11 @@ #include "ieee802154.h" -static int nla_put_hwaddr(struct sk_buff *msg, int type, __le64 hwaddr) +static int nla_put_hwaddr(struct sk_buff *msg, int type, __le64 hwaddr, + int padattr) { - return nla_put_u64(msg, type, swab64((__force u64)hwaddr)); + return nla_put_u64_64bit(msg, type, swab64((__force u64)hwaddr), + padattr); } static __le64 nla_get_hwaddr(const struct nlattr *nla) @@ -623,7 +625,8 @@ ieee802154_llsec_fill_key_id(struct sk_buff *msg, if (desc->device_addr.mode == IEEE802154_ADDR_LONG && nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, - desc->device_addr.extended_addr)) + desc->device_addr.extended_addr, + IEEE802154_ATTR_PAD)) return -EMSGSIZE; } @@ -638,7 +641,7 @@ ieee802154_llsec_fill_key_id(struct sk_buff *msg, if (desc->mode == IEEE802154_SCF_KEY_HW_INDEX && nla_put_hwaddr(msg, IEEE802154_ATTR_LLSEC_KEY_SOURCE_EXTENDED, - desc->extended_source)) + desc->extended_source, IEEE802154_ATTR_PAD)) return -EMSGSIZE; return 0; @@ -1063,7 +1066,8 @@ ieee802154_nl_fill_dev(struct sk_buff *msg, u32 portid, u32 seq, nla_put_shortaddr(msg, IEEE802154_ATTR_PAN_ID, desc->pan_id) || nla_put_shortaddr(msg, IEEE802154_ATTR_SHORT_ADDR, desc->short_addr) || - nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, desc->hwaddr) || + nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, desc->hwaddr, + IEEE802154_ATTR_PAD) || nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER, desc->frame_counter) || nla_put_u8(msg, IEEE802154_ATTR_LLSEC_DEV_OVERRIDE, @@ -1167,7 +1171,8 @@ ieee802154_nl_fill_devkey(struct sk_buff *msg, u32 portid, u32 seq, if (nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name) || nla_put_u32(msg, IEEE802154_ATTR_DEV_INDEX, dev->ifindex) || - nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, devaddr) || + nla_put_hwaddr(msg, IEEE802154_ATTR_HW_ADDR, devaddr, + IEEE802154_ATTR_PAD) || nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER, devkey->frame_counter) || ieee802154_llsec_fill_key_id(msg, &devkey->key_id)) diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 16ef0d9f5..116187b5c 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -722,7 +722,8 @@ ieee802154_llsec_send_key_id(struct sk_buff *msg, break; case NL802154_DEV_ADDR_EXTENDED: if (nla_put_le64(msg, NL802154_DEV_ADDR_ATTR_EXTENDED, - desc->device_addr.extended_addr)) + desc->device_addr.extended_addr, + NL802154_DEV_ADDR_ATTR_PAD)) return -ENOBUFS; break; default: @@ -742,7 +743,8 @@ ieee802154_llsec_send_key_id(struct sk_buff *msg, break; case NL802154_KEY_ID_MODE_INDEX_EXTENDED: if (nla_put_le64(msg, NL802154_KEY_ID_ATTR_SOURCE_EXTENDED, - desc->extended_source)) + desc->extended_source, + NL802154_KEY_ID_ATTR_PAD)) return -ENOBUFS; break; default: @@ -811,7 +813,8 @@ nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags, if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx) || nla_put_u32(msg, NL802154_ATTR_IFTYPE, wpan_dev->iftype) || - nla_put_u64(msg, NL802154_ATTR_WPAN_DEV, wpan_dev_id(wpan_dev)) || + nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV, + wpan_dev_id(wpan_dev), NL802154_ATTR_PAD) || nla_put_u32(msg, NL802154_ATTR_GENERATION, rdev->devlist_generation ^ (cfg802154_rdev_list_generation << 2))) @@ -819,7 +822,8 @@ nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags, /* address settings */ if (nla_put_le64(msg, NL802154_ATTR_EXTENDED_ADDR, - wpan_dev->extended_addr) || + wpan_dev->extended_addr, + NL802154_ATTR_PAD) || nla_put_le16(msg, NL802154_ATTR_SHORT_ADDR, wpan_dev->short_addr) || nla_put_le16(msg, NL802154_ATTR_PAN_ID, wpan_dev->pan_id)) @@ -1074,6 +1078,11 @@ static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info) if (netif_running(dev)) return -EBUSY; + if (wpan_dev->lowpan_dev) { + if (netif_running(wpan_dev->lowpan_dev)) + return -EBUSY; + } + /* don't change address fields on monitor */ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR || !info->attrs[NL802154_ATTR_PAN_ID]) @@ -1105,6 +1114,11 @@ static int nl802154_set_short_addr(struct sk_buff *skb, struct genl_info *info) if (netif_running(dev)) return -EBUSY; + if (wpan_dev->lowpan_dev) { + if (netif_running(wpan_dev->lowpan_dev)) + return -EBUSY; + } + /* don't change address fields on monitor */ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR || !info->attrs[NL802154_ATTR_SHORT_ADDR]) @@ -1275,8 +1289,8 @@ ieee802154_llsec_parse_dev_addr(struct nlattr *nla, nl802154_dev_addr_policy)) return -EINVAL; - if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] && - !attrs[NL802154_DEV_ADDR_ATTR_MODE] && + if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] || + !attrs[NL802154_DEV_ADDR_ATTR_MODE] || !(attrs[NL802154_DEV_ADDR_ATTR_SHORT] || attrs[NL802154_DEV_ADDR_ATTR_EXTENDED])) return -EINVAL; @@ -1614,7 +1628,7 @@ static int nl802154_send_device(struct sk_buff *msg, u32 cmd, u32 portid, nla_put_le16(msg, NL802154_DEV_ATTR_SHORT_ADDR, dev_desc->short_addr) || nla_put_le64(msg, NL802154_DEV_ATTR_EXTENDED_ADDR, - dev_desc->hwaddr) || + dev_desc->hwaddr, NL802154_DEV_ATTR_PAD) || nla_put_u8(msg, NL802154_DEV_ATTR_SECLEVEL_EXEMPT, dev_desc->seclevel_exempt) || nla_put_u32(msg, NL802154_DEV_ATTR_KEY_MODE, dev_desc->key_mode)) @@ -1778,7 +1792,7 @@ static int nl802154_send_devkey(struct sk_buff *msg, u32 cmd, u32 portid, goto nla_put_failure; if (nla_put_le64(msg, NL802154_DEVKEY_ATTR_EXTENDED_ADDR, - extended_addr) || + extended_addr, NL802154_DEVKEY_ATTR_PAD) || nla_put_u32(msg, NL802154_DEVKEY_ATTR_FRAME_COUNTER, devkey->frame_counter)) goto nla_put_failure; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 7ad0e567c..d39e9e47a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -948,6 +948,7 @@ const struct proto_ops inet_dgram_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = inet_sendpage, + .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, @@ -1106,7 +1107,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) struct ip_options_rcu *inet_opt; inet_opt = rcu_dereference_protected(inet->inet_opt, - sock_owned_by_user(sk)); + lockdep_sock_is_held(sk)); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; @@ -1191,35 +1192,19 @@ int inet_sk_rebuild_header(struct sock *sk) } EXPORT_SYMBOL(inet_sk_rebuild_header); -static struct sk_buff *inet_gso_segment(struct sk_buff *skb, - netdev_features_t features) +struct sk_buff *inet_gso_segment(struct sk_buff *skb, + netdev_features_t features) { + bool udpfrag = false, fixedid = false, encap; struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; unsigned int offset = 0; - bool udpfrag, encap; struct iphdr *iph; - int proto; + int proto, tot_len; int nhoff; int ihl; int id; - if (unlikely(skb_shinfo(skb)->gso_type & - ~(SKB_GSO_TCPV4 | - SKB_GSO_UDP | - SKB_GSO_DODGY | - SKB_GSO_TCP_ECN | - SKB_GSO_GRE | - SKB_GSO_GRE_CSUM | - SKB_GSO_IPIP | - SKB_GSO_SIT | - SKB_GSO_TCPV6 | - SKB_GSO_UDP_TUNNEL | - SKB_GSO_UDP_TUNNEL_CSUM | - SKB_GSO_TUNNEL_REMCSUM | - 0))) - goto out; - skb_reset_network_header(skb); nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) @@ -1247,11 +1232,14 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); - if (skb->encapsulation && - skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP)) - udpfrag = proto == IPPROTO_UDP && encap; - else - udpfrag = proto == IPPROTO_UDP && !skb->encapsulation; + if (!skb->encapsulation || encap) { + udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); + fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); + + /* fixed ID is invalid if DF bit is not set */ + if (fixedid && !(iph->frag_off & htons(IP_DF))) + goto out; + } ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) @@ -1264,15 +1252,25 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, do { iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); if (udpfrag) { - iph->id = htons(id); iph->frag_off = htons(offset >> 3); if (skb->next) iph->frag_off |= htons(IP_MF); offset += skb->len - nhoff - ihl; + tot_len = skb->len - nhoff; + } else if (skb_is_gso(skb)) { + if (!fixedid) { + iph->id = htons(id); + id += skb_shinfo(skb)->gso_segs; + } + tot_len = skb_shinfo(skb)->gso_size + + SKB_GSO_CB(skb)->data_offset + + skb->head - (unsigned char *)iph; } else { - iph->id = htons(id++); + if (!fixedid) + iph->id = htons(id++); + tot_len = skb->len - nhoff; } - iph->tot_len = htons(skb->len - nhoff); + iph->tot_len = htons(tot_len); ip_send_check(iph); if (encap) skb_reset_inner_headers(skb); @@ -1282,9 +1280,9 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, out: return segs; } +EXPORT_SYMBOL(inet_gso_segment); -static struct sk_buff **inet_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff **pp = NULL; @@ -1324,6 +1322,7 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, for (p = *head; p; p = p->next) { struct iphdr *iph2; + u16 flush_id; if (!NAPI_GRO_CB(p)->same_flow) continue; @@ -1347,16 +1346,36 @@ static struct sk_buff **inet_gro_receive(struct sk_buff **head, (iph->tos ^ iph2->tos) | ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)); - /* Save the IP ID check to be included later when we get to - * the transport layer so only the inner most IP ID is checked. - * This is because some GSO/TSO implementations do not - * correctly increment the IP ID for the outer hdrs. - */ - NAPI_GRO_CB(p)->flush_id = - ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id); NAPI_GRO_CB(p)->flush |= flush; + + /* We need to store of the IP ID check to be included later + * when we can verify that this packet does in fact belong + * to a given flow. + */ + flush_id = (u16)(id - ntohs(iph2->id)); + + /* This bit of code makes it much easier for us to identify + * the cases where we are doing atomic vs non-atomic IP ID + * checks. Specifically an atomic check can return IP ID + * values 0 - 0xFFFF, while a non-atomic check can only + * return 0 or 0xFFFF. + */ + if (!NAPI_GRO_CB(p)->is_atomic || + !(iph->frag_off & htons(IP_DF))) { + flush_id ^= NAPI_GRO_CB(p)->count; + flush_id = flush_id ? 0xFFFF : 0; + } + + /* If the previous IP ID value was based on an atomic + * datagram we can overwrite the value and ignore it. + */ + if (NAPI_GRO_CB(skb)->is_atomic) + NAPI_GRO_CB(p)->flush_id = flush_id; + else + NAPI_GRO_CB(p)->flush_id |= flush_id; } + NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF)); NAPI_GRO_CB(skb)->flush |= flush; skb_set_network_header(skb, off); /* The above will be needed by the transport layer if there is one @@ -1379,6 +1398,7 @@ out: return pp; } +EXPORT_SYMBOL(inet_gro_receive); static struct sk_buff **ipip_gro_receive(struct sk_buff **head, struct sk_buff *skb) @@ -1430,7 +1450,7 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) return -EINVAL; } -static int inet_gro_complete(struct sk_buff *skb, int nhoff) +int inet_gro_complete(struct sk_buff *skb, int nhoff) { __be16 newlen = htons(skb->len - nhoff); struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); @@ -1460,11 +1480,12 @@ out_unlock: return err; } +EXPORT_SYMBOL(inet_gro_complete); static int ipip_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; - skb_shinfo(skb)->gso_type |= SKB_GSO_IPIP; + skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; return inet_gro_complete(skb, nhoff); } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index c34c7544d..89a8cac47 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -436,7 +436,7 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) if (IS_ERR(rt)) return 1; if (rt->dst.dev != dev) { - NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); + __NET_INC_STATS(net, LINUX_MIB_ARPFILTER); flag = 1; } ip_rt_put(rt); diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index bdb2a07ec..40d6b8771 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1933,7 +1933,8 @@ int cipso_v4_sock_setattr(struct sock *sk, sk_inet = inet_sk(sk); - old = rcu_dereference_protected(sk_inet->inet_opt, sock_owned_by_user(sk)); + old = rcu_dereference_protected(sk_inet->inet_opt, + lockdep_sock_is_held(sk)); if (sk_inet->is_icsk) { sk_conn = inet_csk(sk); if (old) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 63566ec54..ef2ebeb89 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -110,6 +110,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id) hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]); return tb; } +EXPORT_SYMBOL_GPL(fib_new_table); /* caller must hold either rtnl or rcu read lock */ struct fib_table *fib_get_table(struct net *net, u32 id) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2b68418c7..539fa264e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -479,6 +479,9 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, if (!rtnh_ok(rtnh, remaining)) return -EINVAL; + if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + return -EINVAL; + nexthop_nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; nexthop_nh->nh_oif = rtnh->rtnh_ifindex; @@ -1003,6 +1006,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (fib_props[cfg->fc_type].scope > cfg->fc_scope) goto err_inval; + if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + goto err_inval; + #ifdef CONFIG_IP_ROUTE_MULTIPATH if (cfg->fc_mp) { nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len); @@ -1561,21 +1567,45 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags) } #ifdef CONFIG_IP_ROUTE_MULTIPATH +static bool fib_good_nh(const struct fib_nh *nh) +{ + int state = NUD_REACHABLE; + + if (nh->nh_scope == RT_SCOPE_LINK) { + struct neighbour *n; + + rcu_read_lock_bh(); + + n = __ipv4_neigh_lookup_noref(nh->nh_dev, nh->nh_gw); + if (n) + state = n->nud_state; + + rcu_read_unlock_bh(); + } + + return !!(state & NUD_VALID); +} void fib_select_multipath(struct fib_result *res, int hash) { struct fib_info *fi = res->fi; + struct net *net = fi->fib_net; + bool first = false; for_nexthops(fi) { if (hash > atomic_read(&nh->nh_upper_bound)) continue; - res->nh_sel = nhsel; - return; + if (!net->ipv4.sysctl_fib_multipath_use_neigh || + fib_good_nh(nh)) { + res->nh_sel = nhsel; + return; + } + if (!first) { + res->nh_sel = nhsel; + first = true; + } } endfor_nexthops(fi); - - /* Race condition: route has just become dead. */ - res->nh_sel = 0; } #endif diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index a6962ccad..5f9207c03 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -21,8 +21,8 @@ struct fou { u8 protocol; u8 flags; __be16 port; + u8 family; u16 type; - struct udp_offload udp_offloads; struct list_head list; struct rcu_head rcu; }; @@ -48,14 +48,17 @@ static inline struct fou *fou_from_sock(struct sock *sk) return sk->sk_user_data; } -static int fou_recv_pull(struct sk_buff *skb, size_t len) +static int fou_recv_pull(struct sk_buff *skb, struct fou *fou, size_t len) { - struct iphdr *iph = ip_hdr(skb); - /* Remove 'len' bytes from the packet (UDP header and * FOU header if present). */ - iph->tot_len = htons(ntohs(iph->tot_len) - len); + if (fou->family == AF_INET) + ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len); + else + ipv6_hdr(skb)->payload_len = + htons(ntohs(ipv6_hdr(skb)->payload_len) - len); + __skb_pull(skb, len); skb_postpull_rcsum(skb, udp_hdr(skb), len); skb_reset_transport_header(skb); @@ -69,7 +72,7 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) if (!fou) return 1; - if (fou_recv_pull(skb, sizeof(struct udphdr))) + if (fou_recv_pull(skb, fou, sizeof(struct udphdr))) goto drop; return -fou->protocol; @@ -142,7 +145,11 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) hdrlen = sizeof(struct guehdr) + optlen; - ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len); + if (fou->family == AF_INET) + ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len); + else + ipv6_hdr(skb)->payload_len = + htons(ntohs(ipv6_hdr(skb)->payload_len) - len); /* Pull csum through the guehdr now . This can be used if * there is a remote checksum offload. @@ -186,13 +193,13 @@ drop: return 0; } -static struct sk_buff **fou_gro_receive(struct sk_buff **head, - struct sk_buff *skb, - struct udp_offload *uoff) +static struct sk_buff **fou_gro_receive(struct sock *sk, + struct sk_buff **head, + struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff **pp = NULL; - u8 proto = NAPI_GRO_CB(skb)->proto; + u8 proto = fou_from_sock(sk)->protocol; const struct net_offload **offloads; /* We can clear the encap_mark for FOU as we are essentially doing @@ -220,11 +227,11 @@ out_unlock: return pp; } -static int fou_gro_complete(struct sk_buff *skb, int nhoff, - struct udp_offload *uoff) +static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, + int nhoff) { const struct net_offload *ops; - u8 proto = NAPI_GRO_CB(skb)->proto; + u8 proto = fou_from_sock(sk)->protocol; int err = -ENOSYS; const struct net_offload **offloads; @@ -267,9 +274,9 @@ static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, return guehdr; } -static struct sk_buff **gue_gro_receive(struct sk_buff **head, - struct sk_buff *skb, - struct udp_offload *uoff) +static struct sk_buff **gue_gro_receive(struct sock *sk, + struct sk_buff **head, + struct sk_buff *skb) { const struct net_offload **offloads; const struct net_offload *ops; @@ -280,7 +287,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, void *data; u16 doffset = 0; int flush = 1; - struct fou *fou = container_of(uoff, struct fou, udp_offloads); + struct fou *fou = fou_from_sock(sk); struct gro_remcsum grc; skb_gro_remcsum_init(&grc); @@ -392,8 +399,7 @@ out: return pp; } -static int gue_gro_complete(struct sk_buff *skb, int nhoff, - struct udp_offload *uoff) +static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { const struct net_offload **offloads; struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); @@ -428,7 +434,8 @@ static int fou_add_to_port_list(struct net *net, struct fou *fou) mutex_lock(&fn->fou_lock); list_for_each_entry(fout, &fn->fou_list, list) { - if (fou->port == fout->port) { + if (fou->port == fout->port && + fou->family == fout->family) { mutex_unlock(&fn->fou_lock); return -EALREADY; } @@ -443,44 +450,20 @@ static int fou_add_to_port_list(struct net *net, struct fou *fou) static void fou_release(struct fou *fou) { struct socket *sock = fou->sock; - struct sock *sk = sock->sk; - if (sk->sk_family == AF_INET) - udp_del_offload(&fou->udp_offloads); list_del(&fou->list); udp_tunnel_sock_release(sock); kfree_rcu(fou, rcu); } -static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) -{ - udp_sk(sk)->encap_rcv = fou_udp_recv; - fou->protocol = cfg->protocol; - fou->udp_offloads.callbacks.gro_receive = fou_gro_receive; - fou->udp_offloads.callbacks.gro_complete = fou_gro_complete; - fou->udp_offloads.port = cfg->udp_config.local_udp_port; - fou->udp_offloads.ipproto = cfg->protocol; - - return 0; -} - -static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) -{ - udp_sk(sk)->encap_rcv = gue_udp_recv; - fou->udp_offloads.callbacks.gro_receive = gue_gro_receive; - fou->udp_offloads.callbacks.gro_complete = gue_gro_complete; - fou->udp_offloads.port = cfg->udp_config.local_udp_port; - - return 0; -} - static int fou_create(struct net *net, struct fou_cfg *cfg, struct socket **sockp) { struct socket *sock = NULL; struct fou *fou = NULL; struct sock *sk; + struct udp_tunnel_sock_cfg tunnel_cfg; int err; /* Open UDP socket */ @@ -497,44 +480,39 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk = sock->sk; - fou->flags = cfg->flags; fou->port = cfg->udp_config.local_udp_port; + fou->family = cfg->udp_config.family; + fou->flags = cfg->flags; + fou->type = cfg->type; + fou->sock = sock; + + memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); + tunnel_cfg.encap_type = 1; + tunnel_cfg.sk_user_data = fou; + tunnel_cfg.encap_destroy = NULL; /* Initial for fou type */ switch (cfg->type) { case FOU_ENCAP_DIRECT: - err = fou_encap_init(sk, fou, cfg); - if (err) - goto error; + tunnel_cfg.encap_rcv = fou_udp_recv; + tunnel_cfg.gro_receive = fou_gro_receive; + tunnel_cfg.gro_complete = fou_gro_complete; + fou->protocol = cfg->protocol; break; case FOU_ENCAP_GUE: - err = gue_encap_init(sk, fou, cfg); - if (err) - goto error; + tunnel_cfg.encap_rcv = gue_udp_recv; + tunnel_cfg.gro_receive = gue_gro_receive; + tunnel_cfg.gro_complete = gue_gro_complete; break; default: err = -EINVAL; goto error; } - fou->type = cfg->type; - - udp_sk(sk)->encap_type = 1; - udp_encap_enable(); - - sk->sk_user_data = fou; - fou->sock = sock; - - inet_inc_convert_csum(sk); + setup_udp_tunnel_sock(net, sock, &tunnel_cfg); sk->sk_allocation = GFP_ATOMIC; - if (cfg->udp_config.family == AF_INET) { - err = udp_add_offload(net, &fou->udp_offloads); - if (err) - goto error; - } - err = fou_add_to_port_list(net, fou); if (err) goto error; @@ -556,12 +534,13 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg) { struct fou_net *fn = net_generic(net, fou_net_id); __be16 port = cfg->udp_config.local_udp_port; + u8 family = cfg->udp_config.family; int err = -EINVAL; struct fou *fou; mutex_lock(&fn->fou_lock); list_for_each_entry(fou, &fn->fou_list, list) { - if (fou->port == port) { + if (fou->port == port && fou->family == family) { fou_release(fou); err = 0; break; @@ -599,8 +578,15 @@ static int parse_nl_config(struct genl_info *info, if (info->attrs[FOU_ATTR_AF]) { u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]); - if (family != AF_INET) - return -EINVAL; + switch (family) { + case AF_INET: + break; + case AF_INET6: + cfg->udp_config.ipv6_v6only = 1; + break; + default: + return -EAFNOSUPPORT; + } cfg->udp_config.family = family; } @@ -691,6 +677,7 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info) struct fou_cfg cfg; struct fou *fout; __be16 port; + u8 family; int ret; ret = parse_nl_config(info, &cfg); @@ -700,6 +687,10 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info) if (port == 0) return -EINVAL; + family = cfg.udp_config.family; + if (family != AF_INET && family != AF_INET6) + return -EINVAL; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -707,7 +698,7 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info) ret = -ESRCH; mutex_lock(&fn->fou_lock); list_for_each_entry(fout, &fn->fou_list, list) { - if (port == fout->port) { + if (port == fout->port && family == fout->family) { ret = fou_dump_info(fout, info->snd_portid, info->snd_seq, 0, msg, info->genlhdr->cmd); @@ -812,36 +803,48 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, *protocol = IPPROTO_UDP; } +int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, __be16 *sport, int type) +{ + int err; + + err = iptunnel_handle_offloads(skb, type); + if (err) + return err; + + *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), + skb, 0, 0, false); + + return 0; +} +EXPORT_SYMBOL(__fou_build_header); + int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; __be16 sport; + int err; - skb = iptunnel_handle_offloads(skb, type); - - if (IS_ERR(skb)) - return PTR_ERR(skb); + err = __fou_build_header(skb, e, protocol, &sport, type); + if (err) + return err; - sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), - skb, 0, 0, false); fou_build_udp(skb, e, fl4, protocol, sport); return 0; } EXPORT_SYMBOL(fou_build_header); -int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, - u8 *protocol, struct flowi4 *fl4) +int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, __be16 *sport, int type) { - int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : - SKB_GSO_UDP_TUNNEL; struct guehdr *guehdr; size_t hdrlen, optlen = 0; - __be16 sport; void *data; bool need_priv = false; + int err; if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) && skb->ip_summed == CHECKSUM_PARTIAL) { @@ -852,14 +855,13 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, optlen += need_priv ? GUE_LEN_PRIV : 0; - skb = iptunnel_handle_offloads(skb, type); - - if (IS_ERR(skb)) - return PTR_ERR(skb); + err = iptunnel_handle_offloads(skb, type); + if (err) + return err; /* Get source port (based on flow hash) before skb_push */ - sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), - skb, 0, 0, false); + *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), + skb, 0, 0, false); hdrlen = sizeof(struct guehdr) + optlen; @@ -904,6 +906,22 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, } + return 0; +} +EXPORT_SYMBOL(__gue_build_header); + +int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi4 *fl4) +{ + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : + SKB_GSO_UDP_TUNNEL; + __be16 sport; + int err; + + err = __gue_build_header(skb, e, protocol, &sport, type); + if (err) + return err; + fou_build_udp(skb, e, fl4, protocol, sport); return 0; diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index d9c552a72..de1d119a4 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -60,6 +60,67 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version) } EXPORT_SYMBOL_GPL(gre_del_protocol); +/* Fills in tpi and returns header length to be pulled. */ +int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, + bool *csum_err, __be16 proto, int nhs) +{ + const struct gre_base_hdr *greh; + __be32 *options; + int hdr_len; + + if (unlikely(!pskb_may_pull(skb, nhs + sizeof(struct gre_base_hdr)))) + return -EINVAL; + + greh = (struct gre_base_hdr *)(skb->data + nhs); + if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) + return -EINVAL; + + tpi->flags = gre_flags_to_tnl_flags(greh->flags); + hdr_len = gre_calc_hlen(tpi->flags); + + if (!pskb_may_pull(skb, nhs + hdr_len)) + return -EINVAL; + + greh = (struct gre_base_hdr *)(skb->data + nhs); + tpi->proto = greh->protocol; + + options = (__be32 *)(greh + 1); + if (greh->flags & GRE_CSUM) { + if (skb_checksum_simple_validate(skb)) { + *csum_err = true; + return -EINVAL; + } + + skb_checksum_try_convert(skb, IPPROTO_GRE, 0, + null_compute_pseudo); + options++; + } + + if (greh->flags & GRE_KEY) { + tpi->key = *options; + options++; + } else { + tpi->key = 0; + } + if (unlikely(greh->flags & GRE_SEQ)) { + tpi->seq = *options; + options++; + } else { + tpi->seq = 0; + } + /* WCCP version 1 and 2 protocol decoding. + * - Change protocol to IPv4/IPv6 + * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header + */ + if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { + tpi->proto = proto; + if ((*(u8 *)options & 0xF0) != 0x40) + hdr_len += 4; + } + return hdr_len; +} +EXPORT_SYMBOL(gre_parse_header); + static int gre_rcv(struct sk_buff *skb) { const struct gre_protocol *proto; diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 6a5bd4317..ecd1e09db 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -26,18 +26,6 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, int gre_offset, outer_hlen; bool need_csum, ufo; - if (unlikely(skb_shinfo(skb)->gso_type & - ~(SKB_GSO_TCPV4 | - SKB_GSO_TCPV6 | - SKB_GSO_UDP | - SKB_GSO_DODGY | - SKB_GSO_TCP_ECN | - SKB_GSO_GRE | - SKB_GSO_GRE_CSUM | - SKB_GSO_IPIP | - SKB_GSO_SIT))) - goto out; - if (!skb->encapsulation) goto out; @@ -86,7 +74,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, skb = segs; do { struct gre_base_hdr *greh; - __be32 *pcsum; + __sum16 *pcsum; /* Set up inner headers if we are offloading inner checksum */ if (skb->ip_summed == CHECKSUM_PARTIAL) { @@ -106,10 +94,25 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, continue; greh = (struct gre_base_hdr *)skb_transport_header(skb); - pcsum = (__be32 *)(greh + 1); + pcsum = (__sum16 *)(greh + 1); + + if (skb_is_gso(skb)) { + unsigned int partial_adj; + + /* Adjust checksum to account for the fact that + * the partial checksum is based on actual size + * whereas headers should be based on MSS size. + */ + partial_adj = skb->len + skb_headroom(skb) - + SKB_GSO_CB(skb)->data_offset - + skb_shinfo(skb)->gso_size; + *pcsum = ~csum_fold((__force __wsum)htonl(partial_adj)); + } else { + *pcsum = 0; + } - *pcsum = 0; - *(__sum16 *)pcsum = gso_make_checksum(skb, 0); + *(pcsum + 1) = 0; + *pcsum = gso_make_checksum(skb, 0); } while ((skb = skb->next)); out: return segs; @@ -275,6 +278,18 @@ static const struct net_offload gre_offload = { static int __init gre_offload_init(void) { - return inet_add_offload(&gre_offload, IPPROTO_GRE); + int err; + + err = inet_add_offload(&gre_offload, IPPROTO_GRE); +#if IS_ENABLED(CONFIG_IPV6) + if (err) + return err; + + err = inet6_add_offload(&gre_offload, IPPROTO_GRE); + if (err) + inet_del_offload(&gre_offload, IPPROTO_GRE); +#endif + + return err; } device_initcall(gre_offload_init); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 633348977..38abe70e5 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -363,7 +363,7 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param, icmp_param->data_len+icmp_param->head_len, icmp_param->head_len, ipc, rt, MSG_DONTWAIT) < 0) { - ICMP_INC_STATS_BH(sock_net(sk), ICMP_MIB_OUTERRORS); + __ICMP_INC_STATS(sock_net(sk), ICMP_MIB_OUTERRORS); ip_flush_pending_frames(sk); } else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { struct icmphdr *icmph = icmp_hdr(skb); @@ -744,7 +744,7 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) * avoid additional coding at protocol handlers. */ if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) { - ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); return; } @@ -865,7 +865,7 @@ static bool icmp_unreach(struct sk_buff *skb) out: return true; out_err: - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return false; } @@ -877,7 +877,7 @@ out_err: static bool icmp_redirect(struct sk_buff *skb) { if (skb->len < sizeof(struct iphdr)) { - ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); return false; } @@ -956,7 +956,7 @@ static bool icmp_timestamp(struct sk_buff *skb) return true; out_err: - ICMP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); return false; } @@ -996,7 +996,7 @@ int icmp_rcv(struct sk_buff *skb) skb_set_network_header(skb, nh); } - ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS); + __ICMP_INC_STATS(net, ICMP_MIB_INMSGS); if (skb_checksum_simple_validate(skb)) goto csum_error; @@ -1006,7 +1006,7 @@ int icmp_rcv(struct sk_buff *skb) icmph = icmp_hdr(skb); - ICMPMSGIN_INC_STATS_BH(net, icmph->type); + ICMPMSGIN_INC_STATS(net, icmph->type); /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * @@ -1052,9 +1052,9 @@ drop: kfree_skb(skb); return 0; csum_error: - ICMP_INC_STATS_BH(net, ICMP_MIB_CSUMERRORS); + __ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS); error: - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); goto drop; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index bc5196ea1..fa8c39804 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -427,7 +427,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, route_err: ip_rt_put(rt); no_route: - IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } EXPORT_SYMBOL_GPL(inet_csk_route_req); @@ -466,7 +466,7 @@ route_err: ip_rt_put(rt); no_route: rcu_read_unlock(); - IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); @@ -661,6 +661,9 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); newsk->sk_write_space = sk_stream_write_space; + /* listeners have SOCK_RCU_FREE, not the children */ + sock_reset_flag(newsk, SOCK_RCU_FREE); + newsk->sk_mark = inet_rsk(req)->ir_mark; atomic64_set(&newsk->sk_cookie, atomic64_read(&inet_rsk(req)->ir_cookie)); @@ -703,7 +706,9 @@ void inet_csk_destroy_sock(struct sock *sk) sk_refcnt_debug_release(sk); + local_bh_disable(); percpu_counter_dec(sk->sk_prot->orphan_count); + local_bh_enable(); sock_put(sk); } EXPORT_SYMBOL(inet_csk_destroy_sock); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5fdb02f55..25af12436 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -66,7 +66,7 @@ static void inet_diag_unlock_handler(const struct inet_diag_handler *handler) mutex_unlock(&inet_diag_table_mutex); } -static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk) +void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk) { r->idiag_family = sk->sk_family; @@ -89,6 +89,7 @@ static void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk) r->id.idiag_dst[0] = sk->sk_daddr; } } +EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill); static size_t inet_sk_attr_size(void) { @@ -104,13 +105,50 @@ static size_t inet_sk_attr_size(void) + 64; } +int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, + struct inet_diag_msg *r, int ext, + struct user_namespace *user_ns) +{ + const struct inet_sock *inet = inet_sk(sk); + + if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) + goto errout; + + /* IPv6 dual-stack sockets use inet->tos for IPv4 connections, + * hence this needs to be included regardless of socket family. + */ + if (ext & (1 << (INET_DIAG_TOS - 1))) + if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0) + goto errout; + +#if IS_ENABLED(CONFIG_IPV6) + if (r->idiag_family == AF_INET6) { + if (ext & (1 << (INET_DIAG_TCLASS - 1))) + if (nla_put_u8(skb, INET_DIAG_TCLASS, + inet6_sk(sk)->tclass) < 0) + goto errout; + + if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && + nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk))) + goto errout; + } +#endif + + r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); + r->idiag_inode = sock_i_ino(sk); + + return 0; +errout: + return 1; +} +EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill); + int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, const struct inet_diag_req_v2 *req, struct user_namespace *user_ns, u32 portid, u32 seq, u16 nlmsg_flags, const struct nlmsghdr *unlh) { - const struct inet_sock *inet = inet_sk(sk); const struct tcp_congestion_ops *ca_ops; const struct inet_diag_handler *handler; int ext = req->idiag_ext; @@ -135,32 +173,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->idiag_timer = 0; r->idiag_retrans = 0; - if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown)) + if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns)) goto errout; - /* IPv6 dual-stack sockets use inet->tos for IPv4 connections, - * hence this needs to be included regardless of socket family. - */ - if (ext & (1 << (INET_DIAG_TOS - 1))) - if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0) - goto errout; - -#if IS_ENABLED(CONFIG_IPV6) - if (r->idiag_family == AF_INET6) { - if (ext & (1 << (INET_DIAG_TCLASS - 1))) - if (nla_put_u8(skb, INET_DIAG_TCLASS, - inet6_sk(sk)->tclass) < 0) - goto errout; - - if (((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && - nla_put_u8(skb, INET_DIAG_SKV6ONLY, ipv6_only_sock(sk))) - goto errout; - } -#endif - - r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); - r->idiag_inode = sock_i_ino(sk); - if (ext & (1 << (INET_DIAG_MEMINFO - 1))) { struct inet_diag_meminfo minfo = { .idiag_rmem = sk_rmem_alloc_get(sk), @@ -182,31 +197,32 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, goto out; } -#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ) - if (icsk->icsk_pending == ICSK_TIME_RETRANS || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; - r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); + r->idiag_expires = + jiffies_to_msecs(icsk->icsk_timeout - jiffies); } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; r->idiag_retrans = icsk->icsk_probes_out; - r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); + r->idiag_expires = + jiffies_to_msecs(icsk->icsk_timeout - jiffies); } else if (timer_pending(&sk->sk_timer)) { r->idiag_timer = 2; r->idiag_retrans = icsk->icsk_probes_out; - r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); + r->idiag_expires = + jiffies_to_msecs(sk->sk_timer.expires - jiffies); } else { r->idiag_timer = 0; r->idiag_expires = 0; } -#undef EXPIRES_IN_MS if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) { - attr = nla_reserve(skb, INET_DIAG_INFO, - handler->idiag_info_size); + attr = nla_reserve_64bit(skb, INET_DIAG_INFO, + handler->idiag_info_size, + INET_DIAG_PAD); if (!attr) goto errout; @@ -356,6 +372,7 @@ struct sock *inet_diag_find_one_icsk(struct net *net, { struct sock *sk; + rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], @@ -376,9 +393,11 @@ struct sock *inet_diag_find_one_icsk(struct net *net, req->id.idiag_if); } #endif - else + else { + rcu_read_unlock(); return ERR_PTR(-EINVAL); - + } + rcu_read_unlock(); if (!sk) return ERR_PTR(-ENOENT); @@ -772,13 +791,12 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct inet_listen_hashbucket *ilb; - struct hlist_nulls_node *node; struct sock *sk; num = 0; ilb = &hashinfo->listening_hash[i]; spin_lock_bh(&ilb->lock); - sk_nulls_for_each(sk, node, &ilb->head) { + sk_for_each(sk, &ilb->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) @@ -1061,7 +1079,9 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk) } attr = handler->idiag_info_size - ? nla_reserve(skb, INET_DIAG_INFO, handler->idiag_info_size) + ? nla_reserve_64bit(skb, INET_DIAG_INFO, + handler->idiag_info_size, + INET_DIAG_PAD) : NULL; if (attr) info = nla_data(attr); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 0d9e9d7bb..77c20a489 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -198,13 +198,13 @@ static inline int compute_score(struct sock *sk, struct net *net, } /* - * Don't inline this cruft. Here are some nice properties to exploit here. The - * BSD API does not allow a listening sock to specify the remote port nor the + * Here are some nice properties to exploit here. The BSD API + * does not allow a listening sock to specify the remote port nor the * remote address for the connection. So always assume those are both * wildcarded during the search since they can never be otherwise. */ - +/* called with rcu_read_lock() : No refcount taken on the socket */ struct sock *__inet_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -212,38 +212,27 @@ struct sock *__inet_lookup_listener(struct net *net, const __be32 daddr, const unsigned short hnum, const int dif) { - struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - int score, hiscore, matches = 0, reuseport = 0; - bool select_ok = true; + int score, hiscore = 0, matches = 0, reuseport = 0; + struct sock *sk, *result = NULL; u32 phash = 0; - rcu_read_lock(); -begin: - result = NULL; - hiscore = 0; - sk_nulls_for_each_rcu(sk, node, &ilb->head) { + sk_for_each_rcu(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif); if (score > hiscore) { - result = sk; - hiscore = score; reuseport = sk->sk_reuseport; if (reuseport) { phash = inet_ehashfn(net, daddr, hnum, saddr, sport); - if (select_ok) { - struct sock *sk2; - sk2 = reuseport_select_sock(sk, phash, - skb, doff); - if (sk2) { - result = sk2; - goto found; - } - } + result = reuseport_select_sock(sk, phash, + skb, doff); + if (result) + return result; matches = 1; } + result = sk; + hiscore = score; } else if (score == hiscore && reuseport) { matches++; if (reciprocal_scale(phash, matches) == 0) @@ -251,25 +240,6 @@ begin: phash = next_pseudo_random32(phash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) - goto begin; - if (result) { -found: - if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) - result = NULL; - else if (unlikely(compute_score(result, net, hnum, daddr, - dif) < hiscore)) { - sock_put(result); - select_ok = false; - goto begin; - } - } - rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(__inet_lookup_listener); @@ -312,7 +282,6 @@ struct sock *__inet_lookup_established(struct net *net, unsigned int slot = hash & hashinfo->ehash_mask; struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; - rcu_read_lock(); begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) @@ -339,7 +308,6 @@ begin: out: sk = NULL; found: - rcu_read_unlock(); return sk; } EXPORT_SYMBOL_GPL(__inet_lookup_established); @@ -392,7 +360,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { sk_nulls_del_node_init_rcu((struct sock *)tw); - NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); + __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -472,10 +440,9 @@ static int inet_reuseport_add_sock(struct sock *sk, { struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; struct sock *sk2; - struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); - sk_nulls_for_each_rcu(sk2, node, &ilb->head) { + sk_for_each_rcu(sk2, &ilb->head) { if (sk2 != sk && sk2->sk_family == sk->sk_family && ipv6_only_sock(sk2) == ipv6_only_sock(sk) && @@ -514,7 +481,12 @@ int __inet_hash(struct sock *sk, struct sock *osk, if (err) goto unlock; } - __sk_nulls_add_node_rcu(sk, &ilb->head); + if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && + sk->sk_family == AF_INET6) + hlist_add_tail_rcu(&sk->sk_node, &ilb->head); + else + hlist_add_head_rcu(&sk->sk_node, &ilb->head); + sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: spin_unlock(&ilb->lock); @@ -541,20 +513,25 @@ void inet_unhash(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; spinlock_t *lock; + bool listener = false; int done; if (sk_unhashed(sk)) return; - if (sk->sk_state == TCP_LISTEN) + if (sk->sk_state == TCP_LISTEN) { lock = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)].lock; - else + listener = true; + } else { lock = inet_ehash_lockp(hashinfo, sk->sk_hash); - + } spin_lock_bh(lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); - done = __sk_nulls_del_node_init_rcu(sk); + if (listener) + done = __sk_del_node_init(sk); + else + done = __sk_nulls_del_node_init_rcu(sk); if (done) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); @@ -690,9 +667,8 @@ void inet_hashinfo_init(struct inet_hashinfo *h) for (i = 0; i < INET_LHTABLE_SIZE; i++) { spin_lock_init(&h->listening_hash[i].lock); - INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].head, - i + LISTENING_NULLS_BASE); - } + INIT_HLIST_HEAD(&h->listening_hash[i].head); + } } EXPORT_SYMBOL_GPL(inet_hashinfo_init); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index c67f9bd76..206581674 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -94,7 +94,7 @@ static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, } /* - * Enter the time wait state. This is called with locally disabled BH. + * Enter the time wait state. * Essentially we whip up a timewait bucket, copy the relevant info into it * from the SK, and mess with hash chains and list linkage. */ @@ -112,7 +112,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)]; - spin_lock(&bhead->lock); + spin_lock_bh(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); @@ -138,7 +138,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - spin_unlock(lock); + spin_unlock_bh(lock); } EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); @@ -147,9 +147,9 @@ static void tw_timer_handler(unsigned long data) struct inet_timewait_sock *tw = (struct inet_timewait_sock *)data; if (tw->tw_kill) - NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); + __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); else - NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); + __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITED); inet_twsk_kill(tw); } diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index af18f1e48..cbfb1808f 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -65,8 +65,8 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s { struct ip_options *opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(net, IPSTATS_MIB_OUTFORWDATAGRAMS); - IP_ADD_STATS_BH(net, IPSTATS_MIB_OUTOCTETS, skb->len); + __IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS); + __IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len); if (unlikely(opt->optlen)) ip_forward_options(skb); @@ -157,7 +157,7 @@ sr_failed: too_many_hops: /* Tell the sender its packet died... */ - IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS); + __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: kfree_skb(skb); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index efbd47d1a..bbe7f72db 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -204,14 +204,14 @@ static void ip_expire(unsigned long arg) goto out; ipq_kill(qp); - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); + __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); if (!inet_frag_evicting(&qp->q)) { struct sk_buff *head = qp->q.fragments; const struct iphdr *iph; int err; - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); + __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments) goto out; @@ -291,7 +291,7 @@ static int ip_frag_too_far(struct ipq *qp) struct net *net; net = container_of(qp->q.net, struct net, ipv4.frags); - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); + __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); } return rc; @@ -635,7 +635,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, ip_send_check(iph); - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); + __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; qp->q.fragments_tail = NULL; return 0; @@ -647,7 +647,7 @@ out_nomem: out_oversize: net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); out_fail: - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); + __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); return err; } @@ -658,7 +658,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) int vif = l3mdev_master_ifindex_rcu(dev); struct ipq *qp; - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); + __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); skb_orphan(skb); /* Lookup (or create) queue header */ @@ -675,7 +675,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) return ret; } - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); + __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -ENOMEM; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 4cc84212c..1d000af7f 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -49,12 +49,6 @@ #include <net/gre.h> #include <net/dst_metadata.h> -#if IS_ENABLED(CONFIG_IPV6) -#include <net/ipv6.h> -#include <net/ip6_fib.h> -#include <net/ip6_route.h> -#endif - /* Problems & solutions -------------------- @@ -122,126 +116,6 @@ static int ipgre_tunnel_init(struct net_device *dev); static int ipgre_net_id __read_mostly; static int gre_tap_net_id __read_mostly; -static int ip_gre_calc_hlen(__be16 o_flags) -{ - int addend = 4; - - if (o_flags & TUNNEL_CSUM) - addend += 4; - if (o_flags & TUNNEL_KEY) - addend += 4; - if (o_flags & TUNNEL_SEQ) - addend += 4; - return addend; -} - -static __be16 gre_flags_to_tnl_flags(__be16 flags) -{ - __be16 tflags = 0; - - if (flags & GRE_CSUM) - tflags |= TUNNEL_CSUM; - if (flags & GRE_ROUTING) - tflags |= TUNNEL_ROUTING; - if (flags & GRE_KEY) - tflags |= TUNNEL_KEY; - if (flags & GRE_SEQ) - tflags |= TUNNEL_SEQ; - if (flags & GRE_STRICT) - tflags |= TUNNEL_STRICT; - if (flags & GRE_REC) - tflags |= TUNNEL_REC; - if (flags & GRE_VERSION) - tflags |= TUNNEL_VERSION; - - return tflags; -} - -static __be16 tnl_flags_to_gre_flags(__be16 tflags) -{ - __be16 flags = 0; - - if (tflags & TUNNEL_CSUM) - flags |= GRE_CSUM; - if (tflags & TUNNEL_ROUTING) - flags |= GRE_ROUTING; - if (tflags & TUNNEL_KEY) - flags |= GRE_KEY; - if (tflags & TUNNEL_SEQ) - flags |= GRE_SEQ; - if (tflags & TUNNEL_STRICT) - flags |= GRE_STRICT; - if (tflags & TUNNEL_REC) - flags |= GRE_REC; - if (tflags & TUNNEL_VERSION) - flags |= GRE_VERSION; - - return flags; -} - -/* Fills in tpi and returns header length to be pulled. */ -static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, - bool *csum_err) -{ - const struct gre_base_hdr *greh; - __be32 *options; - int hdr_len; - - if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) - return -EINVAL; - - greh = (struct gre_base_hdr *)skb_transport_header(skb); - if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) - return -EINVAL; - - tpi->flags = gre_flags_to_tnl_flags(greh->flags); - hdr_len = ip_gre_calc_hlen(tpi->flags); - - if (!pskb_may_pull(skb, hdr_len)) - return -EINVAL; - - greh = (struct gre_base_hdr *)skb_transport_header(skb); - tpi->proto = greh->protocol; - - options = (__be32 *)(greh + 1); - if (greh->flags & GRE_CSUM) { - if (skb_checksum_simple_validate(skb)) { - *csum_err = true; - return -EINVAL; - } - - skb_checksum_try_convert(skb, IPPROTO_GRE, 0, - null_compute_pseudo); - options++; - } - - if (greh->flags & GRE_KEY) { - tpi->key = *options; - options++; - } else { - tpi->key = 0; - } - if (unlikely(greh->flags & GRE_SEQ)) { - tpi->seq = *options; - options++; - } else { - tpi->seq = 0; - } - /* WCCP version 1 and 2 protocol decoding. - * - Change protocol to IP - * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header - */ - if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { - tpi->proto = htons(ETH_P_IP); - if ((*(u8 *)options & 0xF0) != 0x40) { - hdr_len += 4; - if (!pskb_may_pull(skb, hdr_len)) - return -EINVAL; - } - } - return hdr_len; -} - static void ipgre_err(struct sk_buff *skb, u32 info, const struct tnl_ptk_info *tpi) { @@ -337,12 +211,14 @@ static void gre_err(struct sk_buff *skb, u32 info) * by themselves??? */ + const struct iphdr *iph = (struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct tnl_ptk_info tpi; bool csum_err = false; - if (parse_gre_header(skb, &tpi, &csum_err) < 0) { + if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), + iph->ihl * 4) < 0) { if (!csum_err) /* ignore csum errors. */ return; } @@ -380,24 +256,22 @@ static __be32 tunnel_id_to_key(__be64 x) #endif } -static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) +static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, + struct ip_tunnel_net *itn, int hdr_len, bool raw_proto) { - struct net *net = dev_net(skb->dev); struct metadata_dst *tun_dst = NULL; - struct ip_tunnel_net *itn; const struct iphdr *iph; struct ip_tunnel *tunnel; - if (tpi->proto == htons(ETH_P_TEB)) - itn = net_generic(net, gre_tap_net_id); - else - itn = net_generic(net, ipgre_net_id); - iph = ip_hdr(skb); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, iph->saddr, iph->daddr, tpi->key); if (tunnel) { + if (__iptunnel_pull_header(skb, hdr_len, tpi->proto, + raw_proto, false) < 0) + goto drop; + if (tunnel->dev->type != ARPHRD_NONE) skb_pop_mac_header(skb); else @@ -416,7 +290,34 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); return PACKET_RCVD; } - return PACKET_REJECT; + return PACKET_NEXT; + +drop: + kfree_skb(skb); + return PACKET_RCVD; +} + +static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, + int hdr_len) +{ + struct net *net = dev_net(skb->dev); + struct ip_tunnel_net *itn; + int res; + + if (tpi->proto == htons(ETH_P_TEB)) + itn = net_generic(net, gre_tap_net_id); + else + itn = net_generic(net, ipgre_net_id); + + res = __ipgre_rcv(skb, tpi, itn, hdr_len, false); + if (res == PACKET_NEXT && tpi->proto == htons(ETH_P_TEB)) { + /* ipgre tunnels in collect metadata mode should receive + * also ETH_P_TEB traffic. + */ + itn = net_generic(net, ipgre_net_id); + res = __ipgre_rcv(skb, tpi, itn, hdr_len, true); + } + return res; } static int gre_rcv(struct sk_buff *skb) @@ -433,13 +334,11 @@ static int gre_rcv(struct sk_buff *skb) } #endif - hdr_len = parse_gre_header(skb, &tpi, &csum_err); + hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0); if (hdr_len < 0) goto drop; - if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false) < 0) - goto drop; - if (ipgre_rcv(skb, &tpi) == PACKET_RCVD) + if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); @@ -448,49 +347,6 @@ drop: return 0; } -static __sum16 gre_checksum(struct sk_buff *skb) -{ - __wsum csum; - - if (skb->ip_summed == CHECKSUM_PARTIAL) - csum = lco_csum(skb); - else - csum = skb_checksum(skb, 0, skb->len, 0); - return csum_fold(csum); -} - -static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, - __be16 proto, __be32 key, __be32 seq) -{ - struct gre_base_hdr *greh; - - skb_push(skb, hdr_len); - - skb_reset_transport_header(skb); - greh = (struct gre_base_hdr *)skb->data; - greh->flags = tnl_flags_to_gre_flags(flags); - greh->protocol = proto; - - if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) { - __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); - - if (flags & TUNNEL_SEQ) { - *ptr = seq; - ptr--; - } - if (flags & TUNNEL_KEY) { - *ptr = key; - ptr--; - } - if (flags & TUNNEL_CSUM && - !(skb_shinfo(skb)->gso_type & - (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { - *ptr = 0; - *(__sum16 *)ptr = gre_checksum(skb); - } - } -} - static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, __be16 proto) @@ -501,15 +357,15 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, tunnel->o_seqno++; /* Push GRE header. */ - build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, - proto, tunnel->parms.o_key, htonl(tunnel->o_seqno)); + gre_build_header(skb, tunnel->tun_hlen, + tunnel->parms.o_flags, proto, tunnel->parms.o_key, + htonl(tunnel->o_seqno)); skb_set_inner_protocol(skb, proto); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } -static struct sk_buff *gre_handle_offloads(struct sk_buff *skb, - bool csum) +static int gre_handle_offloads(struct sk_buff *skb, bool csum) { return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } @@ -562,7 +418,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, fl.saddr); } - tunnel_hlen = ip_gre_calc_hlen(key->tun_flags); + tunnel_hlen = gre_calc_hlen(key->tun_flags); min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + tunnel_hlen + sizeof(struct iphdr); @@ -577,15 +433,12 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, } /* Push Tunnel header. */ - skb = gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)); - if (IS_ERR(skb)) { - skb = NULL; + if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM))) goto err_free_rt; - } flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); - build_header(skb, tunnel_hlen, flags, proto, - tunnel_id_to_key(tun_info->key.tun_id), 0); + gre_build_header(skb, tunnel_hlen, flags, proto, + tunnel_id_to_key(tun_info->key.tun_id), 0); df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; @@ -649,16 +502,14 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, tnl_params = &tunnel->parms.iph; } - skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM)); - if (IS_ERR(skb)) - goto out; + if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM))) + goto free_skb; __gre_xmit(skb, dev, tnl_params, skb->protocol); return NETDEV_TX_OK; free_skb: kfree_skb(skb); -out: dev->stats.tx_dropped++; return NETDEV_TX_OK; } @@ -673,9 +524,8 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, return NETDEV_TX_OK; } - skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM)); - if (IS_ERR(skb)) - goto out; + if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM))) + goto free_skb; if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; @@ -685,7 +535,6 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, free_skb: kfree_skb(skb); -out: dev->stats.tx_dropped++; return NETDEV_TX_OK; } @@ -711,8 +560,8 @@ static int ipgre_tunnel_ioctl(struct net_device *dev, if (err) return err; - p.i_flags = tnl_flags_to_gre_flags(p.i_flags); - p.o_flags = tnl_flags_to_gre_flags(p.o_flags); + p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags); + p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) return -EFAULT; @@ -756,7 +605,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, iph = (struct iphdr *)skb_push(skb, t->hlen + sizeof(*iph)); greh = (struct gre_base_hdr *)(iph+1); - greh->flags = tnl_flags_to_gre_flags(t->parms.o_flags); + greh->flags = gre_tnl_flags_to_gre_flags(t->parms.o_flags); greh->protocol = htons(type); memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); @@ -857,7 +706,7 @@ static void __gre_tunnel_init(struct net_device *dev) int t_hlen; tunnel = netdev_priv(dev); - tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); + tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; @@ -1180,8 +1029,10 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) struct ip_tunnel_parm *p = &t->parms; if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || - nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) || - nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) || + nla_put_be16(skb, IFLA_GRE_IFLAGS, + gre_tnl_flags_to_gre_flags(p->i_flags)) || + nla_put_be16(skb, IFLA_GRE_OFLAGS, + gre_tnl_flags_to_gre_flags(p->o_flags)) || nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) || @@ -1266,6 +1117,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, { struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; + LIST_HEAD(list_kill); struct ip_tunnel *t; int err; @@ -1281,8 +1133,10 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, t->collect_md = true; err = ipgre_newlink(net, dev, tb, NULL); - if (err < 0) - goto out; + if (err < 0) { + free_netdev(dev); + return ERR_PTR(err); + } /* openvswitch users expect packet sizes to be unrestricted, * so set the largest MTU we can. @@ -1291,9 +1145,14 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, if (err) goto out; + err = rtnl_configure_link(dev, NULL); + if (err < 0) + goto out; + return dev; out: - free_netdev(dev); + ip_tunnel_dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(gretap_fb_dev_create); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index e3d782746..4b351af3e 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -218,17 +218,17 @@ static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_b protocol = -ret; goto resubmit; } - IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); + __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); } else { if (!raw) { if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS); + __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } kfree_skb(skb); } else { - IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); + __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS); consume_skb(skb); } } @@ -273,7 +273,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb) --ANK (980813) */ if (skb_cow(skb, skb_headroom(skb))) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto drop; } @@ -282,7 +282,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb) opt->optlen = iph->ihl*4 - sizeof(struct iphdr); if (ip_options_compile(dev_net(dev), opt, skb)) { - IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); + __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS); goto drop; } @@ -313,6 +313,13 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip_rcv(skb); + if (!skb) + return NET_RX_SUCCESS; + if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk && @@ -337,7 +344,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) iph->tos, skb->dev); if (unlikely(err)) { if (err == -EXDEV) - NET_INC_STATS_BH(net, LINUX_MIB_IPRPFILTER); + __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); goto drop; } } @@ -358,9 +365,9 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { - IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len); + __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len); } else if (rt->rt_type == RTN_BROADCAST) { - IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len); + __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len); } else if (skb->pkt_type == PACKET_BROADCAST || skb->pkt_type == PACKET_MULTICAST) { struct in_device *in_dev = __in_dev_get_rcu(skb->dev); @@ -409,11 +416,11 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, net = dev_net(dev); - IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_IN, skb->len); + __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { - IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS); + __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto out; } @@ -439,9 +446,9 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); - IP_ADD_STATS_BH(net, - IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), - max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); + __IP_ADD_STATS(net, + IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), + max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; @@ -453,7 +460,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, len = ntohs(iph->tot_len); if (skb->len < len) { - IP_INC_STATS_BH(net, IPSTATS_MIB_INTRUNCATEDPKTS); + __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; @@ -463,7 +470,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS); + __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); goto drop; } @@ -471,6 +478,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, /* Remove any debris in the socket control block */ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + IPCB(skb)->iif = skb->skb_iif; /* Must drop socket now because of tproxy. */ skb_orphan(skb); @@ -480,9 +488,9 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, ip_rcv_finish); csum_error: - IP_INC_STATS_BH(net, IPSTATS_MIB_CSUMERRORS); + __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); inhdr_error: - IP_INC_STATS_BH(net, IPSTATS_MIB_INHDRERRORS); + __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); drop: kfree_skb(skb); out: diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 124bf0a66..4bd492163 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -271,7 +271,7 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk return dst_output(net, sk, skb); } #endif - mtu = ip_skb_dst_mtu(skb); + mtu = ip_skb_dst_mtu(sk, skb); if (skb_is_gso(skb)) return ip_finish_output_gso(net, sk, skb, mtu); @@ -541,7 +541,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, iph = ip_hdr(skb); - mtu = ip_skb_dst_mtu(skb); + mtu = ip_skb_dst_mtu(sk, skb); if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu) mtu = IPCB(skb)->frag_max_size; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 035ad645a..71a52f4d4 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -106,7 +106,8 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, return; if (offset != 0) - csum = csum_sub(csum, csum_partial(skb->data, offset, 0)); + csum = csum_sub(csum, csum_partial(skb_transport_header(skb), + offset, 0)); put_cmsg(msg, SOL_IP, IP_CHECKSUM, sizeof(__wsum), &csum); } @@ -219,11 +220,12 @@ void ip_cmsg_recv_offset(struct msghdr *msg, struct sk_buff *skb, } EXPORT_SYMBOL(ip_cmsg_recv_offset); -int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, +int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, bool allow_ipv6) { int err, val; struct cmsghdr *cmsg; + struct net *net = sock_net(sk); for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) @@ -244,6 +246,13 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc, continue; } #endif + if (cmsg->cmsg_level == SOL_SOCKET) { + err = __sock_cmsg_send(sk, msg, cmsg, &ipc->sockc); + if (err) + return err; + continue; + } + if (cmsg->cmsg_level != SOL_IP) continue; switch (cmsg->cmsg_type) { @@ -502,9 +511,10 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); - if (err) - goto out_free_skb; - + if (unlikely(err)) { + kfree_skb(skb); + return err; + } sock_recv_timestamp(msg, sk, skb); serr = SKB_EXT_ERR(skb); @@ -536,8 +546,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_ERRQUEUE; err = copied; -out_free_skb: - kfree_skb(skb); + consume_skb(skb); out: return err; } @@ -635,7 +644,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, if (err) break; old = rcu_dereference_protected(inet->inet_opt, - sock_owned_by_user(sk)); + lockdep_sock_is_held(sk)); if (inet->is_icsk) { struct inet_connection_sock *icsk = inet_csk(sk); #if IS_ENABLED(CONFIG_IPV6) @@ -1185,7 +1194,12 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) ipv6_sk_rxinfo(sk); if (prepare && skb_rtable(skb)) { - pktinfo->ipi_ifindex = inet_iif(skb); + /* skb->cb is overloaded: prior to this point it is IP{6}CB + * which has interface index (iif) as the first member of the + * underlying inet{6}_skb_parm struct. This code then overlays + * PKTINFO_SKB_CB and in_pktinfo also has iif as the first + * element so the iif is picked up from the prior IPCB + */ pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { pktinfo->ipi_ifindex = 0; @@ -1295,7 +1309,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, struct ip_options_rcu *inet_opt; inet_opt = rcu_dereference_protected(inet->inet_opt, - sock_owned_by_user(sk)); + lockdep_sock_is_held(sk)); opt->optlen = 0; if (inet_opt) memcpy(optbuf, &inet_opt->opt, diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index a69ed94bd..d8f5e0a26 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -443,29 +443,6 @@ drop: } EXPORT_SYMBOL_GPL(ip_tunnel_rcv); -static int ip_encap_hlen(struct ip_tunnel_encap *e) -{ - const struct ip_tunnel_encap_ops *ops; - int hlen = -EINVAL; - - if (e->type == TUNNEL_ENCAP_NONE) - return 0; - - if (e->type >= MAX_IPTUN_ENCAP_OPS) - return -EINVAL; - - rcu_read_lock(); - ops = rcu_dereference(iptun_encaps[e->type]); - if (likely(ops && ops->encap_hlen)) - hlen = ops->encap_hlen(e); - rcu_read_unlock(); - - return hlen; -} - -const struct ip_tunnel_encap_ops __rcu * - iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; - int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops, unsigned int num) { @@ -519,28 +496,6 @@ int ip_tunnel_encap_setup(struct ip_tunnel *t, } EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup); -int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, - u8 *protocol, struct flowi4 *fl4) -{ - const struct ip_tunnel_encap_ops *ops; - int ret = -EINVAL; - - if (t->encap.type == TUNNEL_ENCAP_NONE) - return 0; - - if (t->encap.type >= MAX_IPTUN_ENCAP_OPS) - return -EINVAL; - - rcu_read_lock(); - ops = rcu_dereference(iptun_encaps[t->encap.type]); - if (likely(ops && ops->build_header)) - ret = ops->build_header(skb, &t->encap, protocol, fl4); - rcu_read_unlock(); - - return ret; -} -EXPORT_SYMBOL(ip_tunnel_encap); - static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, struct rtable *rt, __be16 df, const struct iphdr *inner_iph) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 6165f30c4..afd6b5968 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -37,6 +37,7 @@ #include <net/icmp.h> #include <net/protocol.h> #include <net/ip_tunnels.h> +#include <net/ip6_tunnel.h> #include <net/arp.h> #include <net/checksum.h> #include <net/dsfield.h> @@ -47,6 +48,14 @@ #include <net/rtnetlink.h> #include <net/dst_metadata.h> +const struct ip_tunnel_encap_ops __rcu * + iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; +EXPORT_SYMBOL(iptun_encaps); + +const struct ip6_tnl_encap_ops __rcu * + ip6tun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; +EXPORT_SYMBOL(ip6tun_encaps); + void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, __be16 df, bool xnet) @@ -86,15 +95,15 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, } EXPORT_SYMBOL_GPL(iptunnel_xmit); -int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, - bool xnet) +int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len, + __be16 inner_proto, bool raw_proto, bool xnet) { if (unlikely(!pskb_may_pull(skb, hdr_len))) return -ENOMEM; skb_pull_rcsum(skb, hdr_len); - if (inner_proto == htons(ETH_P_TEB)) { + if (!raw_proto && inner_proto == htons(ETH_P_TEB)) { struct ethhdr *eh; if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) @@ -117,7 +126,7 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto, return iptunnel_pull_offloads(skb); } -EXPORT_SYMBOL_GPL(iptunnel_pull_header); +EXPORT_SYMBOL_GPL(__iptunnel_pull_header); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags) @@ -146,8 +155,8 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, } EXPORT_SYMBOL_GPL(iptunnel_metadata_reply); -struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, - int gso_type_mask) +int iptunnel_handle_offloads(struct sk_buff *skb, + int gso_type_mask) { int err; @@ -157,11 +166,11 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, } if (skb_is_gso(skb)) { - err = skb_unclone(skb, GFP_ATOMIC); + err = skb_header_unclone(skb, GFP_ATOMIC); if (unlikely(err)) - goto error; + return err; skb_shinfo(skb)->gso_type |= gso_type_mask; - return skb; + return 0; } if (skb->ip_summed != CHECKSUM_PARTIAL) { @@ -174,10 +183,7 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb, skb->encapsulation = 0; } - return skb; -error: - kfree_skb(skb); - return ERR_PTR(err); + return 0; } EXPORT_SYMBOL_GPL(iptunnel_handle_offloads); @@ -247,10 +253,10 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]); if (tb[LWTUNNEL_IP_DST]) - tun_info->key.u.ipv4.dst = nla_get_be32(tb[LWTUNNEL_IP_DST]); + tun_info->key.u.ipv4.dst = nla_get_in_addr(tb[LWTUNNEL_IP_DST]); if (tb[LWTUNNEL_IP_SRC]) - tun_info->key.u.ipv4.src = nla_get_be32(tb[LWTUNNEL_IP_SRC]); + tun_info->key.u.ipv4.src = nla_get_in_addr(tb[LWTUNNEL_IP_SRC]); if (tb[LWTUNNEL_IP_TTL]) tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]); @@ -274,9 +280,10 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, { struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); - if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) || - nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) || - nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || + if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id, + LWTUNNEL_IP_PAD) || + nla_put_in_addr(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) || + nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) @@ -287,7 +294,7 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) { - return nla_total_size(8) /* LWTUNNEL_IP_ID */ + return nla_total_size_64bit(8) /* LWTUNNEL_IP_ID */ + nla_total_size(4) /* LWTUNNEL_IP_DST */ + nla_total_size(4) /* LWTUNNEL_IP_SRC */ + nla_total_size(1) /* LWTUNNEL_IP_TOS */ @@ -369,7 +376,8 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, { struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); - if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) || + if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id, + LWTUNNEL_IP6_PAD) || nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) || nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) || @@ -382,7 +390,7 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb, static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate) { - return nla_total_size(8) /* LWTUNNEL_IP6_ID */ + return nla_total_size_64bit(8) /* LWTUNNEL_IP6_ID */ + nla_total_size(16) /* LWTUNNEL_IP6_DST */ + nla_total_size(16) /* LWTUNNEL_IP6_SRC */ + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */ diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 2ed9dd2b5..1d71c40ea 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -127,7 +127,9 @@ __be32 ic_myaddr = NONE; /* My IP address */ static __be32 ic_netmask = NONE; /* Netmask for local subnet */ __be32 ic_gateway = NONE; /* Gateway IP address */ -__be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */ +#ifdef IPCONFIG_DYNAMIC +static __be32 ic_addrservaddr = NONE; /* IP Address of the IP addresses'server */ +#endif __be32 ic_servaddr = NONE; /* Boot server IP address */ diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ec51d0216..978370132 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -219,9 +219,8 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(skb->protocol != htons(ETH_P_IP))) goto tx_error; - skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP); - if (IS_ERR(skb)) - goto out; + if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) + goto tx_error; skb_set_inner_ipproto(skb, IPPROTO_IPIP); @@ -230,7 +229,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) tx_error: kfree_skb(skb); -out: + dev->stats.tx_errors++; return NETDEV_TX_OK; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index a42dd8021..5ad48ec77 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2106,7 +2106,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, mfcs.mfcs_packets = c->mfc_un.res.pkt; mfcs.mfcs_bytes = c->mfc_un.res.bytes; mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; - if (nla_put(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs) < 0) + if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) < 0) return -EMSGSIZE; rtm->rtm_type = RTN_MULTICAST; @@ -2239,7 +2239,7 @@ static size_t mroute_msgsize(bool unresolved, int maxvif) + nla_total_size(0) /* RTA_MULTIPATH */ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) /* RTA_MFC_STATS */ - + nla_total_size(sizeof(struct rta_mfc_stats)) + + nla_total_size_64bit(sizeof(struct rta_mfc_stats)) ; return len; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 85d60c69b..2033f929a 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -34,27 +34,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); MODULE_DESCRIPTION("arptables core"); -/*#define DEBUG_ARP_TABLES*/ -/*#define DEBUG_ARP_TABLES_USER*/ - -#ifdef DEBUG_ARP_TABLES -#define dprintf(format, args...) pr_debug(format, ## args) -#else -#define dprintf(format, args...) -#endif - -#ifdef DEBUG_ARP_TABLES_USER -#define duprintf(format, args...) pr_debug(format, ## args) -#else -#define duprintf(format, args...) -#endif - -#ifdef CONFIG_NETFILTER_DEBUG -#define ARP_NF_ASSERT(x) WARN_ON(!(x)) -#else -#define ARP_NF_ASSERT(x) -#endif - void *arpt_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(arpt, ARPT); @@ -113,36 +92,20 @@ static inline int arp_packet_match(const struct arphdr *arphdr, #define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg))) if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop, - ARPT_INV_ARPOP)) { - dprintf("ARP operation field mismatch.\n"); - dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n", - arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask); + ARPT_INV_ARPOP)) return 0; - } if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd, - ARPT_INV_ARPHRD)) { - dprintf("ARP hardware address format mismatch.\n"); - dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n", - arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask); + ARPT_INV_ARPHRD)) return 0; - } if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro, - ARPT_INV_ARPPRO)) { - dprintf("ARP protocol address format mismatch.\n"); - dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n", - arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask); + ARPT_INV_ARPPRO)) return 0; - } if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln, - ARPT_INV_ARPHLN)) { - dprintf("ARP hardware address length mismatch.\n"); - dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n", - arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask); + ARPT_INV_ARPHLN)) return 0; - } src_devaddr = arpptr; arpptr += dev->addr_len; @@ -155,49 +118,25 @@ static inline int arp_packet_match(const struct arphdr *arphdr, if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len), ARPT_INV_SRCDEVADDR) || FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len), - ARPT_INV_TGTDEVADDR)) { - dprintf("Source or target device address mismatch.\n"); - + ARPT_INV_TGTDEVADDR)) return 0; - } if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr, ARPT_INV_SRCIP) || FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr), - ARPT_INV_TGTIP)) { - dprintf("Source or target IP address mismatch.\n"); - - dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n", - &src_ipaddr, - &arpinfo->smsk.s_addr, - &arpinfo->src.s_addr, - arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : ""); - dprintf("TGT: %pI4 Mask: %pI4 Target: %pI4.%s\n", - &tgt_ipaddr, - &arpinfo->tmsk.s_addr, - &arpinfo->tgt.s_addr, - arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : ""); + ARPT_INV_TGTIP)) return 0; - } /* Look for ifname matches. */ ret = ifname_compare(indev, arpinfo->iniface, arpinfo->iniface_mask); - if (FWINV(ret != 0, ARPT_INV_VIA_IN)) { - dprintf("VIA in mismatch (%s vs %s).%s\n", - indev, arpinfo->iniface, - arpinfo->invflags & ARPT_INV_VIA_IN ? " (INV)" : ""); + if (FWINV(ret != 0, ARPT_INV_VIA_IN)) return 0; - } ret = ifname_compare(outdev, arpinfo->outiface, arpinfo->outiface_mask); - if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) { - dprintf("VIA out mismatch (%s vs %s).%s\n", - outdev, arpinfo->outiface, - arpinfo->invflags & ARPT_INV_VIA_OUT ? " (INV)" : ""); + if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) return 0; - } return 1; #undef FWINV @@ -205,16 +144,10 @@ static inline int arp_packet_match(const struct arphdr *arphdr, static inline int arp_checkentry(const struct arpt_arp *arp) { - if (arp->flags & ~ARPT_F_MASK) { - duprintf("Unknown flag bits set: %08X\n", - arp->flags & ~ARPT_F_MASK); + if (arp->flags & ~ARPT_F_MASK) return 0; - } - if (arp->invflags & ~ARPT_INV_MASK) { - duprintf("Unknown invflag bits set: %08X\n", - arp->invflags & ~ARPT_INV_MASK); + if (arp->invflags & ~ARPT_INV_MASK) return 0; - } return 1; } @@ -406,11 +339,9 @@ static int mark_source_chains(const struct xt_table_info *newinfo, = (void *)arpt_get_target_c(e); int visited = e->comefrom & (1 << hook); - if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) { - pr_notice("arptables: loop hook %u pos %u %08X.\n", - hook, pos, e->comefrom); + if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) return 0; - } + e->comefrom |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); @@ -423,12 +354,8 @@ static int mark_source_chains(const struct xt_table_info *newinfo, if ((strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) { - duprintf("mark_source_chains: bad " - "negative verdict (%i)\n", - t->verdict); + t->verdict < -NF_MAX_VERDICT - 1) return 0; - } /* Return: backtrack through the last * big jump. @@ -461,17 +388,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, if (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0 && newpos >= 0) { - if (newpos > newinfo->size - - sizeof(struct arpt_entry)) { - duprintf("mark_source_chains: " - "bad verdict (%i)\n", - newpos); - return 0; - } - /* This a jump; chase it. */ - duprintf("Jump rule %u -> %u\n", - pos, newpos); e = (struct arpt_entry *) (entry0 + newpos); if (!find_jump_target(newinfo, e)) @@ -488,8 +405,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } -next: - duprintf("Finished chain %u\n", hook); +next: ; } return 1; } @@ -497,7 +413,6 @@ next: static inline int check_target(struct arpt_entry *e, const char *name) { struct xt_entry_target *t = arpt_get_target(e); - int ret; struct xt_tgchk_param par = { .table = name, .entryinfo = e, @@ -507,13 +422,7 @@ static inline int check_target(struct arpt_entry *e, const char *name) .family = NFPROTO_ARP, }; - ret = xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); - if (ret < 0) { - duprintf("arp_tables: check failed for `%s'.\n", - t->u.kernel.target->name); - return ret; - } - return 0; + return xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false); } static inline int @@ -521,17 +430,18 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) { struct xt_entry_target *t; struct xt_target *target; + unsigned long pcnt; int ret; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; t = arpt_get_target(e); target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("find_check_entry: `%s' not found\n", t->u.user.name); ret = PTR_ERR(target); goto out; } @@ -577,17 +487,12 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || (unsigned char *)e + sizeof(struct arpt_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p\n", e); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset - < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) return -EINVAL; - } if (!arp_checkentry(&e->arp)) return -EINVAL; @@ -604,12 +509,9 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { - if (!check_underflow(e)) { - pr_debug("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + if (!check_underflow(e)) return -EINVAL; - } + newinfo->underflow[h] = underflows[h]; } } @@ -654,7 +556,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } - duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ @@ -671,31 +572,21 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } - duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) return ret; - if (i != repl->num_entries) { - duprintf("translate_table: %u not %u entries\n", - i, repl->num_entries); + if (i != repl->num_entries) return -EINVAL; - } /* Check hooks all assigned */ for (i = 0; i < NF_ARP_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(repl->valid_hooks & (1 << i))) continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) { - duprintf("Invalid hook entry %u %u\n", - i, repl->hook_entry[i]); + if (newinfo->hook_entry[i] == 0xFFFFFFFF) return -EINVAL; - } - if (newinfo->underflow[i] == 0xFFFFFFFF) { - duprintf("Invalid underflow %u %u\n", - i, repl->underflow[i]); + if (newinfo->underflow[i] == 0xFFFFFFFF) return -EINVAL; - } } if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) @@ -903,11 +794,8 @@ static int get_info(struct net *net, void __user *user, struct xt_table *t; int ret; - if (*len != sizeof(struct arpt_getinfo)) { - duprintf("length %u != %Zu\n", *len, - sizeof(struct arpt_getinfo)); + if (*len != sizeof(struct arpt_getinfo)) return -EINVAL; - } if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; @@ -963,33 +851,25 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, struct arpt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("get_entries: %u < %Zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct arpt_get_entries) + get.size) { - duprintf("get_entries: %u != %Zu\n", *len, - sizeof(struct arpt_get_entries) + get.size); + if (*len != sizeof(struct arpt_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, NFPROTO_ARP, get.name); if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; - duprintf("t->private->number = %u\n", - private->number); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); - else { - duprintf("get_entries: I've got %u not %u!\n", - private->size, get.size); + else ret = -EAGAIN; - } + module_put(t->me); xt_table_unlock(t); } else @@ -1027,8 +907,6 @@ static int __do_replace(struct net *net, const char *name, /* You lied! */ if (valid_hooks != t->valid_hooks) { - duprintf("Valid hook crap: %08X vs %08X\n", - valid_hooks, t->valid_hooks); ret = -EINVAL; goto put_module; } @@ -1038,8 +916,6 @@ static int __do_replace(struct net *net, const char *name, goto put_module; /* Update module usage count based on number of rules */ - duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", - oldinfo->number, oldinfo->initial_entries, newinfo->number); if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); @@ -1109,8 +985,6 @@ static int do_replace(struct net *net, const void __user *user, if (ret != 0) goto free_newinfo; - duprintf("arp_tables: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) @@ -1208,20 +1082,14 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, unsigned int entry_offset; int ret, off; - duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 || (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p, limit = %p\n", e, limit); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset < sizeof(struct compat_arpt_entry) + - sizeof(struct compat_xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + sizeof(struct compat_xt_entry_target)) return -EINVAL; - } if (!arp_checkentry(&e->arp)) return -EINVAL; @@ -1238,8 +1106,6 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, target = xt_request_find_target(NFPROTO_ARP, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", - t->u.user.name); ret = PTR_ERR(target); goto out; } @@ -1309,7 +1175,6 @@ static int translate_compat_table(struct xt_table_info **pinfo, size = compatr->size; info->number = compatr->num_entries; - duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(NFPROTO_ARP); xt_compat_init_offsets(NFPROTO_ARP, compatr->num_entries); @@ -1324,11 +1189,8 @@ static int translate_compat_table(struct xt_table_info **pinfo, } ret = -EINVAL; - if (j != compatr->num_entries) { - duprintf("translate_compat_table: %u not %u entries\n", - j, compatr->num_entries); + if (j != compatr->num_entries) goto out_unlock; - } ret = -ENOMEM; newinfo = xt_alloc_table_info(size); @@ -1398,8 +1260,6 @@ static int compat_do_replace(struct net *net, void __user *user, return -EFAULT; /* overflow check */ - if (tmp.size >= INT_MAX / num_possible_cpus()) - return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; if (tmp.num_counters == 0) @@ -1421,8 +1281,6 @@ static int compat_do_replace(struct net *net, void __user *user, if (ret != 0) goto free_newinfo; - duprintf("compat_do_replace: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) @@ -1455,7 +1313,6 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, break; default: - duprintf("do_arpt_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1538,17 +1395,13 @@ static int compat_get_entries(struct net *net, struct compat_arpt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct compat_arpt_get_entries) + get.size) { - duprintf("compat_get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct compat_arpt_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(NFPROTO_ARP); @@ -1557,16 +1410,13 @@ static int compat_get_entries(struct net *net, const struct xt_table_info *private = t->private; struct xt_table_info info; - duprintf("t->private->number = %u\n", private->number); ret = compat_table_info(private, &info); if (!ret && get.size == info.size) { ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); - } else if (!ret) { - duprintf("compat_get_entries: I've got %u not %u!\n", - private->size, get.size); + } else if (!ret) ret = -EAGAIN; - } + xt_compat_flush_offsets(NFPROTO_ARP); module_put(t->me); xt_table_unlock(t); @@ -1618,7 +1468,6 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned break; default: - duprintf("do_arpt_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1661,7 +1510,6 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len } default: - duprintf("do_arpt_get_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1706,7 +1554,6 @@ int arpt_register_table(struct net *net, memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(newinfo, loc_cpu_entry, repl); - duprintf("arpt_register_table: translate table gives %d\n", ret); if (ret != 0) goto out_free; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 0984ea3fc..54906e0e8 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -35,34 +35,12 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("IPv4 packet filter"); -/*#define DEBUG_IP_FIREWALL*/ -/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ -/*#define DEBUG_IP_FIREWALL_USER*/ - -#ifdef DEBUG_IP_FIREWALL -#define dprintf(format, args...) pr_info(format , ## args) -#else -#define dprintf(format, args...) -#endif - -#ifdef DEBUG_IP_FIREWALL_USER -#define duprintf(format, args...) pr_info(format , ## args) -#else -#define duprintf(format, args...) -#endif - #ifdef CONFIG_NETFILTER_DEBUG #define IP_NF_ASSERT(x) WARN_ON(!(x)) #else #define IP_NF_ASSERT(x) #endif -#if 0 -/* All the better to debug you with... */ -#define static -#define inline -#endif - void *ipt_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(ipt, IPT); @@ -85,52 +63,28 @@ ip_packet_match(const struct iphdr *ip, if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, IPT_INV_SRCIP) || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, - IPT_INV_DSTIP)) { - dprintf("Source or dest mismatch.\n"); - - dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n", - &ip->saddr, &ipinfo->smsk.s_addr, &ipinfo->src.s_addr, - ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : ""); - dprintf("DST: %pI4 Mask: %pI4 Target: %pI4.%s\n", - &ip->daddr, &ipinfo->dmsk.s_addr, &ipinfo->dst.s_addr, - ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : ""); + IPT_INV_DSTIP)) return false; - } ret = ifname_compare_aligned(indev, ipinfo->iniface, ipinfo->iniface_mask); - if (FWINV(ret != 0, IPT_INV_VIA_IN)) { - dprintf("VIA in mismatch (%s vs %s).%s\n", - indev, ipinfo->iniface, - ipinfo->invflags & IPT_INV_VIA_IN ? " (INV)" : ""); + if (FWINV(ret != 0, IPT_INV_VIA_IN)) return false; - } ret = ifname_compare_aligned(outdev, ipinfo->outiface, ipinfo->outiface_mask); - if (FWINV(ret != 0, IPT_INV_VIA_OUT)) { - dprintf("VIA out mismatch (%s vs %s).%s\n", - outdev, ipinfo->outiface, - ipinfo->invflags & IPT_INV_VIA_OUT ? " (INV)" : ""); + if (FWINV(ret != 0, IPT_INV_VIA_OUT)) return false; - } /* Check specific protocol */ if (ipinfo->proto && - FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { - dprintf("Packet protocol %hi does not match %hi.%s\n", - ip->protocol, ipinfo->proto, - ipinfo->invflags & IPT_INV_PROTO ? " (INV)" : ""); + FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) return false; - } /* If we have a fragment rule but the packet is not a fragment * then we return zero */ - if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) { - dprintf("Fragment rule but not fragment.%s\n", - ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : ""); + if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) return false; - } return true; } @@ -138,16 +92,10 @@ ip_packet_match(const struct iphdr *ip, static bool ip_checkentry(const struct ipt_ip *ip) { - if (ip->flags & ~IPT_F_MASK) { - duprintf("Unknown flag bits set: %08X\n", - ip->flags & ~IPT_F_MASK); + if (ip->flags & ~IPT_F_MASK) return false; - } - if (ip->invflags & ~IPT_INV_MASK) { - duprintf("Unknown invflag bits set: %08X\n", - ip->invflags & ~IPT_INV_MASK); + if (ip->invflags & ~IPT_INV_MASK) return false; - } return true; } @@ -346,10 +294,6 @@ ipt_do_table(struct sk_buff *skb, e = get_entry(table_base, private->hook_entry[hook]); - pr_debug("Entering %s(hook %u), UF %p\n", - table->name, hook, - get_entry(table_base, private->underflow[hook])); - do { const struct xt_entry_target *t; const struct xt_entry_match *ematch; @@ -396,22 +340,15 @@ ipt_do_table(struct sk_buff *skb, if (stackidx == 0) { e = get_entry(table_base, private->underflow[hook]); - pr_debug("Underflow (this is normal) " - "to %p\n", e); } else { e = jumpstack[--stackidx]; - pr_debug("Pulled %p out from pos %u\n", - e, stackidx); e = ipt_next_entry(e); } continue; } if (table_base + v != ipt_next_entry(e) && - !(e->ip.flags & IPT_F_GOTO)) { + !(e->ip.flags & IPT_F_GOTO)) jumpstack[stackidx++] = e; - pr_debug("Pushed %p into pos %u\n", - e, stackidx - 1); - } e = get_entry(table_base, v); continue; @@ -429,18 +366,13 @@ ipt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!acpar.hotdrop); - pr_debug("Exiting %s; sp at %u\n", __func__, stackidx); xt_write_recseq_end(addend); local_bh_enable(); -#ifdef DEBUG_ALLOW_ALL - return NF_ACCEPT; -#else if (acpar.hotdrop) return NF_DROP; else return verdict; -#endif } static bool find_jump_target(const struct xt_table_info *t, @@ -480,11 +412,9 @@ mark_source_chains(const struct xt_table_info *newinfo, = (void *)ipt_get_target_c(e); int visited = e->comefrom & (1 << hook); - if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { - pr_err("iptables: loop hook %u pos %u %08X.\n", - hook, pos, e->comefrom); + if (e->comefrom & (1 << NF_INET_NUMHOOKS)) return 0; - } + e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ @@ -496,26 +426,13 @@ mark_source_chains(const struct xt_table_info *newinfo, if ((strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) { - duprintf("mark_source_chains: bad " - "negative verdict (%i)\n", - t->verdict); + t->verdict < -NF_MAX_VERDICT - 1) return 0; - } /* Return: backtrack through the last big jump. */ do { e->comefrom ^= (1<<NF_INET_NUMHOOKS); -#ifdef DEBUG_IP_FIREWALL_USER - if (e->comefrom - & (1 << NF_INET_NUMHOOKS)) { - duprintf("Back unset " - "on hook %u " - "rule %u\n", - hook, pos); - } -#endif oldpos = pos; pos = e->counters.pcnt; e->counters.pcnt = 0; @@ -542,16 +459,7 @@ mark_source_chains(const struct xt_table_info *newinfo, if (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0 && newpos >= 0) { - if (newpos > newinfo->size - - sizeof(struct ipt_entry)) { - duprintf("mark_source_chains: " - "bad verdict (%i)\n", - newpos); - return 0; - } /* This a jump; chase it. */ - duprintf("Jump rule %u -> %u\n", - pos, newpos); e = (struct ipt_entry *) (entry0 + newpos); if (!find_jump_target(newinfo, e)) @@ -568,8 +476,7 @@ mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } -next: - duprintf("Finished chain %u\n", hook); +next: ; } return 1; } @@ -591,18 +498,12 @@ static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { const struct ipt_ip *ip = par->entryinfo; - int ret; par->match = m->u.kernel.match; par->matchinfo = m->data; - ret = xt_check_match(par, m->u.match_size - sizeof(*m), - ip->proto, ip->invflags & IPT_INV_PROTO); - if (ret < 0) { - duprintf("check failed for `%s'.\n", par->match->name); - return ret; - } - return 0; + return xt_check_match(par, m->u.match_size - sizeof(*m), + ip->proto, ip->invflags & IPT_INV_PROTO); } static int @@ -613,10 +514,8 @@ find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name, m->u.user.revision); - if (IS_ERR(match)) { - duprintf("find_check_match: `%s' not found\n", m->u.user.name); + if (IS_ERR(match)) return PTR_ERR(match); - } m->u.kernel.match = match; ret = check_match(m, par); @@ -641,16 +540,9 @@ static int check_target(struct ipt_entry *e, struct net *net, const char *name) .hook_mask = e->comefrom, .family = NFPROTO_IPV4, }; - int ret; - ret = xt_check_target(&par, t->u.target_size - sizeof(*t), - e->ip.proto, e->ip.invflags & IPT_INV_PROTO); - if (ret < 0) { - duprintf("check failed for `%s'.\n", - t->u.kernel.target->name); - return ret; - } - return 0; + return xt_check_target(&par, t->u.target_size - sizeof(*t), + e->ip.proto, e->ip.invflags & IPT_INV_PROTO); } static int @@ -663,10 +555,12 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; + unsigned long pcnt; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; j = 0; mtpar.net = net; @@ -685,7 +579,6 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("find_check_entry: `%s' not found\n", t->u.user.name); ret = PTR_ERR(target); goto cleanup_matches; } @@ -739,17 +632,12 @@ check_entry_size_and_hooks(struct ipt_entry *e, if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || (unsigned char *)e + sizeof(struct ipt_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p\n", e); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset - < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) return -EINVAL; - } if (!ip_checkentry(&e->ip)) return -EINVAL; @@ -766,12 +654,9 @@ check_entry_size_and_hooks(struct ipt_entry *e, if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { - if (!check_underflow(e)) { - pr_debug("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + if (!check_underflow(e)) return -EINVAL; - } + newinfo->underflow[h] = underflows[h]; } } @@ -823,7 +708,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } - duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { @@ -840,27 +724,18 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, ++newinfo->stacksize; } - if (i != repl->num_entries) { - duprintf("translate_table: %u not %u entries\n", - i, repl->num_entries); + if (i != repl->num_entries) return -EINVAL; - } /* Check hooks all assigned */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(repl->valid_hooks & (1 << i))) continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) { - duprintf("Invalid hook entry %u %u\n", - i, repl->hook_entry[i]); + if (newinfo->hook_entry[i] == 0xFFFFFFFF) return -EINVAL; - } - if (newinfo->underflow[i] == 0xFFFFFFFF) { - duprintf("Invalid underflow %u %u\n", - i, repl->underflow[i]); + if (newinfo->underflow[i] == 0xFFFFFFFF) return -EINVAL; - } } if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) @@ -1088,11 +963,8 @@ static int get_info(struct net *net, void __user *user, struct xt_table *t; int ret; - if (*len != sizeof(struct ipt_getinfo)) { - duprintf("length %u != %zu\n", *len, - sizeof(struct ipt_getinfo)); + if (*len != sizeof(struct ipt_getinfo)) return -EINVAL; - } if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; @@ -1150,31 +1022,23 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr, struct ipt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct ipt_get_entries) + get.size) { - duprintf("get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct ipt_get_entries) + get.size) return -EINVAL; - } get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET, get.name); if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; - duprintf("t->private->number = %u\n", private->number); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); - else { - duprintf("get_entries: I've got %u not %u!\n", - private->size, get.size); + else ret = -EAGAIN; - } + module_put(t->me); xt_table_unlock(t); } else @@ -1210,8 +1074,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, /* You lied! */ if (valid_hooks != t->valid_hooks) { - duprintf("Valid hook crap: %08X vs %08X\n", - valid_hooks, t->valid_hooks); ret = -EINVAL; goto put_module; } @@ -1221,8 +1083,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, goto put_module; /* Update module usage count based on number of rules */ - duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", - oldinfo->number, oldinfo->initial_entries, newinfo->number); if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); @@ -1291,8 +1151,6 @@ do_replace(struct net *net, const void __user *user, unsigned int len) if (ret != 0) goto free_newinfo; - duprintf("Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) @@ -1418,11 +1276,9 @@ compat_find_calc_match(struct xt_entry_match *m, match = xt_request_find_match(NFPROTO_IPV4, m->u.user.name, m->u.user.revision); - if (IS_ERR(match)) { - duprintf("compat_check_calc_match: `%s' not found\n", - m->u.user.name); + if (IS_ERR(match)) return PTR_ERR(match); - } + m->u.kernel.match = match; *size += xt_compat_match_offset(match); return 0; @@ -1454,20 +1310,14 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, unsigned int j; int ret, off; - duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 || (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p, limit = %p\n", e, limit); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset < sizeof(struct compat_ipt_entry) + - sizeof(struct compat_xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + sizeof(struct compat_xt_entry_target)) return -EINVAL; - } if (!ip_checkentry(&e->ip)) return -EINVAL; @@ -1491,8 +1341,6 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, target = xt_request_find_target(NFPROTO_IPV4, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", - t->u.user.name); ret = PTR_ERR(target); goto release_matches; } @@ -1574,7 +1422,6 @@ translate_compat_table(struct net *net, size = compatr->size; info->number = compatr->num_entries; - duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET); xt_compat_init_offsets(AF_INET, compatr->num_entries); @@ -1589,11 +1436,8 @@ translate_compat_table(struct net *net, } ret = -EINVAL; - if (j != compatr->num_entries) { - duprintf("translate_compat_table: %u not %u entries\n", - j, compatr->num_entries); + if (j != compatr->num_entries) goto out_unlock; - } ret = -ENOMEM; newinfo = xt_alloc_table_info(size); @@ -1668,8 +1512,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -EFAULT; /* overflow check */ - if (tmp.size >= INT_MAX / num_possible_cpus()) - return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; if (tmp.num_counters == 0) @@ -1692,8 +1534,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) if (ret != 0) goto free_newinfo; - duprintf("compat_do_replace: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) @@ -1727,7 +1567,6 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, break; default: - duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1777,19 +1616,15 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, struct compat_ipt_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct compat_ipt_get_entries) + get.size) { - duprintf("compat_get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct compat_ipt_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(AF_INET); @@ -1797,16 +1632,13 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr, if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; - duprintf("t->private->number = %u\n", private->number); ret = compat_table_info(private, &info); - if (!ret && get.size == info.size) { + if (!ret && get.size == info.size) ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); - } else if (!ret) { - duprintf("compat_get_entries: I've got %u not %u!\n", - private->size, get.size); + else if (!ret) ret = -EAGAIN; - } + xt_compat_flush_offsets(AF_INET); module_put(t->me); xt_table_unlock(t); @@ -1859,7 +1691,6 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) break; default: - duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1911,7 +1742,6 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) } default: - duprintf("do_ipt_get_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -2013,7 +1843,6 @@ icmp_match(const struct sk_buff *skb, struct xt_action_param *par) /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ - duprintf("Dropping evil ICMP tinygram.\n"); par->hotdrop = true; return false; } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index e3c46e8e2..ae1a71a97 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -360,7 +360,7 @@ static int ipv4_init_net(struct net *net) in->ctl_table[0].data = &nf_conntrack_max; in->ctl_table[1].data = &net->ct.count; - in->ctl_table[2].data = &net->ct.htable_size; + in->ctl_table[2].data = &nf_conntrack_htable_size; in->ctl_table[3].data = &net->ct.sysctl_checksum; in->ctl_table[4].data = &net->ct.sysctl_log_invalid; #endif diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index f0dfe92a0..c6f3c406f 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -31,15 +31,14 @@ struct ct_iter_state { static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < net->ct.htable_size; + st->bucket < nf_conntrack_htable_size; st->bucket++) { n = rcu_dereference( - hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -49,17 +48,16 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct hlist_nulls_node *head) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= net->ct.htable_size) + if (++st->bucket >= nf_conntrack_htable_size) return NULL; } head = rcu_dereference( - hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); } return head; } @@ -114,6 +112,23 @@ static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) } #endif +static bool ct_seq_should_skip(const struct nf_conn *ct, + const struct net *net, + const struct nf_conntrack_tuple_hash *hash) +{ + /* we only want to print DIR_ORIGINAL */ + if (NF_CT_DIRECTION(hash)) + return true; + + if (nf_ct_l3num(ct) != AF_INET) + return true; + + if (!net_eq(nf_ct_net(ct), net)) + return true; + + return false; +} + static int ct_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_tuple_hash *hash = v; @@ -123,14 +138,15 @@ static int ct_seq_show(struct seq_file *s, void *v) int ret = 0; NF_CT_ASSERT(ct); - if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + if (ct_seq_should_skip(ct, seq_file_net(s), hash)) return 0; + if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) + return 0; - /* we only want to print DIR_ORIGINAL */ - if (NF_CT_DIRECTION(hash)) - goto release; - if (nf_ct_l3num(ct) != AF_INET) + /* check if we raced w. object reuse */ + if (!nf_ct_is_confirmed(ct) || + ct_seq_should_skip(ct, seq_file_net(s), hash)) goto release; l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct)); @@ -220,13 +236,12 @@ struct ct_expect_iter_state { static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { n = rcu_dereference( - hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); if (n) return n; } @@ -236,7 +251,6 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(hlist_next_rcu(head)); @@ -244,7 +258,7 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq, if (++st->bucket >= nf_ct_expect_hsize) return NULL; head = rcu_dereference( - hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); } return head; } @@ -285,6 +299,9 @@ static int exp_seq_show(struct seq_file *s, void *v) exp = hlist_entry(n, struct nf_conntrack_expect, hnode); + if (!net_eq(nf_ct_net(exp->master), seq_file_net(s))) + return 0; + if (exp->tuple.src.l3num != AF_INET) return 0; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index cf9700b1a..66ddcb605 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -737,6 +737,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* no remote port */ } + ipc.sockc.tsflags = sk->sk_tsflags; ipc.addr = inet->inet_saddr; ipc.opt = NULL; ipc.oif = sk->sk_bound_dev_if; @@ -744,10 +745,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.ttl = 0; ipc.tos = -1; - sock_tx_timestamp(sk, &ipc.tx_flags); - if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc, false); + err = ip_cmsg_send(sk, msg, &ipc, false); if (unlikely(err)) { kfree(ipc.opt); return err; @@ -768,6 +767,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } + sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); + saddr = ipc.addr; ipc.addr = faddr = daddr; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 8d22de740..438f50c1a 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -339,8 +339,8 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, struct msghdr *msg, size_t length, - struct rtable **rtp, - unsigned int flags) + struct rtable **rtp, unsigned int flags, + const struct sockcm_cookie *sockc) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); @@ -379,7 +379,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb->ip_summed = CHECKSUM_NONE; - sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); + sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); skb->transport_header = skb->network_header; err = -EFAULT; @@ -540,6 +540,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = inet->inet_daddr; } + ipc.sockc.tsflags = sk->sk_tsflags; ipc.addr = inet->inet_saddr; ipc.opt = NULL; ipc.tx_flags = 0; @@ -548,7 +549,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { - err = ip_cmsg_send(net, msg, &ipc, false); + err = ip_cmsg_send(sk, msg, &ipc, false); if (unlikely(err)) { kfree(ipc.opt); goto out; @@ -638,10 +639,10 @@ back_from_confirm: if (inet->hdrincl) err = raw_send_hdrinc(sk, &fl4, msg, len, - &rt, msg->msg_flags); + &rt, msg->msg_flags, &ipc.sockc); else { - sock_tx_timestamp(sk, &ipc.tx_flags); + sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); if (!ipc.addr) ipc.addr = fl4.daddr; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 60398a937..a1f2830d8 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -915,11 +915,11 @@ static int ip_error(struct sk_buff *skb) if (!IN_DEV_FORWARD(in_dev)) { switch (rt->dst.error) { case EHOSTUNREACH: - IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS); + __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS); break; case ENETUNREACH: - IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); + __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); break; } goto out; @@ -934,7 +934,7 @@ static int ip_error(struct sk_buff *skb) break; case ENETUNREACH: code = ICMP_NET_UNREACH; - IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); + __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); break; case EACCES: code = ICMP_PKT_FILTERED; @@ -2146,6 +2146,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, unsigned int flags = 0; struct fib_result res; struct rtable *rth; + int master_idx; int orig_oif; int err = -ENETUNREACH; @@ -2155,6 +2156,9 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, orig_oif = fl4->flowi4_oif; + master_idx = l3mdev_master_ifindex_by_index(net, fl4->flowi4_oif); + if (master_idx) + fl4->flowi4_oif = master_idx; fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_tos = tos & IPTOS_RT_MASK; fl4->flowi4_scope = ((tos & RTO_ONLINK) ? diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 4c04f0933..e3c4043c2 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -312,11 +312,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) mss = __cookie_v4_check(ip_hdr(skb), th, cookie); if (mss == 0) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 03112a310..1cb67de10 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -960,6 +960,17 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_IP_ROUTE_MULTIPATH + { + .procname = "fib_multipath_use_neigh", + .data = &init_net.ipv4.sysctl_fib_multipath_use_neigh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 29bb8a5b9..99fbc1479 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -429,13 +429,16 @@ void tcp_init_sock(struct sock *sk) } EXPORT_SYMBOL(tcp_init_sock); -static void tcp_tx_timestamp(struct sock *sk, struct sk_buff *skb) +static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) { - if (sk->sk_tsflags) { + if (tsflags) { struct skb_shared_info *shinfo = skb_shinfo(skb); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - sock_tx_timestamp(sk, &shinfo->tx_flags); - if (shinfo->tx_flags & SKBTX_ANY_TSTAMP) + sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags); + if (tsflags & SOF_TIMESTAMPING_TX_ACK) + tcb->txstamp_ack = 1; + if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; } } @@ -907,7 +910,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, int copy, i; bool can_coalesce; - if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { + if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 || + !tcp_skb_can_collapse_to(skb)) { new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; @@ -958,7 +962,7 @@ new_segment: offset += copy; size -= copy; if (!size) { - tcp_tx_timestamp(sk, skb); + tcp_tx_timestamp(sk, sk->sk_tsflags, skb); goto out; } @@ -1078,8 +1082,10 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; + struct sockcm_cookie sockc; int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; + bool process_backlog = false; bool sg; long timeo; @@ -1120,14 +1126,24 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } + sockc.tsflags = sk->sk_tsflags; + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (unlikely(err)) { + err = -EINVAL; + goto out_err; + } + } + /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - mss_now = tcp_send_mss(sk, &size_goal, flags); - /* Ok commence sending. */ copied = 0; +restart: + mss_now = tcp_send_mss(sk, &size_goal, flags); + err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto out_err; @@ -1145,7 +1161,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) copy = max - skb->len; } - if (copy <= 0) { + if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { new_segment: /* Allocate new segment. If the interface is SG, * allocate skb fitting to single page. @@ -1153,6 +1169,10 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; + if (process_backlog && sk_flush_backlog(sk)) { + process_backlog = false; + goto restart; + } skb = sk_stream_alloc_skb(sk, select_size(sk, sg), sk->sk_allocation, @@ -1160,6 +1180,7 @@ new_segment: if (!skb) goto wait_for_memory; + process_backlog = true; /* * Check whether we can use HW checksum. */ @@ -1238,7 +1259,9 @@ new_segment: copied += copy; if (!msg_data_left(msg)) { - tcp_tx_timestamp(sk, skb); + tcp_tx_timestamp(sk, sockc.tsflags, skb); + if (unlikely(flags & MSG_EOR)) + TCP_SKB_CB(skb)->eor = 1; goto out; } @@ -1432,14 +1455,10 @@ static void tcp_prequeue_process(struct sock *sk) struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); - NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED); - /* RX process wants to run with disabled BHs, though it is not - * necessary */ - local_bh_disable(); while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) sk_backlog_rcv(sk, skb); - local_bh_enable(); /* Clear memory counter. */ tp->ucopy.memory = 0; @@ -1766,7 +1785,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, chunk = len - tp->ucopy.len; if (chunk != 0) { - NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); len -= chunk; copied += chunk; } @@ -1778,7 +1797,7 @@ do_prequeue: chunk = len - tp->ucopy.len; if (chunk != 0) { - NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; } @@ -1864,7 +1883,7 @@ skip_copy: tcp_prequeue_process(sk); if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; } @@ -2054,13 +2073,13 @@ void tcp_close(struct sock *sk, long timeout) sk->sk_prot->disconnect(sk, 0); } else if (data_was_unread) { /* Unread data was tossed, zap the connection. */ - NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, sk->sk_allocation); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); - NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); } else if (tcp_close_state(sk)) { /* We FIN if the application ate all the data before * zapping the connection. @@ -2137,7 +2156,7 @@ adjudge_to_death: if (tp->linger2 < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); - NET_INC_STATS_BH(sock_net(sk), + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } else { const int tmo = tcp_fin_time(sk); @@ -2156,7 +2175,7 @@ adjudge_to_death: if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); - NET_INC_STATS_BH(sock_net(sk), + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); } } @@ -3195,7 +3214,7 @@ void tcp_done(struct sock *sk) struct request_sock *req = tcp_sk(sk)->fastopen_rsk; if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index fd1405d37..36087bca9 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -197,15 +197,15 @@ static void bictcp_state(struct sock *sk, u8 new_state) /* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ -static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt) +static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_state == TCP_CA_Open) { struct bictcp *ca = inet_csk_ca(sk); - cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; - ca->delayed_ack += cnt; + ca->delayed_ack += sample->pkts_acked - + (ca->delayed_ack >> ACK_RATIO_SHIFT); } } diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 167b6a3e1..03725b294 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -155,11 +155,11 @@ static void tcp_cdg_hystart_update(struct sock *sk) ca->last_ack = now_us; if (after(now_us, ca->round_start + base_owd)) { - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPHYSTARTTRAINDETECT); - NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_TCPHYSTARTTRAINCWND, - tp->snd_cwnd); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINDETECT); + NET_ADD_STATS(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINCWND, + tp->snd_cwnd); tp->snd_ssthresh = tp->snd_cwnd; return; } @@ -174,11 +174,11 @@ static void tcp_cdg_hystart_update(struct sock *sk) 125U); if (ca->rtt.min > thresh) { - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPHYSTARTDELAYDETECT); - NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_TCPHYSTARTDELAYCWND, - tp->snd_cwnd); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYDETECT); + NET_ADD_STATS(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYCWND, + tp->snd_cwnd); tp->snd_ssthresh = tp->snd_cwnd; } } @@ -294,12 +294,12 @@ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked) ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr); } -static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us) +static void tcp_cdg_acked(struct sock *sk, const struct ack_sample *sample) { struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - if (rtt_us <= 0) + if (sample->rtt_us <= 0) return; /* A heuristic for filtering delayed ACKs, adapted from: @@ -307,20 +307,20 @@ static void tcp_cdg_acked(struct sock *sk, u32 num_acked, s32 rtt_us) * delay and rate based TCP mechanisms." TR 100219A. CAIA, 2010. */ if (tp->sacked_out == 0) { - if (num_acked == 1 && ca->delack) { + if (sample->pkts_acked == 1 && ca->delack) { /* A delayed ACK is only used for the minimum if it is * provenly lower than an existing non-zero minimum. */ - ca->rtt.min = min(ca->rtt.min, rtt_us); + ca->rtt.min = min(ca->rtt.min, sample->rtt_us); ca->delack--; return; - } else if (num_acked > 1 && ca->delack < 5) { + } else if (sample->pkts_acked > 1 && ca->delack < 5) { ca->delack++; } } - ca->rtt.min = min_not_zero(ca->rtt.min, rtt_us); - ca->rtt.max = max(ca->rtt.max, rtt_us); + ca->rtt.min = min_not_zero(ca->rtt.min, sample->rtt_us); + ca->rtt.max = max(ca->rtt.max, sample->rtt_us); } static u32 tcp_cdg_ssthresh(struct sock *sk) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 448c2615f..c99230efc 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -402,11 +402,11 @@ static void hystart_update(struct sock *sk, u32 delay) ca->last_ack = now; if ((s32)(now - ca->round_start) > ca->delay_min >> 4) { ca->found |= HYSTART_ACK_TRAIN; - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPHYSTARTTRAINDETECT); - NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_TCPHYSTARTTRAINCWND, - tp->snd_cwnd); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINDETECT); + NET_ADD_STATS(sock_net(sk), + LINUX_MIB_TCPHYSTARTTRAINCWND, + tp->snd_cwnd); tp->snd_ssthresh = tp->snd_cwnd; } } @@ -423,11 +423,11 @@ static void hystart_update(struct sock *sk, u32 delay) if (ca->curr_rtt > ca->delay_min + HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { ca->found |= HYSTART_DELAY; - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPHYSTARTDELAYDETECT); - NET_ADD_STATS_BH(sock_net(sk), - LINUX_MIB_TCPHYSTARTDELAYCWND, - tp->snd_cwnd); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYDETECT); + NET_ADD_STATS(sock_net(sk), + LINUX_MIB_TCPHYSTARTDELAYCWND, + tp->snd_cwnd); tp->snd_ssthresh = tp->snd_cwnd; } } @@ -437,21 +437,21 @@ static void hystart_update(struct sock *sk, u32 delay) /* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ -static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) +static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); u32 delay; /* Some calls are for duplicates without timetamps */ - if (rtt_us < 0) + if (sample->rtt_us < 0) return; /* Discard delay samples right after fast recovery */ if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ) return; - delay = (rtt_us << 3) / USEC_PER_MSEC; + delay = (sample->rtt_us << 3) / USEC_PER_MSEC; if (delay == 0) delay = 1; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index cffd8f9ed..54d9f9b01 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -255,9 +255,9 @@ static bool tcp_fastopen_queue_check(struct sock *sk) spin_lock(&fastopenq->lock); req1 = fastopenq->rskq_rst_head; if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) { + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); spin_unlock(&fastopenq->lock); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENLISTENOVERFLOW); return false; } fastopenq->rskq_rst_head = req1->dl_next; @@ -282,7 +282,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct sock *child; if (foc->len == 0) /* Client requests a cookie */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD); if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) && (syn_data || foc->len >= 0) && @@ -311,13 +311,13 @@ fastopen: child = tcp_fastopen_create_child(sk, skb, dst, req); if (child) { foc->len = -1; - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENPASSIVE); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENPASSIVE); return child; } - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); } else if (foc->len > 0) /* Client presents an invalid cookie */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL); valid_foc.exp = foc->exp; *foc = valid_foc; diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 82f0d9ed6..4a4d8e767 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -99,7 +99,7 @@ static inline void measure_rtt(struct sock *sk, u32 srtt) } static void measure_achieved_throughput(struct sock *sk, - u32 pkts_acked, s32 rtt) + const struct ack_sample *sample) { const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); @@ -107,10 +107,10 @@ static void measure_achieved_throughput(struct sock *sk, u32 now = tcp_time_stamp; if (icsk->icsk_ca_state == TCP_CA_Open) - ca->pkts_acked = pkts_acked; + ca->pkts_acked = sample->pkts_acked; - if (rtt > 0) - measure_rtt(sk, usecs_to_jiffies(rtt)); + if (sample->rtt_us > 0) + measure_rtt(sk, usecs_to_jiffies(sample->rtt_us)); if (!use_bandwidth_switch) return; @@ -122,7 +122,7 @@ static void measure_achieved_throughput(struct sock *sk, return; } - ca->packetcount += pkts_acked; + ca->packetcount += sample->pkts_acked; if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) && now - ca->lasttime >= ca->minRTT && diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 2ab9bbb6f..c8e6d86be 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -82,30 +82,31 @@ static void tcp_illinois_init(struct sock *sk) } /* Measure RTT for each ack. */ -static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, s32 rtt) +static void tcp_illinois_acked(struct sock *sk, const struct ack_sample *sample) { struct illinois *ca = inet_csk_ca(sk); + s32 rtt_us = sample->rtt_us; - ca->acked = pkts_acked; + ca->acked = sample->pkts_acked; /* dup ack, no rtt sample */ - if (rtt < 0) + if (rtt_us < 0) return; /* ignore bogus values, this prevents wraparound in alpha math */ - if (rtt > RTT_MAX) - rtt = RTT_MAX; + if (rtt_us > RTT_MAX) + rtt_us = RTT_MAX; /* keep track of minimum RTT seen so far */ - if (ca->base_rtt > rtt) - ca->base_rtt = rtt; + if (ca->base_rtt > rtt_us) + ca->base_rtt = rtt_us; /* and max */ - if (ca->max_rtt < rtt) - ca->max_rtt = rtt; + if (ca->max_rtt < rtt_us) + ca->max_rtt = rtt_us; ++ca->cnt_rtt; - ca->sum_rtt += rtt; + ca->sum_rtt += rtt_us; } /* Maximum queuing delay */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 131005f82..cd3c4cbbb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -90,7 +90,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); /* rfc5961 challenge ack rate limiting */ -int sysctl_tcp_challenge_ack_limit = 100; +int sysctl_tcp_challenge_ack_limit = 1000; int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; @@ -872,7 +872,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, else mib_idx = LINUX_MIB_TCPSACKREORDER; - NET_INC_STATS_BH(sock_net(sk), mib_idx); + NET_INC_STATS(sock_net(sk), mib_idx); #if FASTRETRANS_DEBUG > 1 pr_debug("Disorder%d %d %u f%u s%u rr%d\n", tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, @@ -1065,7 +1065,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { dup_sack = true; tcp_dsack_seen(tp); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); @@ -1074,7 +1074,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, !before(start_seq_0, start_seq_1)) { dup_sack = true; tcp_dsack_seen(tp); - NET_INC_STATS_BH(sock_net(sk), + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV); } } @@ -1292,7 +1292,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, if (skb->len > 0) { BUG_ON(!tcp_skb_pcount(skb)); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTED); return false; } @@ -1306,6 +1306,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, } TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; + TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) TCP_SKB_CB(prev)->end_seq++; @@ -1316,7 +1317,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, tcp_unlink_write_queue(skb, sk); sk_wmem_free_skb(sk, skb); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED); return true; } @@ -1371,6 +1372,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) goto fallback; + if (!tcp_skb_can_collapse_to(prev)) + goto fallback; + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && !before(end_seq, TCP_SKB_CB(skb)->end_seq); @@ -1472,7 +1476,7 @@ noop: return skb; fallback: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK); + NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK); return NULL; } @@ -1660,7 +1664,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, mib_idx = LINUX_MIB_TCPSACKDISCARD; } - NET_INC_STATS_BH(sock_net(sk), mib_idx); + NET_INC_STATS(sock_net(sk), mib_idx); if (i == 0) first_sack_index = -1; continue; @@ -1912,7 +1916,7 @@ void tcp_enter_loss(struct sock *sk) skb = tcp_write_queue_head(sk); is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); if (is_reneg) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tp->sacked_out = 0; tp->fackets_out = 0; } @@ -2256,16 +2260,6 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) } } -/* CWND moderation, preventing bursts due to too big ACKs - * in dubious situations. - */ -static inline void tcp_moderate_cwnd(struct tcp_sock *tp) -{ - tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp) + tcp_max_burst(tp)); - tp->snd_cwnd_stamp = tcp_time_stamp; -} - static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when) { return tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && @@ -2408,13 +2402,12 @@ static bool tcp_try_undo_recovery(struct sock *sk) else mib_idx = LINUX_MIB_TCPFULLUNDO; - NET_INC_STATS_BH(sock_net(sk), mib_idx); + NET_INC_STATS(sock_net(sk), mib_idx); } if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { /* Hold old state until something *above* high_seq * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ - tcp_moderate_cwnd(tp); if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; return true; @@ -2431,7 +2424,7 @@ static bool tcp_try_undo_dsack(struct sock *sk) if (tp->undo_marker && !tp->undo_retrans) { DBGUNDO(sk, "D-SACK"); tcp_undo_cwnd_reduction(sk, false); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); return true; } return false; @@ -2446,10 +2439,10 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) tcp_undo_cwnd_reduction(sk, true); DBGUNDO(sk, "partial loss"); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); if (frto_undo) - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPSPURIOUSRTOS); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPSPURIOUSRTOS); inet_csk(sk)->icsk_retransmits = 0; if (frto_undo || tcp_is_sack(tp)) tcp_set_ca_state(sk, TCP_CA_Open); @@ -2573,7 +2566,7 @@ static void tcp_mtup_probe_failed(struct sock *sk) icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1; icsk->icsk_mtup.probe_size = 0; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPFAIL); } static void tcp_mtup_probe_success(struct sock *sk) @@ -2593,7 +2586,7 @@ static void tcp_mtup_probe_success(struct sock *sk) icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; icsk->icsk_mtup.probe_size = 0; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS); } /* Do a simple retransmit without using the backoff mechanisms in @@ -2657,7 +2650,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) else mib_idx = LINUX_MIB_TCPSACKRECOVERY; - NET_INC_STATS_BH(sock_net(sk), mib_idx); + NET_INC_STATS(sock_net(sk), mib_idx); tp->prior_ssthresh = 0; tcp_init_undo(tp); @@ -2750,7 +2743,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked) DBGUNDO(sk, "partial recovery"); tcp_undo_cwnd_reduction(sk, true); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); tcp_try_keep_open(sk); return true; } @@ -3097,12 +3090,11 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb, const struct skb_shared_info *shinfo; /* Avoid cache line misses to get skb_shinfo() and shinfo->tx_flags */ - if (likely(!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK))) + if (likely(!TCP_SKB_CB(skb)->txstamp_ack)) return; shinfo = skb_shinfo(skb); - if ((shinfo->tx_flags & SKBTX_ACK_TSTAMP) && - !before(shinfo->tskey, prior_snd_una) && + if (!before(shinfo->tskey, prior_snd_una) && before(shinfo->tskey, tcp_sk(sk)->snd_una)) __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); } @@ -3259,8 +3251,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tcp_rearm_rto(sk); } - if (icsk->icsk_ca_ops->pkts_acked) - icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked, ca_rtt_us); + if (icsk->icsk_ca_ops->pkts_acked) { + struct ack_sample sample = { .pkts_acked = pkts_acked, + .rtt_us = ca_rtt_us }; + + icsk->icsk_ca_ops->pkts_acked(sk, &sample); + } #if FASTRETRANS_DEBUG > 0 WARN_ON((int)tp->sacked_out < 0); @@ -3366,9 +3362,10 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack) { u32 delta = ack - tp->snd_una; - u64_stats_update_begin(&tp->syncp); + sock_owned_by_me((struct sock *)tp); + u64_stats_update_begin_raw(&tp->syncp); tp->bytes_acked += delta; - u64_stats_update_end(&tp->syncp); + u64_stats_update_end_raw(&tp->syncp); tp->snd_una = ack; } @@ -3377,9 +3374,10 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq) { u32 delta = seq - tp->rcv_nxt; - u64_stats_update_begin(&tp->syncp); + sock_owned_by_me((struct sock *)tp); + u64_stats_update_begin_raw(&tp->syncp); tp->bytes_received += delta; - u64_stats_update_end(&tp->syncp); + u64_stats_update_end_raw(&tp->syncp); tp->rcv_nxt = seq; } @@ -3426,6 +3424,23 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 return flag; } +static bool __tcp_oow_rate_limited(struct net *net, int mib_idx, + u32 *last_oow_ack_time) +{ + if (*last_oow_ack_time) { + s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); + + if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { + NET_INC_STATS(net, mib_idx); + return true; /* rate-limited: don't send yet! */ + } + } + + *last_oow_ack_time = tcp_time_stamp; + + return false; /* not rate-limited: go ahead, send dupack now! */ +} + /* Return true if we're currently rate-limiting out-of-window ACKs and * thus shouldn't send a dupack right now. We rate-limit dupacks in * response to out-of-window SYNs or ACKs to mitigate ACK loops or DoS @@ -3439,21 +3454,9 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, /* Data packets without SYNs are not likely part of an ACK loop. */ if ((TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) && !tcp_hdr(skb)->syn) - goto not_rate_limited; - - if (*last_oow_ack_time) { - s32 elapsed = (s32)(tcp_time_stamp - *last_oow_ack_time); - - if (0 <= elapsed && elapsed < sysctl_tcp_invalid_ratelimit) { - NET_INC_STATS_BH(net, mib_idx); - return true; /* rate-limited: don't send yet! */ - } - } - - *last_oow_ack_time = tcp_time_stamp; + return false; -not_rate_limited: - return false; /* not rate-limited: go ahead, send dupack now! */ + return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); } /* RFC 5961 7 [ACK Throttling] */ @@ -3463,22 +3466,27 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb) static u32 challenge_timestamp; static unsigned int challenge_count; struct tcp_sock *tp = tcp_sk(sk); - u32 now; + u32 count, now; /* First check our per-socket dupack rate limit. */ - if (tcp_oow_rate_limited(sock_net(sk), skb, - LINUX_MIB_TCPACKSKIPPEDCHALLENGE, - &tp->last_oow_ack_time)) + if (__tcp_oow_rate_limited(sock_net(sk), + LINUX_MIB_TCPACKSKIPPEDCHALLENGE, + &tp->last_oow_ack_time)) return; - /* Then check the check host-wide RFC 5961 rate limit. */ + /* Then check host-wide RFC 5961 rate limit. */ now = jiffies / HZ; if (now != challenge_timestamp) { + u32 half = (sysctl_tcp_challenge_ack_limit + 1) >> 1; + challenge_timestamp = now; - challenge_count = 0; + WRITE_ONCE(challenge_count, half + + prandom_u32_max(sysctl_tcp_challenge_ack_limit)); } - if (++challenge_count <= sysctl_tcp_challenge_ack_limit) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); + count = READ_ONCE(challenge_count); + if (count > 0) { + WRITE_ONCE(challenge_count, count - 1); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPCHALLENGEACK); tcp_send_ack(sk); } } @@ -3527,8 +3535,8 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) tcp_set_ca_state(sk, TCP_CA_CWR); tcp_end_cwnd_reduction(sk); tcp_try_keep_open(sk); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPLOSSPROBERECOVERY); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPLOSSPROBERECOVERY); } else if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP | FLAG_DATA_SACKED))) { /* Pure dupack: original and TLP probe arrived; no loss */ @@ -3632,14 +3640,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { u32 ack_ev_flags = CA_ACK_SLOWPATH; if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPUREACKS); flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); @@ -4183,7 +4191,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) else mib_idx = LINUX_MIB_TCPDSACKOFOSENT; - NET_INC_STATS_BH(sock_net(sk), mib_idx); + NET_INC_STATS(sock_net(sk), mib_idx); tp->rx_opt.dsack = 1; tp->duplicate_sack[0].start_seq = seq; @@ -4207,7 +4215,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); + NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk); if (tcp_is_sack(tp) && sysctl_tcp_dsack) { @@ -4357,13 +4365,19 @@ static bool tcp_try_coalesce(struct sock *sk, atomic_add(delta, &sk->sk_rmem_alloc); sk_mem_charge(sk, delta); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; return true; } +static void tcp_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + __kfree_skb(skb); +} + /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ @@ -4388,7 +4402,7 @@ static void tcp_ofo_queue(struct sock *sk) __skb_unlink(skb, &tp->out_of_order_queue); if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "ofo packet was already received\n"); - __kfree_skb(skb); + tcp_drop(sk, skb); continue; } SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", @@ -4439,8 +4453,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) tcp_ecn_check_ce(tp, skb); if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); - __kfree_skb(skb); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); + tcp_drop(sk, skb); return; } @@ -4448,7 +4462,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) tp->pred_flags = 0; inet_csk_schedule_ack(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); @@ -4503,8 +4517,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) { if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - __kfree_skb(skb); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); + tcp_drop(sk, skb); skb = NULL; tcp_dsack_set(sk, seq, end_seq); goto add_sack; @@ -4542,8 +4556,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) __skb_unlink(skb1, &tp->out_of_order_queue); tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOMERGE); - __kfree_skb(skb1); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE); + tcp_drop(sk, skb1); } add_sack: @@ -4651,11 +4665,13 @@ static int __tcp_stealth_integrity_check(struct sock *sk, struct sk_buff *skb) static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - int eaten = -1; bool fragstolen = false; + int eaten = -1; - if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) - goto drop; + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { + __kfree_skb(skb); + return; + } #ifdef CONFIG_TCP_STEALTH if (unlikely(tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN) && @@ -4689,14 +4705,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) __set_current_state(TASK_RUNNING); - local_bh_enable(); if (!skb_copy_datagram_msg(skb, 0, tp->ucopy.msg, chunk)) { tp->ucopy.len -= chunk; tp->copied_seq += chunk; eaten = (chunk == skb->len); tcp_rcv_space_adjust(sk); } - local_bh_disable(); } if (eaten <= 0) { @@ -4739,14 +4753,14 @@ queue_and_out: if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { /* A retransmit, 2nd most common case. Force an immediate ack. */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); + NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); out_of_window: tcp_enter_quickack_mode(sk); inet_csk_schedule_ack(sk); drop: - __kfree_skb(skb); + tcp_drop(sk, skb); return; } @@ -4785,7 +4799,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, __skb_unlink(skb, list); __kfree_skb(skb); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); return next; } @@ -4944,7 +4958,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk) bool res = false; if (!skb_queue_empty(&tp->out_of_order_queue)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); __skb_queue_purge(&tp->out_of_order_queue); /* Reset SACK state. A conforming SACK implementation will @@ -4973,7 +4987,7 @@ static int tcp_prune_queue(struct sock *sk) SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED); if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); @@ -5003,7 +5017,7 @@ static int tcp_prune_queue(struct sock *sk) * drop receive data on the floor. It will get retransmitted * and hopefully then we'll have sufficient space. */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED); /* Massive buffer overcommit. */ tp->pred_flags = 0; @@ -5212,7 +5226,6 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) int chunk = skb->len - hlen; int err; - local_bh_enable(); if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_msg(skb, hlen, tp->ucopy.msg, chunk); else @@ -5224,32 +5237,9 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) tcp_rcv_space_adjust(sk); } - local_bh_disable(); return err; } -static __sum16 __tcp_checksum_complete_user(struct sock *sk, - struct sk_buff *skb) -{ - __sum16 result; - - if (sock_owned_by_user(sk)) { - local_bh_enable(); - result = __tcp_checksum_complete(skb); - local_bh_disable(); - } else { - result = __tcp_checksum_complete(skb); - } - return result; -} - -static inline bool tcp_checksum_complete_user(struct sock *sk, - struct sk_buff *skb) -{ - return !skb_csum_unnecessary(skb) && - __tcp_checksum_complete_user(sk, skb); -} - /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ @@ -5262,7 +5252,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && tcp_paws_discard(sk, skb)) { if (!th->rst) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDPAWS, &tp->last_oow_ack_time)) @@ -5314,8 +5304,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (th->syn) { syn_challenge: if (syn_inerr) - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); + TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); tcp_send_challenge_ack(sk, skb); goto discard; } @@ -5323,7 +5313,7 @@ syn_challenge: return true; discard: - __kfree_skb(skb); + tcp_drop(sk, skb); return false; } @@ -5430,7 +5420,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_data_snd_check(sk); return; } else { /* Header too small */ - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; } } else { @@ -5467,12 +5457,13 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __skb_pull(skb, tcp_header_len); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPHPHITSTOUSER); eaten = 1; } } if (!eaten) { - if (tcp_checksum_complete_user(sk, skb)) + if (tcp_checksum_complete(skb)) goto csum_error; if ((int)skb->truesize > sk->sk_forward_alloc) @@ -5489,7 +5480,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_rcv_rtt_measure_ts(sk, skb); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ eaten = tcp_queue_rcv(sk, skb, tcp_header_len, @@ -5516,7 +5507,7 @@ no_ack: } slow_path: - if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb)) + if (len < (th->doff << 2) || tcp_checksum_complete(skb)) goto csum_error; if (!th->ack && !th->rst && !th->syn) @@ -5546,11 +5537,11 @@ step5: return; csum_error: - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); discard: - __kfree_skb(skb); + tcp_drop(sk, skb); } EXPORT_SYMBOL(tcp_rcv_established); @@ -5635,16 +5626,18 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, if (data) { /* Retransmit unacked data in SYN */ tcp_for_write_queue_from(data, sk) { if (data == tcp_send_head(sk) || - __tcp_retransmit_skb(sk, data)) + __tcp_retransmit_skb(sk, data, 1)) break; } tcp_rearm_rto(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); return true; } tp->syn_data_acked = tp->syn_data; if (tp->syn_data_acked) - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVE); tcp_fastopen_add_skb(sk, synack); @@ -5679,7 +5672,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, tcp_time_stamp)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_PAWSACTIVEREJECTED); goto reset_and_undo; } @@ -5781,7 +5775,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, TCP_DELACK_MAX, TCP_RTO_MAX); discard: - __kfree_skb(skb); + tcp_drop(sk, skb); return 0; } else { tcp_send_ack(sk); @@ -5888,8 +5882,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) int queued = 0; bool acceptable; - tp->rx_opt.saw_tstamp = 0; - switch (sk->sk_state) { case TCP_CLOSE: goto discard; @@ -5907,29 +5899,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) return 1; - /* Now we have several options: In theory there is - * nothing else in the frame. KA9Q has an option to - * send data with the syn, BSD accepts data with the - * syn up to the [to be] advertised window and - * Solaris 2.1 gives you a protocol error. For now - * we just ignore it, that fits the spec precisely - * and avoids incompatibilities. It would be nice in - * future to drop through and process the data. - * - * Now that TTCP is starting to be used we ought to - * queue this data. - * But, this leaves one open to an easy denial of - * service attack, and SYN cookies can't defend - * against this problem. So, we drop the data - * in the interest of security over speed unless - * it's still in use. - */ - kfree_skb(skb); + consume_skb(skb); return 0; } goto discard; case TCP_SYN_SENT: + tp->rx_opt.saw_tstamp = 0; queued = tcp_rcv_synsent_state_process(sk, skb, th); if (queued >= 0) return queued; @@ -5941,6 +5917,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) return 0; } + tp->rx_opt.saw_tstamp = 0; req = tp->fastopen_rsk; if (req) { WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && @@ -6065,7 +6042,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { tcp_done(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return 1; } @@ -6122,7 +6099,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (sk->sk_shutdown & RCV_SHUTDOWN) { if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); tcp_reset(sk); return 1; } @@ -6142,7 +6119,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (!queued) { discard: - __kfree_skb(skb); + tcp_drop(sk, skb); } return 0; } @@ -6260,10 +6237,10 @@ static bool tcp_syn_flood_action(const struct sock *sk, if (net->ipv4.sysctl_tcp_syncookies) { msg = "Sending cookies"; want_cookie = true; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); } else #endif - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); if (!queue->synflood_warned && net->ipv4.sysctl_tcp_syncookies != 2 && @@ -6324,7 +6301,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, * timeout. */ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); goto drop; } @@ -6371,7 +6348,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (dst && strict && !tcp_peer_is_proven(req, dst, true, tmp_opt.saw_tstamp)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } } @@ -6419,7 +6396,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, } if (fastopen_sk) { af_ops->send_synack(fastopen_sk, dst, &fl, req, - &foc, false); + &foc, TCP_SYNACK_FASTOPEN); /* Add the child socket directly into the accept queue */ inet_csk_reqsk_queue_add(sk, req, fastopen_sk); sk->sk_data_ready(sk); @@ -6429,10 +6406,13 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->tfo_listener = false; if (!want_cookie) inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); - af_ops->send_synack(sk, dst, &fl, req, - &foc, !want_cookie); - if (want_cookie) - goto drop_and_free; + af_ops->send_synack(sk, dst, &fl, req, &foc, + !want_cookie ? TCP_SYNACK_NORMAL : + TCP_SYNACK_COOKIE); + if (want_cookie) { + reqsk_free(req); + return 0; + } } reqsk_put(req); return 0; @@ -6442,7 +6422,7 @@ drop_and_release: drop_and_free: reqsk_free(req); drop: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + tcp_listendrop(sk); return 0; } EXPORT_SYMBOL(tcp_conn_request); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index dfd153fbd..469942798 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -158,7 +158,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) nexthop = daddr = usin->sin_addr.s_addr; inet_opt = rcu_dereference_protected(inet->inet_opt, - sock_owned_by_user(sk)); + lockdep_sock_is_held(sk)); if (inet_opt && inet_opt->opt.srr) { if (!daddr) return -EINVAL; @@ -336,7 +336,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort) * an established socket here. */ if (seq != tcp_rsk(req)->snt_isn) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); } else if (abort) { /* * Still in SYN_RECV, just remove it silently. @@ -345,7 +345,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort) * errors returned from accept(). */ inet_csk_reqsk_queue_drop(req->rsk_listener, req); - NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); + tcp_listendrop(req->rsk_listener); } reqsk_put(req); } @@ -388,7 +388,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) th->dest, iph->saddr, ntohs(th->source), inet_iif(icmp_skb)); if (!sk) { - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; } if (sk->sk_state == TCP_TIME_WAIT) { @@ -412,13 +412,13 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) */ if (sock_owned_by_user(sk)) { if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)) - NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); + __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); } if (sk->sk_state == TCP_CLOSE) goto out; if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { - NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto out; } @@ -429,7 +429,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && !between(seq, snd_una, tp->snd_nxt)) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -644,6 +644,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); #ifdef CONFIG_TCP_MD5SIG + rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); if (sk && sk_fullsock(sk)) { key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) @@ -662,16 +663,18 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ntohs(th->source), inet_iif(skb)); /* don't send rst if it can't find key */ if (!sk1) - return; - rcu_read_lock(); + goto out; + key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) &ip_hdr(skb)->saddr, AF_INET); if (!key) - goto release_sk1; + goto out; + genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) - goto release_sk1; + goto out; + } if (key) { @@ -705,20 +708,19 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) offsetof(struct inet_timewait_sock, tw_bound_dev_if)); arg.tos = ip_hdr(skb)->tos; + local_bh_disable(); ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); - TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); - TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); + __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); + __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); + local_bh_enable(); #ifdef CONFIG_TCP_MD5SIG -release_sk1: - if (sk1) { - rcu_read_unlock(); - sock_put(sk1); - } +out: + rcu_read_unlock(); #endif } @@ -790,12 +792,14 @@ static void tcp_v4_send_ack(struct net *net, if (oif) arg.bound_dev_if = oif; arg.tos = tos; + local_bh_disable(); ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); - TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); + __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); + local_bh_enable(); } static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) @@ -846,7 +850,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, - bool attach_req) + enum tcp_synack_type synack_type) { const struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; @@ -857,7 +861,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req, foc, attach_req); + skb = tcp_make_synack(sk, dst, req, foc, synack_type); if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); @@ -898,8 +902,7 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk, /* caller either holds rcu_read_lock() or socket lock */ md5sig = rcu_dereference_check(tp->md5sig_info, - sock_owned_by_user(sk) || - lockdep_is_held((spinlock_t *)&sk->sk_lock.slock)); + lockdep_sock_is_held(sk)); if (!md5sig) return NULL; #if IS_ENABLED(CONFIG_IPV6) @@ -944,8 +947,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, } md5sig = rcu_dereference_protected(tp->md5sig_info, - sock_owned_by_user(sk) || - lockdep_is_held(&sk->sk_lock.slock)); + lockdep_sock_is_held(sk)); if (!md5sig) { md5sig = kmalloc(sizeof(*md5sig), gfp); if (!md5sig) @@ -1169,12 +1171,12 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, return false; if (hash_expected && !hash_location) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); return true; } if (!hash_expected && hash_location) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); return true; } @@ -1262,7 +1264,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) &tcp_request_sock_ipv4_ops, sk, skb); drop: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + tcp_listendrop(sk); return 0; } EXPORT_SYMBOL(tcp_v4_conn_request); @@ -1360,11 +1362,11 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, return newsk; exit_overflow: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); exit_nonewsk: dst_release(dst); exit: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + tcp_listendrop(sk); return NULL; put_and_exit: inet_csk_prepare_forced_close(newsk); @@ -1461,8 +1463,8 @@ discard: return 0; csum_err: - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; } EXPORT_SYMBOL(tcp_v4_do_rcv); @@ -1535,16 +1537,16 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb) __skb_queue_tail(&tp->ucopy.prequeue, skb); tp->ucopy.memory += skb->truesize; - if (tp->ucopy.memory > sk->sk_rcvbuf) { + if (skb_queue_len(&tp->ucopy.prequeue) >= 32 || + tp->ucopy.memory + atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { struct sk_buff *skb1; BUG_ON(sock_owned_by_user(sk)); + __NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUEDROPPED, + skb_queue_len(&tp->ucopy.prequeue)); - while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) { + while ((skb1 = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) sk_backlog_rcv(sk, skb1); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPPREQUEUEDROPPED); - } tp->ucopy.memory = 0; } else if (skb_queue_len(&tp->ucopy.prequeue) == 1) { @@ -1565,24 +1567,25 @@ EXPORT_SYMBOL(tcp_prequeue); int tcp_v4_rcv(struct sk_buff *skb) { + struct net *net = dev_net(skb->dev); const struct iphdr *iph; const struct tcphdr *th; + bool refcounted; struct sock *sk; int ret; - struct net *net = dev_net(skb->dev); if (skb->pkt_type != PACKET_HOST) goto discard_it; /* Count it even if it's bad */ - TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); + __TCP_INC_STATS(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; - th = tcp_hdr(skb); + th = (const struct tcphdr *)skb->data; - if (th->doff < sizeof(struct tcphdr) / 4) + if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) goto bad_packet; if (!pskb_may_pull(skb, th->doff * 4)) goto discard_it; @@ -1595,7 +1598,7 @@ int tcp_v4_rcv(struct sk_buff *skb) if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo)) goto csum_error; - th = tcp_hdr(skb); + th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB() * barrier() makes sure compiler wont play fool^Waliasing games. @@ -1615,7 +1618,7 @@ int tcp_v4_rcv(struct sk_buff *skb) lookup: sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, - th->dest); + th->dest, &refcounted); if (!sk) goto no_tcp_socket; @@ -1636,7 +1639,11 @@ process: inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } + /* We own a reference on the listener, increase it again + * as we might lose it too soon. + */ sock_hold(sk); + refcounted = true; nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); @@ -1653,7 +1660,7 @@ process: } } if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { - NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; } @@ -1686,13 +1693,14 @@ process: } else if (unlikely(sk_add_backlog(sk, skb, sk->sk_rcvbuf + sk->sk_sndbuf))) { bh_unlock_sock(sk); - NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); + __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP); goto discard_and_relse; } bh_unlock_sock(sk); put_and_return: - sock_put(sk); + if (refcounted) + sock_put(sk); return ret; @@ -1702,9 +1710,9 @@ no_tcp_socket: if (tcp_checksum_complete(skb)) { csum_error: - TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); + __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: - TCP_INC_STATS_BH(net, TCP_MIB_INERRS); + __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { tcp_v4_send_reset(NULL, skb); } @@ -1715,7 +1723,9 @@ discard_it: return 0; discard_and_relse: - sock_put(sk); + sk_drops_add(sk, skb); + if (refcounted) + sock_put(sk); goto discard_it; do_time_wait: @@ -1739,6 +1749,7 @@ do_time_wait: if (sk2) { inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; + refcounted = false; goto process; } /* Fall through to ACK */ @@ -1855,7 +1866,9 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_free_fastopen_req(tp); tcp_saved_syn_free(tp); + local_bh_disable(); sk_sockets_allocated_dec(sk); + local_bh_enable(); if (mem_cgroup_sockets_enabled && sk->sk_memcg) sock_release_memcg(sk); @@ -1872,17 +1885,17 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); */ static void *listening_get_next(struct seq_file *seq, void *cur) { - struct inet_connection_sock *icsk; - struct hlist_nulls_node *node; - struct sock *sk = cur; - struct inet_listen_hashbucket *ilb; struct tcp_iter_state *st = seq->private; struct net *net = seq_file_net(seq); + struct inet_listen_hashbucket *ilb; + struct inet_connection_sock *icsk; + struct sock *sk = cur; if (!sk) { +get_head: ilb = &tcp_hashinfo.listening_hash[st->bucket]; spin_lock_bh(&ilb->lock); - sk = sk_nulls_head(&ilb->head); + sk = sk_head(&ilb->head); st->offset = 0; goto get_sk; } @@ -1890,28 +1903,20 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ++st->num; ++st->offset; - sk = sk_nulls_next(sk); + sk = sk_next(sk); get_sk: - sk_nulls_for_each_from(sk, node) { + sk_for_each_from(sk) { if (!net_eq(sock_net(sk), net)) continue; - if (sk->sk_family == st->family) { - cur = sk; - goto out; - } + if (sk->sk_family == st->family) + return sk; icsk = inet_csk(sk); } spin_unlock_bh(&ilb->lock); st->offset = 0; - if (++st->bucket < INET_LHTABLE_SIZE) { - ilb = &tcp_hashinfo.listening_hash[st->bucket]; - spin_lock_bh(&ilb->lock); - sk = sk_nulls_head(&ilb->head); - goto get_sk; - } - cur = NULL; -out: - return cur; + if (++st->bucket < INET_LHTABLE_SIZE) + goto get_head; + return NULL; } static void *listening_get_idx(struct seq_file *seq, loff_t *pos) @@ -2410,6 +2415,7 @@ static int __net_init tcp_sk_init(struct net *net) IPPROTO_TCP, net); if (res) goto fail; + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; } diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 1e70fa8fa..c67ece139 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -260,13 +260,13 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt) * newReno in increase case. * We work it out by following the idea from TCP-LP's paper directly */ -static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, s32 rtt_us) +static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); - if (rtt_us > 0) - tcp_lp_rtt_sample(sk, rtt_us); + if (sample->rtt_us > 0) + tcp_lp_rtt_sample(sk, sample->rtt_us); /* calc inference */ if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 7b7eec439..b617826e2 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -800,7 +800,8 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, } if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE, - jiffies - tm->tcpm_stamp) < 0) + jiffies - tm->tcpm_stamp, + TCP_METRICS_ATTR_PAD) < 0) goto nla_put_failure; if (tm->tcpm_ts_stamp) { if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP, @@ -864,7 +865,8 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS, tfom->syn_loss) < 0 || nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS, - jiffies - tfom->last_syn_loss) < 0)) + jiffies - tfom->last_syn_loss, + TCP_METRICS_ATTR_PAD) < 0)) goto nla_put_failure; if (tfom->cookie.len > 0 && nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index acb366dd6..4b95ec4ed 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -235,7 +235,7 @@ kill: } if (paws_reject) - NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); + __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. @@ -337,7 +337,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) * socket up. We've got bigger problems than * non-graceful socket closings. */ - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); } tcp_update_metrics(sk); @@ -545,7 +545,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rack.mstamp.v64 = 0; newtp->rack.advanced = 0; - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); } return newsk; } @@ -704,10 +704,13 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { /* Out of window: send ACK and drop. */ - if (!(flg & TCP_FLAG_RST)) + if (!(flg & TCP_FLAG_RST) && + !tcp_oow_rate_limited(sock_net(sk), skb, + LINUX_MIB_TCPACKSKIPPEDSYNRECV, + &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); if (paws_reject) - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); return NULL; } @@ -726,7 +729,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * "fourth, check the SYN bit" */ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); goto embryonic_reset; } @@ -749,7 +752,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; } @@ -788,7 +791,7 @@ embryonic_reset: } if (!fastopen) { inet_csk_reqsk_queue_drop(sk, req); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); } return NULL; } diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 773083b7f..5c5964962 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -83,23 +83,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ - int type = skb_shinfo(skb)->gso_type; - - if (unlikely(type & - ~(SKB_GSO_TCPV4 | - SKB_GSO_DODGY | - SKB_GSO_TCP_ECN | - SKB_GSO_TCPV6 | - SKB_GSO_GRE | - SKB_GSO_GRE_CSUM | - SKB_GSO_IPIP | - SKB_GSO_SIT | - SKB_GSO_UDP_TUNNEL | - SKB_GSO_UDP_TUNNEL_CSUM | - SKB_GSO_TUNNEL_REMCSUM | - 0) || - !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) - goto out; skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); @@ -107,6 +90,12 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, goto out; } + /* GSO partial only requires splitting the frame into an MSS + * multiple and possibly a remainder. So update the mss now. + */ + if (features & NETIF_F_GSO_PARTIAL) + mss = skb->len - (skb->len % mss); + copy_destructor = gso_skb->destructor == tcp_wfree; ooo_okay = gso_skb->ooo_okay; /* All segments but the first should have ooo_okay cleared */ @@ -131,7 +120,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, newcheck = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); - do { + while (skb->next) { th->fin = th->psh = 0; th->check = newcheck; @@ -151,7 +140,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th->seq = htonl(seq); th->cwr = 0; - } while (skb->next); + } /* Following permits TCP Small Queues to work well with GSO : * The callback to TCP stack will be called at the time last frag @@ -237,7 +226,7 @@ struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) found: /* Include the IP ID check below from the inner most IP hdr */ - flush = NAPI_GRO_CB(p)->flush | NAPI_GRO_CB(p)->flush_id; + flush = NAPI_GRO_CB(p)->flush; flush |= (__force int)(flags & TCP_FLAG_CWR); flush |= (__force int)((flags ^ tcp_flag_word(th2)) & ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); @@ -246,6 +235,17 @@ found: flush |= *(u32 *)((u8 *)th + i) ^ *(u32 *)((u8 *)th2 + i); + /* When we receive our second frame we can made a decision on if we + * continue this flow as an atomic flow with a fixed ID or if we use + * an incrementing ID. + */ + if (NAPI_GRO_CB(p)->flush_id != 1 || + NAPI_GRO_CB(p)->count != 1 || + !NAPI_GRO_CB(p)->is_atomic) + flush |= NAPI_GRO_CB(p)->flush_id; + else + NAPI_GRO_CB(p)->is_atomic = false; + mss = skb_shinfo(p)->gso_size; flush |= (len - 1) >= mss; @@ -314,6 +314,9 @@ static int tcp4_gro_complete(struct sk_buff *skb, int thoff) iph->daddr, 0); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; + if (NAPI_GRO_CB(skb)->is_atomic) + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID; + return tcp_gro_complete(skb); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c0c1dac81..30a7dd4f6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -236,7 +236,8 @@ void tcp_select_initial_window(int __space, __u32 mss, /* Set window scaling on max possible window * See RFC1323 for an explanation of the limit to 14 */ - space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max); + space = max_t(u32, space, sysctl_tcp_rmem[2]); + space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); while (space > 65535 && (*rcv_wscale) < 14) { space >>= 1; @@ -364,7 +365,7 @@ tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) * be sent. */ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, - int tcp_header_len) + struct tcphdr *th, int tcp_header_len) { struct tcp_sock *tp = tcp_sk(sk); @@ -375,7 +376,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, INET_ECN_xmit(sk); if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) { tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; - tcp_hdr(skb)->cwr = 1; + th->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } } else if (!tcp_ca_needs_ecn(sk)) { @@ -383,7 +384,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, INET_ECN_dontxmit(sk); } if (tp->ecn_flags & TCP_ECN_DEMAND_CWR) - tcp_hdr(skb)->ece = 1; + th->ece = 1; } } @@ -956,12 +957,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_orphan(skb); skb->sk = sk; - skb->destructor = skb_is_tcp_pure_ack(skb) ? sock_wfree : tcp_wfree; + skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree; skb_set_hash_from_sk(skb, sk); atomic_add(skb->truesize, &sk->sk_wmem_alloc); /* Build TCP header and checksum it. */ - th = tcp_hdr(skb); + th = (struct tcphdr *)skb->data; th->source = inet->inet_sport; th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); @@ -969,14 +970,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->tcp_flags); - if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) { - /* RFC1323: The window in SYN & SYN/ACK segments - * is never scaled. - */ - th->window = htons(min(tp->rcv_wnd, 65535U)); - } else { - th->window = htons(tcp_select_window(sk)); - } th->check = 0; th->urg_ptr = 0; @@ -993,9 +986,15 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_options_write((__be32 *)(th + 1), tp, &opts); skb_shinfo(skb)->gso_type = sk->sk_gso_type; - if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0)) - tcp_ecn_send(sk, skb, tcp_header_size); - + if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) { + th->window = htons(tcp_select_window(sk)); + tcp_ecn_send(sk, skb, th, tcp_header_size); + } else { + /* RFC1323: The window in SYN & SYN/ACK segments + * is never scaled. + */ + th->window = htons(min(tp->rcv_wnd, 65535U)); + } #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ if (md5) { @@ -1118,11 +1117,17 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de tcp_verify_left_out(tp); } +static bool tcp_has_tx_tstamp(const struct sk_buff *skb) +{ + return TCP_SKB_CB(skb)->txstamp_ack || + (skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP); +} + static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2) { struct skb_shared_info *shinfo = skb_shinfo(skb); - if (unlikely(shinfo->tx_flags & SKBTX_ANY_TSTAMP) && + if (unlikely(tcp_has_tx_tstamp(skb)) && !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) { struct skb_shared_info *shinfo2 = skb_shinfo(skb2); u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP; @@ -1130,9 +1135,17 @@ static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2) shinfo->tx_flags &= ~tsflags; shinfo2->tx_flags |= tsflags; swap(shinfo->tskey, shinfo2->tskey); + TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack; + TCP_SKB_CB(skb)->txstamp_ack = 0; } } +static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) +{ + TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor; + TCP_SKB_CB(skb)->eor = 0; +} + /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. @@ -1178,6 +1191,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; + tcp_skb_fragment_eor(skb, buff); if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { /* Copy and checksum data tail into the new buffer. */ @@ -1738,6 +1752,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* This packet was never sent out yet, so no SACK bits. */ TCP_SKB_CB(buff)->sacked = 0; + tcp_skb_fragment_eor(skb, buff); + buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL; skb_split(skb, buff, len); tcp_fragment_tstamp(skb, buff); @@ -2211,14 +2227,13 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Thanks to skb fast clones, we can detect if a prior transmit of * a packet is still in a qdisc or driver queue. * In this case, there is very little point doing a retransmit ! - * Note: This is called from BH context only. */ static bool skb_still_in_host_queue(const struct sock *sk, const struct sk_buff *skb) { if (unlikely(skb_fclone_busy(sk, skb))) { - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES); return true; } return false; @@ -2273,14 +2288,14 @@ void tcp_send_loss_probe(struct sock *sk) if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; - if (__tcp_retransmit_skb(sk, skb)) + if (__tcp_retransmit_skb(sk, skb, 1)) goto rearm_timer; /* Record snd_nxt for loss detection. */ tp->tlp_high_seq = tp->snd_nxt; probe_sent: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); /* Reset s.t. tcp_rearm_rto will restart timer from now */ inet_csk(sk)->icsk_pending = 0; rearm_timer: @@ -2451,14 +2466,15 @@ u32 __tcp_select_window(struct sock *sk) void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb) { - const struct skb_shared_info *next_shinfo = skb_shinfo(next_skb); - u8 tsflags = next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; - - if (unlikely(tsflags)) { + if (unlikely(tcp_has_tx_tstamp(next_skb))) { + const struct skb_shared_info *next_shinfo = + skb_shinfo(next_skb); struct skb_shared_info *shinfo = skb_shinfo(skb); - shinfo->tx_flags |= tsflags; + shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP; shinfo->tskey = next_shinfo->tskey; + TCP_SKB_CB(skb)->txstamp_ack |= + TCP_SKB_CB(next_skb)->txstamp_ack; } } @@ -2497,6 +2513,7 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) * packet counting does not break. */ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS; + TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor; /* changed transmit queue under us so clear hints */ tcp_clear_retrans_hints_partial(tp); @@ -2548,6 +2565,9 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (!tcp_can_collapse(sk, skb)) break; + if (!tcp_skb_can_collapse_to(to)) + break; + space -= skb->len; if (first) { @@ -2574,17 +2594,17 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, * state updates are done by the caller. Returns non-zero if an * error occurred which prevented the send. */ -int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { - struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); unsigned int cur_mss; - int err; + int diff, len, err; - /* Inconslusive MTU probe */ - if (icsk->icsk_mtup.probe_size) { + + /* Inconclusive MTU probe */ + if (icsk->icsk_mtup.probe_size) icsk->icsk_mtup.probe_size = 0; - } /* Do not sent more than we queued. 1/4 is reserved for possible * copying overhead: fragmentation, tunneling, mangling etc. @@ -2617,30 +2637,27 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; - if (skb->len > cur_mss) { - if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC)) + len = cur_mss * segs; + if (skb->len > len) { + if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { - int oldpcount = tcp_skb_pcount(skb); + if (skb_unclone(skb, GFP_ATOMIC)) + return -ENOMEM; - if (unlikely(oldpcount > 1)) { - if (skb_unclone(skb, GFP_ATOMIC)) - return -ENOMEM; - tcp_init_tso_segs(skb, cur_mss); - tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); - } + diff = tcp_skb_pcount(skb); + tcp_set_skb_tso_segs(skb, cur_mss); + diff -= tcp_skb_pcount(skb); + if (diff) + tcp_adjust_pcount(sk, skb, diff); + if (skb->len < cur_mss) + tcp_retrans_try_collapse(sk, skb, cur_mss); } /* RFC3168, section 6.1.1.1. ECN fallback */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) tcp_ecn_clear_syn(sk, skb); - tcp_retrans_try_collapse(sk, skb, cur_mss); - - /* Make a copy, if the first transmission SKB clone we made - * is still in somebody's hands, else make a clone. - */ - /* make sure skb->data is aligned on arches that require it * and check if ack-trimming & collapsing extended the headroom * beyond what csum_start can cover. @@ -2658,20 +2675,22 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) } if (likely(!err)) { + segs = tcp_skb_pcount(skb); + TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; /* Update global TCP statistics. */ - TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); + TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); - tp->total_retrans++; + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + tp->total_retrans += segs; } return err; } -int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) { struct tcp_sock *tp = tcp_sk(sk); - int err = __tcp_retransmit_skb(sk, skb); + int err = __tcp_retransmit_skb(sk, skb, segs); if (err == 0) { #if FASTRETRANS_DEBUG > 0 @@ -2687,7 +2706,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) tp->retrans_stamp = tcp_skb_timestamp(skb); } else if (err != -EBUSY) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL); } if (tp->undo_retrans < 0) @@ -2740,7 +2759,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; struct sk_buff *hole = NULL; - u32 last_lost; + u32 max_segs, last_lost; int mib_idx; int fwd_rexmitting = 0; @@ -2760,8 +2779,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) last_lost = tp->snd_una; } + max_segs = tcp_tso_autosize(sk, tcp_current_mss(sk)); tcp_for_write_queue_from(skb, sk) { __u8 sacked = TCP_SKB_CB(skb)->sacked; + int segs; if (skb == tcp_send_head(sk)) break; @@ -2769,15 +2790,13 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (!hole) tp->retransmit_skb_hint = skb; - /* Assume this retransmit will generate - * only one packet for congestion window - * calculation purposes. This works because - * tcp_retransmit_skb() will chop up the - * packet to be MSS sized and all the - * packet counting works out. - */ - if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + segs = tp->snd_cwnd - tcp_packets_in_flight(tp); + if (segs <= 0) return; + /* In case tcp_shift_skb_data() have aggregated large skbs, + * we need to make sure not sending too bigs TSO packets + */ + segs = min_t(int, segs, max_segs); if (fwd_rexmitting) { begin_fwd: @@ -2813,10 +2832,10 @@ begin_fwd: if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) continue; - if (tcp_retransmit_skb(sk, skb)) + if (tcp_retransmit_skb(sk, skb, segs)) return; - NET_INC_STATS_BH(sock_net(sk), mib_idx); + NET_INC_STATS(sock_net(sk), mib_idx); if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); @@ -2969,7 +2988,7 @@ int tcp_send_synack(struct sock *sk) struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, - bool attach_req) + enum tcp_synack_type synack_type) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk); @@ -2989,14 +3008,22 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, /* Reserve space for headers. */ skb_reserve(skb, MAX_TCP_HEADER); - if (attach_req) { + switch (synack_type) { + case TCP_SYNACK_NORMAL: skb_set_owner_w(skb, req_to_sk(req)); - } else { + break; + case TCP_SYNACK_COOKIE: + /* Under synflood, we do not attach skb to a socket, + * to avoid false sharing. + */ + break; + case TCP_SYNACK_FASTOPEN: /* sk is a const pointer, because we want to express multiple * cpu might call us concurrently. * sk->sk_wmem_alloc in an atomic, we can promote to rw. */ skb_set_owner_w(skb, (struct sock *)sk); + break; } skb_dst_set(skb, dst); @@ -3024,7 +3051,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); - th = tcp_hdr(skb); + th = (struct tcphdr *)skb->data; memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; @@ -3045,7 +3072,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, th->window = htons(min(req->rsk_rcv_wnd, 65535U)); tcp_options_write((__be32 *)(th + 1), NULL, &opts); th->doff = (tcp_header_size >> 2); - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ @@ -3549,10 +3576,10 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) int res; tcp_rsk(req)->txhash = net_tx_rndhash(); - res = af_ops->send_synack(sk, NULL, &fl, req, NULL, true); + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL); if (!res) { - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); + __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); } return res; } diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 5353085fd..e36df4fcf 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -65,8 +65,8 @@ int tcp_rack_mark_lost(struct sock *sk) if (scb->sacked & TCPCB_SACKED_RETRANS) { scb->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPLOSTRETRANSMIT); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPLOSTRETRANSMIT); } } else if (!(scb->sacked & TCPCB_RETRANS)) { /* Original data are sent sequentially so stop early diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 49bc474f8..debdd8b33 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -30,7 +30,7 @@ static void tcp_write_err(struct sock *sk) sk->sk_error_report(sk); tcp_done(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); } /* Do not allow orphaned sockets to eat all our resources. @@ -68,7 +68,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) if (do_reset) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; } return 0; @@ -162,8 +162,8 @@ static int tcp_write_timeout(struct sock *sk) if (tp->syn_fastopen || tp->syn_data) tcp_fastopen_cache_set(sk, 0, NULL, true, 0); if (tp->syn_data && icsk->icsk_retransmits == 1) - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; syn_set = true; @@ -178,8 +178,8 @@ static int tcp_write_timeout(struct sock *sk) tp->bytes_acked <= tp->rx_opt.mss_clamp) { tcp_fastopen_cache_set(sk, 0, NULL, true, 0); if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1) - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPFASTOPENACTIVEFAIL); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPFASTOPENACTIVEFAIL); } /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -209,6 +209,7 @@ static int tcp_write_timeout(struct sock *sk) return 0; } +/* Called with BH disabled */ void tcp_delack_timer_handler(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -228,7 +229,7 @@ void tcp_delack_timer_handler(struct sock *sk) if (!skb_queue_empty(&tp->ucopy.prequeue)) { struct sk_buff *skb; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) sk_backlog_rcv(sk, skb); @@ -248,7 +249,7 @@ void tcp_delack_timer_handler(struct sock *sk) icsk->icsk_ack.ato = TCP_ATO_MIN; } tcp_send_ack(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); } out: @@ -265,7 +266,7 @@ static void tcp_delack_timer(unsigned long data) tcp_delack_timer_handler(sk); } else { inet_csk(sk)->icsk_ack.blocked = 1; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) sock_hold(sk); @@ -404,7 +405,7 @@ void tcp_retransmit_timer(struct sock *sk) goto out; } tcp_enter_loss(sk); - tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); + tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1); __sk_dst_reset(sk); goto out_reset_timer; } @@ -431,12 +432,12 @@ void tcp_retransmit_timer(struct sock *sk) } else { mib_idx = LINUX_MIB_TCPTIMEOUTS; } - NET_INC_STATS_BH(sock_net(sk), mib_idx); + __NET_INC_STATS(sock_net(sk), mib_idx); } tcp_enter_loss(sk); - if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { + if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) { /* Retransmission failed because of local congestion, * do not backoff. */ @@ -493,6 +494,7 @@ out_reset_timer: out:; } +/* Called with BH disabled */ void tcp_write_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -549,7 +551,7 @@ void tcp_syn_ack_timeout(const struct request_sock *req) { struct net *net = read_pnet(&inet_rsk(req)->ireq_net); - NET_INC_STATS_BH(net, LINUX_MIB_TCPTIMEOUTS); + __NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS); } EXPORT_SYMBOL(tcp_syn_ack_timeout); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 13951c408..4c4bac1b5 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -107,16 +107,16 @@ EXPORT_SYMBOL_GPL(tcp_vegas_init); * o min-filter RTT samples from a much longer window (forever for now) * to find the propagation delay (baseRTT) */ -void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us) +void tcp_vegas_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct vegas *vegas = inet_csk_ca(sk); u32 vrtt; - if (rtt_us < 0) + if (sample->rtt_us < 0) return; /* Never allow zero rtt or baseRTT */ - vrtt = rtt_us + 1; + vrtt = sample->rtt_us + 1; /* Filter to find propagation delay: */ if (vrtt < vegas->baseRTT) diff --git a/net/ipv4/tcp_vegas.h b/net/ipv4/tcp_vegas.h index ef9da5306..248cfc0ff 100644 --- a/net/ipv4/tcp_vegas.h +++ b/net/ipv4/tcp_vegas.h @@ -17,7 +17,7 @@ struct vegas { void tcp_vegas_init(struct sock *sk); void tcp_vegas_state(struct sock *sk, u8 ca_state); -void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us); +void tcp_vegas_pkts_acked(struct sock *sk, const struct ack_sample *sample); void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event); size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info); diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 0d094b995..40171e163 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -69,16 +69,17 @@ static void tcp_veno_init(struct sock *sk) } /* Do rtt sampling needed for Veno. */ -static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, s32 rtt_us) +static void tcp_veno_pkts_acked(struct sock *sk, + const struct ack_sample *sample) { struct veno *veno = inet_csk_ca(sk); u32 vrtt; - if (rtt_us < 0) + if (sample->rtt_us < 0) return; /* Never allow zero rtt or baseRTT */ - vrtt = rtt_us + 1; + vrtt = sample->rtt_us + 1; /* Filter to find propagation delay: */ if (vrtt < veno->basertt) diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index c10732e39..4b03a2e2a 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -99,12 +99,13 @@ static void westwood_filter(struct westwood *w, u32 delta) * Called after processing group of packets. * but all westwood needs is the last sample of srtt. */ -static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, s32 rtt) +static void tcp_westwood_pkts_acked(struct sock *sk, + const struct ack_sample *sample) { struct westwood *w = inet_csk_ca(sk); - if (rtt > 0) - w->rtt = usecs_to_jiffies(rtt); + if (sample->rtt_us > 0) + w->rtt = usecs_to_jiffies(sample->rtt_us); } /* diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 3e6a472e6..028eb046e 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -56,15 +56,16 @@ static void tcp_yeah_init(struct sock *sk) tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); } -static void tcp_yeah_pkts_acked(struct sock *sk, u32 pkts_acked, s32 rtt_us) +static void tcp_yeah_pkts_acked(struct sock *sk, + const struct ack_sample *sample) { const struct inet_connection_sock *icsk = inet_csk(sk); struct yeah *yeah = inet_csk_ca(sk); if (icsk->icsk_ca_state == TCP_CA_Open) - yeah->pkts_acked = pkts_acked; + yeah->pkts_acked = sample->pkts_acked; - tcp_vegas_pkts_acked(sk, pkts_acked, rtt_us); + tcp_vegas_pkts_acked(sk, sample); } static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e9853dff7..e61f7cd65 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -143,10 +143,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, unsigned int log) { struct sock *sk2; - struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); - sk_nulls_for_each(sk2, node, &hslot->head) { + sk_for_each(sk2, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (bitmap || udp_sk(sk2)->udp_port_hash == num) && @@ -177,12 +176,11 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, bool match_wildcard)) { struct sock *sk2; - struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); int res = 0; spin_lock(&hslot2->lock); - udp_portaddr_for_each_entry(sk2, node, &hslot2->head) { + udp_portaddr_for_each_entry(sk2, &hslot2->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (udp_sk(sk2)->udp_port_hash == num) && @@ -207,11 +205,10 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot, bool match_wildcard)) { struct net *net = sock_net(sk); - struct hlist_nulls_node *node; kuid_t uid = sock_i_uid(sk); struct sock *sk2; - sk_nulls_for_each(sk2, node, &hslot->head) { + sk_for_each(sk2, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && sk2->sk_family == sk->sk_family && @@ -333,22 +330,23 @@ found: goto fail_unlock; } - sk_nulls_add_node_rcu(sk, &hslot->head); + sk_add_node_rcu(sk, &hslot->head); hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock(&hslot2->lock); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && - sk->sk_family == AF_INET6) - hlist_nulls_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node, - &hslot2->head); + sk->sk_family == AF_INET6) + hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); else - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, - &hslot2->head); + hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); hslot2->count++; spin_unlock(&hslot2->lock); } + sock_set_flag(sk, SOCK_RCU_FREE); error = 0; fail_unlock: spin_unlock_bh(&hslot->lock); @@ -393,9 +391,9 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum) return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr); } -static inline int compute_score(struct sock *sk, struct net *net, - __be32 saddr, unsigned short hnum, __be16 sport, - __be32 daddr, __be16 dport, int dif) +static int compute_score(struct sock *sk, struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned short hnum, int dif) { int score; struct inet_sock *inet; @@ -436,52 +434,6 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } -/* - * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num) - */ -static inline int compute_score2(struct sock *sk, struct net *net, - __be32 saddr, __be16 sport, - __be32 daddr, unsigned int hnum, int dif) -{ - int score; - struct inet_sock *inet; - - if (!net_eq(sock_net(sk), net) || - ipv6_only_sock(sk)) - return -1; - - inet = inet_sk(sk); - - if (inet->inet_rcv_saddr != daddr || - inet->inet_num != hnum) - return -1; - - score = (sk->sk_family == PF_INET) ? 2 : 1; - - if (inet->inet_daddr) { - if (inet->inet_daddr != saddr) - return -1; - score += 4; - } - - if (inet->inet_dport) { - if (inet->inet_dport != sport) - return -1; - score += 4; - } - - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - return -1; - score += 4; - } - - if (sk->sk_incoming_cpu == raw_smp_processor_id()) - score++; - - return score; -} - static u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport) @@ -494,45 +446,35 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr, udp_ehash_secret + net_hash_mix(net)); } -/* called with read_rcu_lock() */ +/* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif, - struct udp_hslot *hslot2, unsigned int slot2, + struct udp_hslot *hslot2, struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; int score, badness, matches = 0, reuseport = 0; - bool select_ok = true; u32 hash = 0; -begin: result = NULL; badness = 0; - udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { - score = compute_score2(sk, net, saddr, sport, + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { + score = compute_score(sk, net, saddr, sport, daddr, hnum, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - if (select_ok) { - struct sock *sk2; - - sk2 = reuseport_select_sock(sk, hash, skb, + result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (sk2) { - result = sk2; - select_ok = false; - goto found; - } - } + if (result) + return result; matches = 1; } + badness = score; + result = sk; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -540,23 +482,6 @@ begin: hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot2) - goto begin; - if (result) { -found: - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score2(result, net, saddr, sport, - daddr, hnum, dif) < badness)) { - sock_put(result); - goto begin; - } - } return result; } @@ -568,15 +493,12 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, int dif, struct udp_table *udptable, struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; int score, badness, matches = 0, reuseport = 0; - bool select_ok = true; u32 hash = 0; - rcu_read_lock(); if (hslot->count > 10) { hash2 = udp4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; @@ -586,47 +508,44 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, - hslot2, slot2, skb); + hslot2, skb); if (!result) { + unsigned int old_slot2 = slot2; hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; + /* avoid searching the same slot again. */ + if (unlikely(slot2 == old_slot2)) + return result; + hslot2 = &udptable->hash2[slot2]; if (hslot->count < hslot2->count) goto begin; result = udp4_lib_lookup2(net, saddr, sport, - htonl(INADDR_ANY), hnum, dif, - hslot2, slot2, skb); + daddr, hnum, dif, + hslot2, skb); } - rcu_read_unlock(); return result; } begin: result = NULL; badness = 0; - sk_nulls_for_each_rcu(sk, node, &hslot->head) { - score = compute_score(sk, net, saddr, hnum, sport, - daddr, dport, dif); + sk_for_each_rcu(sk, &hslot->head) { + score = compute_score(sk, net, saddr, sport, + daddr, hnum, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - if (select_ok) { - struct sock *sk2; - - sk2 = reuseport_select_sock(sk, hash, skb, + result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (sk2) { - result = sk2; - select_ok = false; - goto found; - } - } + if (result) + return result; matches = 1; } + result = sk; + badness = score; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -634,25 +553,6 @@ begin: hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begin; - - if (result) { -found: - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score(result, net, saddr, hnum, sport, - daddr, dport, dif) < badness)) { - sock_put(result); - goto begin; - } - } - rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(__udp4_lib_lookup); @@ -663,18 +563,36 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, { const struct iphdr *iph = ip_hdr(skb); - return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, + return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, iph->daddr, dport, inet_iif(skb), udptable, skb); } +struct sock *udp4_lib_lookup_skb(struct sk_buff *skb, + __be16 sport, __be16 dport) +{ + return __udp4_lib_lookup_skb(skb, sport, dport, &udp_table); +} +EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb); + +/* Must be called under rcu_read_lock(). + * Does increment socket refcount. + */ +#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ + IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { - return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, - &udp_table, NULL); + struct sock *sk; + + sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport, + dif, &udp_table, NULL); + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + return sk; } EXPORT_SYMBOL_GPL(udp4_lib_lookup); +#endif static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk, __be16 loc_port, __be32 loc_addr, @@ -723,7 +641,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) iph->saddr, uh->source, skb->dev->ifindex, udptable, NULL); if (!sk) { - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; /* No socket for error */ } @@ -776,7 +694,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) sk->sk_err = err; sk->sk_error_report(sk); out: - sock_put(sk); + return; } void udp_err(struct sk_buff *skb, u32 info) @@ -917,13 +835,13 @@ send: err = ip_send_skb(sock_net(sk), skb); if (err) { if (err == -ENOBUFS && !inet->recverr) { - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_SNDBUFERRORS, is_udplite); + UDP_INC_STATS(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); err = 0; } } else - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_OUTDATAGRAMS, is_udplite); + UDP_INC_STATS(sock_net(sk), + UDP_MIB_OUTDATAGRAMS, is_udplite); return err; } @@ -1032,15 +950,13 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) */ connected = 1; } - ipc.addr = inet->inet_saddr; + ipc.sockc.tsflags = sk->sk_tsflags; + ipc.addr = inet->inet_saddr; ipc.oif = sk->sk_bound_dev_if; - sock_tx_timestamp(sk, &ipc.tx_flags); - if (msg->msg_controllen) { - err = ip_cmsg_send(sock_net(sk), msg, &ipc, - sk->sk_family == AF_INET6); + err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6); if (unlikely(err)) { kfree(ipc.opt); return err; @@ -1065,6 +981,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) saddr = ipc.addr; ipc.addr = faddr = daddr; + sock_tx_timestamp(sk, ipc.sockc.tsflags, &ipc.tx_flags); + if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) return -EINVAL; @@ -1192,8 +1110,8 @@ out: * seems like overkill. */ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_SNDBUFERRORS, is_udplite); + UDP_INC_STATS(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); } return err; @@ -1277,10 +1195,10 @@ static unsigned int first_packet_length(struct sock *sk) spin_lock_bh(&rcvq->lock); while ((skb = skb_peek(rcvq)) != NULL && udp_lib_checksum_complete(skb)) { - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, - IS_UDPLITE(sk)); - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, - IS_UDPLITE(sk)); + __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, + IS_UDPLITE(sk)); + __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, + IS_UDPLITE(sk)); atomic_inc(&sk->sk_drops); __skb_unlink(skb, rcvq); __skb_queue_tail(&list_kill, skb); @@ -1316,14 +1234,6 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) { unsigned int amount = first_packet_length(sk); - if (amount) - /* - * We will only return the amount - * of this packet since that is all - * that will be read. - */ - amount -= sizeof(struct udphdr); - return put_user(amount, (int __user *)arg); } @@ -1347,7 +1257,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; unsigned int ulen, copied; - int peeked, off = 0; + int peeked, peeking, off; int err; int is_udplite = IS_UDPLITE(sk); bool checksum_valid = false; @@ -1357,15 +1267,16 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, return ip_recv_error(sk, msg, len, addr_len); try_again: + peeking = off = sk_peek_offset(sk, flags); skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), &peeked, &off, &err); if (!skb) - goto out; + return err; - ulen = skb->len - sizeof(struct udphdr); + ulen = skb->len; copied = len; - if (copied > ulen) - copied = ulen; + if (copied > ulen - off) + copied = ulen - off; else if (copied < ulen) msg->msg_flags |= MSG_TRUNC; @@ -1375,18 +1286,16 @@ try_again: * coverage checksum (UDP-Lite), do it before the copy. */ - if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { + if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) { checksum_valid = !udp_lib_checksum_complete(skb); if (!checksum_valid) goto csum_copy_err; } if (checksum_valid || skb_csum_unnecessary(skb)) - err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), - msg, copied); + err = skb_copy_datagram_msg(skb, off, msg, copied); else { - err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), - msg); + err = skb_copy_and_csum_datagram_msg(skb, off, msg); if (err == -EINVAL) goto csum_copy_err; @@ -1396,15 +1305,16 @@ try_again: trace_kfree_skb(skb, udp_recvmsg); if (!peeked) { atomic_inc(&sk->sk_drops); - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS(sock_net(sk), + UDP_MIB_INERRORS, is_udplite); } - goto out_free; + skb_free_datagram_locked(sk, skb); + return err; } if (!peeked) - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_INDATAGRAMS, is_udplite); + UDP_INC_STATS(sock_net(sk), + UDP_MIB_INDATAGRAMS, is_udplite); sock_recv_ts_and_drops(msg, sk, skb); @@ -1417,22 +1327,20 @@ try_again: *addr_len = sizeof(*sin); } if (inet->cmsg_flags) - ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr)); + ip_cmsg_recv_offset(msg, skb, sizeof(struct udphdr) + off); err = copied; if (flags & MSG_TRUNC) err = ulen; -out_free: - skb_free_datagram_locked(sk, skb); -out: + __skb_free_datagram_locked(sk, skb, peeking ? -err : err); return err; csum_copy_err: slow = lock_sock_fast(sk); if (!skb_kill_datagram(sk, skb, flags)) { - UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); - UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); + UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } unlock_sock_fast(sk, slow); @@ -1479,13 +1387,13 @@ void udp_lib_unhash(struct sock *sk) spin_lock_bh(&hslot->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); - if (sk_nulls_del_node_init_rcu(sk)) { + if (sk_del_node_init_rcu(sk)) { hslot->count--; inet_sk(sk)->inet_num = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_lock(&hslot2->lock); - hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); } @@ -1518,12 +1426,12 @@ void udp_lib_rehash(struct sock *sk, u16 newhash) if (hslot2 != nhslot2) { spin_lock(&hslot2->lock); - hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); spin_lock(&nhslot2->lock); - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &nhslot2->head); nhslot2->count++; spin_unlock(&nhslot2->lock); @@ -1553,15 +1461,15 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sk_incoming_cpu_update(sk); } - rc = sock_queue_rcv_skb(sk, skb); + rc = __sock_queue_rcv_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, - is_udplite); - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS, + is_udplite); + UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); kfree_skb(skb); trace_udp_fail_queue_rcv_skb(rc, sk); return -1; @@ -1625,9 +1533,9 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) ret = encap_rcv(sk, skb); if (ret <= 0) { - UDP_INC_STATS_BH(sock_net(sk), - UDP_MIB_INDATAGRAMS, - is_udplite); + __UDP_INC_STATS(sock_net(sk), + UDP_MIB_INDATAGRAMS, + is_udplite); return -ret; } } @@ -1671,11 +1579,15 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (rcu_access_pointer(sk->sk_filter) && udp_lib_checksum_complete(skb)) - goto csum_error; + goto csum_error; + if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) + goto drop; + + udp_csum_pull_header(skb); if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, - is_udplite); + __UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS, + is_udplite); goto drop; } @@ -1694,43 +1606,14 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return rc; csum_error: - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); + __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); kfree_skb(skb); return -1; } -static void flush_stack(struct sock **stack, unsigned int count, - struct sk_buff *skb, unsigned int final) -{ - unsigned int i; - struct sk_buff *skb1 = NULL; - struct sock *sk; - - for (i = 0; i < count; i++) { - sk = stack[i]; - if (likely(!skb1)) - skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); - - if (!skb1) { - atomic_inc(&sk->sk_drops); - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, - IS_UDPLITE(sk)); - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, - IS_UDPLITE(sk)); - } - - if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) - skb1 = NULL; - - sock_put(sk); - } - if (unlikely(skb1)) - kfree_skb(skb1); -} - /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ @@ -1754,14 +1637,14 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udp_table *udptable, int proto) { - struct sock *sk, *stack[256 / sizeof(struct sock *)]; - struct hlist_nulls_node *node; + struct sock *sk, *first = NULL; unsigned short hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); - int dif = skb->dev->ifindex; - unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); - bool inner_flushed = false; + unsigned int offset = offsetof(typeof(*sk), sk_node); + int dif = skb->dev->ifindex; + struct hlist_node *node; + struct sk_buff *nskb; if (use_hash2) { hash2_any = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & @@ -1772,23 +1655,28 @@ start_lookup: offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } - spin_lock(&hslot->lock); - sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { - if (__udp_is_mcast_sock(net, sk, - uh->dest, daddr, - uh->source, saddr, - dif, hnum)) { - if (unlikely(count == ARRAY_SIZE(stack))) { - flush_stack(stack, count, skb, ~0); - inner_flushed = true; - count = 0; - } - stack[count++] = sk; - sock_hold(sk); + sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { + if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr, + uh->source, saddr, dif, hnum)) + continue; + + if (!first) { + first = sk; + continue; } - } + nskb = skb_clone(skb, GFP_ATOMIC); - spin_unlock(&hslot->lock); + if (unlikely(!nskb)) { + atomic_inc(&sk->sk_drops); + __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, + IS_UDPLITE(sk)); + __UDP_INC_STATS(net, UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + continue; + } + if (udp_queue_rcv_skb(sk, nskb) > 0) + consume_skb(nskb); + } /* Also lookup *:port if we are using hash2 and haven't done so yet. */ if (use_hash2 && hash2 != hash2_any) { @@ -1796,16 +1684,13 @@ start_lookup: goto start_lookup; } - /* - * do the slow work with no lock held - */ - if (count) { - flush_stack(stack, count, skb, count - 1); + if (first) { + if (udp_queue_rcv_skb(first, skb) > 0) + consume_skb(skb); } else { - if (!inner_flushed) - UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, - proto == IPPROTO_UDPLITE); - consume_skb(skb); + kfree_skb(skb); + __UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); } return 0; } @@ -1829,8 +1714,11 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, return err; } - return skb_checksum_init_zero_check(skb, proto, uh->check, - inet_compute_pseudo); + /* Note, we are only interested in != 0 or == 0, thus the + * force to int. + */ + return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, + inet_compute_pseudo); } /* @@ -1902,7 +1790,6 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, inet_compute_pseudo); ret = udp_queue_rcv_skb(sk, skb); - sock_put(sk); /* a return value > 0 means to resubmit the input, but * it wants the return to be -protocol, or 0 @@ -1920,7 +1807,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp_lib_checksum_complete(skb)) goto csum_error; - UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); + __UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); /* @@ -1947,9 +1834,9 @@ csum_error: proto == IPPROTO_UDPLITE ? "Lite" : "", &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), ulen); - UDP_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); + __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); drop: - UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); + __UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); kfree_skb(skb); return 0; } @@ -1963,49 +1850,24 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, int dif) { struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(loc_port); - unsigned int count, slot = udp_hashfn(net, hnum, udp_table.mask); + unsigned int slot = udp_hashfn(net, hnum, udp_table.mask); struct udp_hslot *hslot = &udp_table.hash[slot]; /* Do not bother scanning a too big list */ if (hslot->count > 10) return NULL; - rcu_read_lock(); -begin: - count = 0; result = NULL; - sk_nulls_for_each_rcu(sk, node, &hslot->head) { - if (__udp_is_mcast_sock(net, sk, - loc_port, loc_addr, - rmt_port, rmt_addr, - dif, hnum)) { + sk_for_each_rcu(sk, &hslot->head) { + if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr, + rmt_port, rmt_addr, dif, hnum)) { + if (result) + return NULL; result = sk; - ++count; } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begin; - - if (result) { - if (count != 1 || - unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(!__udp_is_mcast_sock(net, result, - loc_port, loc_addr, - rmt_port, rmt_addr, - dif, hnum))) { - sock_put(result); - result = NULL; - } - } - rcu_read_unlock(); + return result; } @@ -2018,37 +1880,22 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, __be16 rmt_port, __be32 rmt_addr, int dif) { - struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(loc_port); unsigned int hash2 = udp4_portaddr_hash(net, loc_addr, hnum); unsigned int slot2 = hash2 & udp_table.mask; struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); + struct sock *sk; - rcu_read_lock(); - result = NULL; - udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { - if (INET_MATCH(sk, net, acookie, - rmt_addr, loc_addr, ports, dif)) - result = sk; + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { + if (INET_MATCH(sk, net, acookie, rmt_addr, + loc_addr, ports, dif)) + return sk; /* Only check first socket in chain */ break; } - - if (result) { - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(!INET_MATCH(sk, net, acookie, - rmt_addr, loc_addr, - ports, dif))) { - sock_put(result); - result = NULL; - } - } - rcu_read_unlock(); - return result; + return NULL; } void udp_v4_early_demux(struct sk_buff *skb) @@ -2056,7 +1903,7 @@ void udp_v4_early_demux(struct sk_buff *skb) struct net *net = dev_net(skb->dev); const struct iphdr *iph; const struct udphdr *uh; - struct sock *sk; + struct sock *sk = NULL; struct dst_entry *dst; int dif = skb->dev->ifindex; int ours; @@ -2088,11 +1935,9 @@ void udp_v4_early_demux(struct sk_buff *skb) } else if (skb->pkt_type == PACKET_HOST) { sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, dif); - } else { - return; } - if (!sk) + if (!sk || !atomic_inc_not_zero_hint(&sk->sk_refcnt, 2)) return; skb->sk = sk; @@ -2392,14 +2237,13 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) for (state->bucket = start; state->bucket <= state->udp_table->mask; ++state->bucket) { - struct hlist_nulls_node *node; struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; - if (hlist_nulls_empty(&hslot->head)) + if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); - sk_nulls_for_each(sk, node, &hslot->head) { + sk_for_each(sk, &hslot->head) { if (!net_eq(sock_net(sk), net)) continue; if (sk->sk_family == state->family) @@ -2418,7 +2262,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) struct net *net = seq_file_net(seq); do { - sk = sk_nulls_next(sk); + sk = sk_next(sk); } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); if (!sk) { @@ -2627,12 +2471,12 @@ void __init udp_table_init(struct udp_table *table, const char *name) table->hash2 = table->hash + (table->mask + 1); for (i = 0; i <= table->mask; i++) { - INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); + INIT_HLIST_HEAD(&table->hash[i].head); table->hash[i].count = 0; spin_lock_init(&table->hash[i].lock); } for (i = 0; i <= table->mask; i++) { - INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i); + INIT_HLIST_HEAD(&table->hash2[i].head); table->hash2[i].count = 0; spin_lock_init(&table->hash2[i].lock); } diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index df1966f3b..3d5ccf4b1 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -36,10 +36,11 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { int err = -EINVAL; - struct sock *sk; + struct sock *sk = NULL; struct sk_buff *rep; struct net *net = sock_net(in_skb->sk); + rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, @@ -54,9 +55,9 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, req->id.idiag_dport, req->id.idiag_if, tbl, NULL); #endif - else - goto out_nosk; - + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + rcu_read_unlock(); err = -ENOENT; if (!sk) goto out_nosk; @@ -96,24 +97,23 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { - int num, s_num, slot, s_slot; struct net *net = sock_net(skb->sk); + int num, s_num, slot, s_slot; s_slot = cb->args[0]; num = s_num = cb->args[1]; for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) { - struct sock *sk; - struct hlist_nulls_node *node; struct udp_hslot *hslot = &table->hash[slot]; + struct sock *sk; num = 0; - if (hlist_nulls_empty(&hslot->head)) + if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); - sk_nulls_for_each(sk, node, &hslot->head) { + sk_for_each(sk, &hslot->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index e330c0e56..81f253b6f 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -14,18 +14,6 @@ #include <net/udp.h> #include <net/protocol.h> -static DEFINE_SPINLOCK(udp_offload_lock); -static struct udp_offload_priv __rcu *udp_offload_base __read_mostly; - -#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock)) - -struct udp_offload_priv { - struct udp_offload *offload; - possible_net_t net; - struct rcu_head rcu; - struct udp_offload_priv __rcu *next; -}; - static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, @@ -51,8 +39,11 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, * 16 bit length field due to the header being added outside of an * IP or IPv6 frame that was already limited to 64K - 1. */ - partial = csum_sub(csum_unfold(uh->check), - (__force __wsum)htonl(skb->len)); + if (skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) + partial = (__force __wsum)uh->len; + else + partial = (__force __wsum)htonl(skb->len); + partial = csum_sub(csum_unfold(uh->check), partial); /* setup inner skb. */ skb->encapsulation = 0; @@ -101,7 +92,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, udp_offset = outer_hlen - tnl_hlen; skb = segs; do { - __be16 len; + unsigned int len; if (remcsum) skb->ip_summed = CHECKSUM_NONE; @@ -119,14 +110,26 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); skb_set_transport_header(skb, udp_offset); - len = htons(skb->len - udp_offset); + len = skb->len - udp_offset; uh = udp_hdr(skb); - uh->len = len; + + /* If we are only performing partial GSO the inner header + * will be using a length value equal to only one MSS sized + * segment instead of the entire frame. + */ + if (skb_is_gso(skb)) { + uh->len = htons(skb_shinfo(skb)->gso_size + + SKB_GSO_CB(skb)->data_offset + + skb->head - (unsigned char *)uh); + } else { + uh->len = htons(len); + } if (!need_csum) continue; - uh->check = ~csum_fold(csum_add(partial, (__force __wsum)len)); + uh->check = ~csum_fold(csum_add(partial, + (__force __wsum)htonl(len))); if (skb->encapsulation || !offload_csum) { uh->check = gso_make_checksum(skb, ~uh->check); @@ -179,6 +182,7 @@ out_unlock: return segs; } +EXPORT_SYMBOL(skb_udp_tunnel_segment); static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, netdev_features_t features) @@ -205,16 +209,6 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ - int type = skb_shinfo(skb)->gso_type; - - if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | - SKB_GSO_UDP_TUNNEL | - SKB_GSO_UDP_TUNNEL_CSUM | - SKB_GSO_TUNNEL_REMCSUM | - SKB_GSO_IPIP | - SKB_GSO_GRE | SKB_GSO_GRE_CSUM) || - !(type & (SKB_GSO_UDP)))) - goto out; skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); @@ -253,64 +247,14 @@ out: return segs; } -int udp_add_offload(struct net *net, struct udp_offload *uo) -{ - struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC); - - if (!new_offload) - return -ENOMEM; - - write_pnet(&new_offload->net, net); - new_offload->offload = uo; - - spin_lock(&udp_offload_lock); - new_offload->next = udp_offload_base; - rcu_assign_pointer(udp_offload_base, new_offload); - spin_unlock(&udp_offload_lock); - - return 0; -} -EXPORT_SYMBOL(udp_add_offload); - -static void udp_offload_free_routine(struct rcu_head *head) -{ - struct udp_offload_priv *ou_priv = container_of(head, struct udp_offload_priv, rcu); - kfree(ou_priv); -} - -void udp_del_offload(struct udp_offload *uo) -{ - struct udp_offload_priv __rcu **head = &udp_offload_base; - struct udp_offload_priv *uo_priv; - - spin_lock(&udp_offload_lock); - - uo_priv = udp_deref_protected(*head); - for (; uo_priv != NULL; - uo_priv = udp_deref_protected(*head)) { - if (uo_priv->offload == uo) { - rcu_assign_pointer(*head, - udp_deref_protected(uo_priv->next)); - goto unlock; - } - head = &uo_priv->next; - } - pr_warn("udp_del_offload: didn't find offload for port %d\n", ntohs(uo->port)); -unlock: - spin_unlock(&udp_offload_lock); - if (uo_priv) - call_rcu(&uo_priv->rcu, udp_offload_free_routine); -} -EXPORT_SYMBOL(udp_del_offload); - struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, - struct udphdr *uh) + struct udphdr *uh, udp_lookup_t lookup) { - struct udp_offload_priv *uo_priv; struct sk_buff *p, **pp = NULL; struct udphdr *uh2; unsigned int off = skb_gro_offset(skb); int flush = 1; + struct sock *sk; if (NAPI_GRO_CB(skb)->encap_mark || (skb->ip_summed != CHECKSUM_PARTIAL && @@ -322,13 +266,10 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb, NAPI_GRO_CB(skb)->encap_mark = 1; rcu_read_lock(); - uo_priv = rcu_dereference(udp_offload_base); - for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { - if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) && - uo_priv->offload->port == uh->dest && - uo_priv->offload->callbacks.gro_receive) - goto unflush; - } + sk = (*lookup)(skb, uh->source, uh->dest); + + if (sk && udp_sk(sk)->gro_receive) + goto unflush; goto out_unlock; unflush: @@ -352,9 +293,7 @@ unflush: skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); - NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; - pp = uo_priv->offload->callbacks.gro_receive(head, skb, - uo_priv->offload); + pp = udp_sk(sk)->gro_receive(sk, head, skb); out_unlock: rcu_read_unlock(); @@ -362,6 +301,7 @@ out: NAPI_GRO_CB(skb)->flush |= flush; return pp; } +EXPORT_SYMBOL(udp_gro_receive); static struct sk_buff **udp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) @@ -383,19 +323,20 @@ static struct sk_buff **udp4_gro_receive(struct sk_buff **head, inet_gro_compute_pseudo); skip: NAPI_GRO_CB(skb)->is_ipv6 = 0; - return udp_gro_receive(head, skb, uh); + return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb); flush: NAPI_GRO_CB(skb)->flush = 1; return NULL; } -int udp_gro_complete(struct sk_buff *skb, int nhoff) +int udp_gro_complete(struct sk_buff *skb, int nhoff, + udp_lookup_t lookup) { - struct udp_offload_priv *uo_priv; __be16 newlen = htons(skb->len - nhoff); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); int err = -ENOSYS; + struct sock *sk; uh->len = newlen; @@ -405,22 +346,10 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff) skb->encapsulation = 1; rcu_read_lock(); - - uo_priv = rcu_dereference(udp_offload_base); - for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) { - if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) && - uo_priv->offload->port == uh->dest && - uo_priv->offload->callbacks.gro_complete) - break; - } - - if (uo_priv) { - NAPI_GRO_CB(skb)->proto = uo_priv->offload->ipproto; - err = uo_priv->offload->callbacks.gro_complete(skb, - nhoff + sizeof(struct udphdr), - uo_priv->offload); - } - + sk = (*lookup)(skb, uh->source, uh->dest); + if (sk && udp_sk(sk)->gro_complete) + err = udp_sk(sk)->gro_complete(sk, skb, + nhoff + sizeof(struct udphdr)); rcu_read_unlock(); if (skb->remcsum_offload) @@ -428,6 +357,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff) return err; } +EXPORT_SYMBOL(udp_gro_complete); static int udp4_gro_complete(struct sk_buff *skb, int nhoff) { @@ -442,7 +372,7 @@ static int udp4_gro_complete(struct sk_buff *skb, int nhoff) skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL; } - return udp_gro_complete(skb, nhoff); + return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb); } static const struct net_offload udpv4_offload = { diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 96599d1a1..47f12c73d 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -69,6 +69,8 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->encap_type = cfg->encap_type; udp_sk(sk)->encap_rcv = cfg->encap_rcv; udp_sk(sk)->encap_destroy = cfg->encap_destroy; + udp_sk(sk)->gro_receive = cfg->gro_receive; + udp_sk(sk)->gro_complete = cfg->gro_complete; udp_tunnel_encap_enable(sock); } diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 11e875ffd..2343e4f2e 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -218,6 +218,7 @@ config IPV6_GRE tristate "IPv6: GRE tunnel" select IPV6_TUNNEL select NET_IP_TUNNEL + depends on NET_IPGRE_DEMUX ---help--- Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the @@ -231,6 +232,15 @@ config IPV6_GRE Saying M here will produce a module called ip6_gre. If unsure, say N. +config IPV6_FOU + tristate + default NET_FOU && IPV6 + +config IPV6_FOU_TUNNEL + tristate + default NET_FOU_IP_TUNNELS && IPV6_FOU + select IPV6_TUNNEL + config IPV6_MULTIPLE_TABLES bool "IPv6: Multiple Routing Tables" select FIB_RULES diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 2fbd90bf8..6d8ea0992 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -8,9 +8,10 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ addrlabel.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \ - exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o + exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \ + udp_offload.o -ipv6-offload := ip6_offload.o tcpv6_offload.o udp_offload.o exthdrs_offload.o +ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o @@ -41,6 +42,7 @@ obj-$(CONFIG_IPV6_VTI) += ip6_vti.o obj-$(CONFIG_IPV6_SIT) += sit.o obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o obj-$(CONFIG_IPV6_GRE) += ip6_gre.o +obj-$(CONFIG_IPV6_FOU) += fou6.o obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 8ec4b3089..047c75a79 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -359,7 +359,6 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64; ndev->cnf.mtu6 = dev->mtu; - ndev->cnf.sysctl = NULL; ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); if (!ndev->nd_parms) { kfree(ndev); @@ -3563,6 +3562,10 @@ restart: if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); inet6addr_notifier_call_chain(NETDEV_DOWN, ifa); + } else { + if (idev->cnf.forwarding) + addrconf_leave_anycast(ifa); + addrconf_leave_solict(ifa->idev, &ifa->addr); } write_lock_bh(&idev->lock); @@ -4995,15 +4998,13 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token) { struct inet6_ifaddr *ifp; struct net_device *dev = idev->dev; - bool update_rs = false; + bool clear_token, update_rs = false; struct in6_addr ll_addr; ASSERT_RTNL(); if (!token) return -EINVAL; - if (ipv6_addr_any(token)) - return -EINVAL; if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) return -EINVAL; if (!ipv6_accept_ra(idev)) @@ -5018,10 +5019,13 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token) write_unlock_bh(&idev->lock); + clear_token = ipv6_addr_any(token); + if (clear_token) + goto update_lft; + if (!idev->dead && (idev->if_flags & IF_READY) && !ipv6_get_lladdr(dev, &ll_addr, IFA_F_TENTATIVE | IFA_F_OPTIMISTIC)) { - /* If we're not ready, then normal ifup will take care * of this. Otherwise, we need to request our rs here. */ @@ -5029,6 +5033,7 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token) update_rs = true; } +update_lft: write_lock_bh(&idev->lock); if (update_rs) { @@ -5618,376 +5623,366 @@ int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, return ret; } -static struct addrconf_sysctl_table -{ - struct ctl_table_header *sysctl_header; - struct ctl_table addrconf_vars[DEVCONF_MAX+1]; -} addrconf_sysctl __read_mostly = { - .sysctl_header = NULL, - .addrconf_vars = { - { - .procname = "forwarding", - .data = &ipv6_devconf.forwarding, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = addrconf_sysctl_forward, - }, - { - .procname = "hop_limit", - .data = &ipv6_devconf.hop_limit, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = addrconf_sysctl_hop_limit, - }, - { - .procname = "mtu", - .data = &ipv6_devconf.mtu6, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = addrconf_sysctl_mtu, - }, - { - .procname = "accept_ra", - .data = &ipv6_devconf.accept_ra, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "accept_redirects", - .data = &ipv6_devconf.accept_redirects, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "autoconf", - .data = &ipv6_devconf.autoconf, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "dad_transmits", - .data = &ipv6_devconf.dad_transmits, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "router_solicitations", - .data = &ipv6_devconf.rtr_solicits, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "router_solicitation_interval", - .data = &ipv6_devconf.rtr_solicit_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "router_solicitation_delay", - .data = &ipv6_devconf.rtr_solicit_delay, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - { - .procname = "force_mld_version", - .data = &ipv6_devconf.force_mld_version, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "mldv1_unsolicited_report_interval", - .data = - &ipv6_devconf.mldv1_unsolicited_report_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - }, - { - .procname = "mldv2_unsolicited_report_interval", - .data = - &ipv6_devconf.mldv2_unsolicited_report_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - }, - { - .procname = "use_tempaddr", - .data = &ipv6_devconf.use_tempaddr, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "temp_valid_lft", - .data = &ipv6_devconf.temp_valid_lft, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "temp_prefered_lft", - .data = &ipv6_devconf.temp_prefered_lft, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "regen_max_retry", - .data = &ipv6_devconf.regen_max_retry, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "max_desync_factor", - .data = &ipv6_devconf.max_desync_factor, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "max_addresses", - .data = &ipv6_devconf.max_addresses, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "accept_ra_defrtr", - .data = &ipv6_devconf.accept_ra_defrtr, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "accept_ra_min_hop_limit", - .data = &ipv6_devconf.accept_ra_min_hop_limit, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "accept_ra_pinfo", - .data = &ipv6_devconf.accept_ra_pinfo, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, +static const struct ctl_table addrconf_sysctl[] = { + { + .procname = "forwarding", + .data = &ipv6_devconf.forwarding, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_forward, + }, + { + .procname = "hop_limit", + .data = &ipv6_devconf.hop_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_hop_limit, + }, + { + .procname = "mtu", + .data = &ipv6_devconf.mtu6, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_mtu, + }, + { + .procname = "accept_ra", + .data = &ipv6_devconf.accept_ra, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "accept_redirects", + .data = &ipv6_devconf.accept_redirects, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "autoconf", + .data = &ipv6_devconf.autoconf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "dad_transmits", + .data = &ipv6_devconf.dad_transmits, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "router_solicitations", + .data = &ipv6_devconf.rtr_solicits, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "router_solicitation_interval", + .data = &ipv6_devconf.rtr_solicit_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "router_solicitation_delay", + .data = &ipv6_devconf.rtr_solicit_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "force_mld_version", + .data = &ipv6_devconf.force_mld_version, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "mldv1_unsolicited_report_interval", + .data = + &ipv6_devconf.mldv1_unsolicited_report_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { + .procname = "mldv2_unsolicited_report_interval", + .data = + &ipv6_devconf.mldv2_unsolicited_report_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { + .procname = "use_tempaddr", + .data = &ipv6_devconf.use_tempaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "temp_valid_lft", + .data = &ipv6_devconf.temp_valid_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "temp_prefered_lft", + .data = &ipv6_devconf.temp_prefered_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "regen_max_retry", + .data = &ipv6_devconf.regen_max_retry, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "max_desync_factor", + .data = &ipv6_devconf.max_desync_factor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "max_addresses", + .data = &ipv6_devconf.max_addresses, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "accept_ra_defrtr", + .data = &ipv6_devconf.accept_ra_defrtr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "accept_ra_min_hop_limit", + .data = &ipv6_devconf.accept_ra_min_hop_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "accept_ra_pinfo", + .data = &ipv6_devconf.accept_ra_pinfo, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IPV6_ROUTER_PREF - { - .procname = "accept_ra_rtr_pref", - .data = &ipv6_devconf.accept_ra_rtr_pref, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "router_probe_interval", - .data = &ipv6_devconf.rtr_probe_interval, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, + { + .procname = "accept_ra_rtr_pref", + .data = &ipv6_devconf.accept_ra_rtr_pref, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "router_probe_interval", + .data = &ipv6_devconf.rtr_probe_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, #ifdef CONFIG_IPV6_ROUTE_INFO - { - .procname = "accept_ra_rt_info_max_plen", - .data = &ipv6_devconf.accept_ra_rt_info_max_plen, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, + { + .procname = "accept_ra_rt_info_max_plen", + .data = &ipv6_devconf.accept_ra_rt_info_max_plen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif #endif - { - .procname = "proxy_ndp", - .data = &ipv6_devconf.proxy_ndp, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = addrconf_sysctl_proxy_ndp, - }, - { - .procname = "accept_source_route", - .data = &ipv6_devconf.accept_source_route, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, + { + .procname = "proxy_ndp", + .data = &ipv6_devconf.proxy_ndp, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_proxy_ndp, + }, + { + .procname = "accept_source_route", + .data = &ipv6_devconf.accept_source_route, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - { - .procname = "optimistic_dad", - .data = &ipv6_devconf.optimistic_dad, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - - }, - { - .procname = "use_optimistic", - .data = &ipv6_devconf.use_optimistic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - - }, + { + .procname = "optimistic_dad", + .data = &ipv6_devconf.optimistic_dad, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "use_optimistic", + .data = &ipv6_devconf.use_optimistic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #endif #ifdef CONFIG_IPV6_MROUTE - { - .procname = "mc_forwarding", - .data = &ipv6_devconf.mc_forwarding, - .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = proc_dointvec, - }, + { + .procname = "mc_forwarding", + .data = &ipv6_devconf.mc_forwarding, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, #endif - { - .procname = "disable_ipv6", - .data = &ipv6_devconf.disable_ipv6, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = addrconf_sysctl_disable, - }, - { - .procname = "accept_dad", - .data = &ipv6_devconf.accept_dad, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "force_tllao", - .data = &ipv6_devconf.force_tllao, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "ndisc_notify", - .data = &ipv6_devconf.ndisc_notify, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "suppress_frag_ndisc", - .data = &ipv6_devconf.suppress_frag_ndisc, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "accept_ra_from_local", - .data = &ipv6_devconf.accept_ra_from_local, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "accept_ra_mtu", - .data = &ipv6_devconf.accept_ra_mtu, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "stable_secret", - .data = &ipv6_devconf.stable_secret, - .maxlen = IPV6_MAX_STRLEN, - .mode = 0600, - .proc_handler = addrconf_sysctl_stable_secret, - }, - { - .procname = "use_oif_addrs_only", - .data = &ipv6_devconf.use_oif_addrs_only, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "ignore_routes_with_linkdown", - .data = &ipv6_devconf.ignore_routes_with_linkdown, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = addrconf_sysctl_ignore_routes_with_linkdown, - }, - { - .procname = "drop_unicast_in_l2_multicast", - .data = &ipv6_devconf.drop_unicast_in_l2_multicast, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "drop_unsolicited_na", - .data = &ipv6_devconf.drop_unsolicited_na, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "keep_addr_on_down", - .data = &ipv6_devconf.keep_addr_on_down, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - - }, - { - /* sentinel */ - } + { + .procname = "disable_ipv6", + .data = &ipv6_devconf.disable_ipv6, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_disable, + }, + { + .procname = "accept_dad", + .data = &ipv6_devconf.accept_dad, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "force_tllao", + .data = &ipv6_devconf.force_tllao, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ndisc_notify", + .data = &ipv6_devconf.ndisc_notify, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec }, + { + .procname = "suppress_frag_ndisc", + .data = &ipv6_devconf.suppress_frag_ndisc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "accept_ra_from_local", + .data = &ipv6_devconf.accept_ra_from_local, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "accept_ra_mtu", + .data = &ipv6_devconf.accept_ra_mtu, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "stable_secret", + .data = &ipv6_devconf.stable_secret, + .maxlen = IPV6_MAX_STRLEN, + .mode = 0600, + .proc_handler = addrconf_sysctl_stable_secret, + }, + { + .procname = "use_oif_addrs_only", + .data = &ipv6_devconf.use_oif_addrs_only, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "ignore_routes_with_linkdown", + .data = &ipv6_devconf.ignore_routes_with_linkdown, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_ignore_routes_with_linkdown, + }, + { + .procname = "drop_unicast_in_l2_multicast", + .data = &ipv6_devconf.drop_unicast_in_l2_multicast, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "drop_unsolicited_na", + .data = &ipv6_devconf.drop_unsolicited_na, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "keep_addr_on_down", + .data = &ipv6_devconf.keep_addr_on_down, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + + }, + { + /* sentinel */ + } }; static int __addrconf_sysctl_register(struct net *net, char *dev_name, struct inet6_dev *idev, struct ipv6_devconf *p) { int i; - struct addrconf_sysctl_table *t; + struct ctl_table *table; char path[sizeof("net/ipv6/conf/") + IFNAMSIZ]; - t = kmemdup(&addrconf_sysctl, sizeof(*t), GFP_KERNEL); - if (!t) + table = kmemdup(addrconf_sysctl, sizeof(addrconf_sysctl), GFP_KERNEL); + if (!table) goto out; - for (i = 0; t->addrconf_vars[i].data; i++) { - t->addrconf_vars[i].data += (char *)p - (char *)&ipv6_devconf; - t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */ - t->addrconf_vars[i].extra2 = net; + for (i = 0; table[i].data; i++) { + table[i].data += (char *)p - (char *)&ipv6_devconf; + table[i].extra1 = idev; /* embedded; no ref */ + table[i].extra2 = net; } snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name); - t->sysctl_header = register_net_sysctl(net, path, t->addrconf_vars); - if (!t->sysctl_header) + p->sysctl_header = register_net_sysctl(net, path, table); + if (!p->sysctl_header) goto free; - p->sysctl = t; return 0; free: - kfree(t); + kfree(table); out: return -ENOBUFS; } static void __addrconf_sysctl_unregister(struct ipv6_devconf *p) { - struct addrconf_sysctl_table *t; + struct ctl_table *table; - if (!p->sysctl) + if (!p->sysctl_header) return; - t = p->sysctl; - p->sysctl = NULL; - unregister_net_sysctl_table(t->sysctl_header); - kfree(t); + table = p->sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(p->sysctl_header); + p->sysctl_header = NULL; + kfree(table); } static int addrconf_sysctl_register(struct inet6_dev *idev) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b11c37cfd..bfa86f040 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -64,6 +64,8 @@ #include <asm/uaccess.h> #include <linux/mroute6.h> +#include "ip6_offload.h" + MODULE_AUTHOR("Cast of dozens"); MODULE_DESCRIPTION("IPv6 protocol stack for Linux"); MODULE_LICENSE("GPL"); @@ -561,6 +563,7 @@ const struct proto_ops inet6_dgram_ops = { .recvmsg = inet_recvmsg, /* ok */ .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, + .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, @@ -958,6 +961,10 @@ static int __init inet6_init(void) if (err) goto udplitev6_fail; + err = udpv6_offload_init(); + if (err) + goto udpv6_offload_fail; + err = tcpv6_init(); if (err) goto tcpv6_fail; @@ -987,6 +994,8 @@ pingv6_fail: ipv6_packet_fail: tcpv6_exit(); tcpv6_fail: + udpv6_offload_exit(); +udpv6_offload_fail: udplitev6_exit(); udplitev6_fail: udpv6_exit(); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 9dd3882fe..37874e2f3 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -450,9 +450,10 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); - if (err) - goto out_free_skb; - + if (unlikely(err)) { + kfree_skb(skb); + return err; + } sock_recv_timestamp(msg, sk, skb); serr = SKB_EXT_ERR(skb); @@ -509,8 +510,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_ERRQUEUE; err = copied; -out_free_skb: - kfree_skb(skb); + consume_skb(skb); out: return err; } @@ -727,13 +727,13 @@ EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl); int ip6_datagram_send_ctl(struct net *net, struct sock *sk, struct msghdr *msg, struct flowi6 *fl6, - struct ipv6_txoptions *opt, - int *hlimit, int *tclass, int *dontfrag) + struct ipcm6_cookie *ipc6, struct sockcm_cookie *sockc) { struct in6_pktinfo *src_info; struct cmsghdr *cmsg; struct ipv6_rt_hdr *rthdr; struct ipv6_opt_hdr *hdr; + struct ipv6_txoptions *opt = ipc6->opt; int len; int err = 0; @@ -745,6 +745,13 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, goto exit_f; } + if (cmsg->cmsg_level == SOL_SOCKET) { + err = __sock_cmsg_send(sk, msg, cmsg, sockc); + if (err) + return err; + continue; + } + if (cmsg->cmsg_level != SOL_IPV6) continue; @@ -946,8 +953,8 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, goto exit_f; } - *hlimit = *(int *)CMSG_DATA(cmsg); - if (*hlimit < -1 || *hlimit > 0xff) { + ipc6->hlimit = *(int *)CMSG_DATA(cmsg); + if (ipc6->hlimit < -1 || ipc6->hlimit > 0xff) { err = -EINVAL; goto exit_f; } @@ -967,7 +974,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, goto exit_f; err = 0; - *tclass = tc; + ipc6->tclass = tc; break; } @@ -985,7 +992,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, goto exit_f; err = 0; - *dontfrag = df; + ipc6->dontfrag = df; break; } diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index ea7c4d64a..8de5dd7aa 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -258,8 +258,8 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { - IP6_INC_STATS_BH(dev_net(dst->dev), ip6_dst_idev(dst), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), + IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } @@ -280,8 +280,8 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) return 1; } - IP6_INC_STATS_BH(dev_net(dst->dev), - ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(dev_net(dst->dev), + ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); return -1; } @@ -309,8 +309,8 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } @@ -319,8 +319,8 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb) if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) || skb->pkt_type != PACKET_HOST) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } @@ -334,8 +334,8 @@ looped_back: * processed by own */ if (!addr) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } @@ -360,8 +360,8 @@ looped_back: goto unknown_rh; /* Silently discard invalid RTH type 2 */ if (hdr->hdrlen != 2 || hdr->segments_left != 1) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -1; } @@ -379,8 +379,8 @@ looped_back: n = hdr->hdrlen >> 1; if (hdr->segments_left > n) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); @@ -393,8 +393,8 @@ looped_back: if (skb_cloned(skb)) { /* the copy is a forwarded packet */ if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_OUTDISCARDS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); return -1; } @@ -416,14 +416,14 @@ looped_back: if (xfrm6_input_addr(skb, (xfrm_address_t *)addr, (xfrm_address_t *)&ipv6_hdr(skb)->saddr, IPPROTO_ROUTING) < 0) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } @@ -434,8 +434,8 @@ looped_back: } if (ipv6_addr_is_multicast(addr)) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; } @@ -454,8 +454,8 @@ looped_back: if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); kfree_skb(skb); @@ -470,7 +470,7 @@ looped_back: return -1; unknown_rh: - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->type) - skb_network_header(skb)); return -1; @@ -568,28 +568,28 @@ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) if (nh[optoff + 1] != 4 || (optoff & 3) != 2) { net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", nh[optoff+1]); - IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INHDRERRORS); goto drop; } pkt_len = ntohl(*(__be32 *)(nh + optoff + 2)); if (pkt_len <= IPV6_MAXPLEN) { - IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2); return false; } if (ipv6_hdr(skb)->payload_len) { - IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff); return false; } if (pkt_len > skb->len - sizeof(struct ipv6hdr)) { - IP6_INC_STATS_BH(net, ipv6_skb_idev(skb), - IPSTATS_MIB_INTRUNCATEDPKTS); + __IP6_INC_STATS(net, ipv6_skb_idev(skb), + IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c new file mode 100644 index 000000000..9ea249b94 --- /dev/null +++ b/net/ipv6/fou6.c @@ -0,0 +1,140 @@ +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/socket.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <net/fou.h> +#include <net/ip.h> +#include <net/ip6_tunnel.h> +#include <net/ip6_checksum.h> +#include <net/protocol.h> +#include <net/udp.h> +#include <net/udp_tunnel.h> + +static void fou6_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, + struct flowi6 *fl6, u8 *protocol, __be16 sport) +{ + struct udphdr *uh; + + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + + uh = udp_hdr(skb); + + uh->dest = e->dport; + uh->source = sport; + uh->len = htons(skb->len); + udp6_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM6), skb, + &fl6->saddr, &fl6->daddr, skb->len); + + *protocol = IPPROTO_UDP; +} + +int fou6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi6 *fl6) +{ + __be16 sport; + int err; + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM6 ? + SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + + err = __fou_build_header(skb, e, protocol, &sport, type); + if (err) + return err; + + fou6_build_udp(skb, e, fl6, protocol, sport); + + return 0; +} +EXPORT_SYMBOL(fou6_build_header); + +int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi6 *fl6) +{ + __be16 sport; + int err; + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM6 ? + SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + + err = __gue_build_header(skb, e, protocol, &sport, type); + if (err) + return err; + + fou6_build_udp(skb, e, fl6, protocol, sport); + + return 0; +} +EXPORT_SYMBOL(gue6_build_header); + +#if IS_ENABLED(CONFIG_IPV6_FOU_TUNNEL) + +static const struct ip6_tnl_encap_ops fou_ip6tun_ops = { + .encap_hlen = fou_encap_hlen, + .build_header = fou6_build_header, +}; + +static const struct ip6_tnl_encap_ops gue_ip6tun_ops = { + .encap_hlen = gue_encap_hlen, + .build_header = gue6_build_header, +}; + +static int ip6_tnl_encap_add_fou_ops(void) +{ + int ret; + + ret = ip6_tnl_encap_add_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU); + if (ret < 0) { + pr_err("can't add fou6 ops\n"); + return ret; + } + + ret = ip6_tnl_encap_add_ops(&gue_ip6tun_ops, TUNNEL_ENCAP_GUE); + if (ret < 0) { + pr_err("can't add gue6 ops\n"); + ip6_tnl_encap_del_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU); + return ret; + } + + return 0; +} + +static void ip6_tnl_encap_del_fou_ops(void) +{ + ip6_tnl_encap_del_ops(&fou_ip6tun_ops, TUNNEL_ENCAP_FOU); + ip6_tnl_encap_del_ops(&gue_ip6tun_ops, TUNNEL_ENCAP_GUE); +} + +#else + +static int ip6_tnl_encap_add_fou_ops(void) +{ + return 0; +} + +static void ip6_tnl_encap_del_fou_ops(void) +{ +} + +#endif + +static int __init fou6_init(void) +{ + int ret; + + ret = ip6_tnl_encap_add_fou_ops(); + + return ret; +} + +static void __exit fou6_fini(void) +{ + ip6_tnl_encap_del_fou_ops(); +} + +module_init(fou6_init); +module_exit(fou6_fini); +MODULE_AUTHOR("Tom Herbert <therbert@google.com>"); +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 0013cacf7..a4fa84076 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -98,7 +98,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (!(type & ICMPV6_INFOMSG_MASK)) if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) - ping_err(skb, offset, info); + ping_err(skb, offset, ntohl(info)); } static int icmpv6_rcv(struct sk_buff *skb); @@ -400,10 +400,11 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; + struct sockcm_cookie sockc_unused = {0}; + struct ipcm6_cookie ipc6; int iif = 0; int addr_type = 0; int len; - int hlimit; int err = 0; u32 mark = IP6_REPLY_MARK(net, skb->mark); @@ -505,7 +506,10 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) if (IS_ERR(dst)) goto out; - hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + ipc6.tclass = np->tclass; + ipc6.dontfrag = np->dontfrag; + ipc6.opt = NULL; msg.skb = skb; msg.offset = skb_network_offset(skb); @@ -524,9 +528,9 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) err = ip6_append_data(sk, icmpv6_getfrag, &msg, len + sizeof(struct icmp6hdr), - sizeof(struct icmp6hdr), hlimit, - np->tclass, NULL, &fl6, (struct rt6_info *)dst, - MSG_DONTWAIT, np->dontfrag); + sizeof(struct icmp6hdr), + &ipc6, &fl6, (struct rt6_info *)dst, + MSG_DONTWAIT, &sockc_unused); if (err) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); @@ -561,10 +565,10 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct flowi6 fl6; struct icmpv6_msg msg; struct dst_entry *dst; + struct ipcm6_cookie ipc6; int err = 0; - int hlimit; - u8 tclass; u32 mark = IP6_REPLY_MARK(net, skb->mark); + struct sockcm_cookie sockc_unused = {0}; saddr = &ipv6_hdr(skb)->daddr; @@ -604,22 +608,24 @@ static void icmpv6_echo_reply(struct sk_buff *skb) if (IS_ERR(dst)) goto out; - hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - idev = __in6_dev_get(skb->dev); msg.skb = skb; msg.offset = 0; msg.type = ICMPV6_ECHO_REPLY; - tclass = ipv6_get_dsfield(ipv6_hdr(skb)); + ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb)); + ipc6.dontfrag = np->dontfrag; + ipc6.opt = NULL; + err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), - sizeof(struct icmp6hdr), hlimit, tclass, NULL, &fl6, + sizeof(struct icmp6hdr), &ipc6, &fl6, (struct rt6_info *)dst, MSG_DONTWAIT, - np->dontfrag); + &sockc_unused); if (err) { - ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTERRORS); + __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, @@ -671,7 +677,7 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) return; out: - ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); } /* @@ -707,7 +713,7 @@ static int icmpv6_rcv(struct sk_buff *skb) skb_set_network_header(skb, nh); } - ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INMSGS); + __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INMSGS); saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; @@ -725,7 +731,7 @@ static int icmpv6_rcv(struct sk_buff *skb) type = hdr->icmp6_type; - ICMP6MSGIN_INC_STATS_BH(dev_net(dev), idev, type); + ICMP6MSGIN_INC_STATS(dev_net(dev), idev, type); switch (type) { case ICMPV6_ECHO_REQUEST: @@ -809,9 +815,9 @@ static int icmpv6_rcv(struct sk_buff *skb) return 0; csum_error: - ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS); + __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS); discard_it: - ICMP6_INC_STATS_BH(dev_net(dev), idev, ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INERRORS); drop_no_count: kfree_skb(skb); return 0; diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index 28542cb2b..d08fd2d48 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -23,10 +23,76 @@ #include <net/protocol.h> #include <uapi/linux/ila.h> +struct ila_locator { + union { + __u8 v8[8]; + __be16 v16[4]; + __be32 v32[2]; + __be64 v64; + }; +}; + +struct ila_identifier { + union { + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + u8 __space:4; + u8 csum_neutral:1; + u8 type:3; +#elif defined(__BIG_ENDIAN_BITFIELD) + u8 type:3; + u8 csum_neutral:1; + u8 __space:4; +#else +#error "Adjust your <asm/byteorder.h> defines" +#endif + u8 __space2[7]; + }; + __u8 v8[8]; + __be16 v16[4]; + __be32 v32[2]; + __be64 v64; + }; +}; + +enum { + ILA_ATYPE_IID = 0, + ILA_ATYPE_LUID, + ILA_ATYPE_VIRT_V4, + ILA_ATYPE_VIRT_UNI_V6, + ILA_ATYPE_VIRT_MULTI_V6, + ILA_ATYPE_RSVD_1, + ILA_ATYPE_RSVD_2, + ILA_ATYPE_RSVD_3, +}; + +#define CSUM_NEUTRAL_FLAG htonl(0x10000000) + +struct ila_addr { + union { + struct in6_addr addr; + struct { + struct ila_locator loc; + struct ila_identifier ident; + }; + }; +}; + +static inline struct ila_addr *ila_a2i(struct in6_addr *addr) +{ + return (struct ila_addr *)addr; +} + +static inline bool ila_addr_is_ila(struct ila_addr *iaddr) +{ + return (iaddr->ident.type != ILA_ATYPE_IID); +} + struct ila_params { - __be64 locator; - __be64 locator_match; + struct ila_locator locator; + struct ila_locator locator_match; __wsum csum_diff; + u8 csum_mode; }; static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to) @@ -38,7 +104,14 @@ static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to) return csum_partial(diff, sizeof(diff), 0); } -void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p); +static inline bool ila_csum_neutral_set(struct ila_identifier ident) +{ + return !!(ident.csum_neutral); +} + +void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p); + +void ila_init_saved_csum(struct ila_params *p); int ila_lwt_init(void); void ila_lwt_fini(void); diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 30613050e..0e94042d1 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -15,20 +15,52 @@ static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p) { - if (*(__be64 *)&ip6h->daddr == p->locator_match) + struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); + + if (p->locator_match.v64) return p->csum_diff; else - return compute_csum_diff8((__be32 *)&ip6h->daddr, + return compute_csum_diff8((__be32 *)&iaddr->loc, + (__be32 *)&p->locator); +} + +static void ila_csum_do_neutral(struct ila_addr *iaddr, + struct ila_params *p) +{ + __sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3]; + __wsum diff, fval; + + /* Check if checksum adjust value has been cached */ + if (p->locator_match.v64) { + diff = p->csum_diff; + } else { + diff = compute_csum_diff8((__be32 *)iaddr, (__be32 *)&p->locator); + } + + fval = (__force __wsum)(ila_csum_neutral_set(iaddr->ident) ? + ~CSUM_NEUTRAL_FLAG : CSUM_NEUTRAL_FLAG); + + diff = csum_add(diff, fval); + + *adjust = ~csum_fold(csum_add(diff, csum_unfold(*adjust))); + + /* Flip the csum-neutral bit. Either we are doing a SIR->ILA + * translation with ILA_CSUM_NEUTRAL_MAP as the csum_method + * and the C-bit is not set, or we are doing an ILA-SIR + * tranlsation and the C-bit is set. + */ + iaddr->ident.csum_neutral ^= 1; } -void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) +static void ila_csum_adjust_transport(struct sk_buff *skb, + struct ila_params *p) { __wsum diff; struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); size_t nhoff = sizeof(struct ipv6hdr); - /* First update checksum */ switch (ip6h->nexthdr) { case NEXTHDR_TCP: if (likely(pskb_may_pull(skb, nhoff + sizeof(struct tcphdr)))) { @@ -68,7 +100,46 @@ void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) } /* Now change destination address */ - *(__be64 *)&ip6h->daddr = p->locator; + iaddr->loc = p->locator; +} + +void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) +{ + struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); + + /* First deal with the transport checksum */ + if (ila_csum_neutral_set(iaddr->ident)) { + /* C-bit is set in the locator indicating that this + * is a locator being translated to a SIR address. + * Perform (receiver) checksum-neutral translation. + */ + ila_csum_do_neutral(iaddr, p); + } else { + switch (p->csum_mode) { + case ILA_CSUM_ADJUST_TRANSPORT: + ila_csum_adjust_transport(skb, p); + break; + case ILA_CSUM_NEUTRAL_MAP: + ila_csum_do_neutral(iaddr, p); + break; + case ILA_CSUM_NO_ACTION: + break; + } + } + + /* Now change destination address */ + iaddr->loc = p->locator; +} + +void ila_init_saved_csum(struct ila_params *p) +{ + if (!p->locator_match.v64) + return; + + p->csum_diff = compute_csum_diff8( + (__be32 *)&p->locator_match, + (__be32 *)&p->locator); } static int __init ila_init(void) diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 41f18de5d..1dfb64166 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -26,7 +26,7 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); + ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); return dst->lwtstate->orig_output(net, sk, skb); @@ -42,7 +42,7 @@ static int ila_input(struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); + ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); return dst->lwtstate->orig_input(skb); @@ -53,6 +53,7 @@ drop: static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, + [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, }; static int ila_build_state(struct net_device *dev, struct nlattr *nla, @@ -64,11 +65,28 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla, size_t encap_len = sizeof(*p); struct lwtunnel_state *newts; const struct fib6_config *cfg6 = cfg; + struct ila_addr *iaddr; int ret; if (family != AF_INET6) return -EINVAL; + if (cfg6->fc_dst_len < sizeof(struct ila_locator) + 1) { + /* Need to have full locator and at least type field + * included in destination + */ + return -EINVAL; + } + + iaddr = (struct ila_addr *)&cfg6->fc_dst; + + if (!ila_addr_is_ila(iaddr) || ila_csum_neutral_set(iaddr->ident)) { + /* Don't allow translation for a non-ILA address or checksum + * neutral flag to be set. + */ + return -EINVAL; + } + ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy); if (ret < 0) @@ -84,16 +102,19 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla, newts->len = encap_len; p = ila_params_lwtunnel(newts); - p->locator = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]); + p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]); - if (cfg6->fc_dst_len > sizeof(__be64)) { - /* Precompute checksum difference for translation since we - * know both the old locator and the new one. - */ - p->locator_match = *(__be64 *)&cfg6->fc_dst; - p->csum_diff = compute_csum_diff8( - (__be32 *)&p->locator_match, (__be32 *)&p->locator); - } + /* Precompute checksum difference for translation since we + * know both the old locator and the new one. + */ + p->locator_match = iaddr->loc; + p->csum_diff = compute_csum_diff8( + (__be32 *)&p->locator_match, (__be32 *)&p->locator); + + if (tb[ILA_ATTR_CSUM_MODE]) + p->csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]); + + ila_init_saved_csum(p); newts->type = LWTUNNEL_ENCAP_ILA; newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT | @@ -109,7 +130,10 @@ static int ila_fill_encap_info(struct sk_buff *skb, { struct ila_params *p = ila_params_lwtunnel(lwtstate); - if (nla_put_u64(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator)) + if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator.v64, + ILA_ATTR_PAD)) + goto nla_put_failure; + if (nla_put_u8(skb, ILA_ATTR_CSUM_MODE, (__force u8)p->csum_mode)) goto nla_put_failure; return 0; @@ -120,7 +144,9 @@ nla_put_failure: static int ila_encap_nlsize(struct lwtunnel_state *lwtstate) { - return nla_total_size(sizeof(u64)); /* ILA_ATTR_LOCATOR */ + return nla_total_size_64bit(sizeof(u64)) + /* ILA_ATTR_LOCATOR */ + nla_total_size(sizeof(u8)) + /* ILA_ATTR_CSUM_MODE */ + 0; } static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) @@ -128,7 +154,7 @@ static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) struct ila_params *a_p = ila_params_lwtunnel(a); struct ila_params *b_p = ila_params_lwtunnel(b); - return (a_p->locator != b_p->locator); + return (a_p->locator.v64 != b_p->locator.v64); } static const struct lwtunnel_encap_ops ila_encap_ops = { diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 295ca29a2..a90e57229 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -11,13 +11,11 @@ struct ila_xlat_params { struct ila_params ip; - __be64 identifier; int ifindex; - unsigned int dir; }; struct ila_map { - struct ila_xlat_params p; + struct ila_xlat_params xp; struct rhash_head node; struct ila_map __rcu *next; struct rcu_head rcu; @@ -66,31 +64,29 @@ static __always_inline void __ila_hash_secret_init(void) net_get_random_once(&hashrnd, sizeof(hashrnd)); } -static inline u32 ila_identifier_hash(__be64 identifier) +static inline u32 ila_locator_hash(struct ila_locator loc) { - u32 *v = (u32 *)&identifier; + u32 *v = (u32 *)loc.v32; return jhash_2words(v[0], v[1], hashrnd); } -static inline spinlock_t *ila_get_lock(struct ila_net *ilan, __be64 identifier) +static inline spinlock_t *ila_get_lock(struct ila_net *ilan, + struct ila_locator loc) { - return &ilan->locks[ila_identifier_hash(identifier) & ilan->locks_mask]; + return &ilan->locks[ila_locator_hash(loc) & ilan->locks_mask]; } -static inline int ila_cmp_wildcards(struct ila_map *ila, __be64 loc, - int ifindex, unsigned int dir) +static inline int ila_cmp_wildcards(struct ila_map *ila, + struct ila_addr *iaddr, int ifindex) { - return (ila->p.ip.locator_match && ila->p.ip.locator_match != loc) || - (ila->p.ifindex && ila->p.ifindex != ifindex) || - !(ila->p.dir & dir); + return (ila->xp.ifindex && ila->xp.ifindex != ifindex); } -static inline int ila_cmp_params(struct ila_map *ila, struct ila_xlat_params *p) +static inline int ila_cmp_params(struct ila_map *ila, + struct ila_xlat_params *xp) { - return (ila->p.ip.locator_match != p->ip.locator_match) || - (ila->p.ifindex != p->ifindex) || - (ila->p.dir != p->dir); + return (ila->xp.ifindex != xp->ifindex); } static int ila_cmpfn(struct rhashtable_compare_arg *arg, @@ -98,17 +94,14 @@ static int ila_cmpfn(struct rhashtable_compare_arg *arg, { const struct ila_map *ila = obj; - return (ila->p.identifier != *(__be64 *)arg->key); + return (ila->xp.ip.locator_match.v64 != *(__be64 *)arg->key); } static inline int ila_order(struct ila_map *ila) { int score = 0; - if (ila->p.ip.locator_match) - score += 1 << 0; - - if (ila->p.ifindex) + if (ila->xp.ifindex) score += 1 << 1; return score; @@ -117,7 +110,7 @@ static inline int ila_order(struct ila_map *ila) static const struct rhashtable_params rht_params = { .nelem_hint = 1024, .head_offset = offsetof(struct ila_map, node), - .key_offset = offsetof(struct ila_map, p.identifier), + .key_offset = offsetof(struct ila_map, xp.ip.locator_match), .key_len = sizeof(u64), /* identifier */ .max_size = 1048576, .min_size = 256, @@ -136,50 +129,45 @@ static struct genl_family ila_nl_family = { }; static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { - [ILA_ATTR_IDENTIFIER] = { .type = NLA_U64, }, [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, - [ILA_ATTR_DIR] = { .type = NLA_U32, }, + [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, }; static int parse_nl_config(struct genl_info *info, - struct ila_xlat_params *p) + struct ila_xlat_params *xp) { - memset(p, 0, sizeof(*p)); - - if (info->attrs[ILA_ATTR_IDENTIFIER]) - p->identifier = (__force __be64)nla_get_u64( - info->attrs[ILA_ATTR_IDENTIFIER]); + memset(xp, 0, sizeof(*xp)); if (info->attrs[ILA_ATTR_LOCATOR]) - p->ip.locator = (__force __be64)nla_get_u64( + xp->ip.locator.v64 = (__force __be64)nla_get_u64( info->attrs[ILA_ATTR_LOCATOR]); if (info->attrs[ILA_ATTR_LOCATOR_MATCH]) - p->ip.locator_match = (__force __be64)nla_get_u64( + xp->ip.locator_match.v64 = (__force __be64)nla_get_u64( info->attrs[ILA_ATTR_LOCATOR_MATCH]); - if (info->attrs[ILA_ATTR_IFINDEX]) - p->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); + if (info->attrs[ILA_ATTR_CSUM_MODE]) + xp->ip.csum_mode = nla_get_u8(info->attrs[ILA_ATTR_CSUM_MODE]); - if (info->attrs[ILA_ATTR_DIR]) - p->dir = nla_get_u32(info->attrs[ILA_ATTR_DIR]); + if (info->attrs[ILA_ATTR_IFINDEX]) + xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]); return 0; } /* Must be called with rcu readlock */ -static inline struct ila_map *ila_lookup_wildcards(__be64 id, __be64 loc, +static inline struct ila_map *ila_lookup_wildcards(struct ila_addr *iaddr, int ifindex, - unsigned int dir, struct ila_net *ilan) { struct ila_map *ila; - ila = rhashtable_lookup_fast(&ilan->rhash_table, &id, rht_params); + ila = rhashtable_lookup_fast(&ilan->rhash_table, &iaddr->loc, + rht_params); while (ila) { - if (!ila_cmp_wildcards(ila, loc, ifindex, dir)) + if (!ila_cmp_wildcards(ila, iaddr, ifindex)) return ila; ila = rcu_access_pointer(ila->next); } @@ -188,15 +176,16 @@ static inline struct ila_map *ila_lookup_wildcards(__be64 id, __be64 loc, } /* Must be called with rcu readlock */ -static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *p, +static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *xp, struct ila_net *ilan) { struct ila_map *ila; - ila = rhashtable_lookup_fast(&ilan->rhash_table, &p->identifier, + ila = rhashtable_lookup_fast(&ilan->rhash_table, + &xp->ip.locator_match, rht_params); while (ila) { - if (!ila_cmp_params(ila, p)) + if (!ila_cmp_params(ila, xp)) return ila; ila = rcu_access_pointer(ila->next); } @@ -221,14 +210,14 @@ static void ila_free_cb(void *ptr, void *arg) } } -static int ila_xlat_addr(struct sk_buff *skb, int dir); +static int ila_xlat_addr(struct sk_buff *skb); static unsigned int ila_nf_input(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - ila_xlat_addr(skb, ILA_DIR_IN); + ila_xlat_addr(skb); return NF_ACCEPT; } @@ -241,11 +230,11 @@ static struct nf_hook_ops ila_nf_hook_ops[] __read_mostly = { }, }; -static int ila_add_mapping(struct net *net, struct ila_xlat_params *p) +static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_map *ila, *head; - spinlock_t *lock = ila_get_lock(ilan, p->identifier); + spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = 0, order; if (!ilan->hooks_registered) { @@ -264,22 +253,16 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *p) if (!ila) return -ENOMEM; - ila->p = *p; + ila_init_saved_csum(&xp->ip); - if (p->ip.locator_match) { - /* Precompute checksum difference for translation since we - * know both the old identifier and the new one. - */ - ila->p.ip.csum_diff = compute_csum_diff8( - (__be32 *)&p->ip.locator_match, - (__be32 *)&p->ip.locator); - } + ila->xp = *xp; order = ila_order(ila); spin_lock(lock); - head = rhashtable_lookup_fast(&ilan->rhash_table, &p->identifier, + head = rhashtable_lookup_fast(&ilan->rhash_table, + &xp->ip.locator_match, rht_params); if (!head) { /* New entry for the rhash_table */ @@ -289,7 +272,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *p) struct ila_map *tila = head, *prev = NULL; do { - if (!ila_cmp_params(tila, p)) { + if (!ila_cmp_params(tila, xp)) { err = -EEXIST; goto out; } @@ -326,23 +309,23 @@ out: return err; } -static int ila_del_mapping(struct net *net, struct ila_xlat_params *p) +static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_map *ila, *head, *prev; - spinlock_t *lock = ila_get_lock(ilan, p->identifier); + spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = -ENOENT; spin_lock(lock); head = rhashtable_lookup_fast(&ilan->rhash_table, - &p->identifier, rht_params); + &xp->ip.locator_match, rht_params); ila = head; prev = NULL; while (ila) { - if (ila_cmp_params(ila, p)) { + if (ila_cmp_params(ila, xp)) { prev = ila; ila = rcu_dereference_protected(ila->next, lockdep_is_held(lock)); @@ -404,28 +387,28 @@ static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info) static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); - struct ila_xlat_params p; + struct ila_xlat_params xp; int err; - err = parse_nl_config(info, &p); + err = parse_nl_config(info, &xp); if (err) return err; - ila_del_mapping(net, &p); + ila_del_mapping(net, &xp); return 0; } static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg) { - if (nla_put_u64(msg, ILA_ATTR_IDENTIFIER, - (__force u64)ila->p.identifier) || - nla_put_u64(msg, ILA_ATTR_LOCATOR, - (__force u64)ila->p.ip.locator) || - nla_put_u64(msg, ILA_ATTR_LOCATOR_MATCH, - (__force u64)ila->p.ip.locator_match) || - nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->p.ifindex) || - nla_put_u32(msg, ILA_ATTR_DIR, ila->p.dir)) + if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR, + (__force u64)ila->xp.ip.locator.v64, + ILA_ATTR_PAD) || + nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR_MATCH, + (__force u64)ila->xp.ip.locator_match.v64, + ILA_ATTR_PAD) || + nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->xp.ifindex) || + nla_put_u32(msg, ILA_ATTR_CSUM_MODE, ila->xp.ip.csum_mode)) return -1; return 0; @@ -457,11 +440,11 @@ static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) struct net *net = genl_info_net(info); struct ila_net *ilan = net_generic(net, ila_net_id); struct sk_buff *msg; - struct ila_xlat_params p; + struct ila_xlat_params xp; struct ila_map *ila; int ret; - ret = parse_nl_config(info, &p); + ret = parse_nl_config(info, &xp); if (ret) return ret; @@ -471,7 +454,7 @@ static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info) rcu_read_lock(); - ila = ila_lookup_by_params(&p, ilan); + ila = ila_lookup_by_params(&xp, ilan); if (ila) { ret = ila_dump_info(ila, info->snd_portid, @@ -501,7 +484,8 @@ static int ila_nl_dump_start(struct netlink_callback *cb) struct ila_net *ilan = net_generic(net, ila_net_id); struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args; - return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter); + return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter, + GFP_KERNEL); } static int ila_nl_dump_done(struct netlink_callback *cb) @@ -613,45 +597,32 @@ static struct pernet_operations ila_net_ops = { .size = sizeof(struct ila_net), }; -static int ila_xlat_addr(struct sk_buff *skb, int dir) +static int ila_xlat_addr(struct sk_buff *skb) { struct ila_map *ila; struct ipv6hdr *ip6h = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); struct ila_net *ilan = net_generic(net, ila_net_id); - __be64 identifier, locator_match; - size_t nhoff; + struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); /* Assumes skb contains a valid IPv6 header that is pulled */ - identifier = *(__be64 *)&ip6h->daddr.in6_u.u6_addr8[8]; - locator_match = *(__be64 *)&ip6h->daddr.in6_u.u6_addr8[0]; - nhoff = sizeof(struct ipv6hdr); + if (!ila_addr_is_ila(iaddr)) { + /* Type indicates this is not an ILA address */ + return 0; + } rcu_read_lock(); - ila = ila_lookup_wildcards(identifier, locator_match, - skb->dev->ifindex, dir, ilan); + ila = ila_lookup_wildcards(iaddr, skb->dev->ifindex, ilan); if (ila) - update_ipv6_locator(skb, &ila->p.ip); + ila_update_ipv6_locator(skb, &ila->xp.ip); rcu_read_unlock(); return 0; } -int ila_xlat_incoming(struct sk_buff *skb) -{ - return ila_xlat_addr(skb, ILA_DIR_IN); -} -EXPORT_SYMBOL(ila_xlat_incoming); - -int ila_xlat_outgoing(struct sk_buff *skb) -{ - return ila_xlat_addr(skb, ILA_DIR_OUT); -} -EXPORT_SYMBOL(ila_xlat_outgoing); - int ila_xlat_init(void) { int ret; diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 70f2628be..00cf28ad4 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -69,7 +69,6 @@ struct sock *__inet6_lookup_established(struct net *net, struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; - rcu_read_lock(); begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) @@ -90,7 +89,6 @@ begin: out: sk = NULL; found: - rcu_read_unlock(); return sk; } EXPORT_SYMBOL(__inet6_lookup_established); @@ -122,6 +120,7 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } +/* called with rcu_read_lock() */ struct sock *inet6_lookup_listener(struct net *net, struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, @@ -129,39 +128,27 @@ struct sock *inet6_lookup_listener(struct net *net, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif) { - struct sock *sk; - const struct hlist_nulls_node *node; - struct sock *result; - int score, hiscore, matches = 0, reuseport = 0; - bool select_ok = true; - u32 phash = 0; unsigned int hash = inet_lhashfn(net, hnum); struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; + int score, hiscore = 0, matches = 0, reuseport = 0; + struct sock *sk, *result = NULL; + u32 phash = 0; - rcu_read_lock(); -begin: - result = NULL; - hiscore = 0; - sk_nulls_for_each(sk, node, &ilb->head) { + sk_for_each(sk, &ilb->head) { score = compute_score(sk, net, hnum, daddr, dif); if (score > hiscore) { - hiscore = score; - result = sk; reuseport = sk->sk_reuseport; if (reuseport) { phash = inet6_ehashfn(net, daddr, hnum, saddr, sport); - if (select_ok) { - struct sock *sk2; - sk2 = reuseport_select_sock(sk, phash, - skb, doff); - if (sk2) { - result = sk2; - goto found; - } - } + result = reuseport_select_sock(sk, phash, + skb, doff); + if (result) + return result; matches = 1; } + result = sk; + hiscore = score; } else if (score == hiscore && reuseport) { matches++; if (reciprocal_scale(phash, matches) == 0) @@ -169,25 +156,6 @@ begin: phash = next_pseudo_random32(phash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE) - goto begin; - if (result) { -found: - if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) - result = NULL; - else if (unlikely(compute_score(result, net, hnum, daddr, - dif) < hiscore)) { - sock_put(result); - select_ok = false; - goto begin; - } - } - rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(inet6_lookup_listener); @@ -199,12 +167,12 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo, const int dif) { struct sock *sk; + bool refcounted; - local_bh_disable(); sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, - ntohs(dport), dif); - local_bh_enable(); - + ntohs(dport), dif, &refcounted); + if (sk && !refcounted && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; return sk; } EXPORT_SYMBOL_GPL(inet6_lookup); @@ -254,7 +222,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { sk_nulls_del_node_init_rcu((struct sock *)tw); - NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); + __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c index b2025bf3d..c0cbcb259 100644 --- a/net/ipv6/ip6_checksum.c +++ b/net/ipv6/ip6_checksum.c @@ -78,9 +78,12 @@ int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) * we accept a checksum of zero here. When we find the socket * for the UDP packet we'll check if that socket allows zero checksum * for IPv6 (set by socket option). + * + * Note, we are only interested in != 0 or == 0, thus the + * force to int. */ - return skb_checksum_init_zero_check(skb, proto, uh->check, - ip6_compute_pseudo); + return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, + ip6_compute_pseudo); } EXPORT_SYMBOL(udp6_csum_init); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c26fac26b..771be1fa4 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -241,6 +241,7 @@ struct fib6_table *fib6_new_table(struct net *net, u32 id) return tb; } +EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index dc2db4f7b..b912f0dba 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -372,7 +372,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, if (olen > 0) { struct msghdr msg; struct flowi6 flowi6; - int junk; + struct sockcm_cookie sockc_junk; + struct ipcm6_cookie ipc6; err = -ENOMEM; fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL); @@ -389,8 +390,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq, msg.msg_control = (void *)(fl->opt+1); memset(&flowi6, 0, sizeof(flowi6)); - err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, fl->opt, - &junk, &junk, &junk); + ipc6.opt = fl->opt; + err = ip6_datagram_send_ctl(net, sk, &msg, &flowi6, &ipc6, &sockc_junk); if (err) goto done; err = -EINVAL; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4e636e60a..776d14511 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -54,6 +54,7 @@ #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/ip6_tunnel.h> +#include <net/gre.h> static bool log_ecn_error = true; @@ -342,7 +343,7 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, goto failed_free; /* Can use a lockless transmit, unless we generate output sequences */ - if (!(nt->parms.o_flags & GRE_SEQ)) + if (!(nt->parms.o_flags & TUNNEL_SEQ)) dev->features |= NETIF_F_LLTX; dev_hold(dev); @@ -443,137 +444,41 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, t->err_time = jiffies; } -static int ip6gre_rcv(struct sk_buff *skb) +static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) { const struct ipv6hdr *ipv6h; - u8 *h; - __be16 flags; - __sum16 csum = 0; - __be32 key = 0; - u32 seqno = 0; struct ip6_tnl *tunnel; - int offset = 4; - __be16 gre_proto; - int err; - - if (!pskb_may_pull(skb, sizeof(struct in6_addr))) - goto drop; ipv6h = ipv6_hdr(skb); - h = skb->data; - flags = *(__be16 *)h; - - if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { - /* - Version must be 0. - - We do not support routing headers. - */ - if (flags&(GRE_VERSION|GRE_ROUTING)) - goto drop; - - if (flags&GRE_CSUM) { - csum = skb_checksum_simple_validate(skb); - offset += 4; - } - if (flags&GRE_KEY) { - key = *(__be32 *)(h + offset); - offset += 4; - } - if (flags&GRE_SEQ) { - seqno = ntohl(*(__be32 *)(h + offset)); - offset += 4; - } - } - - gre_proto = *(__be16 *)(h + 2); - tunnel = ip6gre_tunnel_lookup(skb->dev, - &ipv6h->saddr, &ipv6h->daddr, key, - gre_proto); + &ipv6h->saddr, &ipv6h->daddr, tpi->key, + tpi->proto); if (tunnel) { - struct pcpu_sw_netstats *tstats; - - if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto drop; - - if (!ip6_tnl_rcv_ctl(tunnel, &ipv6h->daddr, &ipv6h->saddr)) { - tunnel->dev->stats.rx_dropped++; - goto drop; - } - - skb->protocol = gre_proto; - /* WCCP version 1 and 2 protocol decoding. - * - Change protocol to IPv6 - * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header - */ - if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) { - skb->protocol = htons(ETH_P_IPV6); - if ((*(h + offset) & 0xF0) != 0x40) - offset += 4; - } - - skb->mac_header = skb->network_header; - __pskb_pull(skb, offset); - skb_postpull_rcsum(skb, skb_transport_header(skb), offset); - - if (((flags&GRE_CSUM) && csum) || - (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { - tunnel->dev->stats.rx_crc_errors++; - tunnel->dev->stats.rx_errors++; - goto drop; - } - if (tunnel->parms.i_flags&GRE_SEQ) { - if (!(flags&GRE_SEQ) || - (tunnel->i_seqno && - (s32)(seqno - tunnel->i_seqno) < 0)) { - tunnel->dev->stats.rx_fifo_errors++; - tunnel->dev->stats.rx_errors++; - goto drop; - } - tunnel->i_seqno = seqno + 1; - } - - /* Warning: All skb pointers will be invalidated! */ - if (tunnel->dev->type == ARPHRD_ETHER) { - if (!pskb_may_pull(skb, ETH_HLEN)) { - tunnel->dev->stats.rx_length_errors++; - tunnel->dev->stats.rx_errors++; - goto drop; - } + ip6_tnl_rcv(tunnel, skb, tpi, NULL, false); - ipv6h = ipv6_hdr(skb); - skb->protocol = eth_type_trans(skb, tunnel->dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - } - - __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); + return PACKET_RCVD; + } - skb_reset_network_header(skb); + return PACKET_REJECT; +} - err = IP6_ECN_decapsulate(ipv6h, skb); - if (unlikely(err)) { - if (log_ecn_error) - net_info_ratelimited("non-ECT from %pI6 with dsfield=%#x\n", - &ipv6h->saddr, - ipv6_get_dsfield(ipv6h)); - if (err > 1) { - ++tunnel->dev->stats.rx_frame_errors; - ++tunnel->dev->stats.rx_errors; - goto drop; - } - } +static int gre_rcv(struct sk_buff *skb) +{ + struct tnl_ptk_info tpi; + bool csum_err = false; + int hdr_len; - tstats = this_cpu_ptr(tunnel->dev->tstats); - u64_stats_update_begin(&tstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += skb->len; - u64_stats_update_end(&tstats->syncp); + hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IPV6), 0); + if (hdr_len < 0) + goto drop; - netif_rx(skb); + if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false)) + goto drop; + if (ip6gre_rcv(skb, &tpi) == PACKET_RCVD) return 0; - } - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; @@ -584,187 +489,40 @@ struct ipv6_tel_txoption { __u8 dst_opt[8]; }; -static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) +static int gre_handle_offloads(struct sk_buff *skb, bool csum) { - memset(opt, 0, sizeof(struct ipv6_tel_txoption)); - - opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT; - opt->dst_opt[3] = 1; - opt->dst_opt[4] = encap_limit; - opt->dst_opt[5] = IPV6_TLV_PADN; - opt->dst_opt[6] = 1; - - opt->ops.dst0opt = (struct ipv6_opt_hdr *) opt->dst_opt; - opt->ops.opt_nflen = 8; + return iptunnel_handle_offloads(skb, + csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } -static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, - struct net_device *dev, - __u8 dsfield, - struct flowi6 *fl6, - int encap_limit, - __u32 *pmtu) +static netdev_tx_t __gre6_xmit(struct sk_buff *skb, + struct net_device *dev, __u8 dsfield, + struct flowi6 *fl6, int encap_limit, + __u32 *pmtu, __be16 proto) { struct ip6_tnl *tunnel = netdev_priv(dev); - struct net *net = tunnel->net; - struct net_device *tdev; /* Device to other host */ - struct ipv6hdr *ipv6h; /* Our new IP header */ - unsigned int max_headroom = 0; /* The extra header space needed */ - int gre_hlen; - struct ipv6_tel_txoption opt; - int mtu; - struct dst_entry *dst = NULL, *ndst = NULL; - struct net_device_stats *stats = &tunnel->dev->stats; - int err = -1; - u8 proto; - struct sk_buff *new_skb; - __be16 protocol; + __be16 protocol = (dev->type == ARPHRD_ETHER) ? + htons(ETH_P_TEB) : proto; if (dev->type == ARPHRD_ETHER) IPCB(skb)->flags = 0; - if (dev->header_ops && dev->type == ARPHRD_IP6GRE) { - gre_hlen = 0; - ipv6h = (struct ipv6hdr *)skb->data; - fl6->daddr = ipv6h->daddr; - } else { - gre_hlen = tunnel->hlen; + if (dev->header_ops && dev->type == ARPHRD_IP6GRE) + fl6->daddr = ((struct ipv6hdr *)skb->data)->daddr; + else fl6->daddr = tunnel->parms.raddr; - } - - if (!fl6->flowi6_mark) - dst = dst_cache_get(&tunnel->dst_cache); - - if (!dst) { - dst = ip6_route_output(net, NULL, fl6); - - if (dst->error) - goto tx_err_link_failure; - dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); - if (IS_ERR(dst)) { - err = PTR_ERR(dst); - dst = NULL; - goto tx_err_link_failure; - } - ndst = dst; - } - - tdev = dst->dev; - - if (tdev == dev) { - stats->collisions++; - net_warn_ratelimited("%s: Local routing loop detected!\n", - tunnel->parms.name); - goto tx_err_dst_release; - } - - mtu = dst_mtu(dst) - sizeof(*ipv6h); - if (encap_limit >= 0) { - max_headroom += 8; - mtu -= 8; - } - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - if (skb->len > mtu) { - *pmtu = mtu; - err = -EMSGSIZE; - goto tx_err_dst_release; - } - - if (tunnel->err_count > 0) { - if (time_before(jiffies, - tunnel->err_time + IP6TUNNEL_ERR_TIMEO)) { - tunnel->err_count--; - - dst_link_failure(skb); - } else - tunnel->err_count = 0; - } - - skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); - - max_headroom += LL_RESERVED_SPACE(tdev) + gre_hlen + dst->header_len; - - if (skb_headroom(skb) < max_headroom || skb_shared(skb) || - (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { - new_skb = skb_realloc_headroom(skb, max_headroom); - if (max_headroom > dev->needed_headroom) - dev->needed_headroom = max_headroom; - if (!new_skb) - goto tx_err_dst_release; - - if (skb->sk) - skb_set_owner_w(new_skb, skb->sk); - consume_skb(skb); - skb = new_skb; - } - - if (!fl6->flowi6_mark && ndst) - dst_cache_set_ip6(&tunnel->dst_cache, ndst, &fl6->saddr); - skb_dst_set(skb, dst); - proto = NEXTHDR_GRE; - if (encap_limit >= 0) { - init_tel_txopt(&opt, encap_limit); - ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL); - } - - if (likely(!skb->encapsulation)) { - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - } + if (tunnel->parms.o_flags & TUNNEL_SEQ) + tunnel->o_seqno++; - skb_push(skb, gre_hlen); - skb_reset_network_header(skb); - skb_set_transport_header(skb, sizeof(*ipv6h)); - - /* - * Push down and install the IP header. - */ - ipv6h = ipv6_hdr(skb); - ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), - ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); - ipv6h->hop_limit = tunnel->parms.hop_limit; - ipv6h->nexthdr = proto; - ipv6h->saddr = fl6->saddr; - ipv6h->daddr = fl6->daddr; - - ((__be16 *)(ipv6h + 1))[0] = tunnel->parms.o_flags; - protocol = (dev->type == ARPHRD_ETHER) ? - htons(ETH_P_TEB) : skb->protocol; - ((__be16 *)(ipv6h + 1))[1] = protocol; - - if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { - __be32 *ptr = (__be32 *)(((u8 *)ipv6h) + tunnel->hlen - 4); - - if (tunnel->parms.o_flags&GRE_SEQ) { - ++tunnel->o_seqno; - *ptr = htonl(tunnel->o_seqno); - ptr--; - } - if (tunnel->parms.o_flags&GRE_KEY) { - *ptr = tunnel->parms.o_key; - ptr--; - } - if (tunnel->parms.o_flags&GRE_CSUM) { - *ptr = 0; - *(__sum16 *)ptr = ip_compute_csum((void *)(ipv6h+1), - skb->len - sizeof(struct ipv6hdr)); - } - } + /* Push GRE header. */ + gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, + protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno)); skb_set_inner_protocol(skb, protocol); - ip6tunnel_xmit(NULL, skb, dev); - return 0; -tx_err_link_failure: - stats->tx_carrier_errors++; - dst_link_failure(skb); -tx_err_dst_release: - dst_release(dst); - return err; + return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, + NEXTHDR_GRE); } static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) @@ -783,7 +541,6 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) encap_limit = t->parms.encap_limit; memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_GRE; dsfield = ipv4_get_dsfield(iph); @@ -793,7 +550,12 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; - err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); + err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); + if (err) + return -1; + + err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, + skb->protocol); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) @@ -833,7 +595,6 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) encap_limit = t->parms.encap_limit; memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_GRE; dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) @@ -843,7 +604,11 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; - err = ip6gre_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); + if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM))) + return -1; + + err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, + &mtu, skb->protocol); if (err != 0) { if (err == -EMSGSIZE) icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); @@ -887,7 +652,11 @@ static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); fl6.flowi6_proto = skb->protocol; - err = ip6gre_xmit2(skb, dev, 0, &fl6, encap_limit, &mtu); + err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); + if (err) + return err; + + err = __gre6_xmit(skb, dev, 0, &fl6, encap_limit, &mtu, skb->protocol); return err; } @@ -931,7 +700,7 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) struct net_device *dev = t->dev; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; - int addend = sizeof(struct ipv6hdr) + 4; + int t_hlen; if (dev->type != ARPHRD_ETHER) { memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); @@ -943,6 +712,7 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) fl6->daddr = p->raddr; fl6->flowi6_oif = p->link; fl6->flowlabel = 0; + fl6->flowi6_proto = IPPROTO_GRE; if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; @@ -958,16 +728,11 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) else dev->flags &= ~IFF_POINTOPOINT; - /* Precalculate GRE options length */ - if (t->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { - if (t->parms.o_flags&GRE_CSUM) - addend += 4; - if (t->parms.o_flags&GRE_KEY) - addend += 4; - if (t->parms.o_flags&GRE_SEQ) - addend += 4; - } - t->hlen = addend; + t->tun_hlen = gre_calc_hlen(t->parms.o_flags); + + t->hlen = t->encap_hlen + t->tun_hlen; + + t_hlen = t->hlen + sizeof(struct ipv6hdr); if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & @@ -981,12 +746,15 @@ static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) return; if (rt->dst.dev) { - dev->hard_header_len = rt->dst.dev->hard_header_len + addend; + dev->hard_header_len = rt->dst.dev->hard_header_len + + t_hlen; if (set_mtu) { - dev->mtu = rt->dst.dev->mtu - addend; + dev->mtu = rt->dst.dev->mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; + if (dev->type == ARPHRD_ETHER) + dev->mtu -= ETH_HLEN; if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; @@ -1028,8 +796,8 @@ static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm *p, p->link = u->link; p->i_key = u->i_key; p->o_key = u->o_key; - p->i_flags = u->i_flags; - p->o_flags = u->o_flags; + p->i_flags = gre_flags_to_tnl_flags(u->i_flags); + p->o_flags = gre_flags_to_tnl_flags(u->o_flags); memcpy(p->name, u->name, sizeof(u->name)); } @@ -1046,8 +814,8 @@ static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u, u->link = p->link; u->i_key = p->i_key; u->o_key = p->o_key; - u->i_flags = p->i_flags; - u->o_flags = p->o_flags; + u->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); + u->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); memcpy(u->name, p->name, sizeof(u->name)); } @@ -1061,6 +829,8 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, struct net *net = t->net; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + memset(&p1, 0, sizeof(p1)); + switch (cmd) { case SIOCGETTUNNEL: if (dev == ign->fb_tunnel_dev) { @@ -1160,15 +930,6 @@ done: return err; } -static int ip6gre_tunnel_change_mtu(struct net_device *dev, int new_mtu) -{ - if (new_mtu < 68 || - new_mtu > 0xFFF8 - dev->hard_header_len) - return -EINVAL; - dev->mtu = new_mtu; - return 0; -} - static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) @@ -1212,7 +973,7 @@ static const struct net_device_ops ip6gre_netdev_ops = { .ndo_uninit = ip6gre_tunnel_uninit, .ndo_start_xmit = ip6gre_tunnel_xmit, .ndo_do_ioctl = ip6gre_tunnel_ioctl, - .ndo_change_mtu = ip6gre_tunnel_change_mtu, + .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; @@ -1228,17 +989,11 @@ static void ip6gre_dev_free(struct net_device *dev) static void ip6gre_tunnel_setup(struct net_device *dev) { - struct ip6_tnl *t; - dev->netdev_ops = &ip6gre_netdev_ops; dev->destructor = ip6gre_dev_free; dev->type = ARPHRD_IP6GRE; - dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr) + 4; - dev->mtu = ETH_DATA_LEN - sizeof(struct ipv6hdr) - 4; - t = netdev_priv(dev); - if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - dev->mtu -= 8; + dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); netif_keep_dst(dev); @@ -1248,6 +1003,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) { struct ip6_tnl *tunnel; int ret; + int t_hlen; tunnel = netdev_priv(dev); @@ -1266,6 +1022,17 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) return ret; } + tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; + t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); + + dev->hard_header_len = LL_MAX_HEADER + t_hlen; + dev->mtu = ETH_DATA_LEN - t_hlen; + if (dev->type == ARPHRD_ETHER) + dev->mtu -= ETH_HLEN; + if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu -= 8; + return 0; } @@ -1304,7 +1071,7 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev) static struct inet6_protocol ip6gre_protocol __read_mostly = { - .handler = ip6gre_rcv, + .handler = gre_rcv, .err_handler = ip6gre_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; @@ -1448,10 +1215,12 @@ static void ip6gre_netlink_parms(struct nlattr *data[], parms->link = nla_get_u32(data[IFLA_GRE_LINK]); if (data[IFLA_GRE_IFLAGS]) - parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); + parms->i_flags = gre_flags_to_tnl_flags( + nla_get_be16(data[IFLA_GRE_IFLAGS])); if (data[IFLA_GRE_OFLAGS]) - parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); + parms->o_flags = gre_flags_to_tnl_flags( + nla_get_be16(data[IFLA_GRE_OFLAGS])); if (data[IFLA_GRE_IKEY]) parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); @@ -1487,6 +1256,8 @@ static int ip6gre_tap_init(struct net_device *dev) if (ret) return ret; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + tunnel = netdev_priv(dev); ip6gre_tnl_link_config(tunnel, 1); @@ -1500,11 +1271,16 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = { .ndo_start_xmit = ip6gre_tunnel_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = ip6gre_tunnel_change_mtu, + .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; +#define GRE6_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_HIGHDMA | \ + NETIF_F_HW_CSUM) + static void ip6gre_tap_setup(struct net_device *dev) { @@ -1515,6 +1291,40 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; +} + +static bool ip6gre_netlink_encap_parms(struct nlattr *data[], + struct ip_tunnel_encap *ipencap) +{ + bool ret = false; + + memset(ipencap, 0, sizeof(*ipencap)); + + if (!data) + return ret; + + if (data[IFLA_GRE_ENCAP_TYPE]) { + ret = true; + ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]); + } + + if (data[IFLA_GRE_ENCAP_FLAGS]) { + ret = true; + ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]); + } + + if (data[IFLA_GRE_ENCAP_SPORT]) { + ret = true; + ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]); + } + + if (data[IFLA_GRE_ENCAP_DPORT]) { + ret = true; + ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]); + } + + return ret; } static int ip6gre_newlink(struct net *src_net, struct net_device *dev, @@ -1523,9 +1333,18 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, struct ip6_tnl *nt; struct net *net = dev_net(dev); struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + struct ip_tunnel_encap ipencap; int err; nt = netdev_priv(dev); + + if (ip6gre_netlink_encap_parms(data, &ipencap)) { + int err = ip6_tnl_encap_setup(nt, &ipencap); + + if (err < 0) + return err; + } + ip6gre_netlink_parms(data, &nt->parms); if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) @@ -1538,9 +1357,25 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, nt->net = dev_net(dev); ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); - /* Can use a lockless transmit, unless we generate output sequences */ - if (!(nt->parms.o_flags & GRE_SEQ)) + dev->features |= GRE6_FEATURES; + dev->hw_features |= GRE6_FEATURES; + + if (!(nt->parms.o_flags & TUNNEL_SEQ)) { + /* TCP offload with GRE SEQ is not supported, nor + * can we support 2 levels of outer headers requiring + * an update. + */ + if (!(nt->parms.o_flags & TUNNEL_CSUM) || + (nt->encap.type == TUNNEL_ENCAP_NONE)) { + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->hw_features |= NETIF_F_GSO_SOFTWARE; + } + + /* Can use a lockless transmit, unless we generate + * output sequences + */ dev->features |= NETIF_F_LLTX; + } err = register_netdevice(dev); if (err) @@ -1560,10 +1395,18 @@ static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[], struct net *net = nt->net; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); struct __ip6_tnl_parm p; + struct ip_tunnel_encap ipencap; if (dev == ign->fb_tunnel_dev) return -EINVAL; + if (ip6gre_netlink_encap_parms(data, &ipencap)) { + int err = ip6_tnl_encap_setup(nt, &ipencap); + + if (err < 0) + return err; + } + ip6gre_netlink_parms(data, &p); t = ip6gre_tunnel_locate(net, &p, 0); @@ -1609,14 +1452,20 @@ static size_t ip6gre_get_size(const struct net_device *dev) nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GRE_TTL */ nla_total_size(1) + - /* IFLA_GRE_TOS */ - nla_total_size(1) + /* IFLA_GRE_ENCAP_LIMIT */ nla_total_size(1) + /* IFLA_GRE_FLOWINFO */ nla_total_size(4) + /* IFLA_GRE_FLAGS */ nla_total_size(4) + + /* IFLA_GRE_ENCAP_TYPE */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_FLAGS */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_SPORT */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_DPORT */ + nla_total_size(2) + 0; } @@ -1626,18 +1475,30 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) struct __ip6_tnl_parm *p = &t->parms; if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || - nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) || - nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) || + nla_put_be16(skb, IFLA_GRE_IFLAGS, + gre_tnl_flags_to_gre_flags(p->i_flags)) || + nla_put_be16(skb, IFLA_GRE_OFLAGS, + gre_tnl_flags_to_gre_flags(p->o_flags)) || nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || nla_put_in6_addr(skb, IFLA_GRE_LOCAL, &p->laddr) || nla_put_in6_addr(skb, IFLA_GRE_REMOTE, &p->raddr) || nla_put_u8(skb, IFLA_GRE_TTL, p->hop_limit) || - /*nla_put_u8(skb, IFLA_GRE_TOS, t->priority) ||*/ nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) || nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) || nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags)) goto nla_put_failure; + + if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, + t->encap.type) || + nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT, + t->encap.sport) || + nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT, + t->encap.dport) || + nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, + t->encap.flags)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -1656,6 +1517,10 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_LIMIT] = { .type = NLA_U8 }, [IFLA_GRE_FLOWINFO] = { .type = NLA_U32 }, [IFLA_GRE_FLAGS] = { .type = NLA_U32 }, + [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, }; static struct rtnl_link_ops ip6gre_link_ops __read_mostly = { diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index c05c425c2..94611e450 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -49,6 +49,13 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + /* if ingress device is enslaved to an L3 master device pass the + * skb to its handler for processing + */ + skb = l3mdev_ip6_rcv(skb); + if (!skb) + return NET_RX_SUCCESS; + if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) { const struct inet6_protocol *ipprot; @@ -78,11 +85,11 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt idev = __in6_dev_get(skb->dev); - IP6_UPD_PO_STATS_BH(net, idev, IPSTATS_MIB_IN, skb->len); + __IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_IN, skb->len); if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL || !idev || unlikely(idev->cnf.disable_ipv6)) { - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); goto drop; } @@ -109,10 +116,10 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (hdr->version != 6) goto err; - IP6_ADD_STATS_BH(net, idev, - IPSTATS_MIB_NOECTPKTS + + __IP6_ADD_STATS(net, idev, + IPSTATS_MIB_NOECTPKTS + (ipv6_get_dsfield(hdr) & INET_ECN_MASK), - max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); + max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); /* * RFC4291 2.5.3 * A packet received on an interface with a destination address @@ -169,12 +176,12 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt /* pkt_len may be zero if Jumbo payload option is present */ if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { if (pkt_len + sizeof(struct ipv6hdr) > skb->len) { - IP6_INC_STATS_BH(net, - idev, IPSTATS_MIB_INTRUNCATEDPKTS); + __IP6_INC_STATS(net, + idev, IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) { - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); goto drop; } hdr = ipv6_hdr(skb); @@ -182,7 +189,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt if (hdr->nexthdr == NEXTHDR_HOP) { if (ipv6_parse_hopopts(skb) < 0) { - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); rcu_read_unlock(); return NET_RX_DROP; } @@ -197,7 +204,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt net, NULL, skb, dev, NULL, ip6_rcv_finish); err: - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); drop: rcu_read_unlock(); kfree_skb(skb); @@ -216,6 +223,7 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk unsigned int nhoff; int nexthdr; bool raw; + bool have_final = false; /* * Parse extension headers @@ -229,14 +237,27 @@ resubmit: nhoff = IP6CB(skb)->nhoff; nexthdr = skb_network_header(skb)[nhoff]; +resubmit_final: raw = raw6_local_deliver(skb, nexthdr); ipprot = rcu_dereference(inet6_protos[nexthdr]); if (ipprot) { int ret; - if (ipprot->flags & INET6_PROTO_FINAL) { + if (have_final) { + if (!(ipprot->flags & INET6_PROTO_FINAL)) { + /* Once we've seen a final protocol don't + * allow encapsulation on any non-final + * ones. This allows foo in UDP encapsulation + * to work. + */ + goto discard; + } + } else if (ipprot->flags & INET6_PROTO_FINAL) { const struct ipv6hdr *hdr; + /* Only do this once for first final protocol */ + have_final = true; + /* Free reference early: we don't need it any more, and it may hold ip_conntrack module loaded indefinitely. */ @@ -256,21 +277,32 @@ resubmit: goto discard; ret = ipprot->handler(skb); - if (ret > 0) - goto resubmit; - else if (ret == 0) - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS); + if (ret > 0) { + if (ipprot->flags & INET6_PROTO_FINAL) { + /* Not an extension header, most likely UDP + * encapsulation. Use return value as nexthdr + * protocol not nhoff (which presumably is + * not set by handler). + */ + nexthdr = ret; + goto resubmit_final; + } else { + goto resubmit; + } + } else if (ret == 0) { + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS); + } } else { if (!raw) { if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { - IP6_INC_STATS_BH(net, idev, - IPSTATS_MIB_INUNKNOWNPROTOS); + __IP6_INC_STATS(net, idev, + IPSTATS_MIB_INUNKNOWNPROTOS); icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_UNK_NEXTHDR, nhoff); } kfree_skb(skb); } else { - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDELIVERS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS); consume_skb(skb); } } @@ -278,7 +310,7 @@ resubmit: return 0; discard: - IP6_INC_STATS_BH(net, idev, IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); rcu_read_unlock(); kfree_skb(skb); return 0; @@ -297,7 +329,7 @@ int ip6_mc_input(struct sk_buff *skb) const struct ipv6hdr *hdr; bool deliver; - IP6_UPD_PO_STATS_BH(dev_net(skb_dst(skb)->dev), + __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INMCAST, skb->len); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 82e9f3076..22e90e56b 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -16,6 +16,7 @@ #include <net/protocol.h> #include <net/ipv6.h> +#include <net/inet_common.h> #include "ip6_offload.h" @@ -63,27 +64,12 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int proto; struct frag_hdr *fptr; unsigned int unfrag_ip6hlen; + unsigned int payload_len; u8 *prevhdr; int offset = 0; bool encap, udpfrag; int nhoff; - if (unlikely(skb_shinfo(skb)->gso_type & - ~(SKB_GSO_TCPV4 | - SKB_GSO_UDP | - SKB_GSO_DODGY | - SKB_GSO_TCP_ECN | - SKB_GSO_GRE | - SKB_GSO_GRE_CSUM | - SKB_GSO_IPIP | - SKB_GSO_SIT | - SKB_GSO_UDP_TUNNEL | - SKB_GSO_UDP_TUNNEL_CSUM | - SKB_GSO_TUNNEL_REMCSUM | - SKB_GSO_TCPV6 | - 0))) - goto out; - skb_reset_network_header(skb); nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) @@ -101,7 +87,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, proto = ipv6_gso_pull_exthdrs(skb, ipv6h->nexthdr); if (skb->encapsulation && - skb_shinfo(skb)->gso_type & (SKB_GSO_SIT|SKB_GSO_IPIP)) + skb_shinfo(skb)->gso_type & (SKB_GSO_IPXIP4 | SKB_GSO_IPXIP6)) udpfrag = proto == IPPROTO_UDP && encap; else udpfrag = proto == IPPROTO_UDP && !skb->encapsulation; @@ -117,7 +103,13 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); - ipv6h->payload_len = htons(skb->len - nhoff - sizeof(*ipv6h)); + if (skb_is_gso(skb)) + payload_len = skb_shinfo(skb)->gso_size + + SKB_GSO_CB(skb)->data_offset + + skb->head - (unsigned char *)(ipv6h + 1); + else + payload_len = skb->len - nhoff - sizeof(*ipv6h); + ipv6h->payload_len = htons(payload_len); skb->network_header = (u8 *)ipv6h - skb->head; if (udpfrag) { @@ -239,10 +231,14 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000)); NAPI_GRO_CB(p)->flush |= flush; - /* Clear flush_id, there's really no concept of ID in IPv6. */ - NAPI_GRO_CB(p)->flush_id = 0; + /* If the previous IP ID value was based on an atomic + * datagram we can overwrite the value and ignore it. + */ + if (NAPI_GRO_CB(skb)->is_atomic) + NAPI_GRO_CB(p)->flush_id = 0; } + NAPI_GRO_CB(skb)->is_atomic = true; NAPI_GRO_CB(skb)->flush |= flush; skb_gro_postpull_rcsum(skb, iph, nlen); @@ -258,9 +254,11 @@ out: return pp; } -static struct sk_buff **sit_gro_receive(struct sk_buff **head, - struct sk_buff *skb) +static struct sk_buff **sit_ip6ip6_gro_receive(struct sk_buff **head, + struct sk_buff *skb) { + /* Common GRO receive for SIT and IP6IP6 */ + if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; return NULL; @@ -271,6 +269,21 @@ static struct sk_buff **sit_gro_receive(struct sk_buff **head, return ipv6_gro_receive(head, skb); } +static struct sk_buff **ip4ip6_gro_receive(struct sk_buff **head, + struct sk_buff *skb) +{ + /* Common GRO receive for SIT and IP6IP6 */ + + if (NAPI_GRO_CB(skb)->encap_mark) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + + NAPI_GRO_CB(skb)->encap_mark = 1; + + return inet_gro_receive(head, skb); +} + static int ipv6_gro_complete(struct sk_buff *skb, int nhoff) { const struct net_offload *ops; @@ -299,10 +312,24 @@ out_unlock: static int sit_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; - skb_shinfo(skb)->gso_type |= SKB_GSO_SIT; + skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; + return ipv6_gro_complete(skb, nhoff); +} + +static int ip6ip6_gro_complete(struct sk_buff *skb, int nhoff) +{ + skb->encapsulation = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; return ipv6_gro_complete(skb, nhoff); } +static int ip4ip6_gro_complete(struct sk_buff *skb, int nhoff) +{ + skb->encapsulation = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6; + return inet_gro_complete(skb, nhoff); +} + static struct packet_offload ipv6_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), .callbacks = { @@ -315,24 +342,39 @@ static struct packet_offload ipv6_packet_offload __read_mostly = { static const struct net_offload sit_offload = { .callbacks = { .gso_segment = ipv6_gso_segment, - .gro_receive = sit_gro_receive, + .gro_receive = sit_ip6ip6_gro_receive, .gro_complete = sit_gro_complete, }, }; +static const struct net_offload ip4ip6_offload = { + .callbacks = { + .gso_segment = inet_gso_segment, + .gro_receive = ip4ip6_gro_receive, + .gro_complete = ip4ip6_gro_complete, + }, +}; + +static const struct net_offload ip6ip6_offload = { + .callbacks = { + .gso_segment = ipv6_gso_segment, + .gro_receive = sit_ip6ip6_gro_receive, + .gro_complete = ip6ip6_gro_complete, + }, +}; static int __init ipv6_offload_init(void) { if (tcpv6_offload_init() < 0) pr_crit("%s: Cannot add TCP protocol offload\n", __func__); - if (udp_offload_init() < 0) - pr_crit("%s: Cannot add UDP protocol offload\n", __func__); if (ipv6_exthdrs_offload_init() < 0) pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__); dev_add_offload(&ipv6_packet_offload); inet_add_offload(&sit_offload, IPPROTO_IPV6); + inet6_add_offload(&ip6ip6_offload, IPPROTO_IPV6); + inet6_add_offload(&ip4ip6_offload, IPPROTO_IPIP); return 0; } diff --git a/net/ipv6/ip6_offload.h b/net/ipv6/ip6_offload.h index 2e155c651..96b40e41a 100644 --- a/net/ipv6/ip6_offload.h +++ b/net/ipv6/ip6_offload.h @@ -12,7 +12,8 @@ #define __ip6_offload_h int ipv6_exthdrs_offload_init(void); -int udp_offload_init(void); +int udpv6_offload_init(void); +int udpv6_offload_exit(void); int tcpv6_offload_init(void); #endif diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index da88de82b..635b8d340 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -395,8 +395,8 @@ int ip6_forward(struct sk_buff *skb) goto drop; if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), - IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_INDISCARDS); goto drop; } @@ -427,8 +427,8 @@ int ip6_forward(struct sk_buff *skb) /* Force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_INHDRERRORS); kfree_skb(skb); return -ETIMEDOUT; @@ -441,15 +441,15 @@ int ip6_forward(struct sk_buff *skb) if (proxied > 0) return ip6_input(skb); else if (proxied < 0) { - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), - IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_INDISCARDS); goto drop; } } if (!xfrm6_route_forward(skb)) { - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), - IPSTATS_MIB_INDISCARDS); + __IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_INDISCARDS); goto drop; } dst = skb_dst(skb); @@ -505,17 +505,17 @@ int ip6_forward(struct sk_buff *skb) /* Again, force OUTPUT device used as source address */ skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), - IPSTATS_MIB_INTOOBIGERRORS); - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), - IPSTATS_MIB_FRAGFAILS); + __IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_INTOOBIGERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; } if (skb_cow(skb, dst->dev->hard_header_len)) { - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), - IPSTATS_MIB_OUTDISCARDS); + __IP6_INC_STATS(net, ip6_dst_idev(dst), + IPSTATS_MIB_OUTDISCARDS); goto drop; } @@ -525,14 +525,14 @@ int ip6_forward(struct sk_buff *skb) hdr->hop_limit--; - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); - IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); + __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); + __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, net, NULL, skb, skb->dev, dst->dev, ip6_forward_finish); error: - IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); drop: kfree_skb(skb); return -EINVAL; @@ -1177,12 +1177,12 @@ static void ip6_append_data_mtu(unsigned int *mtu, } static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, - struct inet6_cork *v6_cork, - int hlimit, int tclass, struct ipv6_txoptions *opt, + struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, struct rt6_info *rt, struct flowi6 *fl6) { struct ipv6_pinfo *np = inet6_sk(sk); unsigned int mtu; + struct ipv6_txoptions *opt = ipc6->opt; /* * setup for corking @@ -1224,8 +1224,8 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, dst_hold(&rt->dst); cork->base.dst = &rt->dst; cork->fl.u.ip6 = *fl6; - v6_cork->hop_limit = hlimit; - v6_cork->tclass = tclass; + v6_cork->hop_limit = ipc6->hlimit; + v6_cork->tclass = ipc6->tclass; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? rt->dst.dev->mtu : dst_mtu(&rt->dst); @@ -1253,7 +1253,8 @@ static int __ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - unsigned int flags, int dontfrag) + unsigned int flags, struct ipcm6_cookie *ipc6, + const struct sockcm_cookie *sockc) { struct sk_buff *skb, *skb_prev = NULL; unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu; @@ -1292,7 +1293,7 @@ static int __ip6_append_data(struct sock *sk, sizeof(struct frag_hdr) : 0) + rt->rt6i_nfheader_len; - if (cork->length + length > mtu - headersize && dontfrag && + if (cork->length + length > mtu - headersize && ipc6->dontfrag && (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_RAW)) { ipv6_local_rxpmtu(sk, fl6, mtu - headersize + @@ -1324,7 +1325,7 @@ emsgsize: csummode = CHECKSUM_PARTIAL; if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { - sock_tx_timestamp(sk, &tx_flags); + sock_tx_timestamp(sk, sockc->tsflags, &tx_flags); if (tx_flags & SKBTX_ANY_SW_TSTAMP && sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) tskey = sk->sk_tskey++; @@ -1558,9 +1559,10 @@ error: int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), - void *from, int length, int transhdrlen, int hlimit, - int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6, - struct rt6_info *rt, unsigned int flags, int dontfrag) + void *from, int length, int transhdrlen, + struct ipcm6_cookie *ipc6, struct flowi6 *fl6, + struct rt6_info *rt, unsigned int flags, + const struct sockcm_cookie *sockc) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -1573,12 +1575,12 @@ int ip6_append_data(struct sock *sk, /* * setup for corking */ - err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit, - tclass, opt, rt, fl6); + err = ip6_setup_cork(sk, &inet->cork, &np->cork, + ipc6, rt, fl6); if (err) return err; - exthdrlen = (opt ? opt->opt_flen : 0); + exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); length += exthdrlen; transhdrlen += exthdrlen; } else { @@ -1588,7 +1590,7 @@ int ip6_append_data(struct sock *sk, return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, &np->cork, sk_page_frag(sk), getfrag, - from, length, transhdrlen, flags, dontfrag); + from, length, transhdrlen, flags, ipc6, sockc); } EXPORT_SYMBOL_GPL(ip6_append_data); @@ -1744,15 +1746,14 @@ struct sk_buff *ip6_make_skb(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, - int hlimit, int tclass, - struct ipv6_txoptions *opt, struct flowi6 *fl6, + struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags, - int dontfrag) + const struct sockcm_cookie *sockc) { struct inet_cork_full cork; struct inet6_cork v6_cork; struct sk_buff_head queue; - int exthdrlen = (opt ? opt->opt_flen : 0); + int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); int err; if (flags & MSG_PROBE) @@ -1764,17 +1765,17 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork.base.addr = 0; cork.base.opt = NULL; v6_cork.opt = NULL; - err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6); + err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); if (err) return ERR_PTR(err); - if (dontfrag < 0) - dontfrag = inet6_sk(sk)->dontfrag; + if (ipc6->dontfrag < 0) + ipc6->dontfrag = inet6_sk(sk)->dontfrag; err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, - flags, dontfrag); + flags, ipc6, sockc); if (err) { __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork); return ERR_PTR(err); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 1f20345cb..7b0481e37 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -238,6 +238,7 @@ static void ip6_dev_free(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); + gro_cells_destroy(&t->gro_cells); dst_cache_destroy(&t->dst_cache); free_percpu(dev->tstats); free_netdev(dev); @@ -753,97 +754,157 @@ int ip6_tnl_rcv_ctl(struct ip6_tnl *t, } EXPORT_SYMBOL_GPL(ip6_tnl_rcv_ctl); -/** - * ip6_tnl_rcv - decapsulate IPv6 packet and retransmit it locally - * @skb: received socket buffer - * @protocol: ethernet protocol ID - * @dscp_ecn_decapsulate: the function to decapsulate DSCP code and ECN - * - * Return: 0 - **/ - -static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, - __u8 ipproto, - int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, - const struct ipv6hdr *ipv6h, - struct sk_buff *skb)) +static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, + const struct tnl_ptk_info *tpi, + struct metadata_dst *tun_dst, + int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb), + bool log_ecn_err) { - struct ip6_tnl *t; + struct pcpu_sw_netstats *tstats; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); - u8 tproto; int err; - rcu_read_lock(); - t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr); - if (t) { - struct pcpu_sw_netstats *tstats; + if ((!(tpi->flags & TUNNEL_CSUM) && + (tunnel->parms.i_flags & TUNNEL_CSUM)) || + ((tpi->flags & TUNNEL_CSUM) && + !(tunnel->parms.i_flags & TUNNEL_CSUM))) { + tunnel->dev->stats.rx_crc_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; + } - tproto = ACCESS_ONCE(t->parms.proto); - if (tproto != ipproto && tproto != 0) { - rcu_read_unlock(); - goto discard; + if (tunnel->parms.i_flags & TUNNEL_SEQ) { + if (!(tpi->flags & TUNNEL_SEQ) || + (tunnel->i_seqno && + (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { + tunnel->dev->stats.rx_fifo_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; } + tunnel->i_seqno = ntohl(tpi->seq) + 1; + } - if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { - rcu_read_unlock(); - goto discard; - } + skb->protocol = tpi->proto; - if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) { - t->dev->stats.rx_dropped++; - rcu_read_unlock(); - goto discard; + /* Warning: All skb pointers will be invalidated! */ + if (tunnel->dev->type == ARPHRD_ETHER) { + if (!pskb_may_pull(skb, ETH_HLEN)) { + tunnel->dev->stats.rx_length_errors++; + tunnel->dev->stats.rx_errors++; + goto drop; } - skb->mac_header = skb->network_header; - skb_reset_network_header(skb); - skb->protocol = htons(protocol); - memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); - - __skb_tunnel_rx(skb, t->dev, t->net); - - err = dscp_ecn_decapsulate(t, ipv6h, skb); - if (unlikely(err)) { - if (log_ecn_error) - net_info_ratelimited("non-ECT from %pI6 with dsfield=%#x\n", - &ipv6h->saddr, - ipv6_get_dsfield(ipv6h)); - if (err > 1) { - ++t->dev->stats.rx_frame_errors; - ++t->dev->stats.rx_errors; - rcu_read_unlock(); - goto discard; - } + + ipv6h = ipv6_hdr(skb); + skb->protocol = eth_type_trans(skb, tunnel->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } else { + skb->dev = tunnel->dev; + } + + skb_reset_network_header(skb); + memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); + + __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); + + err = dscp_ecn_decapsulate(tunnel, ipv6h, skb); + if (unlikely(err)) { + if (log_ecn_err) + net_info_ratelimited("non-ECT from %pI6 with DS=%#x\n", + &ipv6h->saddr, + ipv6_get_dsfield(ipv6h)); + if (err > 1) { + ++tunnel->dev->stats.rx_frame_errors; + ++tunnel->dev->stats.rx_errors; + goto drop; } + } - tstats = this_cpu_ptr(t->dev->tstats); - u64_stats_update_begin(&tstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += skb->len; - u64_stats_update_end(&tstats->syncp); + tstats = this_cpu_ptr(tunnel->dev->tstats); + u64_stats_update_begin(&tstats->syncp); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + u64_stats_update_end(&tstats->syncp); - netif_rx(skb); + skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev))); - rcu_read_unlock(); - return 0; + gro_cells_receive(&tunnel->gro_cells, skb); + return 0; + +drop: + kfree_skb(skb); + return 0; +} + +int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb, + const struct tnl_ptk_info *tpi, + struct metadata_dst *tun_dst, + bool log_ecn_err) +{ + return __ip6_tnl_rcv(t, skb, tpi, NULL, ip6ip6_dscp_ecn_decapsulate, + log_ecn_err); +} +EXPORT_SYMBOL(ip6_tnl_rcv); + +static const struct tnl_ptk_info tpi_v6 = { + /* no tunnel info required for ipxip6. */ + .proto = htons(ETH_P_IPV6), +}; + +static const struct tnl_ptk_info tpi_v4 = { + /* no tunnel info required for ipxip6. */ + .proto = htons(ETH_P_IP), +}; + +static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, + const struct tnl_ptk_info *tpi, + int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb)) +{ + struct ip6_tnl *t; + const struct ipv6hdr *ipv6h = ipv6_hdr(skb); + int ret = -1; + + rcu_read_lock(); + t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr); + + if (t) { + u8 tproto = ACCESS_ONCE(t->parms.proto); + + if (tproto != ipproto && tproto != 0) + goto drop; + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) + goto drop; + if (iptunnel_pull_header(skb, 0, tpi->proto, false)) + goto drop; + ret = __ip6_tnl_rcv(t, skb, tpi, NULL, dscp_ecn_decapsulate, + log_ecn_error); } + rcu_read_unlock(); - return 1; -discard: + return ret; + +drop: + rcu_read_unlock(); kfree_skb(skb); return 0; } static int ip4ip6_rcv(struct sk_buff *skb) { - return ip6_tnl_rcv(skb, ETH_P_IP, IPPROTO_IPIP, - ip4ip6_dscp_ecn_decapsulate); + return ipxip6_rcv(skb, IPPROTO_IPIP, &tpi_v4, + ip4ip6_dscp_ecn_decapsulate); } static int ip6ip6_rcv(struct sk_buff *skb) { - return ip6_tnl_rcv(skb, ETH_P_IPV6, IPPROTO_IPV6, - ip6ip6_dscp_ecn_decapsulate); + return ipxip6_rcv(skb, IPPROTO_IPV6, &tpi_v6, + ip6ip6_dscp_ecn_decapsulate); } struct ipv6_tel_txoption { @@ -918,13 +979,14 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl); /** - * ip6_tnl_xmit2 - encapsulate packet and send + * ip6_tnl_xmit - encapsulate packet and send * @skb: the outgoing socket buffer * @dev: the outgoing tunnel device * @dsfield: dscp code for outer header - * @fl: flow of tunneled packet + * @fl6: flow of tunneled packet * @encap_limit: encapsulation limit * @pmtu: Path MTU is stored if packet is too big + * @proto: next header value * * Description: * Build new header and do some sanity checks on the packet before sending @@ -936,12 +998,9 @@ EXPORT_SYMBOL_GPL(ip6_tnl_xmit_ctl); * %-EMSGSIZE message too big. return mtu in this case. **/ -static int ip6_tnl_xmit2(struct sk_buff *skb, - struct net_device *dev, - __u8 dsfield, - struct flowi6 *fl6, - int encap_limit, - __u32 *pmtu) +int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, + struct flowi6 *fl6, int encap_limit, __u32 *pmtu, + __u8 proto) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; @@ -951,8 +1010,8 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; int mtu; - unsigned int max_headroom = sizeof(struct ipv6hdr); - u8 proto; + unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; + unsigned int max_headroom = psh_hlen; int err = -1; /* NBMA tunnel */ @@ -1005,7 +1064,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, t->parms.name); goto tx_err_dst_release; } - mtu = dst_mtu(dst) - sizeof(*ipv6h); + mtu = dst_mtu(dst) - psh_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; @@ -1014,12 +1073,23 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, mtu = IPV6_MIN_MTU; if (skb_dst(skb)) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); - if (skb->len > mtu) { + if (skb->len > mtu && !skb_is_gso(skb)) { *pmtu = mtu; err = -EMSGSIZE; goto tx_err_dst_release; } + if (t->err_count > 0) { + if (time_before(jiffies, + t->err_time + IP6TUNNEL_ERR_TIMEO)) { + t->err_count--; + + dst_link_failure(skb); + } else { + t->err_count = 0; + } + } + skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); /* @@ -1045,18 +1115,22 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); skb_dst_set(skb, dst); - skb->transport_header = skb->network_header; - - proto = fl6->flowi6_proto; if (encap_limit >= 0) { init_tel_txopt(&opt, encap_limit); ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL); } - if (likely(!skb->encapsulation)) { - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - } + /* Calculate max headroom for all the headers and adjust + * needed_headroom if necessary. + */ + max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr) + + dst->header_len + t->hlen; + if (max_headroom > dev->needed_headroom) + dev->needed_headroom = max_headroom; + + err = ip6_tnl_encap(skb, t, &proto, fl6); + if (err) + return err; skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); @@ -1076,6 +1150,7 @@ tx_err_dst_release: dst_release(dst); return err; } +EXPORT_SYMBOL(ip6_tnl_xmit); static inline int ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) @@ -1099,7 +1174,6 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) encap_limit = t->parms.encap_limit; memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPIP; dsfield = ipv4_get_dsfield(iph); @@ -1109,7 +1183,13 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; - err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); + if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) + return -1; + + skb_set_inner_ipproto(skb, IPPROTO_IPIP); + + err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, + IPPROTO_IPIP); if (err != 0) { /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) @@ -1153,7 +1233,6 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) encap_limit = t->parms.encap_limit; memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPV6; dsfield = ipv6_get_dsfield(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) @@ -1163,7 +1242,13 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; - err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu); + if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) + return -1; + + skb_set_inner_ipproto(skb, IPPROTO_IPV6); + + err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, + IPPROTO_IPV6); if (err != 0) { if (err == -EMSGSIZE) icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); @@ -1174,7 +1259,7 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) } static netdev_tx_t -ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net_device_stats *stats = &t->dev->stats; @@ -1208,6 +1293,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) struct net_device *dev = t->dev; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; + int t_hlen; memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); @@ -1231,6 +1317,10 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) else dev->flags &= ~IFF_POINTOPOINT; + t->tun_hlen = 0; + t->hlen = t->encap_hlen + t->tun_hlen; + t_hlen = t->hlen + sizeof(struct ipv6hdr); + if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); @@ -1244,9 +1334,9 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) if (rt->dst.dev) { dev->hard_header_len = rt->dst.dev->hard_header_len + - sizeof(struct ipv6hdr); + t_hlen; - dev->mtu = rt->dst.dev->mtu - sizeof(struct ipv6hdr); + dev->mtu = rt->dst.dev->mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) dev->mtu -= 8; @@ -1370,6 +1460,8 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + memset(&p1, 0, sizeof(p1)); + switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { @@ -1464,8 +1556,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) * %-EINVAL if mtu too small **/ -static int -ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) +int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) { struct ip6_tnl *tnl = netdev_priv(dev); @@ -1481,6 +1572,7 @@ ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) dev->mtu = new_mtu; return 0; } +EXPORT_SYMBOL(ip6_tnl_change_mtu); int ip6_tnl_get_iflink(const struct net_device *dev) { @@ -1490,16 +1582,74 @@ int ip6_tnl_get_iflink(const struct net_device *dev) } EXPORT_SYMBOL(ip6_tnl_get_iflink); +int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops, + unsigned int num) +{ + if (num >= MAX_IPTUN_ENCAP_OPS) + return -ERANGE; + + return !cmpxchg((const struct ip6_tnl_encap_ops **) + &ip6tun_encaps[num], + NULL, ops) ? 0 : -1; +} +EXPORT_SYMBOL(ip6_tnl_encap_add_ops); + +int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, + unsigned int num) +{ + int ret; + + if (num >= MAX_IPTUN_ENCAP_OPS) + return -ERANGE; + + ret = (cmpxchg((const struct ip6_tnl_encap_ops **) + &ip6tun_encaps[num], + ops, NULL) == ops) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(ip6_tnl_encap_del_ops); + +int ip6_tnl_encap_setup(struct ip6_tnl *t, + struct ip_tunnel_encap *ipencap) +{ + int hlen; + + memset(&t->encap, 0, sizeof(t->encap)); + + hlen = ip6_encap_hlen(ipencap); + if (hlen < 0) + return hlen; + + t->encap.type = ipencap->type; + t->encap.sport = ipencap->sport; + t->encap.dport = ipencap->dport; + t->encap.flags = ipencap->flags; + + t->encap_hlen = hlen; + t->hlen = t->encap_hlen + t->tun_hlen; + + return 0; +} +EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup); + static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, - .ndo_start_xmit = ip6_tnl_xmit, + .ndo_start_xmit = ip6_tnl_start_xmit, .ndo_do_ioctl = ip6_tnl_ioctl, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats = ip6_get_stats, .ndo_get_iflink = ip6_tnl_get_iflink, }; +#define IPXIPX_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_HIGHDMA | \ + NETIF_F_GSO_SOFTWARE | \ + NETIF_F_HW_CSUM) /** * ip6_tnl_dev_setup - setup virtual tunnel device @@ -1511,20 +1661,18 @@ static const struct net_device_ops ip6_tnl_netdev_ops = { static void ip6_tnl_dev_setup(struct net_device *dev) { - struct ip6_tnl *t; - dev->netdev_ops = &ip6_tnl_netdev_ops; dev->destructor = ip6_dev_free; dev->type = ARPHRD_TUNNEL6; - dev->hard_header_len = LL_MAX_HEADER + sizeof(struct ipv6hdr); - dev->mtu = ETH_DATA_LEN - sizeof(struct ipv6hdr); - t = netdev_priv(dev); - if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - dev->mtu -= 8; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); + dev->features |= NETIF_F_LLTX; netif_keep_dst(dev); + + dev->features |= IPXIPX_FEATURES; + dev->hw_features |= IPXIPX_FEATURES; + /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); @@ -1541,6 +1689,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int ret; + int t_hlen; t->dev = dev; t->net = dev_net(dev); @@ -1549,13 +1698,32 @@ ip6_tnl_dev_init_gen(struct net_device *dev) return -ENOMEM; ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); - if (ret) { - free_percpu(dev->tstats); - dev->tstats = NULL; - return ret; - } + if (ret) + goto free_stats; + + ret = gro_cells_init(&t->gro_cells, dev); + if (ret) + goto destroy_dst; + + t->tun_hlen = 0; + t->hlen = t->encap_hlen + t->tun_hlen; + t_hlen = t->hlen + sizeof(struct ipv6hdr); + + dev->type = ARPHRD_TUNNEL6; + dev->hard_header_len = LL_MAX_HEADER + t_hlen; + dev->mtu = ETH_DATA_LEN - t_hlen; + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) + dev->mtu -= 8; return 0; + +destroy_dst: + dst_cache_destroy(&t->dst_cache); +free_stats: + free_percpu(dev->tstats); + dev->tstats = NULL; + + return ret; } /** @@ -1643,13 +1811,55 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[], parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); } +static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[], + struct ip_tunnel_encap *ipencap) +{ + bool ret = false; + + memset(ipencap, 0, sizeof(*ipencap)); + + if (!data) + return ret; + + if (data[IFLA_IPTUN_ENCAP_TYPE]) { + ret = true; + ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]); + } + + if (data[IFLA_IPTUN_ENCAP_FLAGS]) { + ret = true; + ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]); + } + + if (data[IFLA_IPTUN_ENCAP_SPORT]) { + ret = true; + ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]); + } + + if (data[IFLA_IPTUN_ENCAP_DPORT]) { + ret = true; + ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]); + } + + return ret; +} + static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct net *net = dev_net(dev); struct ip6_tnl *nt, *t; + struct ip_tunnel_encap ipencap; nt = netdev_priv(dev); + + if (ip6_tnl_netlink_encap_parms(data, &ipencap)) { + int err = ip6_tnl_encap_setup(nt, &ipencap); + + if (err < 0) + return err; + } + ip6_tnl_netlink_parms(data, &nt->parms); t = ip6_tnl_locate(net, &nt->parms, 0); @@ -1666,10 +1876,17 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], struct __ip6_tnl_parm p; struct net *net = t->net; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + struct ip_tunnel_encap ipencap; if (dev == ip6n->fb_tnl_dev) return -EINVAL; + if (ip6_tnl_netlink_encap_parms(data, &ipencap)) { + int err = ip6_tnl_encap_setup(t, &ipencap); + + if (err < 0) + return err; + } ip6_tnl_netlink_parms(data, &p); t = ip6_tnl_locate(net, &p, 0); @@ -1710,6 +1927,14 @@ static size_t ip6_tnl_get_size(const struct net_device *dev) nla_total_size(4) + /* IFLA_IPTUN_PROTO */ nla_total_size(1) + + /* IFLA_IPTUN_ENCAP_TYPE */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_FLAGS */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_SPORT */ + nla_total_size(2) + + /* IFLA_IPTUN_ENCAP_DPORT */ + nla_total_size(2) + 0; } @@ -1727,6 +1952,17 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto)) goto nla_put_failure; + + if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, + tunnel->encap.type) || + nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, + tunnel->encap.sport) || + nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, + tunnel->encap.dport) || + nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, + tunnel->encap.flags)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -1750,6 +1986,10 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 }, [IFLA_IPTUN_FLAGS] = { .type = NLA_U32 }, [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, + [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, + [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, }; static struct rtnl_link_ops ip6_link_ops __read_mostly = { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index e207cb246..487ef3bc7 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1985,10 +1985,10 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_OUTFORWDATAGRAMS); - IP6_ADD_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_OUTOCTETS, skb->len); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTFORWDATAGRAMS); + __IP6_ADD_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_OUTOCTETS, skb->len); return dst_output(net, sk, skb); } @@ -2269,7 +2269,7 @@ static int __ip6mr_fill_mroute(struct mr6_table *mrt, struct sk_buff *skb, mfcs.mfcs_packets = c->mfc_un.res.pkt; mfcs.mfcs_bytes = c->mfc_un.res.bytes; mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; - if (nla_put(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs) < 0) + if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) < 0) return -EMSGSIZE; rtm->rtm_type = RTN_MULTICAST; @@ -2412,7 +2412,7 @@ static int mr6_msgsize(bool unresolved, int maxvif) + nla_total_size(0) /* RTA_MULTIPATH */ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) /* RTA_MFC_STATS */ - + nla_total_size(sizeof(struct rta_mfc_stats)) + + nla_total_size_64bit(sizeof(struct rta_mfc_stats)) ; return len; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 4449ad1f8..a9895e15e 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -407,7 +407,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) break; - opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + opt = rcu_dereference_protected(np->opt, + lockdep_sock_is_held(sk)); opt = ipv6_renew_options(sk, opt, optname, (struct ipv6_opt_hdr __user *)optval, optlen); @@ -471,7 +472,8 @@ sticky_done: struct ipv6_txoptions *opt = NULL; struct msghdr msg; struct flowi6 fl6; - int junk; + struct sockcm_cookie sockc_junk; + struct ipcm6_cookie ipc6; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = sk->sk_bound_dev_if; @@ -501,9 +503,9 @@ sticky_done: msg.msg_controllen = optlen; msg.msg_control = (void *)(opt+1); + ipc6.opt = opt; - retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, opt, &junk, - &junk, &junk); + retv = ip6_datagram_send_ctl(net, sk, &msg, &fl6, &ipc6, &sockc_junk); if (retv) goto done; update: @@ -1123,7 +1125,8 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, struct ipv6_txoptions *opt; lock_sock(sk); - opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + opt = rcu_dereference_protected(np->opt, + lockdep_sock_is_held(sk)); len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len); release_sock(sk); /* check if ipv6_getsockopt_sticky() returns err code */ diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 9021b4355..63e06c3dd 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -39,34 +39,12 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("IPv6 packet filter"); -/*#define DEBUG_IP_FIREWALL*/ -/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ -/*#define DEBUG_IP_FIREWALL_USER*/ - -#ifdef DEBUG_IP_FIREWALL -#define dprintf(format, args...) pr_info(format , ## args) -#else -#define dprintf(format, args...) -#endif - -#ifdef DEBUG_IP_FIREWALL_USER -#define duprintf(format, args...) pr_info(format , ## args) -#else -#define duprintf(format, args...) -#endif - #ifdef CONFIG_NETFILTER_DEBUG #define IP_NF_ASSERT(x) WARN_ON(!(x)) #else #define IP_NF_ASSERT(x) #endif -#if 0 -/* All the better to debug you with... */ -#define static -#define inline -#endif - void *ip6t_alloc_initial_table(const struct xt_table *info) { return xt_alloc_initial_table(ip6t, IP6T); @@ -100,35 +78,18 @@ ip6_packet_match(const struct sk_buff *skb, if (FWINV(ipv6_masked_addr_cmp(&ipv6->saddr, &ip6info->smsk, &ip6info->src), IP6T_INV_SRCIP) || FWINV(ipv6_masked_addr_cmp(&ipv6->daddr, &ip6info->dmsk, - &ip6info->dst), IP6T_INV_DSTIP)) { - dprintf("Source or dest mismatch.\n"); -/* - dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr, - ipinfo->smsk.s_addr, ipinfo->src.s_addr, - ipinfo->invflags & IP6T_INV_SRCIP ? " (INV)" : ""); - dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr, - ipinfo->dmsk.s_addr, ipinfo->dst.s_addr, - ipinfo->invflags & IP6T_INV_DSTIP ? " (INV)" : "");*/ + &ip6info->dst), IP6T_INV_DSTIP)) return false; - } ret = ifname_compare_aligned(indev, ip6info->iniface, ip6info->iniface_mask); - if (FWINV(ret != 0, IP6T_INV_VIA_IN)) { - dprintf("VIA in mismatch (%s vs %s).%s\n", - indev, ip6info->iniface, - ip6info->invflags & IP6T_INV_VIA_IN ? " (INV)" : ""); + if (FWINV(ret != 0, IP6T_INV_VIA_IN)) return false; - } ret = ifname_compare_aligned(outdev, ip6info->outiface, ip6info->outiface_mask); - if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) { - dprintf("VIA out mismatch (%s vs %s).%s\n", - outdev, ip6info->outiface, - ip6info->invflags & IP6T_INV_VIA_OUT ? " (INV)" : ""); + if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) return false; - } /* ... might want to do something with class and flowlabel here ... */ @@ -145,11 +106,6 @@ ip6_packet_match(const struct sk_buff *skb, } *fragoff = _frag_off; - dprintf("Packet protocol %hi ?= %s%hi.\n", - protohdr, - ip6info->invflags & IP6T_INV_PROTO ? "!":"", - ip6info->proto); - if (ip6info->proto == protohdr) { if (ip6info->invflags & IP6T_INV_PROTO) return false; @@ -169,16 +125,11 @@ ip6_packet_match(const struct sk_buff *skb, static bool ip6_checkentry(const struct ip6t_ip6 *ipv6) { - if (ipv6->flags & ~IP6T_F_MASK) { - duprintf("Unknown flag bits set: %08X\n", - ipv6->flags & ~IP6T_F_MASK); + if (ipv6->flags & ~IP6T_F_MASK) return false; - } - if (ipv6->invflags & ~IP6T_INV_MASK) { - duprintf("Unknown invflag bits set: %08X\n", - ipv6->invflags & ~IP6T_INV_MASK); + if (ipv6->invflags & ~IP6T_INV_MASK) return false; - } + return true; } @@ -446,13 +397,9 @@ ip6t_do_table(struct sk_buff *skb, xt_write_recseq_end(addend); local_bh_enable(); -#ifdef DEBUG_ALLOW_ALL - return NF_ACCEPT; -#else if (acpar.hotdrop) return NF_DROP; else return verdict; -#endif } static bool find_jump_target(const struct xt_table_info *t, @@ -492,11 +439,9 @@ mark_source_chains(const struct xt_table_info *newinfo, = (void *)ip6t_get_target_c(e); int visited = e->comefrom & (1 << hook); - if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { - pr_err("iptables: loop hook %u pos %u %08X.\n", - hook, pos, e->comefrom); + if (e->comefrom & (1 << NF_INET_NUMHOOKS)) return 0; - } + e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ @@ -508,26 +453,13 @@ mark_source_chains(const struct xt_table_info *newinfo, if ((strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0) && - t->verdict < -NF_MAX_VERDICT - 1) { - duprintf("mark_source_chains: bad " - "negative verdict (%i)\n", - t->verdict); + t->verdict < -NF_MAX_VERDICT - 1) return 0; - } /* Return: backtrack through the last big jump. */ do { e->comefrom ^= (1<<NF_INET_NUMHOOKS); -#ifdef DEBUG_IP_FIREWALL_USER - if (e->comefrom - & (1 << NF_INET_NUMHOOKS)) { - duprintf("Back unset " - "on hook %u " - "rule %u\n", - hook, pos); - } -#endif oldpos = pos; pos = e->counters.pcnt; e->counters.pcnt = 0; @@ -554,16 +486,7 @@ mark_source_chains(const struct xt_table_info *newinfo, if (strcmp(t->target.u.user.name, XT_STANDARD_TARGET) == 0 && newpos >= 0) { - if (newpos > newinfo->size - - sizeof(struct ip6t_entry)) { - duprintf("mark_source_chains: " - "bad verdict (%i)\n", - newpos); - return 0; - } /* This a jump; chase it. */ - duprintf("Jump rule %u -> %u\n", - pos, newpos); e = (struct ip6t_entry *) (entry0 + newpos); if (!find_jump_target(newinfo, e)) @@ -580,8 +503,7 @@ mark_source_chains(const struct xt_table_info *newinfo, pos = newpos; } } -next: - duprintf("Finished chain %u\n", hook); +next: ; } return 1; } @@ -602,19 +524,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net) static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { const struct ip6t_ip6 *ipv6 = par->entryinfo; - int ret; par->match = m->u.kernel.match; par->matchinfo = m->data; - ret = xt_check_match(par, m->u.match_size - sizeof(*m), - ipv6->proto, ipv6->invflags & IP6T_INV_PROTO); - if (ret < 0) { - duprintf("ip_tables: check failed for `%s'.\n", - par.match->name); - return ret; - } - return 0; + return xt_check_match(par, m->u.match_size - sizeof(*m), + ipv6->proto, ipv6->invflags & IP6T_INV_PROTO); } static int @@ -625,10 +540,9 @@ find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, m->u.user.revision); - if (IS_ERR(match)) { - duprintf("find_check_match: `%s' not found\n", m->u.user.name); + if (IS_ERR(match)) return PTR_ERR(match); - } + m->u.kernel.match = match; ret = check_match(m, par); @@ -653,17 +567,11 @@ static int check_target(struct ip6t_entry *e, struct net *net, const char *name) .hook_mask = e->comefrom, .family = NFPROTO_IPV6, }; - int ret; t = ip6t_get_target(e); - ret = xt_check_target(&par, t->u.target_size - sizeof(*t), - e->ipv6.proto, e->ipv6.invflags & IP6T_INV_PROTO); - if (ret < 0) { - duprintf("ip_tables: check failed for `%s'.\n", - t->u.kernel.target->name); - return ret; - } - return 0; + return xt_check_target(&par, t->u.target_size - sizeof(*t), + e->ipv6.proto, + e->ipv6.invflags & IP6T_INV_PROTO); } static int @@ -676,10 +584,12 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, unsigned int j; struct xt_mtchk_param mtpar; struct xt_entry_match *ematch; + unsigned long pcnt; - e->counters.pcnt = xt_percpu_counter_alloc(); - if (IS_ERR_VALUE(e->counters.pcnt)) + pcnt = xt_percpu_counter_alloc(); + if (IS_ERR_VALUE(pcnt)) return -ENOMEM; + e->counters.pcnt = pcnt; j = 0; mtpar.net = net; @@ -698,7 +608,6 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("find_check_entry: `%s' not found\n", t->u.user.name); ret = PTR_ERR(target); goto cleanup_matches; } @@ -751,17 +660,12 @@ check_entry_size_and_hooks(struct ip6t_entry *e, if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct ip6t_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p\n", e); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset - < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) return -EINVAL; - } if (!ip6_checkentry(&e->ipv6)) return -EINVAL; @@ -778,12 +682,9 @@ check_entry_size_and_hooks(struct ip6t_entry *e, if ((unsigned char *)e - base == hook_entries[h]) newinfo->hook_entry[h] = hook_entries[h]; if ((unsigned char *)e - base == underflows[h]) { - if (!check_underflow(e)) { - pr_debug("Underflows must be unconditional and " - "use the STANDARD target with " - "ACCEPT/DROP\n"); + if (!check_underflow(e)) return -EINVAL; - } + newinfo->underflow[h] = underflows[h]; } } @@ -835,7 +736,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, newinfo->underflow[i] = 0xFFFFFFFF; } - duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ xt_entry_foreach(iter, entry0, newinfo->size) { @@ -852,27 +752,18 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, ++newinfo->stacksize; } - if (i != repl->num_entries) { - duprintf("translate_table: %u not %u entries\n", - i, repl->num_entries); + if (i != repl->num_entries) return -EINVAL; - } /* Check hooks all assigned */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { /* Only hooks which are valid */ if (!(repl->valid_hooks & (1 << i))) continue; - if (newinfo->hook_entry[i] == 0xFFFFFFFF) { - duprintf("Invalid hook entry %u %u\n", - i, repl->hook_entry[i]); + if (newinfo->hook_entry[i] == 0xFFFFFFFF) return -EINVAL; - } - if (newinfo->underflow[i] == 0xFFFFFFFF) { - duprintf("Invalid underflow %u %u\n", - i, repl->underflow[i]); + if (newinfo->underflow[i] == 0xFFFFFFFF) return -EINVAL; - } } if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) @@ -1100,11 +991,8 @@ static int get_info(struct net *net, void __user *user, struct xt_table *t; int ret; - if (*len != sizeof(struct ip6t_getinfo)) { - duprintf("length %u != %zu\n", *len, - sizeof(struct ip6t_getinfo)); + if (*len != sizeof(struct ip6t_getinfo)) return -EINVAL; - } if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; @@ -1162,31 +1050,24 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr, struct ip6t_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct ip6t_get_entries) + get.size) { - duprintf("get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct ip6t_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; t = xt_find_table_lock(net, AF_INET6, get.name); if (!IS_ERR_OR_NULL(t)) { struct xt_table_info *private = t->private; - duprintf("t->private->number = %u\n", private->number); if (get.size == private->size) ret = copy_entries_to_user(private->size, t, uptr->entrytable); - else { - duprintf("get_entries: I've got %u not %u!\n", - private->size, get.size); + else ret = -EAGAIN; - } + module_put(t->me); xt_table_unlock(t); } else @@ -1222,8 +1103,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, /* You lied! */ if (valid_hooks != t->valid_hooks) { - duprintf("Valid hook crap: %08X vs %08X\n", - valid_hooks, t->valid_hooks); ret = -EINVAL; goto put_module; } @@ -1233,8 +1112,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, goto put_module; /* Update module usage count based on number of rules */ - duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", - oldinfo->number, oldinfo->initial_entries, newinfo->number); if ((oldinfo->number > oldinfo->initial_entries) || (newinfo->number <= oldinfo->initial_entries)) module_put(t->me); @@ -1303,8 +1180,6 @@ do_replace(struct net *net, const void __user *user, unsigned int len) if (ret != 0) goto free_newinfo; - duprintf("ip_tables: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, tmp.counters); if (ret) @@ -1429,11 +1304,9 @@ compat_find_calc_match(struct xt_entry_match *m, match = xt_request_find_match(NFPROTO_IPV6, m->u.user.name, m->u.user.revision); - if (IS_ERR(match)) { - duprintf("compat_check_calc_match: `%s' not found\n", - m->u.user.name); + if (IS_ERR(match)) return PTR_ERR(match); - } + m->u.kernel.match = match; *size += xt_compat_match_offset(match); return 0; @@ -1465,20 +1338,14 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, unsigned int j; int ret, off; - duprintf("check_compat_entry_size_and_hooks %p\n", e); if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 || (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit || - (unsigned char *)e + e->next_offset > limit) { - duprintf("Bad offset %p, limit = %p\n", e, limit); + (unsigned char *)e + e->next_offset > limit) return -EINVAL; - } if (e->next_offset < sizeof(struct compat_ip6t_entry) + - sizeof(struct compat_xt_entry_target)) { - duprintf("checking: element %p size %u\n", - e, e->next_offset); + sizeof(struct compat_xt_entry_target)) return -EINVAL; - } if (!ip6_checkentry(&e->ipv6)) return -EINVAL; @@ -1502,8 +1369,6 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, target = xt_request_find_target(NFPROTO_IPV6, t->u.user.name, t->u.user.revision); if (IS_ERR(target)) { - duprintf("check_compat_entry_size_and_hooks: `%s' not found\n", - t->u.user.name); ret = PTR_ERR(target); goto release_matches; } @@ -1582,7 +1447,6 @@ translate_compat_table(struct net *net, size = compatr->size; info->number = compatr->num_entries; - duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET6); xt_compat_init_offsets(AF_INET6, compatr->num_entries); @@ -1597,11 +1461,8 @@ translate_compat_table(struct net *net, } ret = -EINVAL; - if (j != compatr->num_entries) { - duprintf("translate_compat_table: %u not %u entries\n", - j, compatr->num_entries); + if (j != compatr->num_entries) goto out_unlock; - } ret = -ENOMEM; newinfo = xt_alloc_table_info(size); @@ -1670,8 +1531,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return -EFAULT; /* overflow check */ - if (tmp.size >= INT_MAX / num_possible_cpus()) - return -ENOMEM; if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters)) return -ENOMEM; if (tmp.num_counters == 0) @@ -1694,8 +1553,6 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) if (ret != 0) goto free_newinfo; - duprintf("compat_do_replace: Translated table\n"); - ret = __do_replace(net, tmp.name, tmp.valid_hooks, newinfo, tmp.num_counters, compat_ptr(tmp.counters)); if (ret) @@ -1729,7 +1586,6 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, break; default: - duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1779,19 +1635,15 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, struct compat_ip6t_get_entries get; struct xt_table *t; - if (*len < sizeof(get)) { - duprintf("compat_get_entries: %u < %zu\n", *len, sizeof(get)); + if (*len < sizeof(get)) return -EINVAL; - } if (copy_from_user(&get, uptr, sizeof(get)) != 0) return -EFAULT; - if (*len != sizeof(struct compat_ip6t_get_entries) + get.size) { - duprintf("compat_get_entries: %u != %zu\n", - *len, sizeof(get) + get.size); + if (*len != sizeof(struct compat_ip6t_get_entries) + get.size) return -EINVAL; - } + get.name[sizeof(get.name) - 1] = '\0'; xt_compat_lock(AF_INET6); @@ -1799,16 +1651,13 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr, if (!IS_ERR_OR_NULL(t)) { const struct xt_table_info *private = t->private; struct xt_table_info info; - duprintf("t->private->number = %u\n", private->number); ret = compat_table_info(private, &info); - if (!ret && get.size == info.size) { + if (!ret && get.size == info.size) ret = compat_copy_entries_to_user(private->size, t, uptr->entrytable); - } else if (!ret) { - duprintf("compat_get_entries: I've got %u not %u!\n", - private->size, get.size); + else if (!ret) ret = -EAGAIN; - } + xt_compat_flush_offsets(AF_INET6); module_put(t->me); xt_table_unlock(t); @@ -1861,7 +1710,6 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) break; default: - duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -1913,7 +1761,6 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) } default: - duprintf("do_ip6t_get_ctl: unknown request %i\n", cmd); ret = -EINVAL; } @@ -2015,7 +1862,6 @@ icmp6_match(const struct sk_buff *skb, struct xt_action_param *par) /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ - duprintf("Dropping evil ICMP tinygram.\n"); par->hotdrop = true; return false; } diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 3deed5860..06bed74cf 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -20,15 +20,16 @@ #include <net/netfilter/nf_conntrack_synproxy.h> static struct ipv6hdr * -synproxy_build_ip(struct sk_buff *skb, const struct in6_addr *saddr, - const struct in6_addr *daddr) +synproxy_build_ip(struct net *net, struct sk_buff *skb, + const struct in6_addr *saddr, + const struct in6_addr *daddr) { struct ipv6hdr *iph; skb_reset_network_header(skb); iph = (struct ipv6hdr *)skb_put(skb, sizeof(*iph)); ip6_flow_hdr(iph, 0, 0); - iph->hop_limit = 64; //XXX + iph->hop_limit = net->ipv6.devconf_all->hop_limit; iph->nexthdr = IPPROTO_TCP; iph->saddr = *saddr; iph->daddr = *daddr; @@ -37,13 +38,12 @@ synproxy_build_ip(struct sk_buff *skb, const struct in6_addr *saddr, } static void -synproxy_send_tcp(const struct synproxy_net *snet, +synproxy_send_tcp(struct net *net, const struct sk_buff *skb, struct sk_buff *nskb, struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo, struct ipv6hdr *niph, struct tcphdr *nth, unsigned int tcp_hdr_size) { - struct net *net = nf_ct_net(snet->tmpl); struct dst_entry *dst; struct flowi6 fl6; @@ -60,7 +60,7 @@ synproxy_send_tcp(const struct synproxy_net *snet, fl6.fl6_dport = nth->dest; security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6)); dst = ip6_route_output(net, NULL, &fl6); - if (dst == NULL || dst->error) { + if (dst->error) { dst_release(dst); goto free_nskb; } @@ -84,7 +84,7 @@ free_nskb: } static void -synproxy_send_client_synack(const struct synproxy_net *snet, +synproxy_send_client_synack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { @@ -103,7 +103,7 @@ synproxy_send_client_synack(const struct synproxy_net *snet, return; skb_reserve(nskb, MAX_TCP_HEADER); - niph = synproxy_build_ip(nskb, &iph->daddr, &iph->saddr); + niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr); skb_reset_transport_header(nskb); nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); @@ -121,15 +121,16 @@ synproxy_send_client_synack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } static void -synproxy_send_server_syn(const struct synproxy_net *snet, +synproxy_send_server_syn(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts, u32 recv_seq) { + struct synproxy_net *snet = synproxy_pernet(net); struct sk_buff *nskb; struct ipv6hdr *iph, *niph; struct tcphdr *nth; @@ -144,7 +145,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet, return; skb_reserve(nskb, MAX_TCP_HEADER); - niph = synproxy_build_ip(nskb, &iph->saddr, &iph->daddr); + niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr); skb_reset_transport_header(nskb); nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); @@ -165,12 +166,12 @@ synproxy_send_server_syn(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, + synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW, niph, nth, tcp_hdr_size); } static void -synproxy_send_server_ack(const struct synproxy_net *snet, +synproxy_send_server_ack(struct net *net, const struct ip_ct_tcp *state, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) @@ -189,7 +190,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet, return; skb_reserve(nskb, MAX_TCP_HEADER); - niph = synproxy_build_ip(nskb, &iph->daddr, &iph->saddr); + niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr); skb_reset_transport_header(nskb); nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); @@ -205,11 +206,11 @@ synproxy_send_server_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); + synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); } static void -synproxy_send_client_ack(const struct synproxy_net *snet, +synproxy_send_client_ack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, const struct synproxy_options *opts) { @@ -227,7 +228,7 @@ synproxy_send_client_ack(const struct synproxy_net *snet, return; skb_reserve(nskb, MAX_TCP_HEADER); - niph = synproxy_build_ip(nskb, &iph->saddr, &iph->daddr); + niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr); skb_reset_transport_header(nskb); nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size); @@ -243,15 +244,16 @@ synproxy_send_client_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, + synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size); } static bool -synproxy_recv_client_ack(const struct synproxy_net *snet, +synproxy_recv_client_ack(struct net *net, const struct sk_buff *skb, const struct tcphdr *th, struct synproxy_options *opts, u32 recv_seq) { + struct synproxy_net *snet = synproxy_pernet(net); int mss; mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1); @@ -267,7 +269,7 @@ synproxy_recv_client_ack(const struct synproxy_net *snet, if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP) synproxy_check_timestamp_cookie(opts); - synproxy_send_server_syn(snet, skb, th, opts, recv_seq); + synproxy_send_server_syn(net, skb, th, opts, recv_seq); return true; } @@ -275,7 +277,8 @@ static unsigned int synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_synproxy_info *info = par->targinfo; - struct synproxy_net *snet = synproxy_pernet(par->net); + struct net *net = par->net; + struct synproxy_net *snet = synproxy_pernet(net); struct synproxy_options opts = {}; struct tcphdr *th, _th; @@ -304,12 +307,12 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_SACK_PERM | XT_SYNPROXY_OPT_ECN); - synproxy_send_client_synack(snet, skb, th, &opts); + synproxy_send_client_synack(net, skb, th, &opts); return NF_DROP; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ - synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq)); + synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq)); return NF_DROP; } @@ -320,7 +323,8 @@ static unsigned int ipv6_synproxy_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *nhs) { - struct synproxy_net *snet = synproxy_pernet(nhs->net); + struct net *net = nhs->net; + struct synproxy_net *snet = synproxy_pernet(net); enum ip_conntrack_info ctinfo; struct nf_conn *ct; struct nf_conn_synproxy *synproxy; @@ -384,7 +388,7 @@ static unsigned int ipv6_synproxy_hook(void *priv, * therefore we need to add 1 to make the SYN sequence * number match the one of first SYN. */ - if (synproxy_recv_client_ack(snet, skb, th, &opts, + if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq) + 1)) this_cpu_inc(snet->stats->cookie_retrans); @@ -410,12 +414,12 @@ static unsigned int ipv6_synproxy_hook(void *priv, XT_SYNPROXY_OPT_SACK_PERM); swap(opts.tsval, opts.tsecr); - synproxy_send_server_ack(snet, state, skb, th, &opts); + synproxy_send_server_ack(net, state, skb, th, &opts); nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq)); swap(opts.tsval, opts.tsecr); - synproxy_send_client_ack(snet, skb, th, &opts); + synproxy_send_client_ack(net, skb, th, &opts); consume_skb(skb); return NF_STOLEN; diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index 6989c70ae..4a84b5ad9 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -33,6 +33,7 @@ static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb, fl6.daddr = *gw; fl6.flowlabel = (__force __be32)(((iph->flow_lbl[0] & 0xF) << 16) | (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]); + fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 4709f657b..a5400223f 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -158,7 +158,7 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook) fl6.fl6_dport = otcph->source; security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6)); dst = ip6_route_output(net, NULL, &fl6); - if (dst == NULL || dst->error) { + if (dst->error) { dst_release(dst); return; } diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index c382db7a2..3ee3e444a 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -58,10 +58,11 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int iif = 0; struct flowi6 fl6; int err; - int hlimit; struct dst_entry *dst; struct rt6_info *rt; struct pingfakehdr pfh; + struct sockcm_cookie junk = {0}; + struct ipcm6_cookie ipc6; pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); @@ -138,13 +139,15 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) pfh.wcheck = 0; pfh.family = AF_INET6; - hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + ipc6.tclass = np->tclass; + ipc6.dontfrag = np->dontfrag; + ipc6.opt = NULL; lock_sock(sk); err = ip6_append_data(sk, ping_getfrag, &pfh, len, - 0, hlimit, - np->tclass, NULL, &fl6, rt, - MSG_DONTWAIT, np->dontfrag); + 0, &ipc6, &fl6, rt, + MSG_DONTWAIT, &junk); if (err) { ICMP6_INC_STATS(sock_net(sk), rt->rt6i_idev, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index fa59dd7a4..896350df6 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -745,10 +745,9 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct dst_entry *dst = NULL; struct raw6_frag_vec rfv; struct flowi6 fl6; + struct sockcm_cookie sockc; + struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; - int hlimit = -1; - int tclass = -1; - int dontfrag = -1; u16 proto; int err; @@ -769,6 +768,11 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = sk->sk_mark; + ipc6.hlimit = -1; + ipc6.tclass = -1; + ipc6.dontfrag = -1; + ipc6.opt = NULL; + if (sin6) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; @@ -821,13 +825,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (fl6.flowi6_oif == 0) fl6.flowi6_oif = sk->sk_bound_dev_if; + sockc.tsflags = sk->sk_tsflags; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); + ipc6.opt = opt; - err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt, - &hlimit, &tclass, &dontfrag); + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -843,7 +848,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!opt) { opt = txopt_get(np); opt_to_free = opt; - } + } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); @@ -878,14 +883,14 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) err = PTR_ERR(dst); goto out; } - if (hlimit < 0) - hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + if (ipc6.hlimit < 0) + ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (tclass < 0) - tclass = np->tclass; + if (ipc6.tclass < 0) + ipc6.tclass = np->tclass; - if (dontfrag < 0) - dontfrag = np->dontfrag; + if (ipc6.dontfrag < 0) + ipc6.dontfrag = np->dontfrag; if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; @@ -894,10 +899,11 @@ back_from_confirm: if (inet->hdrincl) err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags); else { + ipc6.opt = opt; lock_sock(sk); err = ip6_append_data(sk, raw6_getfrag, &rfv, - len, 0, hlimit, tclass, opt, &fl6, (struct rt6_info *)dst, - msg->msg_flags, dontfrag); + len, 0, &ipc6, &fl6, (struct rt6_info *)dst, + msg->msg_flags, &sockc); if (err) ip6_flush_pending_frames(sk); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index e2ea31175..2160d5d00 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -145,12 +145,12 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq, if (!dev) goto out_rcu_unlock; - IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); if (inet_frag_evicting(&fq->q)) goto out_rcu_unlock; - IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); /* Don't send error if the first segment did not arrive. */ if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !fq->q.fragments) @@ -223,8 +223,8 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((u8 *)&fhdr->frag_off - skb_network_header(skb))); @@ -258,8 +258,8 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, /* RFC2460 says always send parameter problem in * this case. -DaveM */ - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, offsetof(struct ipv6hdr, payload_len)); return -1; @@ -361,8 +361,8 @@ found: discard_fq: inet_frag_kill(&fq->q, &ip6_frags); err: - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_REASMFAILS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -1; } @@ -500,7 +500,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, skb_network_header_len(head)); rcu_read_lock(); - IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); rcu_read_unlock(); fq->q.fragments = NULL; fq->q.fragments_tail = NULL; @@ -513,7 +513,7 @@ out_oom: net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n"); out_fail: rcu_read_lock(); - IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); rcu_read_unlock(); return -1; } @@ -528,7 +528,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) goto fail_hdr; - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMREQDS); /* Jumbo payload inhibits frag. header */ if (hdr->payload_len == 0) @@ -544,8 +544,8 @@ static int ipv6_frag_rcv(struct sk_buff *skb) if (!(fhdr->frag_off & htons(0xFFF9))) { /* It is not a fragmented frame */ skb->transport_header += sizeof(struct frag_hdr); - IP6_INC_STATS_BH(net, - ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS); + __IP6_INC_STATS(net, + ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMOKS); IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb); IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; @@ -566,13 +566,13 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return ret; } - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -1; fail_hdr: - IP6_INC_STATS_BH(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb_network_header_len(skb)); return -1; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6f32944e0..520b7884d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1190,7 +1190,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct dst_entry *dst; bool any_src; - dst = l3mdev_rt6_dst_by_oif(net, fl6); + dst = l3mdev_get_rt6_dst(net, fl6); if (dst) return dst; @@ -1771,6 +1771,37 @@ static int ip6_convert_metrics(struct mx6_config *mxc, return -EINVAL; } +static struct rt6_info *ip6_nh_lookup_table(struct net *net, + struct fib6_config *cfg, + const struct in6_addr *gw_addr) +{ + struct flowi6 fl6 = { + .flowi6_oif = cfg->fc_ifindex, + .daddr = *gw_addr, + .saddr = cfg->fc_prefsrc, + }; + struct fib6_table *table; + struct rt6_info *rt; + int flags = RT6_LOOKUP_F_IFACE; + + table = fib6_get_table(net, cfg->fc_table); + if (!table) + return NULL; + + if (!ipv6_addr_any(&cfg->fc_prefsrc)) + flags |= RT6_LOOKUP_F_HAS_SADDR; + + rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags); + + /* if table lookup failed, fall back to full lookup */ + if (rt == net->ipv6.ip6_null_entry) { + ip6_rt_put(rt); + rt = NULL; + } + + return rt; +} + static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) { struct net *net = cfg->fc_nlinfo.nl_net; @@ -1942,7 +1973,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) rt->rt6i_gateway = *gw_addr; if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { - struct rt6_info *grt; + struct rt6_info *grt = NULL; /* IPv6 strictly inhibits using not link-local addresses as nexthop address. @@ -1954,7 +1985,12 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) if (!(gwa_type & IPV6_ADDR_UNICAST)) goto out; - grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1); + if (cfg->fc_table) + grt = ip6_nh_lookup_table(net, cfg, gw_addr); + + if (!grt) + grt = rt6_lookup(net, gw_addr, NULL, + cfg->fc_ifindex, 1); err = -EHOSTUNREACH; if (!grt) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 6c53e4eb0..0619ac708 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -913,10 +913,9 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, goto tx_error; } - skb = iptunnel_handle_offloads(skb, SKB_GSO_SIT); - if (IS_ERR(skb)) { + if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) { ip_rt_put(rt); - goto out; + goto tx_error; } if (df) { @@ -992,7 +991,6 @@ tx_error_icmp: dst_link_failure(skb); tx_error: kfree_skb(skb); -out: dev->stats.tx_errors++; return NETDEV_TX_OK; } @@ -1002,15 +1000,15 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; - skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP); - if (IS_ERR(skb)) - goto out; + if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) + goto tx_error; skb_set_inner_ipproto(skb, IPPROTO_IPIP); ip_tunnel_xmit(skb, dev, tiph, IPPROTO_IPIP); return NETDEV_TX_OK; -out: +tx_error: + kfree_skb(skb); dev->stats.tx_errors++; return NETDEV_TX_OK; } diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index aab91fa86..59c483937 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -155,11 +155,11 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie); if (mss == 0) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 07c456fe0..90ea274a7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -235,7 +235,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; - opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk)); + opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); final_p = fl6_update_dst(&fl6, opt, &final); security_sk_classify_flow(sk, flowi6_to_flowi(&fl6)); @@ -352,8 +352,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, skb->dev->ifindex); if (!sk) { - ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); return; } @@ -368,13 +368,13 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) - NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); + __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) goto out; if (ipv6_hdr(skb)->hop_limit < inet6_sk(sk)->min_hopcount) { - NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto out; } @@ -384,7 +384,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && !between(seq, snd_una, tp->snd_nxt)) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -455,7 +455,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, - bool attach_req) + enum tcp_synack_type synack_type) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); @@ -468,7 +468,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, IPPROTO_TCP)) == NULL) goto done; - skb = tcp_make_synack(sk, dst, req, foc, attach_req); + skb = tcp_make_synack(sk, dst, req, foc, synack_type); if (skb) { __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, @@ -665,12 +665,12 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk, return false; if (hash_expected && !hash_location) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND); return true; } if (!hash_expected && hash_location) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); return true; } @@ -754,7 +754,7 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, int rst, - u8 tclass, u32 label) + u8 tclass, __be32 label) { const struct tcphdr *th = tcp_hdr(skb); struct tcphdr *t1; @@ -846,9 +846,9 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (!IS_ERR(dst)) { skb_dst_set(buff, dst); ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass); - TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); + TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) - TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); + TCP_INC_STATS(net, TCP_MIB_OUTRSTS); return; } @@ -879,6 +879,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) return; #ifdef CONFIG_TCP_MD5SIG + rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); if (sk && sk_fullsock(sk)) { key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr); @@ -896,16 +897,15 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) th->source, &ipv6h->daddr, ntohs(th->source), tcp_v6_iif(skb)); if (!sk1) - return; + goto out; - rcu_read_lock(); key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr); if (!key) - goto release_sk1; + goto out; genhash = tcp_v6_md5_hash_skb(newhash, key, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) - goto release_sk1; + goto out; } #endif @@ -919,18 +919,15 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); #ifdef CONFIG_TCP_MD5SIG -release_sk1: - if (sk1) { - rcu_read_unlock(); - sock_put(sk1); - } +out: + rcu_read_unlock(); #endif } static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_md5sig_key *key, u8 tclass, - u32 label) + __be32 label) { tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0, tclass, label); @@ -988,7 +985,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) &tcp_request_sock_ipv6_ops, sk, skb); drop: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + tcp_listendrop(sk); return 0; /* don't send reset */ } @@ -1189,11 +1186,11 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * return newsk; out_overflow: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); out_nonewsk: dst_release(dst); out: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + tcp_listendrop(sk); return NULL; } @@ -1308,8 +1305,8 @@ discard: kfree_skb(skb); return 0; csum_err: - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS); - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1380,6 +1377,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) { const struct tcphdr *th; const struct ipv6hdr *hdr; + bool refcounted; struct sock *sk; int ret; struct net *net = dev_net(skb->dev); @@ -1390,14 +1388,14 @@ static int tcp_v6_rcv(struct sk_buff *skb) /* * Count it even if it's bad. */ - TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); + __TCP_INC_STATS(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; - th = tcp_hdr(skb); + th = (const struct tcphdr *)skb->data; - if (th->doff < sizeof(struct tcphdr)/4) + if (unlikely(th->doff < sizeof(struct tcphdr)/4)) goto bad_packet; if (!pskb_may_pull(skb, th->doff*4)) goto discard_it; @@ -1405,12 +1403,13 @@ static int tcp_v6_rcv(struct sk_buff *skb) if (skb_checksum_init(skb, IPPROTO_TCP, ip6_compute_pseudo)) goto csum_error; - th = tcp_hdr(skb); + th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); lookup: sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), - th->source, th->dest, inet6_iif(skb)); + th->source, th->dest, inet6_iif(skb), + &refcounted); if (!sk) goto no_tcp_socket; @@ -1433,6 +1432,7 @@ process: goto lookup; } sock_hold(sk); + refcounted = true; nsk = tcp_check_req(sk, skb, req, false); if (!nsk) { reqsk_put(req); @@ -1450,7 +1450,7 @@ process: } } if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { - NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; } @@ -1483,13 +1483,14 @@ process: } else if (unlikely(sk_add_backlog(sk, skb, sk->sk_rcvbuf + sk->sk_sndbuf))) { bh_unlock_sock(sk); - NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); + __NET_INC_STATS(net, LINUX_MIB_TCPBACKLOGDROP); goto discard_and_relse; } bh_unlock_sock(sk); put_and_return: - sock_put(sk); + if (refcounted) + sock_put(sk); return ret ? -1 : 0; no_tcp_socket: @@ -1500,9 +1501,9 @@ no_tcp_socket: if (tcp_checksum_complete(skb)) { csum_error: - TCP_INC_STATS_BH(net, TCP_MIB_CSUMERRORS); + __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: - TCP_INC_STATS_BH(net, TCP_MIB_INERRS); + __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { tcp_v6_send_reset(NULL, skb); } @@ -1512,7 +1513,9 @@ discard_it: return 0; discard_and_relse: - sock_put(sk); + sk_drops_add(sk, skb); + if (refcounted) + sock_put(sk); goto discard_it; do_time_wait: @@ -1543,6 +1546,7 @@ do_time_wait: inet_twsk_deschedule_put(tw); sk = sk2; tcp_v6_restore_cb(skb); + refcounted = false; goto process; } /* Fall through to ACK */ diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f96831d9d..42a2edf7c 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -115,11 +115,10 @@ static void udp_v6_rehash(struct sock *sk) udp_lib_rehash(sk, new_hash); } -static inline int compute_score(struct sock *sk, struct net *net, - unsigned short hnum, - const struct in6_addr *saddr, __be16 sport, - const struct in6_addr *daddr, __be16 dport, - int dif) +static int compute_score(struct sock *sk, struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, unsigned short hnum, + int dif) { int score; struct inet_sock *inet; @@ -162,88 +161,36 @@ static inline int compute_score(struct sock *sk, struct net *net, return score; } -static inline int compute_score2(struct sock *sk, struct net *net, - const struct in6_addr *saddr, __be16 sport, - const struct in6_addr *daddr, - unsigned short hnum, int dif) -{ - int score; - struct inet_sock *inet; - - if (!net_eq(sock_net(sk), net) || - udp_sk(sk)->udp_port_hash != hnum || - sk->sk_family != PF_INET6) - return -1; - - if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr)) - return -1; - - score = 0; - inet = inet_sk(sk); - - if (inet->inet_dport) { - if (inet->inet_dport != sport) - return -1; - score++; - } - - if (!ipv6_addr_any(&sk->sk_v6_daddr)) { - if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr)) - return -1; - score++; - } - - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - return -1; - score++; - } - - if (sk->sk_incoming_cpu == raw_smp_processor_id()) - score++; - - return score; -} - -/* called with read_rcu_lock() */ +/* called with rcu_read_lock() */ static struct sock *udp6_lib_lookup2(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, unsigned int hnum, int dif, - struct udp_hslot *hslot2, unsigned int slot2, + struct udp_hslot *hslot2, struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; int score, badness, matches = 0, reuseport = 0; - bool select_ok = true; u32 hash = 0; -begin: result = NULL; badness = -1; - udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { - score = compute_score2(sk, net, saddr, sport, + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { + score = compute_score(sk, net, saddr, sport, daddr, hnum, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); - if (select_ok) { - struct sock *sk2; - sk2 = reuseport_select_sock(sk, hash, skb, + result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (sk2) { - result = sk2; - select_ok = false; - goto found; - } - } + if (result) + return result; matches = 1; } + result = sk; + badness = score; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -251,27 +198,10 @@ begin: hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot2) - goto begin; - - if (result) { -found: - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score2(result, net, saddr, sport, - daddr, hnum, dif) < badness)) { - sock_put(result); - goto begin; - } - } return result; } +/* rcu_read_lock() must be held */ struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, @@ -279,15 +209,12 @@ struct sock *__udp6_lib_lookup(struct net *net, struct sk_buff *skb) { struct sock *sk, *result; - struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; int score, badness, matches = 0, reuseport = 0; - bool select_ok = true; u32 hash = 0; - rcu_read_lock(); if (hslot->count > 10) { hash2 = udp6_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; @@ -297,46 +224,43 @@ struct sock *__udp6_lib_lookup(struct net *net, result = udp6_lib_lookup2(net, saddr, sport, daddr, hnum, dif, - hslot2, slot2, skb); + hslot2, skb); if (!result) { + unsigned int old_slot2 = slot2; hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum); slot2 = hash2 & udptable->mask; + /* avoid searching the same slot again. */ + if (unlikely(slot2 == old_slot2)) + return result; + hslot2 = &udptable->hash2[slot2]; if (hslot->count < hslot2->count) goto begin; result = udp6_lib_lookup2(net, saddr, sport, - &in6addr_any, hnum, dif, - hslot2, slot2, skb); + daddr, hnum, dif, + hslot2, skb); } - rcu_read_unlock(); return result; } begin: result = NULL; badness = -1; - sk_nulls_for_each_rcu(sk, node, &hslot->head) { - score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif); + sk_for_each_rcu(sk, &hslot->head) { + score = compute_score(sk, net, saddr, sport, daddr, hnum, dif); if (score > badness) { - result = sk; - badness = score; reuseport = sk->sk_reuseport; if (reuseport) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); - if (select_ok) { - struct sock *sk2; - - sk2 = reuseport_select_sock(sk, hash, skb, + result = reuseport_select_sock(sk, hash, skb, sizeof(struct udphdr)); - if (sk2) { - result = sk2; - select_ok = false; - goto found; - } - } + if (result) + return result; matches = 1; } + result = sk; + badness = score; } else if (score == badness && reuseport) { matches++; if (reciprocal_scale(hash, matches) == 0) @@ -344,25 +268,6 @@ begin: hash = next_pseudo_random32(hash); } } - /* - * if the nulls value we got at the end of this lookup is - * not the expected one, we must restart lookup. - * We probably met an item that was moved to another chain. - */ - if (get_nulls_value(node) != slot) - goto begin; - - if (result) { -found: - if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) - result = NULL; - else if (unlikely(compute_score(result, net, hnum, saddr, sport, - daddr, dport, dif) < badness)) { - sock_put(result); - goto begin; - } - } - rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(__udp6_lib_lookup); @@ -371,23 +276,46 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport, struct udp_table *udptable) { - struct sock *sk; const struct ipv6hdr *iph = ipv6_hdr(skb); + struct sock *sk; sk = skb_steal_sock(skb); if (unlikely(sk)) return sk; - return __udp6_lib_lookup(dev_net(skb_dst(skb)->dev), &iph->saddr, sport, + return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, &iph->daddr, dport, inet6_iif(skb), udptable, skb); } +struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, + __be16 sport, __be16 dport) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + + return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, + &iph->daddr, dport, inet6_iif(skb), + &udp_table, skb); +} +EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb); + +/* Must be called under rcu_read_lock(). + * Does increment socket refcount. + */ +#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ + IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif) { - return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table, NULL); + struct sock *sk; + + sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport, + dif, &udp_table, NULL); + if (sk && !atomic_inc_not_zero(&sk->sk_refcnt)) + sk = NULL; + return sk; } EXPORT_SYMBOL_GPL(udp6_lib_lookup); +#endif /* * This should be easy, if there is something there we @@ -401,7 +329,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, struct inet_sock *inet = inet_sk(sk); struct sk_buff *skb; unsigned int ulen, copied; - int peeked, off = 0; + int peeked, peeking, off; int err; int is_udplite = IS_UDPLITE(sk); bool checksum_valid = false; @@ -415,15 +343,16 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return ipv6_recv_rxpmtu(sk, msg, len, addr_len); try_again: + peeking = off = sk_peek_offset(sk, flags); skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0), &peeked, &off, &err); if (!skb) - goto out; + return err; - ulen = skb->len - sizeof(struct udphdr); + ulen = skb->len; copied = len; - if (copied > ulen) - copied = ulen; + if (copied > ulen - off) + copied = ulen - off; else if (copied < ulen) msg->msg_flags |= MSG_TRUNC; @@ -435,17 +364,16 @@ try_again: * coverage checksum (UDP-Lite), do it before the copy. */ - if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { + if (copied < ulen || UDP_SKB_CB(skb)->partial_cov || peeking) { checksum_valid = !udp_lib_checksum_complete(skb); if (!checksum_valid) goto csum_copy_err; } if (checksum_valid || skb_csum_unnecessary(skb)) - err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), - msg, copied); + err = skb_copy_datagram_msg(skb, off, msg, copied); else { - err = skb_copy_and_csum_datagram_msg(skb, sizeof(struct udphdr), msg); + err = skb_copy_and_csum_datagram_msg(skb, off, msg); if (err == -EINVAL) goto csum_copy_err; } @@ -454,23 +382,22 @@ try_again: if (!peeked) { atomic_inc(&sk->sk_drops); if (is_udp4) - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_INERRORS, - is_udplite); + UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, + is_udplite); else - UDP6_INC_STATS_USER(sock_net(sk), - UDP_MIB_INERRORS, - is_udplite); + UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, + is_udplite); } - goto out_free; + skb_free_datagram_locked(sk, skb); + return err; } if (!peeked) { if (is_udp4) - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_INDATAGRAMS, is_udplite); + UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, + is_udplite); else - UDP6_INC_STATS_USER(sock_net(sk), - UDP_MIB_INDATAGRAMS, is_udplite); + UDP6_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, + is_udplite); } sock_recv_ts_and_drops(msg, sk, skb); @@ -510,24 +437,22 @@ try_again: if (flags & MSG_TRUNC) err = ulen; -out_free: - skb_free_datagram_locked(sk, skb); -out: + __skb_free_datagram_locked(sk, skb, peeking ? -err : err); return err; csum_copy_err: slow = lock_sock_fast(sk); if (!skb_kill_datagram(sk, skb, flags)) { if (is_udp4) { - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_CSUMERRORS, is_udplite); - UDP_INC_STATS_USER(sock_net(sk), - UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS(sock_net(sk), + UDP_MIB_CSUMERRORS, is_udplite); + UDP_INC_STATS(sock_net(sk), + UDP_MIB_INERRORS, is_udplite); } else { - UDP6_INC_STATS_USER(sock_net(sk), - UDP_MIB_CSUMERRORS, is_udplite); - UDP6_INC_STATS_USER(sock_net(sk), - UDP_MIB_INERRORS, is_udplite); + UDP6_INC_STATS(sock_net(sk), + UDP_MIB_CSUMERRORS, is_udplite); + UDP6_INC_STATS(sock_net(sk), + UDP_MIB_INERRORS, is_udplite); } } unlock_sock_fast(sk, slow); @@ -555,8 +480,8 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, inet6_iif(skb), udptable, skb); if (!sk) { - ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); return; } @@ -585,7 +510,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, sk->sk_err = err; sk->sk_error_report(sk); out: - sock_put(sk); + return; } static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) @@ -598,15 +523,15 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sk_incoming_cpu_update(sk); } - rc = sock_queue_rcv_skb(sk, skb); + rc = __sock_queue_rcv_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) - UDP6_INC_STATS_BH(sock_net(sk), - UDP_MIB_RCVBUFERRORS, is_udplite); - UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + UDP6_INC_STATS(sock_net(sk), + UDP_MIB_RCVBUFERRORS, is_udplite); + UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); kfree_skb(skb); return -1; } @@ -662,9 +587,9 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) ret = encap_rcv(sk, skb); if (ret <= 0) { - UDP_INC_STATS_BH(sock_net(sk), - UDP_MIB_INDATAGRAMS, - is_udplite); + __UDP_INC_STATS(sock_net(sk), + UDP_MIB_INDATAGRAMS, + is_udplite); return -ret; } } @@ -689,14 +614,17 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) } } - if (rcu_access_pointer(sk->sk_filter)) { - if (udp_lib_checksum_complete(skb)) - goto csum_error; - } + if (rcu_access_pointer(sk->sk_filter) && + udp_lib_checksum_complete(skb)) + goto csum_error; + + if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) + goto drop; + udp_csum_pull_header(skb); if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { - UDP6_INC_STATS_BH(sock_net(sk), - UDP_MIB_RCVBUFERRORS, is_udplite); + __UDP6_INC_STATS(sock_net(sk), + UDP_MIB_RCVBUFERRORS, is_udplite); goto drop; } @@ -715,9 +643,9 @@ int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return rc; csum_error: - UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); + __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: - UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); kfree_skb(skb); return -1; @@ -747,33 +675,6 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk, return true; } -static void flush_stack(struct sock **stack, unsigned int count, - struct sk_buff *skb, unsigned int final) -{ - struct sk_buff *skb1 = NULL; - struct sock *sk; - unsigned int i; - - for (i = 0; i < count; i++) { - sk = stack[i]; - if (likely(!skb1)) - skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); - if (!skb1) { - atomic_inc(&sk->sk_drops); - UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, - IS_UDPLITE(sk)); - UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, - IS_UDPLITE(sk)); - } - - if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0) - skb1 = NULL; - sock_put(sk); - } - if (unlikely(skb1)) - kfree_skb(skb1); -} - static void udp6_csum_zero_error(struct sk_buff *skb) { /* RFC 2460 section 8.1 says that we SHOULD log @@ -792,15 +693,15 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, struct udp_table *udptable, int proto) { - struct sock *sk, *stack[256 / sizeof(struct sock *)]; + struct sock *sk, *first = NULL; const struct udphdr *uh = udp_hdr(skb); - struct hlist_nulls_node *node; unsigned short hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); - int dif = inet6_iif(skb); - unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node); + unsigned int offset = offsetof(typeof(*sk), sk_node); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); - bool inner_flushed = false; + int dif = inet6_iif(skb); + struct hlist_node *node; + struct sk_buff *nskb; if (use_hash2) { hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) & @@ -811,27 +712,32 @@ start_lookup: offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } - spin_lock(&hslot->lock); - sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) { - if (__udp_v6_is_mcast_sock(net, sk, - uh->dest, daddr, - uh->source, saddr, - dif, hnum) && - /* If zero checksum and no_check is not on for - * the socket then skip it. - */ - (uh->check || udp_sk(sk)->no_check6_rx)) { - if (unlikely(count == ARRAY_SIZE(stack))) { - flush_stack(stack, count, skb, ~0); - inner_flushed = true; - count = 0; - } - stack[count++] = sk; - sock_hold(sk); + sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { + if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr, + uh->source, saddr, dif, hnum)) + continue; + /* If zero checksum and no_check is not on for + * the socket then skip it. + */ + if (!uh->check && !udp_sk(sk)->no_check6_rx) + continue; + if (!first) { + first = sk; + continue; + } + nskb = skb_clone(skb, GFP_ATOMIC); + if (unlikely(!nskb)) { + atomic_inc(&sk->sk_drops); + __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS, + IS_UDPLITE(sk)); + __UDP6_INC_STATS(net, UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + continue; } - } - spin_unlock(&hslot->lock); + if (udpv6_queue_rcv_skb(sk, nskb) > 0) + consume_skb(nskb); + } /* Also lookup *:port if we are using hash2 and haven't done so yet. */ if (use_hash2 && hash2 != hash2_any) { @@ -839,13 +745,13 @@ start_lookup: goto start_lookup; } - if (count) { - flush_stack(stack, count, skb, count - 1); + if (first) { + if (udpv6_queue_rcv_skb(first, skb) > 0) + consume_skb(skb); } else { - if (!inner_flushed) - UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI, - proto == IPPROTO_UDPLITE); - consume_skb(skb); + kfree_skb(skb); + __UDP6_INC_STATS(net, UDP_MIB_IGNOREDMULTI, + proto == IPPROTO_UDPLITE); } return 0; } @@ -853,10 +759,10 @@ start_lookup: int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { + const struct in6_addr *saddr, *daddr; struct net *net = dev_net(skb->dev); - struct sock *sk; struct udphdr *uh; - const struct in6_addr *saddr, *daddr; + struct sock *sk; u32 ulen = 0; if (!pskb_may_pull(skb, sizeof(struct udphdr))) @@ -910,7 +816,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int ret; if (!uh->check && !udp_sk(sk)->no_check6_rx) { - sock_put(sk); udp6_csum_zero_error(skb); goto csum_error; } @@ -920,7 +825,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, ip6_compute_pseudo); ret = udpv6_queue_rcv_skb(sk, skb); - sock_put(sk); /* a return value > 0 means to resubmit the input */ if (ret > 0) @@ -940,7 +844,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, if (udp_lib_checksum_complete(skb)) goto csum_error; - UDP6_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); + __UDP6_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); kfree_skb(skb); @@ -954,9 +858,9 @@ short_packet: daddr, ntohs(uh->dest)); goto discard; csum_error: - UDP6_INC_STATS_BH(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); + __UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); discard: - UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); + __UDP6_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); kfree_skb(skb); return 0; } @@ -1068,13 +972,14 @@ send: err = ip6_send_skb(skb); if (err) { if (err == -ENOBUFS && !inet6_sk(sk)->recverr) { - UDP6_INC_STATS_USER(sock_net(sk), - UDP_MIB_SNDBUFERRORS, is_udplite); + UDP6_INC_STATS(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); err = 0; } - } else - UDP6_INC_STATS_USER(sock_net(sk), - UDP_MIB_OUTDATAGRAMS, is_udplite); + } else { + UDP6_INC_STATS(sock_net(sk), + UDP_MIB_OUTDATAGRAMS, is_udplite); + } return err; } @@ -1118,16 +1023,19 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ip6_flowlabel *flowlabel = NULL; struct flowi6 fl6; struct dst_entry *dst; + struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; int ulen = len; - int hlimit = -1; - int tclass = -1; - int dontfrag = -1; int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; int err; int connected = 0; int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); + struct sockcm_cookie sockc; + + ipc6.hlimit = -1; + ipc6.tclass = -1; + ipc6.dontfrag = -1; /* destination address check */ if (sin6) { @@ -1247,14 +1155,15 @@ do_udp_sendmsg: fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; fl6.flowi6_mark = sk->sk_mark; + sockc.tsflags = sk->sk_tsflags; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(*opt); + ipc6.opt = opt; - err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt, - &hlimit, &tclass, &dontfrag); + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -1275,6 +1184,7 @@ do_udp_sendmsg: if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); + ipc6.opt = opt; fl6.flowi6_proto = sk->sk_protocol; if (!ipv6_addr_any(daddr)) @@ -1304,11 +1214,11 @@ do_udp_sendmsg: goto out; } - if (hlimit < 0) - hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + if (ipc6.hlimit < 0) + ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (tclass < 0) - tclass = np->tclass; + if (ipc6.tclass < 0) + ipc6.tclass = np->tclass; if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; @@ -1319,9 +1229,9 @@ back_from_confirm: struct sk_buff *skb; skb = ip6_make_skb(sk, getfrag, msg, ulen, - sizeof(struct udphdr), hlimit, tclass, opt, + sizeof(struct udphdr), &ipc6, &fl6, (struct rt6_info *)dst, - msg->msg_flags, dontfrag); + msg->msg_flags, &sockc); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) err = udp_v6_send_skb(skb, &fl6); @@ -1342,13 +1252,12 @@ back_from_confirm: up->pending = AF_INET6; do_append_data: - if (dontfrag < 0) - dontfrag = np->dontfrag; + if (ipc6.dontfrag < 0) + ipc6.dontfrag = np->dontfrag; up->len += ulen; - err = ip6_append_data(sk, getfrag, msg, ulen, - sizeof(struct udphdr), hlimit, tclass, opt, &fl6, - (struct rt6_info *)dst, - corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, dontfrag); + err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), + &ipc6, &fl6, (struct rt6_info *)dst, + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags, &sockc); if (err) udp_v6_flush_pending_frames(sk); else if (!corkreq) @@ -1391,8 +1300,8 @@ out: * seems like overkill. */ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - UDP6_INC_STATS_USER(sock_net(sk), - UDP_MIB_SNDBUFERRORS, is_udplite); + UDP6_INC_STATS(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); } return err; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 2b0fbe692..ac858c480 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -36,19 +36,6 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ - int type = skb_shinfo(skb)->gso_type; - - if (unlikely(type & ~(SKB_GSO_UDP | - SKB_GSO_DODGY | - SKB_GSO_UDP_TUNNEL | - SKB_GSO_UDP_TUNNEL_CSUM | - SKB_GSO_TUNNEL_REMCSUM | - SKB_GSO_GRE | - SKB_GSO_GRE_CSUM | - SKB_GSO_IPIP | - SKB_GSO_SIT) || - !(type & (SKB_GSO_UDP)))) - goto out; skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); @@ -153,7 +140,7 @@ static struct sk_buff **udp6_gro_receive(struct sk_buff **head, skip: NAPI_GRO_CB(skb)->is_ipv6 = 1; - return udp_gro_receive(head, skb, uh); + return udp_gro_receive(head, skb, uh, udp6_lib_lookup_skb); flush: NAPI_GRO_CB(skb)->flush = 1; @@ -173,7 +160,7 @@ static int udp6_gro_complete(struct sk_buff *skb, int nhoff) skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL; } - return udp_gro_complete(skb, nhoff); + return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb); } static const struct net_offload udpv6_offload = { @@ -184,7 +171,12 @@ static const struct net_offload udpv6_offload = { }, }; -int __init udp_offload_init(void) +int udpv6_offload_init(void) { return inet6_add_offload(&udpv6_offload, IPPROTO_UDP); } + +int udpv6_offload_exit(void) +{ + return inet6_del_offload(&udpv6_offload, IPPROTO_UDP); +} diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 923abd6b3..8d2f7c9b4 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1024,8 +1024,11 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr, } /* Check if we have opened a local TSAP */ - if (!self->tsap) - irda_open_tsap(self, LSAP_ANY, addr->sir_name); + if (!self->tsap) { + err = irda_open_tsap(self, LSAP_ANY, addr->sir_name); + if (err) + goto out; + } /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c index da126ee6d..873c4b707 100644 --- a/net/irda/ircomm/ircomm_tty.c +++ b/net/irda/ircomm/ircomm_tty.c @@ -220,10 +220,11 @@ static int ircomm_tty_startup(struct ircomm_tty_cb *self) IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); /* Check if already open */ - if (test_and_set_bit(ASYNCB_INITIALIZED, &self->port.flags)) { + if (tty_port_initialized(&self->port)) { pr_debug("%s(), already open so break out!\n", __func__); return 0; } + tty_port_set_initialized(&self->port, 1); /* Register with IrCOMM */ irda_notify_init(¬ify); @@ -257,7 +258,7 @@ static int ircomm_tty_startup(struct ircomm_tty_cb *self) return 0; err: - clear_bit(ASYNCB_INITIALIZED, &self->port.flags); + tty_port_set_initialized(&self->port, 0); return ret; } @@ -280,8 +281,8 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self, * If non-blocking mode is set, or the port is not enabled, * then make the check up front and then exit. */ - if (test_bit(TTY_IO_ERROR, &tty->flags)) { - port->flags |= ASYNC_NORMAL_ACTIVE; + if (tty_io_error(tty)) { + tty_port_set_active(port, 1); return 0; } @@ -289,7 +290,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self, /* nonblock mode is set */ if (C_BAUD(tty)) tty_port_raise_dtr_rts(port); - port->flags |= ASYNC_NORMAL_ACTIVE; + tty_port_set_active(port, 1); pr_debug("%s(), O_NONBLOCK requested!\n", __func__); return 0; } @@ -318,13 +319,12 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self, spin_unlock_irqrestore(&port->lock, flags); while (1) { - if (C_BAUD(tty) && test_bit(ASYNCB_INITIALIZED, &port->flags)) + if (C_BAUD(tty) && tty_port_initialized(port)) tty_port_raise_dtr_rts(port); set_current_state(TASK_INTERRUPTIBLE); - if (tty_hung_up_p(filp) || - !test_bit(ASYNCB_INITIALIZED, &port->flags)) { + if (tty_hung_up_p(filp) || !tty_port_initialized(port)) { retval = (port->flags & ASYNC_HUP_NOTIFY) ? -EAGAIN : -ERESTARTSYS; break; @@ -365,7 +365,7 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self, __FILE__, __LINE__, tty->driver->name, port->count); if (!retval) - port->flags |= ASYNC_NORMAL_ACTIVE; + tty_port_set_active(port, 1); return retval; } @@ -876,8 +876,9 @@ static void ircomm_tty_shutdown(struct ircomm_tty_cb *self) IRDA_ASSERT(self != NULL, return;); IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); - if (!test_and_clear_bit(ASYNCB_INITIALIZED, &self->port.flags)) + if (!tty_port_initialized(&self->port)) return; + tty_port_set_initialized(&self->port, 0); ircomm_tty_detach_cable(self); @@ -925,7 +926,6 @@ static void ircomm_tty_hangup(struct tty_struct *tty) ircomm_tty_shutdown(self); spin_lock_irqsave(&port->lock, flags); - port->flags &= ~ASYNC_NORMAL_ACTIVE; if (port->tty) { set_bit(TTY_IO_ERROR, &port->tty->flags); tty_kref_put(port->tty); @@ -933,6 +933,7 @@ static void ircomm_tty_hangup(struct tty_struct *tty) port->tty = NULL; port->count = 0; spin_unlock_irqrestore(&port->lock, flags); + tty_port_set_active(port, 0); wake_up_interruptible(&port->open_wait); } @@ -999,7 +1000,7 @@ void ircomm_tty_check_modem_status(struct ircomm_tty_cb *self) if (status & IRCOMM_DCE_DELTA_ANY) { /*wake_up_interruptible(&self->delta_msr_wait);*/ } - if ((self->port.flags & ASYNC_CHECK_CD) && (status & IRCOMM_DELTA_CD)) { + if (tty_port_check_carrier(&self->port) && (status & IRCOMM_DELTA_CD)) { pr_debug("%s(), ircomm%d CD now %s...\n", __func__ , self->line, (status & IRCOMM_CD) ? "on" : "off"); @@ -1255,11 +1256,11 @@ static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m) seq_printf(m, "%cASYNC_CTS_FLOW", sep); sep = '|'; } - if (self->port.flags & ASYNC_CHECK_CD) { + if (tty_port_check_carrier(&self->port)) { seq_printf(m, "%cASYNC_CHECK_CD", sep); sep = '|'; } - if (self->port.flags & ASYNC_INITIALIZED) { + if (tty_port_initialized(&self->port)) { seq_printf(m, "%cASYNC_INITIALIZED", sep); sep = '|'; } @@ -1267,7 +1268,7 @@ static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m) seq_printf(m, "%cASYNC_LOW_LATENCY", sep); sep = '|'; } - if (self->port.flags & ASYNC_NORMAL_ACTIVE) { + if (tty_port_active(&self->port)) { seq_printf(m, "%cASYNC_NORMAL_ACTIVE", sep); sep = '|'; } diff --git a/net/irda/ircomm/ircomm_tty_attach.c b/net/irda/ircomm/ircomm_tty_attach.c index 61137f8b5..0a411019c 100644 --- a/net/irda/ircomm/ircomm_tty_attach.c +++ b/net/irda/ircomm/ircomm_tty_attach.c @@ -968,7 +968,7 @@ static int ircomm_tty_state_ready(struct ircomm_tty_cb *self, ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH); ircomm_tty_start_watchdog_timer(self, 3*HZ); - if (self->port.flags & ASYNC_CHECK_CD) { + if (tty_port_check_carrier(&self->port)) { /* Drop carrier */ self->settings.dce = IRCOMM_DELTA_CD; ircomm_tty_check_modem_status(self); diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c index d3687aaa2..d4fdf8f7b 100644 --- a/net/irda/ircomm/ircomm_tty_ioctl.c +++ b/net/irda/ircomm/ircomm_tty_ioctl.c @@ -86,21 +86,17 @@ static void ircomm_tty_change_speed(struct ircomm_tty_cb *self, ircomm_param_request(self, IRCOMM_DATA_RATE, FALSE); /* CTS flow control flag and modem status interrupts */ + tty_port_set_cts_flow(&self->port, cflag & CRTSCTS); if (cflag & CRTSCTS) { - self->port.flags |= ASYNC_CTS_FLOW; self->settings.flow_control |= IRCOMM_RTS_CTS_IN; /* This got me. Bummer. Jean II */ if (self->service_type == IRCOMM_3_WIRE_RAW) net_warn_ratelimited("%s(), enabling RTS/CTS on link that doesn't support it (3-wire-raw)\n", __func__); } else { - self->port.flags &= ~ASYNC_CTS_FLOW; self->settings.flow_control &= ~IRCOMM_RTS_CTS_IN; } - if (cflag & CLOCAL) - self->port.flags &= ~ASYNC_CHECK_CD; - else - self->port.flags |= ASYNC_CHECK_CD; + tty_port_set_check_carrier(&self->port, ~cflag & CLOCAL); #if 0 /* * Set up parity check flag @@ -166,7 +162,7 @@ void ircomm_tty_set_termios(struct tty_struct *tty, /* Handle transition away from B0 status */ if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) { self->settings.dte |= IRCOMM_DTR; - if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags)) + if (!C_CRTSCTS(tty) || !tty_throttled(tty)) self->settings.dte |= IRCOMM_RTS; ircomm_param_request(self, IRCOMM_DTE, TRUE); } @@ -190,7 +186,7 @@ int ircomm_tty_tiocmget(struct tty_struct *tty) struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; unsigned int result; - if (tty->flags & (1 << TTY_IO_ERROR)) + if (tty_io_error(tty)) return -EIO; result = ((self->settings.dte & IRCOMM_RTS) ? TIOCM_RTS : 0) @@ -213,7 +209,7 @@ int ircomm_tty_tiocmset(struct tty_struct *tty, { struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; - if (tty->flags & (1 << TTY_IO_ERROR)) + if (tty_io_error(tty)) return -EIO; IRDA_ASSERT(self != NULL, return -1;); @@ -328,7 +324,7 @@ static int ircomm_tty_set_serial_info(struct ircomm_tty_cb *self, check_and_exit: - if (self->flags & ASYNC_INITIALIZED) { + if (tty_port_initialized(self)) { if (((old_state.flags & ASYNC_SPD_MASK) != (self->flags & ASYNC_SPD_MASK)) || (old_driver.custom_divisor != driver->custom_divisor)) { @@ -362,7 +358,7 @@ int ircomm_tty_ioctl(struct tty_struct *tty, if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) && (cmd != TIOCSERCONFIG) && (cmd != TIOCSERGSTRUCT) && (cmd != TIOCMIWAIT) && (cmd != TIOCGICOUNT)) { - if (tty->flags & (1 << TTY_IO_ERROR)) + if (tty_io_error(tty)) return -EIO; } diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c index fcfbe5794..d8b726728 100644 --- a/net/irda/irlan/irlan_eth.c +++ b/net/irda/irlan/irlan_eth.c @@ -181,7 +181,7 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb, skb = new_skb; } - dev->trans_start = jiffies; + netif_trans_update(dev); len = skb->len; /* Now queue the packet in the transport layer */ diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 40662d732..0b68ba730 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1483,7 +1483,7 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos, long timeo; struct kcm_rx_msg *rxm; int err = 0; - size_t copied; + ssize_t copied; struct sk_buff *skb; /* Only support splice for SOCKSEQPACKET */ diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index cd479903d..6c54e03fe 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -128,6 +128,7 @@ static inline struct sock *l2tp_ip6_bind_lookup(struct net *net, */ static int l2tp_ip6_recv(struct sk_buff *skb) { + struct net *net = dev_net(skb->dev); struct sock *sk; u32 session_id; u32 tunnel_id; @@ -154,7 +155,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb) } /* Ok, this is a data packet. Lookup the session. */ - session = l2tp_session_find(&init_net, NULL, session_id); + session = l2tp_session_find(net, NULL, session_id); if (session == NULL) goto discard; @@ -188,14 +189,14 @@ pass_up: goto discard; tunnel_id = ntohl(*(__be32 *) &skb->data[4]); - tunnel = l2tp_tunnel_find(&init_net, tunnel_id); + tunnel = l2tp_tunnel_find(net, tunnel_id); if (tunnel != NULL) sk = tunnel->sock; else { struct ipv6hdr *iph = ipv6_hdr(skb); read_lock_bh(&l2tp_ip6_lock); - sk = __l2tp_ip6_bind_lookup(&init_net, &iph->daddr, + sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, 0, tunnel_id); read_unlock_bh(&l2tp_ip6_lock); } @@ -263,6 +264,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *) uaddr; + struct net *net = sock_net(sk); __be32 v4addr = 0; int addr_type; int err; @@ -286,7 +288,7 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) err = -EADDRINUSE; read_lock_bh(&l2tp_ip6_lock); - if (__l2tp_ip6_bind_lookup(&init_net, &addr->l2tp_addr, + if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, sk->sk_bound_dev_if, addr->l2tp_conn_id)) goto out_in_use; read_unlock_bh(&l2tp_ip6_lock); @@ -456,7 +458,7 @@ static int l2tp_ip6_backlog_recv(struct sock *sk, struct sk_buff *skb) return 0; drop: - IP_INC_STATS(&init_net, IPSTATS_MIB_INDISCARDS); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS); kfree_skb(skb); return -1; } @@ -494,10 +496,9 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; struct flowi6 fl6; + struct sockcm_cookie sockc_unused = {0}; + struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; - int hlimit = -1; - int tclass = -1; - int dontfrag = -1; int transhdrlen = 4; /* zero session-id */ int ulen = len + transhdrlen; int err; @@ -519,6 +520,10 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = sk->sk_mark; + ipc6.hlimit = -1; + ipc6.tclass = -1; + ipc6.dontfrag = -1; + if (lsa) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; @@ -563,9 +568,10 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); + ipc6.opt = opt; - err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, opt, - &hlimit, &tclass, &dontfrag); + err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, + &sockc_unused); if (err < 0) { fl6_sock_release(flowlabel); return err; @@ -586,6 +592,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); + ipc6.opt = opt; fl6.flowi6_proto = sk->sk_protocol; if (!ipv6_addr_any(daddr)) @@ -610,14 +617,14 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto out; } - if (hlimit < 0) - hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); + if (ipc6.hlimit < 0) + ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (tclass < 0) - tclass = np->tclass; + if (ipc6.tclass < 0) + ipc6.tclass = np->tclass; - if (dontfrag < 0) - dontfrag = np->dontfrag; + if (ipc6.dontfrag < 0) + ipc6.dontfrag = np->dontfrag; if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; @@ -625,9 +632,9 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) back_from_confirm: lock_sock(sk); err = ip6_append_data(sk, ip_generic_getfrag, msg, - ulen, transhdrlen, hlimit, tclass, opt, + ulen, transhdrlen, &ipc6, &fl6, (struct rt6_info *)dst, - msg->msg_flags, dontfrag); + msg->msg_flags, &sockc_unused); if (err) ip6_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 2caaa84ce..1d02e8d20 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -346,22 +346,30 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla if (nest == NULL) goto nla_put_failure; - if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS, - atomic_long_read(&tunnel->stats.tx_packets)) || - nla_put_u64(skb, L2TP_ATTR_TX_BYTES, - atomic_long_read(&tunnel->stats.tx_bytes)) || - nla_put_u64(skb, L2TP_ATTR_TX_ERRORS, - atomic_long_read(&tunnel->stats.tx_errors)) || - nla_put_u64(skb, L2TP_ATTR_RX_PACKETS, - atomic_long_read(&tunnel->stats.rx_packets)) || - nla_put_u64(skb, L2TP_ATTR_RX_BYTES, - atomic_long_read(&tunnel->stats.rx_bytes)) || - nla_put_u64(skb, L2TP_ATTR_RX_SEQ_DISCARDS, - atomic_long_read(&tunnel->stats.rx_seq_discards)) || - nla_put_u64(skb, L2TP_ATTR_RX_OOS_PACKETS, - atomic_long_read(&tunnel->stats.rx_oos_packets)) || - nla_put_u64(skb, L2TP_ATTR_RX_ERRORS, - atomic_long_read(&tunnel->stats.rx_errors))) + if (nla_put_u64_64bit(skb, L2TP_ATTR_TX_PACKETS, + atomic_long_read(&tunnel->stats.tx_packets), + L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_TX_BYTES, + atomic_long_read(&tunnel->stats.tx_bytes), + L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_TX_ERRORS, + atomic_long_read(&tunnel->stats.tx_errors), + L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_RX_PACKETS, + atomic_long_read(&tunnel->stats.rx_packets), + L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_RX_BYTES, + atomic_long_read(&tunnel->stats.rx_bytes), + L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS, + atomic_long_read(&tunnel->stats.rx_seq_discards), + L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS, + atomic_long_read(&tunnel->stats.rx_oos_packets), + L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_RX_ERRORS, + atomic_long_read(&tunnel->stats.rx_errors), + L2TP_ATTR_STATS_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); @@ -746,29 +754,38 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl nla_put_u8(skb, L2TP_ATTR_USING_IPSEC, 1)) || #endif (session->reorder_timeout && - nla_put_msecs(skb, L2TP_ATTR_RECV_TIMEOUT, session->reorder_timeout))) + nla_put_msecs(skb, L2TP_ATTR_RECV_TIMEOUT, + session->reorder_timeout, L2TP_ATTR_PAD))) goto nla_put_failure; nest = nla_nest_start(skb, L2TP_ATTR_STATS); if (nest == NULL) goto nla_put_failure; - if (nla_put_u64(skb, L2TP_ATTR_TX_PACKETS, - atomic_long_read(&session->stats.tx_packets)) || - nla_put_u64(skb, L2TP_ATTR_TX_BYTES, - atomic_long_read(&session->stats.tx_bytes)) || - nla_put_u64(skb, L2TP_ATTR_TX_ERRORS, - atomic_long_read(&session->stats.tx_errors)) || - nla_put_u64(skb, L2TP_ATTR_RX_PACKETS, - atomic_long_read(&session->stats.rx_packets)) || - nla_put_u64(skb, L2TP_ATTR_RX_BYTES, - atomic_long_read(&session->stats.rx_bytes)) || - nla_put_u64(skb, L2TP_ATTR_RX_SEQ_DISCARDS, - atomic_long_read(&session->stats.rx_seq_discards)) || - nla_put_u64(skb, L2TP_ATTR_RX_OOS_PACKETS, - atomic_long_read(&session->stats.rx_oos_packets)) || - nla_put_u64(skb, L2TP_ATTR_RX_ERRORS, - atomic_long_read(&session->stats.rx_errors))) + if (nla_put_u64_64bit(skb, L2TP_ATTR_TX_PACKETS, + atomic_long_read(&session->stats.tx_packets), + L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_TX_BYTES, + atomic_long_read(&session->stats.tx_bytes), + L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_TX_ERRORS, + atomic_long_read(&session->stats.tx_errors), + L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_RX_PACKETS, + atomic_long_read(&session->stats.rx_packets), + L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_RX_BYTES, + atomic_long_read(&session->stats.rx_bytes), + L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS, + atomic_long_read(&session->stats.rx_seq_discards), + L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS, + atomic_long_read(&session->stats.rx_oos_packets), + L2TP_ATTR_STATS_PAD) || + nla_put_u64_64bit(skb, L2TP_ATTR_RX_ERRORS, + atomic_long_read(&session->stats.rx_errors), + L2TP_ATTR_STATS_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index e925037fa..6651a78e1 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -97,3 +97,66 @@ u32 l3mdev_fib_table_by_index(struct net *net, int ifindex) return tb_id; } EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index); + +/** + * l3mdev_get_rt6_dst - IPv6 route lookup based on flow. Returns + * cached route for L3 master device if relevant + * to flow + * @net: network namespace for device index lookup + * @fl6: IPv6 flow struct for lookup + */ + +struct dst_entry *l3mdev_get_rt6_dst(struct net *net, + const struct flowi6 *fl6) +{ + struct dst_entry *dst = NULL; + struct net_device *dev; + + if (fl6->flowi6_oif) { + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, fl6->flowi6_oif); + if (dev && netif_is_l3_slave(dev)) + dev = netdev_master_upper_dev_get_rcu(dev); + + if (dev && netif_is_l3_master(dev) && + dev->l3mdev_ops->l3mdev_get_rt6_dst) + dst = dev->l3mdev_ops->l3mdev_get_rt6_dst(dev, fl6); + + rcu_read_unlock(); + } + + return dst; +} +EXPORT_SYMBOL_GPL(l3mdev_get_rt6_dst); + +/** + * l3mdev_get_saddr - get source address for a flow based on an interface + * enslaved to an L3 master device + * @net: network namespace for device index lookup + * @ifindex: Interface index + * @fl4: IPv4 flow struct + */ + +int l3mdev_get_saddr(struct net *net, int ifindex, struct flowi4 *fl4) +{ + struct net_device *dev; + int rc = 0; + + if (ifindex) { + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, ifindex); + if (dev && netif_is_l3_slave(dev)) + dev = netdev_master_upper_dev_get_rcu(dev); + + if (dev && netif_is_l3_master(dev) && + dev->l3mdev_ops->l3mdev_get_saddr) + rc = dev->l3mdev_ops->l3mdev_get_saddr(dev, fl4); + + rcu_read_unlock(); + } + + return rc; +} +EXPORT_SYMBOL_GPL(l3mdev_get_saddr); diff --git a/net/lapb/lapb_in.c b/net/lapb/lapb_in.c index 5dba89913..182470847 100644 --- a/net/lapb/lapb_in.c +++ b/net/lapb/lapb_in.c @@ -444,10 +444,9 @@ static void lapb_state3_machine(struct lapb_cb *lapb, struct sk_buff *skb, break; case LAPB_FRMR: - lapb_dbg(1, "(%p) S3 RX FRMR(%d) %02X %02X %02X %02X %02X\n", + lapb_dbg(1, "(%p) S3 RX FRMR(%d) %5ph\n", lapb->dev, frame->pf, - skb->data[0], skb->data[1], skb->data[2], - skb->data[3], skb->data[4]); + skb->data); lapb_establish_data_link(lapb); lapb_dbg(0, "(%p) S3 -> S1\n", lapb->dev); lapb_requeue_frames(lapb); diff --git a/net/lapb/lapb_out.c b/net/lapb/lapb_out.c index ba4d015bd..482c94d9d 100644 --- a/net/lapb/lapb_out.c +++ b/net/lapb/lapb_out.c @@ -148,9 +148,7 @@ void lapb_transmit_buffer(struct lapb_cb *lapb, struct sk_buff *skb, int type) } } - lapb_dbg(2, "(%p) S%d TX %02X %02X %02X\n", - lapb->dev, lapb->state, - skb->data[0], skb->data[1], skb->data[2]); + lapb_dbg(2, "(%p) S%d TX %3ph\n", lapb->dev, lapb->state, skb->data); if (!lapb_data_transmit(lapb, skb)) kfree_skb(skb); diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c index 9d0a426ec..3c1914df6 100644 --- a/net/lapb/lapb_subr.c +++ b/net/lapb/lapb_subr.c @@ -113,9 +113,7 @@ int lapb_decode(struct lapb_cb *lapb, struct sk_buff *skb, { frame->type = LAPB_ILLEGAL; - lapb_dbg(2, "(%p) S%d RX %02X %02X %02X\n", - lapb->dev, lapb->state, - skb->data[0], skb->data[1], skb->data[2]); + lapb_dbg(2, "(%p) S%d RX %3ph\n", lapb->dev, lapb->state, skb->data); /* We always need to look at 2 bytes, sometimes we need * to look at 3 and those cases are handled below. @@ -284,10 +282,9 @@ void lapb_transmit_frmr(struct lapb_cb *lapb) dptr++; *dptr++ = lapb->frmr_type; - lapb_dbg(1, "(%p) S%d TX FRMR %02X %02X %02X %02X %02X\n", + lapb_dbg(1, "(%p) S%d TX FRMR %5ph\n", lapb->dev, lapb->state, - skb->data[1], skb->data[2], skb->data[3], - skb->data[4], skb->data[5]); + &skb->data[1]); } else { dptr = skb_put(skb, 4); *dptr++ = LAPB_FRMR; @@ -299,9 +296,8 @@ void lapb_transmit_frmr(struct lapb_cb *lapb) dptr++; *dptr++ = lapb->frmr_type; - lapb_dbg(1, "(%p) S%d TX FRMR %02X %02X %02X\n", - lapb->dev, lapb->state, skb->data[1], - skb->data[2], skb->data[3]); + lapb_dbg(1, "(%p) S%d TX FRMR %3ph\n", + lapb->dev, lapb->state, &skb->data[1]); } lapb_transmit_buffer(lapb, skb, LAPB_RESPONSE); diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c index 1a3c7e0f5..29c509c54 100644 --- a/net/llc/llc_proc.c +++ b/net/llc/llc_proc.c @@ -195,7 +195,7 @@ static int llc_seq_core_show(struct seq_file *seq, void *v) timer_pending(&llc->pf_cycle_timer.timer), timer_pending(&llc->rej_sent_timer.timer), timer_pending(&llc->busy_state_timer.timer), - !!sk->sk_backlog.tail, !!sock_owned_by_user(sk)); + !!sk->sk_backlog.tail, !!sk->sk_lock.owned); out: return 0; } diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 4932e9f24..42fa81031 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -935,6 +935,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, size_t len) { struct tid_ampdu_tx *tid_tx; + struct ieee80211_txq *txq; u16 capab, tid; u8 buf_size; bool amsdu; @@ -945,6 +946,10 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6; buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes); + txq = sta->sta.txq[tid]; + if (!amsdu && txq) + set_bit(IEEE80211_TXQ_NO_AMSDU, &to_txq_info(txq)->flags); + mutex_lock(&sta->ampdu_mlme.mtx); tid_tx = rcu_dereference_protected_tid_tx(sta, tid); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index fe1704c4e..0c12e4001 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -65,11 +65,13 @@ static int ieee80211_change_iface(struct wiphy *wiphy, return ret; if (type == NL80211_IFTYPE_AP_VLAN && - params && params->use_4addr == 0) + params && params->use_4addr == 0) { RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); - else if (type == NL80211_IFTYPE_STATION && - params && params->use_4addr >= 0) + ieee80211_check_fast_rx_iface(sdata); + } else if (type == NL80211_IFTYPE_STATION && + params && params->use_4addr >= 0) { sdata->u.mgd.use_4addr = params->use_4addr; + } if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) { struct ieee80211_local *local = sdata->local; @@ -732,6 +734,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.beacon_int = params->beacon_interval; sdata->vif.bss_conf.dtim_period = params->dtim_period; sdata->vif.bss_conf.enable_beacon = true; + sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p; sdata->vif.bss_conf.ssid_len = params->ssid_len; if (params->ssid_len) @@ -1046,7 +1049,7 @@ static int sta_apply_parameters(struct ieee80211_local *local, int ret = 0; struct ieee80211_supported_band *sband; struct ieee80211_sub_if_data *sdata = sta->sdata; - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); u32 mask, set; sband = local->hw.wiphy->bands[band]; @@ -1202,6 +1205,9 @@ static int sta_apply_parameters(struct ieee80211_local *local, params->opmode_notif, band); } + if (params->support_p2p_ps >= 0) + sta->sta.support_p2p_ps = params->support_p2p_ps; + if (ieee80211_vif_is_mesh(&sdata->vif)) sta_apply_mesh_params(local, sta, params); @@ -1363,6 +1369,7 @@ static int ieee80211_change_station(struct wiphy *wiphy, rcu_assign_pointer(vlansdata->u.vlan.sta, sta); new_4addr = true; + __ieee80211_check_fast_rx_iface(vlansdata); } if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN && @@ -1499,7 +1506,7 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop, memset(pinfo, 0, sizeof(*pinfo)); - pinfo->generation = mesh_paths_generation; + pinfo->generation = mpath->sdata->u.mesh.mesh_paths_generation; pinfo->filled = MPATH_INFO_FRAME_QLEN | MPATH_INFO_SN | @@ -1577,7 +1584,7 @@ static void mpp_set_pinfo(struct mesh_path *mpath, u8 *mpp, memset(pinfo, 0, sizeof(*pinfo)); memcpy(mpp, mpath->mpp, ETH_ALEN); - pinfo->generation = mpp_paths_generation; + pinfo->generation = mpath->sdata->u.mesh.mpp_paths_generation; } static int ieee80211_get_mpp(struct wiphy *wiphy, struct net_device *dev, @@ -1841,7 +1848,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy, struct bss_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - enum ieee80211_band band; + enum nl80211_band band; u32 changed = 0; if (!sdata_dereference(sdata->u.ap.beacon, sdata)) @@ -1860,7 +1867,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy, } if (!sdata->vif.bss_conf.use_short_slot && - band == IEEE80211_BAND_5GHZ) { + band == NL80211_BAND_5GHZ) { sdata->vif.bss_conf.use_short_slot = true; changed |= BSS_CHANGED_ERP_SLOT; } @@ -1885,6 +1892,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy, sdata->flags |= IEEE80211_SDATA_DONT_BRIDGE_PACKETS; else sdata->flags &= ~IEEE80211_SDATA_DONT_BRIDGE_PACKETS; + ieee80211_check_fast_rx_iface(sdata); } if (params->ht_opmode >= 0) { @@ -2089,12 +2097,12 @@ static int ieee80211_leave_ocb(struct wiphy *wiphy, struct net_device *dev) } static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev, - int rate[IEEE80211_NUM_BANDS]) + int rate[NUM_NL80211_BANDS]) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); memcpy(sdata->vif.bss_conf.mcast_rate, rate, - sizeof(int) * IEEE80211_NUM_BANDS); + sizeof(int) * NUM_NL80211_BANDS); return 0; } @@ -2499,7 +2507,7 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, return ret; } - for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { struct ieee80211_supported_band *sband = wiphy->bands[i]; int j; @@ -3127,7 +3135,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev, struct ieee80211_tx_info *info; struct sta_info *sta; struct ieee80211_chanctx_conf *chanctx_conf; - enum ieee80211_band band; + enum nl80211_band band; int ret; /* the lock is needed to assign the cookie later */ diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 4ab5c522c..b251b2f7f 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -127,6 +127,9 @@ static const char *hw_flag_names[] = { FLAG(BEACON_TX_STATUS), FLAG(NEEDS_UNIQUE_STA_ADDR), FLAG(SUPPORTS_REORDERING_BUFFER), + FLAG(USES_RSS), + FLAG(TX_AMSDU), + FLAG(TX_FRAG_LIST), #undef FLAG }; diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 37ea30e07..a5ba739cd 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -169,21 +169,21 @@ static ssize_t ieee80211_if_write_##name(struct file *file, \ IEEE80211_IF_FILE_R(name) /* common attributes */ -IEEE80211_IF_FILE(rc_rateidx_mask_2ghz, rc_rateidx_mask[IEEE80211_BAND_2GHZ], +IEEE80211_IF_FILE(rc_rateidx_mask_2ghz, rc_rateidx_mask[NL80211_BAND_2GHZ], HEX); -IEEE80211_IF_FILE(rc_rateidx_mask_5ghz, rc_rateidx_mask[IEEE80211_BAND_5GHZ], +IEEE80211_IF_FILE(rc_rateidx_mask_5ghz, rc_rateidx_mask[NL80211_BAND_5GHZ], HEX); IEEE80211_IF_FILE(rc_rateidx_mcs_mask_2ghz, - rc_rateidx_mcs_mask[IEEE80211_BAND_2GHZ], HEXARRAY); + rc_rateidx_mcs_mask[NL80211_BAND_2GHZ], HEXARRAY); IEEE80211_IF_FILE(rc_rateidx_mcs_mask_5ghz, - rc_rateidx_mcs_mask[IEEE80211_BAND_5GHZ], HEXARRAY); + rc_rateidx_mcs_mask[NL80211_BAND_5GHZ], HEXARRAY); static ssize_t ieee80211_if_fmt_rc_rateidx_vht_mcs_mask_2ghz( const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) { int i, len = 0; - const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[IEEE80211_BAND_2GHZ]; + const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[NL80211_BAND_2GHZ]; for (i = 0; i < NL80211_VHT_NSS_MAX; i++) len += scnprintf(buf + len, buflen - len, "%04x ", mask[i]); @@ -199,7 +199,7 @@ static ssize_t ieee80211_if_fmt_rc_rateidx_vht_mcs_mask_5ghz( char *buf, int buflen) { int i, len = 0; - const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[IEEE80211_BAND_5GHZ]; + const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[NL80211_BAND_5GHZ]; for (i = 0; i < NL80211_VHT_NSS_MAX; i++) len += scnprintf(buf + len, buflen - len, "%04x ", mask[i]); diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index a39512f09..33dfcbc2b 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -3,6 +3,7 @@ * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright(c) 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -51,31 +52,54 @@ static const struct file_operations sta_ ##name## _ops = { \ STA_FILE(aid, sta.aid, D); +static const char * const sta_flag_names[] = { +#define FLAG(F) [WLAN_STA_##F] = #F + FLAG(AUTH), + FLAG(ASSOC), + FLAG(PS_STA), + FLAG(AUTHORIZED), + FLAG(SHORT_PREAMBLE), + FLAG(WDS), + FLAG(CLEAR_PS_FILT), + FLAG(MFP), + FLAG(BLOCK_BA), + FLAG(PS_DRIVER), + FLAG(PSPOLL), + FLAG(TDLS_PEER), + FLAG(TDLS_PEER_AUTH), + FLAG(TDLS_INITIATOR), + FLAG(TDLS_CHAN_SWITCH), + FLAG(TDLS_OFF_CHANNEL), + FLAG(TDLS_WIDER_BW), + FLAG(UAPSD), + FLAG(SP), + FLAG(4ADDR_EVENT), + FLAG(INSERTED), + FLAG(RATE_CONTROL), + FLAG(TOFFSET_KNOWN), + FLAG(MPSP_OWNER), + FLAG(MPSP_RECIPIENT), + FLAG(PS_DELIVER), +#undef FLAG +}; + static ssize_t sta_flags_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { - char buf[121]; + char buf[16 * NUM_WLAN_STA_FLAGS], *pos = buf; + char *end = buf + sizeof(buf) - 1; struct sta_info *sta = file->private_data; + unsigned int flg; + + BUILD_BUG_ON(ARRAY_SIZE(sta_flag_names) != NUM_WLAN_STA_FLAGS); + + for (flg = 0; flg < NUM_WLAN_STA_FLAGS; flg++) { + if (test_sta_flag(sta, flg)) + pos += scnprintf(pos, end - pos, "%s\n", + sta_flag_names[flg]); + } -#define TEST(flg) \ - test_sta_flag(sta, WLAN_STA_##flg) ? #flg "\n" : "" - - int res = scnprintf(buf, sizeof(buf), - "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", - TEST(AUTH), TEST(ASSOC), TEST(PS_STA), - TEST(PS_DRIVER), TEST(AUTHORIZED), - TEST(SHORT_PREAMBLE), - sta->sta.wme ? "WME\n" : "", - TEST(WDS), TEST(CLEAR_PS_FILT), - TEST(MFP), TEST(BLOCK_BA), TEST(PSPOLL), - TEST(UAPSD), TEST(SP), TEST(TDLS_PEER), - TEST(TDLS_PEER_AUTH), TEST(TDLS_INITIATOR), - TEST(TDLS_CHAN_SWITCH), TEST(TDLS_OFF_CHANNEL), - TEST(4ADDR_EVENT), TEST(INSERTED), - TEST(RATE_CONTROL), TEST(TOFFSET_KNOWN), - TEST(MPSP_OWNER), TEST(MPSP_RECIPIENT)); -#undef TEST - return simple_read_from_buffer(userbuf, count, ppos, buf, res); + return simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf)); } STA_OPS(flags); @@ -151,11 +175,12 @@ static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf, static ssize_t sta_agg_status_write(struct file *file, const char __user *userbuf, size_t count, loff_t *ppos) { - char _buf[12] = {}, *buf = _buf; + char _buf[25] = {}, *buf = _buf; struct sta_info *sta = file->private_data; bool start, tx; unsigned long tid; - int ret; + char *pos; + int ret, timeout = 5000; if (count > sizeof(_buf)) return -EINVAL; @@ -164,37 +189,48 @@ static ssize_t sta_agg_status_write(struct file *file, const char __user *userbu return -EFAULT; buf[sizeof(_buf) - 1] = '\0'; + pos = buf; + buf = strsep(&pos, " "); + if (!buf) + return -EINVAL; - if (strncmp(buf, "tx ", 3) == 0) { - buf += 3; + if (!strcmp(buf, "tx")) tx = true; - } else if (strncmp(buf, "rx ", 3) == 0) { - buf += 3; + else if (!strcmp(buf, "rx")) tx = false; - } else + else return -EINVAL; - if (strncmp(buf, "start ", 6) == 0) { - buf += 6; + buf = strsep(&pos, " "); + if (!buf) + return -EINVAL; + if (!strcmp(buf, "start")) { start = true; if (!tx) return -EINVAL; - } else if (strncmp(buf, "stop ", 5) == 0) { - buf += 5; + } else if (!strcmp(buf, "stop")) { start = false; - } else + } else { return -EINVAL; + } - ret = kstrtoul(buf, 0, &tid); - if (ret) - return ret; + buf = strsep(&pos, " "); + if (!buf) + return -EINVAL; + if (sscanf(buf, "timeout=%d", &timeout) == 1) { + buf = strsep(&pos, " "); + if (!buf || !tx || !start) + return -EINVAL; + } - if (tid >= IEEE80211_NUM_TIDS) + ret = kstrtoul(buf, 0, &tid); + if (ret || tid >= IEEE80211_NUM_TIDS) return -EINVAL; if (tx) { if (start) - ret = ieee80211_start_tx_ba_session(&sta->sta, tid, 5000); + ret = ieee80211_start_tx_ba_session(&sta->sta, tid, + timeout); else ret = ieee80211_stop_tx_ba_session(&sta->sta, tid); } else { @@ -322,14 +358,14 @@ STA_OPS(vht_capa); #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, \ - sta->debugfs.dir, sta, &sta_ ##name## _ops); + sta->debugfs_dir, sta, &sta_ ##name## _ops); #define DEBUGFS_ADD_COUNTER(name, field) \ if (sizeof(sta->field) == sizeof(u32)) \ - debugfs_create_u32(#name, 0400, sta->debugfs.dir, \ + debugfs_create_u32(#name, 0400, sta->debugfs_dir, \ (u32 *) &sta->field); \ else \ - debugfs_create_u64(#name, 0400, sta->debugfs.dir, \ + debugfs_create_u64(#name, 0400, sta->debugfs_dir, \ (u64 *) &sta->field); void ieee80211_sta_debugfs_add(struct sta_info *sta) @@ -339,8 +375,6 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) struct dentry *stations_dir = sta->sdata->debugfs.subdir_stations; u8 mac[3*ETH_ALEN]; - sta->debugfs.add_has_run = true; - if (!stations_dir) return; @@ -355,8 +389,8 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) * destroyed quickly enough the old station's debugfs * dir might still be around. */ - sta->debugfs.dir = debugfs_create_dir(mac, stations_dir); - if (!sta->debugfs.dir) + sta->debugfs_dir = debugfs_create_dir(mac, stations_dir); + if (!sta->debugfs_dir) return; DEBUGFS_ADD(flags); @@ -372,14 +406,14 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) if (sizeof(sta->driver_buffered_tids) == sizeof(u32)) debugfs_create_x32("driver_buffered_tids", 0400, - sta->debugfs.dir, + sta->debugfs_dir, (u32 *)&sta->driver_buffered_tids); else debugfs_create_x64("driver_buffered_tids", 0400, - sta->debugfs.dir, + sta->debugfs_dir, (u64 *)&sta->driver_buffered_tids); - drv_sta_add_debugfs(local, sdata, &sta->sta, sta->debugfs.dir); + drv_sta_add_debugfs(local, sdata, &sta->sta, sta->debugfs_dir); } void ieee80211_sta_debugfs_remove(struct sta_info *sta) @@ -387,7 +421,7 @@ void ieee80211_sta_debugfs_remove(struct sta_info *sta) struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; - drv_sta_remove_debugfs(local, sdata, &sta->sta, sta->debugfs.dir); - debugfs_remove_recursive(sta->debugfs.dir); - sta->debugfs.dir = NULL; + drv_sta_remove_debugfs(local, sdata, &sta->sta, sta->debugfs_dir); + debugfs_remove_recursive(sta->debugfs_dir); + sta->debugfs_dir = NULL; } diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 18b0d65ba..184473c25 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1,3 +1,8 @@ +/* +* Portions of this file +* Copyright(c) 2016 Intel Deutschland GmbH +*/ + #ifndef __MAC80211_DRIVER_OPS #define __MAC80211_DRIVER_OPS @@ -29,6 +34,16 @@ static inline void drv_tx(struct ieee80211_local *local, local->ops->tx(&local->hw, control, skb); } +static inline void drv_sync_rx_queues(struct ieee80211_local *local, + struct sta_info *sta) +{ + if (local->ops->sync_rx_queues) { + trace_drv_sync_rx_queues(local, sta->sdata, &sta->sta); + local->ops->sync_rx_queues(&local->hw); + trace_drv_return_void(local); + } +} + static inline void drv_get_et_strings(struct ieee80211_sub_if_data *sdata, u32 sset, u8 *data) { diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index fc3238376..a31d30713 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -126,7 +126,7 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, } } - if (sband->band == IEEE80211_BAND_2GHZ) { + if (sband->band == NL80211_BAND_2GHZ) { *pos++ = WLAN_EID_DS_PARAMS; *pos++ = 1; *pos++ = ieee80211_frequency_to_channel( @@ -348,11 +348,11 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, * * HT follows these specifications (IEEE 802.11-2012 20.3.18) */ - sdata->vif.bss_conf.use_short_slot = chan->band == IEEE80211_BAND_5GHZ; + sdata->vif.bss_conf.use_short_slot = chan->band == NL80211_BAND_5GHZ; bss_change |= BSS_CHANGED_ERP_SLOT; /* cf. IEEE 802.11 9.2.12 */ - if (chan->band == IEEE80211_BAND_2GHZ && have_higher_than_11mbit) + if (chan->band == NL80211_BAND_2GHZ && have_higher_than_11mbit) sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE; else sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE; @@ -649,8 +649,6 @@ ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, const u8 *bssid, return NULL; } - sta->rx_stats.last_rx = jiffies; - /* make sure mandatory rates are always added */ sband = local->hw.wiphy->bands[band]; sta->sta.supp_rates[band] = supp_rates | @@ -670,10 +668,11 @@ static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata) rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { + unsigned long last_active = ieee80211_sta_last_active(sta); + if (sta->sdata == sdata && - time_after(sta->rx_stats.last_rx + - IEEE80211_IBSS_MERGE_INTERVAL, - jiffies)) { + time_is_after_jiffies(last_active + + IEEE80211_IBSS_MERGE_INTERVAL)) { active++; break; } @@ -990,7 +989,7 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel) { struct sta_info *sta; - enum ieee80211_band band = rx_status->band; + enum nl80211_band band = rx_status->band; enum nl80211_bss_scan_width scan_width; struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; @@ -1110,7 +1109,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel; u64 beacon_timestamp, rx_timestamp; u32 supp_rates = 0; - enum ieee80211_band band = rx_status->band; + enum nl80211_band band = rx_status->band; channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq); if (!channel) @@ -1236,8 +1235,6 @@ void ieee80211_ibss_rx_no_sta(struct ieee80211_sub_if_data *sdata, if (!sta) return; - sta->rx_stats.last_rx = jiffies; - /* make sure mandatory rates are always added */ sband = local->hw.wiphy->bands[band]; sta->sta.supp_rates[band] = supp_rates | @@ -1259,11 +1256,13 @@ static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata) mutex_lock(&local->sta_mtx); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { + unsigned long last_active = ieee80211_sta_last_active(sta); + if (sdata != sta->sdata) continue; - if (time_after(jiffies, sta->rx_stats.last_rx + exp_time) || - (time_after(jiffies, sta->rx_stats.last_rx + exp_rsn) && + if (time_is_before_jiffies(last_active + exp_time) || + (time_is_before_jiffies(last_active + exp_rsn) && sta->sta_state != IEEE80211_STA_AUTHORIZED)) { sta_dbg(sta->sdata, "expiring inactive %sSTA %pM\n", sta->sta_state != IEEE80211_STA_AUTHORIZED ? diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 422003540..9438c9406 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -696,6 +696,11 @@ struct ieee80211_if_mesh { /* offset from skb->data while building IE */ int meshconf_offset; + + struct mesh_table *mesh_paths; + struct mesh_table *mpp_paths; /* Store paths for MPP&MAP */ + int mesh_paths_generation; + int mpp_paths_generation; }; #ifdef CONFIG_MAC80211_MESH @@ -797,6 +802,7 @@ struct mac80211_qos_map { enum txq_info_flags { IEEE80211_TXQ_STOP, IEEE80211_TXQ_AMPDU, + IEEE80211_TXQ_NO_AMSDU, }; struct txq_info { @@ -890,13 +896,13 @@ struct ieee80211_sub_if_data { struct ieee80211_if_ap *bss; /* bitmap of allowed (non-MCS) rate indexes for rate control */ - u32 rc_rateidx_mask[IEEE80211_NUM_BANDS]; + u32 rc_rateidx_mask[NUM_NL80211_BANDS]; - bool rc_has_mcs_mask[IEEE80211_NUM_BANDS]; - u8 rc_rateidx_mcs_mask[IEEE80211_NUM_BANDS][IEEE80211_HT_MCS_MASK_LEN]; + bool rc_has_mcs_mask[NUM_NL80211_BANDS]; + u8 rc_rateidx_mcs_mask[NUM_NL80211_BANDS][IEEE80211_HT_MCS_MASK_LEN]; - bool rc_has_vht_mcs_mask[IEEE80211_NUM_BANDS]; - u16 rc_rateidx_vht_mcs_mask[IEEE80211_NUM_BANDS][NL80211_VHT_NSS_MAX]; + bool rc_has_vht_mcs_mask[NUM_NL80211_BANDS]; + u16 rc_rateidx_vht_mcs_mask[NUM_NL80211_BANDS][NL80211_VHT_NSS_MAX]; union { struct ieee80211_if_ap ap; @@ -951,10 +957,10 @@ sdata_assert_lock(struct ieee80211_sub_if_data *sdata) lockdep_assert_held(&sdata->wdev.mtx); } -static inline enum ieee80211_band +static inline enum nl80211_band ieee80211_get_sdata_band(struct ieee80211_sub_if_data *sdata) { - enum ieee80211_band band = IEEE80211_BAND_2GHZ; + enum nl80211_band band = NL80211_BAND_2GHZ; struct ieee80211_chanctx_conf *chanctx_conf; rcu_read_lock(); @@ -1225,7 +1231,7 @@ struct ieee80211_local { struct cfg80211_scan_request __rcu *scan_req; struct ieee80211_scan_request *hw_scan_req; struct cfg80211_chan_def scan_chandef; - enum ieee80211_band hw_scan_band; + enum nl80211_band hw_scan_band; int scan_channel_idx; int scan_ies_len; int hw_scan_ies_bufsize; @@ -1489,6 +1495,11 @@ u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local); int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, u64 *cookie, gfp_t gfp); +void ieee80211_check_fast_rx(struct sta_info *sta); +void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata); +void ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata); +void ieee80211_clear_fast_rx(struct sta_info *sta); + /* STA code */ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata); int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, @@ -1727,10 +1738,10 @@ void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt); u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, - enum ieee80211_band band); + enum nl80211_band band); void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, - enum ieee80211_band band); + enum nl80211_band band); void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap); void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, @@ -1758,7 +1769,7 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, */ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, - enum ieee80211_band current_band, + enum nl80211_band current_band, u32 sta_flags, u8 *bssid, struct ieee80211_csa_ie *csa_ie); @@ -1783,7 +1794,7 @@ static inline int __ieee80211_resume(struct ieee80211_hw *hw) /* utility functions/constants */ extern const void *const mac80211_wiphy_privid; /* for wiphy privid */ -int ieee80211_frame_duration(enum ieee80211_band band, size_t len, +int ieee80211_frame_duration(enum nl80211_band band, size_t len, int rate, int erp, int short_preamble, int shift); void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, @@ -1793,12 +1804,12 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum ieee80211_band band); + enum nl80211_band band); static inline void ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum ieee80211_band band) + enum nl80211_band band) { rcu_read_lock(); __ieee80211_tx_skb_tid_band(sdata, skb, tid, band); @@ -1953,7 +1964,7 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, - enum ieee80211_band band, u32 *basic_rates); + enum nl80211_band band, u32 *basic_rates); int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps_mode); int __ieee80211_request_smps_ap(struct ieee80211_sub_if_data *sdata, @@ -1976,10 +1987,10 @@ int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const u8 *srates, int srates_len, u32 *rates); int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, bool need_basic, - enum ieee80211_band band); + enum nl80211_band band); int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, bool need_basic, - enum ieee80211_band band); + enum nl80211_band band); u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo); /* channel management */ diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index e1cb22c16..c59af3eb9 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1093,7 +1093,7 @@ static void ieee80211_teardown_sdata(struct ieee80211_sub_if_data *sdata) sdata->fragment_next = 0; if (ieee80211_vif_is_mesh(&sdata->vif)) - mesh_rmc_free(sdata); + ieee80211_mesh_teardown_sdata(sdata); } static void ieee80211_uninit(struct net_device *dev) @@ -1800,7 +1800,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, INIT_DELAYED_WORK(&sdata->dec_tailroom_needed_wk, ieee80211_delayed_tailroom_dec); - for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[i]; sdata->rc_rateidx_mask[i] = diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 3df7b0392..edd6f2945 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -338,6 +338,7 @@ static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, } else { rcu_assign_pointer(sta->gtk[idx], new); } + ieee80211_check_fast_rx(sta); } else { defunikey = old && old == key_mtx_dereference(sdata->local, diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 8190bf27e..7ee91d615 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -558,6 +558,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, if (!ops->set_key) wiphy->flags |= WIPHY_FLAG_IBSS_RSN; + wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_RRM); + wiphy->bss_priv_size = sizeof(struct ieee80211_bss); local = wiphy_priv(wiphy); @@ -799,7 +801,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); int result, i; - enum ieee80211_band band; + enum nl80211_band band; int channels, max_bitrates; bool supp_ht, supp_vht; netdev_features_t feature_whitelist; @@ -854,7 +856,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* Only HW csum features are currently compatible with mac80211 */ feature_whitelist = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_HIGHDMA | - NETIF_F_GSO_SOFTWARE; + NETIF_F_GSO_SOFTWARE | NETIF_F_RXCSUM; if (WARN_ON(hw->netdev_features & ~feature_whitelist)) return -EINVAL; @@ -872,7 +874,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) max_bitrates = 0; supp_ht = false; supp_vht = false; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; sband = local->hw.wiphy->bands[band]; @@ -934,7 +936,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (!local->int_scan_req) return -ENOMEM; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!local->hw.wiphy->bands[band]) continue; local->int_scan_req->rates[band] = (u32) -1; diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 34a5712d4..6a1603bcd 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -25,7 +25,6 @@ bool mesh_action_is_path_sel(struct ieee80211_mgmt *mgmt) void ieee80211s_init(void) { - mesh_pathtbl_init(); mesh_allocated = 1; rm_cache = kmem_cache_create("mesh_rmc", sizeof(struct rmc_entry), 0, 0, NULL); @@ -35,7 +34,6 @@ void ieee80211s_stop(void) { if (!mesh_allocated) return; - mesh_pathtbl_unregister(); kmem_cache_destroy(rm_cache); } @@ -183,22 +181,23 @@ int mesh_rmc_init(struct ieee80211_sub_if_data *sdata) return -ENOMEM; sdata->u.mesh.rmc->idx_mask = RMC_BUCKETS - 1; for (i = 0; i < RMC_BUCKETS; i++) - INIT_LIST_HEAD(&sdata->u.mesh.rmc->bucket[i]); + INIT_HLIST_HEAD(&sdata->u.mesh.rmc->bucket[i]); return 0; } void mesh_rmc_free(struct ieee80211_sub_if_data *sdata) { struct mesh_rmc *rmc = sdata->u.mesh.rmc; - struct rmc_entry *p, *n; + struct rmc_entry *p; + struct hlist_node *n; int i; if (!sdata->u.mesh.rmc) return; for (i = 0; i < RMC_BUCKETS; i++) { - list_for_each_entry_safe(p, n, &rmc->bucket[i], list) { - list_del(&p->list); + hlist_for_each_entry_safe(p, n, &rmc->bucket[i], list) { + hlist_del(&p->list); kmem_cache_free(rm_cache, p); } } @@ -227,16 +226,20 @@ int mesh_rmc_check(struct ieee80211_sub_if_data *sdata, u32 seqnum = 0; int entries = 0; u8 idx; - struct rmc_entry *p, *n; + struct rmc_entry *p; + struct hlist_node *n; + + if (!rmc) + return -1; /* Don't care about endianness since only match matters */ memcpy(&seqnum, &mesh_hdr->seqnum, sizeof(mesh_hdr->seqnum)); idx = le32_to_cpu(mesh_hdr->seqnum) & rmc->idx_mask; - list_for_each_entry_safe(p, n, &rmc->bucket[idx], list) { + hlist_for_each_entry_safe(p, n, &rmc->bucket[idx], list) { ++entries; if (time_after(jiffies, p->exp_time) || entries == RMC_QUEUE_MAX_LEN) { - list_del(&p->list); + hlist_del(&p->list); kmem_cache_free(rm_cache, p); --entries; } else if ((seqnum == p->seqnum) && ether_addr_equal(sa, p->sa)) @@ -250,7 +253,7 @@ int mesh_rmc_check(struct ieee80211_sub_if_data *sdata, p->seqnum = seqnum; p->exp_time = jiffies + RMC_TIMEOUT; memcpy(p->sa, sa, ETH_ALEN); - list_add(&p->list, &rmc->bucket[idx]); + hlist_add_head(&p->list, &rmc->bucket[idx]); return 0; } @@ -419,7 +422,7 @@ int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_supported_band *sband; u8 *pos; @@ -482,7 +485,7 @@ int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_supported_band *sband; u8 *pos; @@ -684,7 +687,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) struct ieee80211_mgmt *mgmt; struct ieee80211_chanctx_conf *chanctx_conf; struct mesh_csa_settings *csa; - enum ieee80211_band band; + enum nl80211_band band; u8 *pos; struct ieee80211_sub_if_data *sdata; int hdr_len = offsetof(struct ieee80211_mgmt, u.beacon) + @@ -934,7 +937,7 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings params; struct ieee80211_csa_ie csa_ie; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); int err; u32 sta_flags; @@ -1088,7 +1091,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel; size_t baselen; int freq; - enum ieee80211_band band = rx_status->band; + enum nl80211_band band = rx_status->band; /* ignore ProbeResp to foreign address */ if (stype == IEEE80211_STYPE_PROBE_RESP && @@ -1355,12 +1358,6 @@ void ieee80211_mesh_work(struct ieee80211_sub_if_data *sdata) ifmsh->last_preq + msecs_to_jiffies(ifmsh->mshcfg.dot11MeshHWMPpreqMinInterval))) mesh_path_start_discovery(sdata); - if (test_and_clear_bit(MESH_WORK_GROW_MPATH_TABLE, &ifmsh->wrkq_flags)) - mesh_mpath_table_grow(); - - if (test_and_clear_bit(MESH_WORK_GROW_MPP_TABLE, &ifmsh->wrkq_flags)) - mesh_mpp_table_grow(); - if (test_and_clear_bit(MESH_WORK_HOUSEKEEPING, &ifmsh->wrkq_flags)) ieee80211_mesh_housekeeping(sdata); @@ -1395,6 +1392,9 @@ void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata) /* Allocate all mesh structures when creating the first mesh interface. */ if (!mesh_allocated) ieee80211s_init(); + + mesh_pathtbl_init(sdata); + setup_timer(&ifmsh->mesh_path_timer, ieee80211_mesh_path_timer, (unsigned long) sdata); @@ -1409,3 +1409,9 @@ void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata) sdata->vif.bss_conf.bssid = zero_addr; } + +void ieee80211_mesh_teardown_sdata(struct ieee80211_sub_if_data *sdata) +{ + mesh_rmc_free(sdata); + mesh_pathtbl_unregister(sdata); +} diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 87c017a3b..26b9ccbe1 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -21,8 +21,6 @@ /** * enum mesh_path_flags - mac80211 mesh path flags * - * - * * @MESH_PATH_ACTIVE: the mesh path can be used for forwarding * @MESH_PATH_RESOLVING: the discovery process is running for this mesh path * @MESH_PATH_SN_VALID: the mesh path contains a valid destination sequence @@ -32,6 +30,8 @@ * @MESH_PATH_RESOLVED: the mesh path can has been resolved * @MESH_PATH_REQ_QUEUED: there is an unsent path request for this destination * already queued up, waiting for the discovery process to start. + * @MESH_PATH_DELETED: the mesh path has been deleted and should no longer + * be used * * MESH_PATH_RESOLVED is used by the mesh path timer to * decide when to stop or cancel the mesh path discovery. @@ -43,6 +43,7 @@ enum mesh_path_flags { MESH_PATH_FIXED = BIT(3), MESH_PATH_RESOLVED = BIT(4), MESH_PATH_REQ_QUEUED = BIT(5), + MESH_PATH_DELETED = BIT(6), }; /** @@ -51,10 +52,6 @@ enum mesh_path_flags { * * * @MESH_WORK_HOUSEKEEPING: run the periodic mesh housekeeping tasks - * @MESH_WORK_GROW_MPATH_TABLE: the mesh path table is full and needs - * to grow. - * @MESH_WORK_GROW_MPP_TABLE: the mesh portals table is full and needs to - * grow * @MESH_WORK_ROOT: the mesh root station needs to send a frame * @MESH_WORK_DRIFT_ADJUST: time to compensate for clock drift relative to other * mesh nodes @@ -62,8 +59,6 @@ enum mesh_path_flags { */ enum mesh_deferred_task_flags { MESH_WORK_HOUSEKEEPING, - MESH_WORK_GROW_MPATH_TABLE, - MESH_WORK_GROW_MPP_TABLE, MESH_WORK_ROOT, MESH_WORK_DRIFT_ADJUST, MESH_WORK_MBSS_CHANGED, @@ -73,12 +68,16 @@ enum mesh_deferred_task_flags { * struct mesh_path - mac80211 mesh path structure * * @dst: mesh path destination mac address + * @mpp: mesh proxy mac address + * @rhash: rhashtable list pointer + * @gate_list: list pointer for known gates list * @sdata: mesh subif * @next_hop: mesh neighbor to which frames for this destination will be * forwarded * @timer: mesh path discovery timer * @frame_queue: pending queue for frames sent to this destination while the * path is unresolved + * @rcu: rcu head for freeing mesh path * @sn: target sequence number * @metric: current metric to this destination * @hop_count: hops to destination @@ -97,14 +96,16 @@ enum mesh_deferred_task_flags { * @is_gate: the destination station of this path is a mesh gate * * - * The combination of dst and sdata is unique in the mesh path table. Since the - * next_hop STA is only protected by RCU as well, deleting the STA must also - * remove/substitute the mesh_path structure and wait until that is no longer - * reachable before destroying the STA completely. + * The dst address is unique in the mesh path table. Since the mesh_path is + * protected by RCU, deleting the next_hop STA must remove / substitute the + * mesh_path structure and wait until that is no longer reachable before + * destroying the STA completely. */ struct mesh_path { u8 dst[ETH_ALEN]; u8 mpp[ETH_ALEN]; /* used for MPP or MAP */ + struct rhash_head rhash; + struct hlist_node gate_list; struct ieee80211_sub_if_data *sdata; struct sta_info __rcu *next_hop; struct timer_list timer; @@ -128,34 +129,17 @@ struct mesh_path { /** * struct mesh_table * - * @hash_buckets: array of hash buckets of the table - * @hashwlock: array of locks to protect write operations, one per bucket - * @hash_mask: 2^size_order - 1, used to compute hash idx - * @hash_rnd: random value used for hash computations - * @entries: number of entries in the table - * @free_node: function to free nodes of the table - * @copy_node: function to copy nodes of the table - * @size_order: determines size of the table, there will be 2^size_order hash - * buckets * @known_gates: list of known mesh gates and their mpaths by the station. The * gate's mpath may or may not be resolved and active. - * - * rcu_head: RCU head to free the table + * @gates_lock: protects updates to known_gates + * @rhead: the rhashtable containing struct mesh_paths, keyed by dest addr + * @entries: number of entries in the table */ struct mesh_table { - /* Number of buckets will be 2^N */ - struct hlist_head *hash_buckets; - spinlock_t *hashwlock; /* One per bucket, for add/del */ - unsigned int hash_mask; /* (2^size_order) - 1 */ - __u32 hash_rnd; /* Used for hash generation */ - atomic_t entries; /* Up to MAX_MESH_NEIGHBOURS */ - void (*free_node) (struct hlist_node *p, bool free_leafs); - int (*copy_node) (struct hlist_node *p, struct mesh_table *newtbl); - int size_order; - struct hlist_head *known_gates; + struct hlist_head known_gates; spinlock_t gates_lock; - - struct rcu_head rcu_head; + struct rhashtable rhead; + atomic_t entries; /* Up to MAX_MESH_NEIGHBOURS */ }; /* Recent multicast cache */ @@ -170,20 +154,21 @@ struct mesh_table { * @seqnum: mesh sequence number of the frame * @exp_time: expiration time of the entry, in jiffies * @sa: source address of the frame + * @list: hashtable list pointer * * The Recent Multicast Cache keeps track of the latest multicast frames that * have been received by a mesh interface and discards received multicast frames * that are found in the cache. */ struct rmc_entry { - struct list_head list; - u32 seqnum; + struct hlist_node list; unsigned long exp_time; + u32 seqnum; u8 sa[ETH_ALEN]; }; struct mesh_rmc { - struct list_head bucket[RMC_BUCKETS]; + struct hlist_head bucket[RMC_BUCKETS]; u32 idx_mask; }; @@ -234,6 +219,7 @@ void ieee80211s_init(void); void ieee80211s_update_metric(struct ieee80211_local *local, struct sta_info *sta, struct sk_buff *skb); void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata); +void ieee80211_mesh_teardown_sdata(struct ieee80211_sub_if_data *sdata); int ieee80211_start_mesh(struct ieee80211_sub_if_data *sdata); void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata); void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh); @@ -299,9 +285,6 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, void mesh_sta_cleanup(struct sta_info *sta); /* Private interfaces */ -/* Mesh tables */ -void mesh_mpath_table_grow(void); -void mesh_mpp_table_grow(void); /* Mesh paths */ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata, u8 ttl, const u8 *target, u32 target_sn, @@ -309,8 +292,8 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata, void mesh_path_assign_nexthop(struct mesh_path *mpath, struct sta_info *sta); void mesh_path_flush_pending(struct mesh_path *mpath); void mesh_path_tx_pending(struct mesh_path *mpath); -int mesh_pathtbl_init(void); -void mesh_pathtbl_unregister(void); +int mesh_pathtbl_init(struct ieee80211_sub_if_data *sdata); +void mesh_pathtbl_unregister(struct ieee80211_sub_if_data *sdata); int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr); void mesh_path_timer(unsigned long data); void mesh_path_flush_by_nexthop(struct sta_info *sta); @@ -319,8 +302,6 @@ void mesh_path_discard_frame(struct ieee80211_sub_if_data *sdata, void mesh_path_tx_root_frame(struct ieee80211_sub_if_data *sdata); bool mesh_action_is_path_sel(struct ieee80211_mgmt *mgmt); -extern int mesh_paths_generation; -extern int mpp_paths_generation; #ifdef CONFIG_MAC80211_MESH static inline diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 002244bca..8f9c3bde8 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1012,6 +1012,10 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata) goto enddiscovery; spin_lock_bh(&mpath->state_lock); + if (mpath->flags & MESH_PATH_DELETED) { + spin_unlock_bh(&mpath->state_lock); + goto enddiscovery; + } mpath->flags &= ~MESH_PATH_REQ_QUEUED; if (preq_node->flags & PREQ_Q_F_START) { if (mpath->flags & MESH_PATH_RESOLVING) { diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 2ba7aa56b..6db2ddfa0 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -18,11 +18,22 @@ #include "ieee80211_i.h" #include "mesh.h" -/* There will be initially 2^INIT_PATHS_SIZE_ORDER buckets */ -#define INIT_PATHS_SIZE_ORDER 2 +static void mesh_path_free_rcu(struct mesh_table *tbl, struct mesh_path *mpath); -/* Keep the mean chain length below this constant */ -#define MEAN_CHAIN_LEN 2 +static u32 mesh_table_hash(const void *addr, u32 len, u32 seed) +{ + /* Use last four bytes of hw addr as hash index */ + return jhash_1word(*(u32 *)(addr+2), seed); +} + +static const struct rhashtable_params mesh_rht_params = { + .nelem_hint = 2, + .automatic_shrinking = true, + .key_len = ETH_ALEN, + .key_offset = offsetof(struct mesh_path, dst), + .head_offset = offsetof(struct mesh_path, rhash), + .hashfn = mesh_table_hash, +}; static inline bool mpath_expired(struct mesh_path *mpath) { @@ -31,173 +42,36 @@ static inline bool mpath_expired(struct mesh_path *mpath) !(mpath->flags & MESH_PATH_FIXED); } -struct mpath_node { - struct hlist_node list; - struct rcu_head rcu; - /* This indirection allows two different tables to point to the same - * mesh_path structure, useful when resizing - */ - struct mesh_path *mpath; -}; - -static struct mesh_table __rcu *mesh_paths; -static struct mesh_table __rcu *mpp_paths; /* Store paths for MPP&MAP */ - -int mesh_paths_generation; -int mpp_paths_generation; - -/* This lock will have the grow table function as writer and add / delete nodes - * as readers. RCU provides sufficient protection only when reading the table - * (i.e. doing lookups). Adding or adding or removing nodes requires we take - * the read lock or we risk operating on an old table. The write lock is only - * needed when modifying the number of buckets a table. - */ -static DEFINE_RWLOCK(pathtbl_resize_lock); - - -static inline struct mesh_table *resize_dereference_paths( - struct mesh_table __rcu *table) +static void mesh_path_rht_free(void *ptr, void *tblptr) { - return rcu_dereference_protected(table, - lockdep_is_held(&pathtbl_resize_lock)); -} + struct mesh_path *mpath = ptr; + struct mesh_table *tbl = tblptr; -static inline struct mesh_table *resize_dereference_mesh_paths(void) -{ - return resize_dereference_paths(mesh_paths); + mesh_path_free_rcu(tbl, mpath); } -static inline struct mesh_table *resize_dereference_mpp_paths(void) +static struct mesh_table *mesh_table_alloc(void) { - return resize_dereference_paths(mpp_paths); -} - -/* - * CAREFUL -- "tbl" must not be an expression, - * in particular not an rcu_dereference(), since - * it's used twice. So it is illegal to do - * for_each_mesh_entry(rcu_dereference(...), ...) - */ -#define for_each_mesh_entry(tbl, node, i) \ - for (i = 0; i <= tbl->hash_mask; i++) \ - hlist_for_each_entry_rcu(node, &tbl->hash_buckets[i], list) - - -static struct mesh_table *mesh_table_alloc(int size_order) -{ - int i; struct mesh_table *newtbl; newtbl = kmalloc(sizeof(struct mesh_table), GFP_ATOMIC); if (!newtbl) return NULL; - newtbl->hash_buckets = kzalloc(sizeof(struct hlist_head) * - (1 << size_order), GFP_ATOMIC); - - if (!newtbl->hash_buckets) { - kfree(newtbl); - return NULL; - } - - newtbl->hashwlock = kmalloc(sizeof(spinlock_t) * - (1 << size_order), GFP_ATOMIC); - if (!newtbl->hashwlock) { - kfree(newtbl->hash_buckets); - kfree(newtbl); - return NULL; - } - - newtbl->size_order = size_order; - newtbl->hash_mask = (1 << size_order) - 1; + INIT_HLIST_HEAD(&newtbl->known_gates); atomic_set(&newtbl->entries, 0); - get_random_bytes(&newtbl->hash_rnd, - sizeof(newtbl->hash_rnd)); - for (i = 0; i <= newtbl->hash_mask; i++) - spin_lock_init(&newtbl->hashwlock[i]); spin_lock_init(&newtbl->gates_lock); return newtbl; } -static void __mesh_table_free(struct mesh_table *tbl) +static void mesh_table_free(struct mesh_table *tbl) { - kfree(tbl->hash_buckets); - kfree(tbl->hashwlock); + rhashtable_free_and_destroy(&tbl->rhead, + mesh_path_rht_free, tbl); kfree(tbl); } -static void mesh_table_free(struct mesh_table *tbl, bool free_leafs) -{ - struct hlist_head *mesh_hash; - struct hlist_node *p, *q; - struct mpath_node *gate; - int i; - - mesh_hash = tbl->hash_buckets; - for (i = 0; i <= tbl->hash_mask; i++) { - spin_lock_bh(&tbl->hashwlock[i]); - hlist_for_each_safe(p, q, &mesh_hash[i]) { - tbl->free_node(p, free_leafs); - atomic_dec(&tbl->entries); - } - spin_unlock_bh(&tbl->hashwlock[i]); - } - if (free_leafs) { - spin_lock_bh(&tbl->gates_lock); - hlist_for_each_entry_safe(gate, q, - tbl->known_gates, list) { - hlist_del(&gate->list); - kfree(gate); - } - kfree(tbl->known_gates); - spin_unlock_bh(&tbl->gates_lock); - } - - __mesh_table_free(tbl); -} - -static int mesh_table_grow(struct mesh_table *oldtbl, - struct mesh_table *newtbl) -{ - struct hlist_head *oldhash; - struct hlist_node *p, *q; - int i; - - if (atomic_read(&oldtbl->entries) - < MEAN_CHAIN_LEN * (oldtbl->hash_mask + 1)) - return -EAGAIN; - - newtbl->free_node = oldtbl->free_node; - newtbl->copy_node = oldtbl->copy_node; - newtbl->known_gates = oldtbl->known_gates; - atomic_set(&newtbl->entries, atomic_read(&oldtbl->entries)); - - oldhash = oldtbl->hash_buckets; - for (i = 0; i <= oldtbl->hash_mask; i++) - hlist_for_each(p, &oldhash[i]) - if (oldtbl->copy_node(p, newtbl) < 0) - goto errcopy; - - return 0; - -errcopy: - for (i = 0; i <= newtbl->hash_mask; i++) { - hlist_for_each_safe(p, q, &newtbl->hash_buckets[i]) - oldtbl->free_node(p, 0); - } - return -ENOMEM; -} - -static u32 mesh_table_hash(const u8 *addr, struct ieee80211_sub_if_data *sdata, - struct mesh_table *tbl) -{ - /* Use last four bytes of hw addr and interface index as hash index */ - return jhash_2words(*(u32 *)(addr+2), sdata->dev->ifindex, - tbl->hash_rnd) & tbl->hash_mask; -} - - /** * * mesh_path_assign_nexthop - update mesh path next hop @@ -340,23 +214,15 @@ static struct mesh_path *mpath_lookup(struct mesh_table *tbl, const u8 *dst, struct ieee80211_sub_if_data *sdata) { struct mesh_path *mpath; - struct hlist_head *bucket; - struct mpath_node *node; - - bucket = &tbl->hash_buckets[mesh_table_hash(dst, sdata, tbl)]; - hlist_for_each_entry_rcu(node, bucket, list) { - mpath = node->mpath; - if (mpath->sdata == sdata && - ether_addr_equal(dst, mpath->dst)) { - if (mpath_expired(mpath)) { - spin_lock_bh(&mpath->state_lock); - mpath->flags &= ~MESH_PATH_ACTIVE; - spin_unlock_bh(&mpath->state_lock); - } - return mpath; - } + + mpath = rhashtable_lookup_fast(&tbl->rhead, dst, mesh_rht_params); + + if (mpath && mpath_expired(mpath)) { + spin_lock_bh(&mpath->state_lock); + mpath->flags &= ~MESH_PATH_ACTIVE; + spin_unlock_bh(&mpath->state_lock); } - return NULL; + return mpath; } /** @@ -371,15 +237,52 @@ static struct mesh_path *mpath_lookup(struct mesh_table *tbl, const u8 *dst, struct mesh_path * mesh_path_lookup(struct ieee80211_sub_if_data *sdata, const u8 *dst) { - return mpath_lookup(rcu_dereference(mesh_paths), dst, sdata); + return mpath_lookup(sdata->u.mesh.mesh_paths, dst, sdata); } struct mesh_path * mpp_path_lookup(struct ieee80211_sub_if_data *sdata, const u8 *dst) { - return mpath_lookup(rcu_dereference(mpp_paths), dst, sdata); + return mpath_lookup(sdata->u.mesh.mpp_paths, dst, sdata); } +static struct mesh_path * +__mesh_path_lookup_by_idx(struct mesh_table *tbl, int idx) +{ + int i = 0, ret; + struct mesh_path *mpath = NULL; + struct rhashtable_iter iter; + + ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC); + if (ret) + return NULL; + + ret = rhashtable_walk_start(&iter); + if (ret && ret != -EAGAIN) + goto err; + + while ((mpath = rhashtable_walk_next(&iter))) { + if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) + continue; + if (IS_ERR(mpath)) + break; + if (i++ == idx) + break; + } +err: + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); + + if (IS_ERR(mpath) || !mpath) + return NULL; + + if (mpath_expired(mpath)) { + spin_lock_bh(&mpath->state_lock); + mpath->flags &= ~MESH_PATH_ACTIVE; + spin_unlock_bh(&mpath->state_lock); + } + return mpath; +} /** * mesh_path_lookup_by_idx - look up a path in the mesh path table by its index @@ -393,25 +296,7 @@ mpp_path_lookup(struct ieee80211_sub_if_data *sdata, const u8 *dst) struct mesh_path * mesh_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx) { - struct mesh_table *tbl = rcu_dereference(mesh_paths); - struct mpath_node *node; - int i; - int j = 0; - - for_each_mesh_entry(tbl, node, i) { - if (sdata && node->mpath->sdata != sdata) - continue; - if (j++ == idx) { - if (mpath_expired(node->mpath)) { - spin_lock_bh(&node->mpath->state_lock); - node->mpath->flags &= ~MESH_PATH_ACTIVE; - spin_unlock_bh(&node->mpath->state_lock); - } - return node->mpath; - } - } - - return NULL; + return __mesh_path_lookup_by_idx(sdata->u.mesh.mesh_paths, idx); } /** @@ -426,19 +311,7 @@ mesh_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx) struct mesh_path * mpp_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx) { - struct mesh_table *tbl = rcu_dereference(mpp_paths); - struct mpath_node *node; - int i; - int j = 0; - - for_each_mesh_entry(tbl, node, i) { - if (sdata && node->mpath->sdata != sdata) - continue; - if (j++ == idx) - return node->mpath; - } - - return NULL; + return __mesh_path_lookup_by_idx(sdata->u.mesh.mpp_paths, idx); } /** @@ -448,30 +321,26 @@ mpp_path_lookup_by_idx(struct ieee80211_sub_if_data *sdata, int idx) int mesh_path_add_gate(struct mesh_path *mpath) { struct mesh_table *tbl; - struct mpath_node *gate, *new_gate; int err; rcu_read_lock(); - tbl = rcu_dereference(mesh_paths); - - hlist_for_each_entry_rcu(gate, tbl->known_gates, list) - if (gate->mpath == mpath) { - err = -EEXIST; - goto err_rcu; - } + tbl = mpath->sdata->u.mesh.mesh_paths; - new_gate = kzalloc(sizeof(struct mpath_node), GFP_ATOMIC); - if (!new_gate) { - err = -ENOMEM; + spin_lock_bh(&mpath->state_lock); + if (mpath->is_gate) { + err = -EEXIST; + spin_unlock_bh(&mpath->state_lock); goto err_rcu; } - mpath->is_gate = true; mpath->sdata->u.mesh.num_gates++; - new_gate->mpath = mpath; - spin_lock_bh(&tbl->gates_lock); - hlist_add_head_rcu(&new_gate->list, tbl->known_gates); - spin_unlock_bh(&tbl->gates_lock); + + spin_lock(&tbl->gates_lock); + hlist_add_head_rcu(&mpath->gate_list, &tbl->known_gates); + spin_unlock(&tbl->gates_lock); + + spin_unlock_bh(&mpath->state_lock); + mpath_dbg(mpath->sdata, "Mesh path: Recorded new gate: %pM. %d known gates\n", mpath->dst, mpath->sdata->u.mesh.num_gates); @@ -485,28 +354,22 @@ err_rcu: * mesh_gate_del - remove a mesh gate from the list of known gates * @tbl: table which holds our list of known gates * @mpath: gate mpath - * - * Locking: must be called inside rcu_read_lock() section */ static void mesh_gate_del(struct mesh_table *tbl, struct mesh_path *mpath) { - struct mpath_node *gate; - struct hlist_node *q; + lockdep_assert_held(&mpath->state_lock); + if (!mpath->is_gate) + return; - hlist_for_each_entry_safe(gate, q, tbl->known_gates, list) { - if (gate->mpath != mpath) - continue; - spin_lock_bh(&tbl->gates_lock); - hlist_del_rcu(&gate->list); - kfree_rcu(gate, rcu); - spin_unlock_bh(&tbl->gates_lock); - mpath->sdata->u.mesh.num_gates--; - mpath->is_gate = false; - mpath_dbg(mpath->sdata, - "Mesh path: Deleted gate: %pM. %d known gates\n", - mpath->dst, mpath->sdata->u.mesh.num_gates); - break; - } + mpath->is_gate = false; + spin_lock_bh(&tbl->gates_lock); + hlist_del_rcu(&mpath->gate_list); + mpath->sdata->u.mesh.num_gates--; + spin_unlock_bh(&tbl->gates_lock); + + mpath_dbg(mpath->sdata, + "Mesh path: Deleted gate: %pM. %d known gates\n", + mpath->dst, mpath->sdata->u.mesh.num_gates); } /** @@ -518,6 +381,31 @@ int mesh_gate_num(struct ieee80211_sub_if_data *sdata) return sdata->u.mesh.num_gates; } +static +struct mesh_path *mesh_path_new(struct ieee80211_sub_if_data *sdata, + const u8 *dst, gfp_t gfp_flags) +{ + struct mesh_path *new_mpath; + + new_mpath = kzalloc(sizeof(struct mesh_path), gfp_flags); + if (!new_mpath) + return NULL; + + memcpy(new_mpath->dst, dst, ETH_ALEN); + eth_broadcast_addr(new_mpath->rann_snd_addr); + new_mpath->is_root = false; + new_mpath->sdata = sdata; + new_mpath->flags = 0; + skb_queue_head_init(&new_mpath->frame_queue); + new_mpath->timer.data = (unsigned long) new_mpath; + new_mpath->timer.function = mesh_path_timer; + new_mpath->exp_time = jiffies; + spin_lock_init(&new_mpath->state_lock); + init_timer(&new_mpath->timer); + + return new_mpath; +} + /** * mesh_path_add - allocate and add a new path to the mesh path table * @dst: destination address of the path (ETH_ALEN length) @@ -530,15 +418,9 @@ int mesh_gate_num(struct ieee80211_sub_if_data *sdata) struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata, const u8 *dst) { - struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - struct ieee80211_local *local = sdata->local; struct mesh_table *tbl; struct mesh_path *mpath, *new_mpath; - struct mpath_node *node, *new_node; - struct hlist_head *bucket; - int grow = 0; - int err; - u32 hash_idx; + int ret; if (ether_addr_equal(dst, sdata->vif.addr)) /* never add ourselves as neighbours */ @@ -550,129 +432,44 @@ struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata, if (atomic_add_unless(&sdata->u.mesh.mpaths, 1, MESH_MAX_MPATHS) == 0) return ERR_PTR(-ENOSPC); - read_lock_bh(&pathtbl_resize_lock); - tbl = resize_dereference_mesh_paths(); - - hash_idx = mesh_table_hash(dst, sdata, tbl); - bucket = &tbl->hash_buckets[hash_idx]; - - spin_lock(&tbl->hashwlock[hash_idx]); - - hlist_for_each_entry(node, bucket, list) { - mpath = node->mpath; - if (mpath->sdata == sdata && - ether_addr_equal(dst, mpath->dst)) - goto found; - } - - err = -ENOMEM; - new_mpath = kzalloc(sizeof(struct mesh_path), GFP_ATOMIC); + new_mpath = mesh_path_new(sdata, dst, GFP_ATOMIC); if (!new_mpath) - goto err_path_alloc; - - new_node = kmalloc(sizeof(struct mpath_node), GFP_ATOMIC); - if (!new_node) - goto err_node_alloc; - - memcpy(new_mpath->dst, dst, ETH_ALEN); - eth_broadcast_addr(new_mpath->rann_snd_addr); - new_mpath->is_root = false; - new_mpath->sdata = sdata; - new_mpath->flags = 0; - skb_queue_head_init(&new_mpath->frame_queue); - new_node->mpath = new_mpath; - new_mpath->timer.data = (unsigned long) new_mpath; - new_mpath->timer.function = mesh_path_timer; - new_mpath->exp_time = jiffies; - spin_lock_init(&new_mpath->state_lock); - init_timer(&new_mpath->timer); - - hlist_add_head_rcu(&new_node->list, bucket); - if (atomic_inc_return(&tbl->entries) >= - MEAN_CHAIN_LEN * (tbl->hash_mask + 1)) - grow = 1; - - mesh_paths_generation++; - - if (grow) { - set_bit(MESH_WORK_GROW_MPATH_TABLE, &ifmsh->wrkq_flags); - ieee80211_queue_work(&local->hw, &sdata->work); - } - mpath = new_mpath; -found: - spin_unlock(&tbl->hashwlock[hash_idx]); - read_unlock_bh(&pathtbl_resize_lock); - return mpath; + return ERR_PTR(-ENOMEM); -err_node_alloc: - kfree(new_mpath); -err_path_alloc: - atomic_dec(&sdata->u.mesh.mpaths); - spin_unlock(&tbl->hashwlock[hash_idx]); - read_unlock_bh(&pathtbl_resize_lock); - return ERR_PTR(err); -} + tbl = sdata->u.mesh.mesh_paths; + do { + ret = rhashtable_lookup_insert_fast(&tbl->rhead, + &new_mpath->rhash, + mesh_rht_params); -static void mesh_table_free_rcu(struct rcu_head *rcu) -{ - struct mesh_table *tbl = container_of(rcu, struct mesh_table, rcu_head); + if (ret == -EEXIST) + mpath = rhashtable_lookup_fast(&tbl->rhead, + dst, + mesh_rht_params); - mesh_table_free(tbl, false); -} + } while (unlikely(ret == -EEXIST && !mpath)); -void mesh_mpath_table_grow(void) -{ - struct mesh_table *oldtbl, *newtbl; - - write_lock_bh(&pathtbl_resize_lock); - oldtbl = resize_dereference_mesh_paths(); - newtbl = mesh_table_alloc(oldtbl->size_order + 1); - if (!newtbl) - goto out; - if (mesh_table_grow(oldtbl, newtbl) < 0) { - __mesh_table_free(newtbl); - goto out; - } - rcu_assign_pointer(mesh_paths, newtbl); - - call_rcu(&oldtbl->rcu_head, mesh_table_free_rcu); - - out: - write_unlock_bh(&pathtbl_resize_lock); -} + if (ret && ret != -EEXIST) + return ERR_PTR(ret); -void mesh_mpp_table_grow(void) -{ - struct mesh_table *oldtbl, *newtbl; - - write_lock_bh(&pathtbl_resize_lock); - oldtbl = resize_dereference_mpp_paths(); - newtbl = mesh_table_alloc(oldtbl->size_order + 1); - if (!newtbl) - goto out; - if (mesh_table_grow(oldtbl, newtbl) < 0) { - __mesh_table_free(newtbl); - goto out; + /* At this point either new_mpath was added, or we found a + * matching entry already in the table; in the latter case + * free the unnecessary new entry. + */ + if (ret == -EEXIST) { + kfree(new_mpath); + new_mpath = mpath; } - rcu_assign_pointer(mpp_paths, newtbl); - call_rcu(&oldtbl->rcu_head, mesh_table_free_rcu); - - out: - write_unlock_bh(&pathtbl_resize_lock); + sdata->u.mesh.mesh_paths_generation++; + return new_mpath; } int mpp_path_add(struct ieee80211_sub_if_data *sdata, const u8 *dst, const u8 *mpp) { - struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - struct ieee80211_local *local = sdata->local; struct mesh_table *tbl; - struct mesh_path *mpath, *new_mpath; - struct mpath_node *node, *new_node; - struct hlist_head *bucket; - int grow = 0; - int err = 0; - u32 hash_idx; + struct mesh_path *new_mpath; + int ret; if (ether_addr_equal(dst, sdata->vif.addr)) /* never add ourselves as neighbours */ @@ -681,65 +478,19 @@ int mpp_path_add(struct ieee80211_sub_if_data *sdata, if (is_multicast_ether_addr(dst)) return -ENOTSUPP; - err = -ENOMEM; - new_mpath = kzalloc(sizeof(struct mesh_path), GFP_ATOMIC); - if (!new_mpath) - goto err_path_alloc; + new_mpath = mesh_path_new(sdata, dst, GFP_ATOMIC); - new_node = kmalloc(sizeof(struct mpath_node), GFP_ATOMIC); - if (!new_node) - goto err_node_alloc; + if (!new_mpath) + return -ENOMEM; - read_lock_bh(&pathtbl_resize_lock); - memcpy(new_mpath->dst, dst, ETH_ALEN); memcpy(new_mpath->mpp, mpp, ETH_ALEN); - new_mpath->sdata = sdata; - new_mpath->flags = 0; - skb_queue_head_init(&new_mpath->frame_queue); - new_node->mpath = new_mpath; - init_timer(&new_mpath->timer); - new_mpath->exp_time = jiffies; - spin_lock_init(&new_mpath->state_lock); - - tbl = resize_dereference_mpp_paths(); + tbl = sdata->u.mesh.mpp_paths; + ret = rhashtable_lookup_insert_fast(&tbl->rhead, + &new_mpath->rhash, + mesh_rht_params); - hash_idx = mesh_table_hash(dst, sdata, tbl); - bucket = &tbl->hash_buckets[hash_idx]; - - spin_lock(&tbl->hashwlock[hash_idx]); - - err = -EEXIST; - hlist_for_each_entry(node, bucket, list) { - mpath = node->mpath; - if (mpath->sdata == sdata && - ether_addr_equal(dst, mpath->dst)) - goto err_exists; - } - - hlist_add_head_rcu(&new_node->list, bucket); - if (atomic_inc_return(&tbl->entries) >= - MEAN_CHAIN_LEN * (tbl->hash_mask + 1)) - grow = 1; - - spin_unlock(&tbl->hashwlock[hash_idx]); - read_unlock_bh(&pathtbl_resize_lock); - - mpp_paths_generation++; - - if (grow) { - set_bit(MESH_WORK_GROW_MPP_TABLE, &ifmsh->wrkq_flags); - ieee80211_queue_work(&local->hw, &sdata->work); - } - return 0; - -err_exists: - spin_unlock(&tbl->hashwlock[hash_idx]); - read_unlock_bh(&pathtbl_resize_lock); - kfree(new_node); -err_node_alloc: - kfree(new_mpath); -err_path_alloc: - return err; + sdata->u.mesh.mpp_paths_generation++; + return ret; } @@ -753,17 +504,26 @@ err_path_alloc: */ void mesh_plink_broken(struct sta_info *sta) { - struct mesh_table *tbl; + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct mesh_table *tbl = sdata->u.mesh.mesh_paths; static const u8 bcast[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; struct mesh_path *mpath; - struct mpath_node *node; - struct ieee80211_sub_if_data *sdata = sta->sdata; - int i; + struct rhashtable_iter iter; + int ret; - rcu_read_lock(); - tbl = rcu_dereference(mesh_paths); - for_each_mesh_entry(tbl, node, i) { - mpath = node->mpath; + ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC); + if (ret) + return; + + ret = rhashtable_walk_start(&iter); + if (ret && ret != -EAGAIN) + goto out; + + while ((mpath = rhashtable_walk_next(&iter))) { + if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) + continue; + if (IS_ERR(mpath)) + break; if (rcu_access_pointer(mpath->next_hop) == sta && mpath->flags & MESH_PATH_ACTIVE && !(mpath->flags & MESH_PATH_FIXED)) { @@ -777,33 +537,30 @@ void mesh_plink_broken(struct sta_info *sta) WLAN_REASON_MESH_PATH_DEST_UNREACHABLE, bcast); } } - rcu_read_unlock(); +out: + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); } -static void mesh_path_node_reclaim(struct rcu_head *rp) +static void mesh_path_free_rcu(struct mesh_table *tbl, + struct mesh_path *mpath) { - struct mpath_node *node = container_of(rp, struct mpath_node, rcu); + struct ieee80211_sub_if_data *sdata = mpath->sdata; - del_timer_sync(&node->mpath->timer); - kfree(node->mpath); - kfree(node); + spin_lock_bh(&mpath->state_lock); + mpath->flags |= MESH_PATH_RESOLVING | MESH_PATH_DELETED; + mesh_gate_del(tbl, mpath); + spin_unlock_bh(&mpath->state_lock); + del_timer_sync(&mpath->timer); + atomic_dec(&sdata->u.mesh.mpaths); + atomic_dec(&tbl->entries); + kfree_rcu(mpath, rcu); } -/* needs to be called with the corresponding hashwlock taken */ -static void __mesh_path_del(struct mesh_table *tbl, struct mpath_node *node) +static void __mesh_path_del(struct mesh_table *tbl, struct mesh_path *mpath) { - struct mesh_path *mpath = node->mpath; - struct ieee80211_sub_if_data *sdata = node->mpath->sdata; - - spin_lock(&mpath->state_lock); - mpath->flags |= MESH_PATH_RESOLVING; - if (mpath->is_gate) - mesh_gate_del(tbl, mpath); - hlist_del_rcu(&node->list); - call_rcu(&node->rcu, mesh_path_node_reclaim); - spin_unlock(&mpath->state_lock); - atomic_dec(&sdata->u.mesh.mpaths); - atomic_dec(&tbl->entries); + rhashtable_remove_fast(&tbl->rhead, &mpath->rhash, mesh_rht_params); + mesh_path_free_rcu(tbl, mpath); } /** @@ -819,65 +576,88 @@ static void __mesh_path_del(struct mesh_table *tbl, struct mpath_node *node) */ void mesh_path_flush_by_nexthop(struct sta_info *sta) { - struct mesh_table *tbl; + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct mesh_table *tbl = sdata->u.mesh.mesh_paths; struct mesh_path *mpath; - struct mpath_node *node; - int i; + struct rhashtable_iter iter; + int ret; - rcu_read_lock(); - read_lock_bh(&pathtbl_resize_lock); - tbl = resize_dereference_mesh_paths(); - for_each_mesh_entry(tbl, node, i) { - mpath = node->mpath; - if (rcu_access_pointer(mpath->next_hop) == sta) { - spin_lock(&tbl->hashwlock[i]); - __mesh_path_del(tbl, node); - spin_unlock(&tbl->hashwlock[i]); - } + ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC); + if (ret) + return; + + ret = rhashtable_walk_start(&iter); + if (ret && ret != -EAGAIN) + goto out; + + while ((mpath = rhashtable_walk_next(&iter))) { + if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) + continue; + if (IS_ERR(mpath)) + break; + + if (rcu_access_pointer(mpath->next_hop) == sta) + __mesh_path_del(tbl, mpath); } - read_unlock_bh(&pathtbl_resize_lock); - rcu_read_unlock(); +out: + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); } static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata, const u8 *proxy) { - struct mesh_table *tbl; - struct mesh_path *mpp; - struct mpath_node *node; - int i; + struct mesh_table *tbl = sdata->u.mesh.mpp_paths; + struct mesh_path *mpath; + struct rhashtable_iter iter; + int ret; - rcu_read_lock(); - read_lock_bh(&pathtbl_resize_lock); - tbl = resize_dereference_mpp_paths(); - for_each_mesh_entry(tbl, node, i) { - mpp = node->mpath; - if (ether_addr_equal(mpp->mpp, proxy)) { - spin_lock(&tbl->hashwlock[i]); - __mesh_path_del(tbl, node); - spin_unlock(&tbl->hashwlock[i]); - } + ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC); + if (ret) + return; + + ret = rhashtable_walk_start(&iter); + if (ret && ret != -EAGAIN) + goto out; + + while ((mpath = rhashtable_walk_next(&iter))) { + if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) + continue; + if (IS_ERR(mpath)) + break; + + if (ether_addr_equal(mpath->mpp, proxy)) + __mesh_path_del(tbl, mpath); } - read_unlock_bh(&pathtbl_resize_lock); - rcu_read_unlock(); +out: + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); } -static void table_flush_by_iface(struct mesh_table *tbl, - struct ieee80211_sub_if_data *sdata) +static void table_flush_by_iface(struct mesh_table *tbl) { struct mesh_path *mpath; - struct mpath_node *node; - int i; + struct rhashtable_iter iter; + int ret; + + ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC); + if (ret) + return; + + ret = rhashtable_walk_start(&iter); + if (ret && ret != -EAGAIN) + goto out; - WARN_ON(!rcu_read_lock_held()); - for_each_mesh_entry(tbl, node, i) { - mpath = node->mpath; - if (mpath->sdata != sdata) + while ((mpath = rhashtable_walk_next(&iter))) { + if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) continue; - spin_lock_bh(&tbl->hashwlock[i]); - __mesh_path_del(tbl, node); - spin_unlock_bh(&tbl->hashwlock[i]); + if (IS_ERR(mpath)) + break; + __mesh_path_del(tbl, mpath); } +out: + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); } /** @@ -890,16 +670,8 @@ static void table_flush_by_iface(struct mesh_table *tbl, */ void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata) { - struct mesh_table *tbl; - - rcu_read_lock(); - read_lock_bh(&pathtbl_resize_lock); - tbl = resize_dereference_mesh_paths(); - table_flush_by_iface(tbl, sdata); - tbl = resize_dereference_mpp_paths(); - table_flush_by_iface(tbl, sdata); - read_unlock_bh(&pathtbl_resize_lock); - rcu_read_unlock(); + table_flush_by_iface(sdata->u.mesh.mesh_paths); + table_flush_by_iface(sdata->u.mesh.mpp_paths); } /** @@ -911,37 +683,25 @@ void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata) * * Returns: 0 if successful */ -static int table_path_del(struct mesh_table __rcu *rcu_tbl, +static int table_path_del(struct mesh_table *tbl, struct ieee80211_sub_if_data *sdata, const u8 *addr) { - struct mesh_table *tbl; struct mesh_path *mpath; - struct mpath_node *node; - struct hlist_head *bucket; - int hash_idx; - int err = 0; - - tbl = resize_dereference_paths(rcu_tbl); - hash_idx = mesh_table_hash(addr, sdata, tbl); - bucket = &tbl->hash_buckets[hash_idx]; - - spin_lock(&tbl->hashwlock[hash_idx]); - hlist_for_each_entry(node, bucket, list) { - mpath = node->mpath; - if (mpath->sdata == sdata && - ether_addr_equal(addr, mpath->dst)) { - __mesh_path_del(tbl, node); - goto enddel; - } + + rcu_read_lock(); + mpath = rhashtable_lookup_fast(&tbl->rhead, addr, mesh_rht_params); + if (!mpath) { + rcu_read_unlock(); + return -ENXIO; } - err = -ENXIO; -enddel: - spin_unlock(&tbl->hashwlock[hash_idx]); - return err; + __mesh_path_del(tbl, mpath); + rcu_read_unlock(); + return 0; } + /** * mesh_path_del - delete a mesh path from the table * @@ -952,36 +712,13 @@ enddel: */ int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr) { - int err = 0; + int err; /* flush relevant mpp entries first */ mpp_flush_by_proxy(sdata, addr); - read_lock_bh(&pathtbl_resize_lock); - err = table_path_del(mesh_paths, sdata, addr); - mesh_paths_generation++; - read_unlock_bh(&pathtbl_resize_lock); - - return err; -} - -/** - * mpp_path_del - delete a mesh proxy path from the table - * - * @addr: addr address (ETH_ALEN length) - * @sdata: local subif - * - * Returns: 0 if successful - */ -static int mpp_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr) -{ - int err = 0; - - read_lock_bh(&pathtbl_resize_lock); - err = table_path_del(mpp_paths, sdata, addr); - mpp_paths_generation++; - read_unlock_bh(&pathtbl_resize_lock); - + err = table_path_del(sdata->u.mesh.mesh_paths, sdata, addr); + sdata->u.mesh.mesh_paths_generation++; return err; } @@ -1015,39 +752,30 @@ int mesh_path_send_to_gates(struct mesh_path *mpath) struct ieee80211_sub_if_data *sdata = mpath->sdata; struct mesh_table *tbl; struct mesh_path *from_mpath = mpath; - struct mpath_node *gate = NULL; + struct mesh_path *gate; bool copy = false; - struct hlist_head *known_gates; - - rcu_read_lock(); - tbl = rcu_dereference(mesh_paths); - known_gates = tbl->known_gates; - rcu_read_unlock(); - if (!known_gates) - return -EHOSTUNREACH; + tbl = sdata->u.mesh.mesh_paths; - hlist_for_each_entry_rcu(gate, known_gates, list) { - if (gate->mpath->sdata != sdata) - continue; - - if (gate->mpath->flags & MESH_PATH_ACTIVE) { - mpath_dbg(sdata, "Forwarding to %pM\n", gate->mpath->dst); - mesh_path_move_to_queue(gate->mpath, from_mpath, copy); - from_mpath = gate->mpath; + rcu_read_lock(); + hlist_for_each_entry_rcu(gate, &tbl->known_gates, gate_list) { + if (gate->flags & MESH_PATH_ACTIVE) { + mpath_dbg(sdata, "Forwarding to %pM\n", gate->dst); + mesh_path_move_to_queue(gate, from_mpath, copy); + from_mpath = gate; copy = true; } else { mpath_dbg(sdata, "Not forwarding to %pM (flags %#x)\n", - gate->mpath->dst, gate->mpath->flags); + gate->dst, gate->flags); } } - hlist_for_each_entry_rcu(gate, known_gates, list) - if (gate->mpath->sdata == sdata) { - mpath_dbg(sdata, "Sending to %pM\n", gate->mpath->dst); - mesh_path_tx_pending(gate->mpath); - } + hlist_for_each_entry_rcu(gate, &tbl->known_gates, gate_list) { + mpath_dbg(sdata, "Sending to %pM\n", gate->dst); + mesh_path_tx_pending(gate); + } + rcu_read_unlock(); return (from_mpath == mpath) ? -EHOSTUNREACH : 0; } @@ -1104,118 +832,73 @@ void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop) mesh_path_tx_pending(mpath); } -static void mesh_path_node_free(struct hlist_node *p, bool free_leafs) -{ - struct mesh_path *mpath; - struct mpath_node *node = hlist_entry(p, struct mpath_node, list); - mpath = node->mpath; - hlist_del_rcu(p); - if (free_leafs) { - del_timer_sync(&mpath->timer); - kfree(mpath); - } - kfree(node); -} - -static int mesh_path_node_copy(struct hlist_node *p, struct mesh_table *newtbl) -{ - struct mesh_path *mpath; - struct mpath_node *node, *new_node; - u32 hash_idx; - - new_node = kmalloc(sizeof(struct mpath_node), GFP_ATOMIC); - if (new_node == NULL) - return -ENOMEM; - - node = hlist_entry(p, struct mpath_node, list); - mpath = node->mpath; - new_node->mpath = mpath; - hash_idx = mesh_table_hash(mpath->dst, mpath->sdata, newtbl); - hlist_add_head(&new_node->list, - &newtbl->hash_buckets[hash_idx]); - return 0; -} - -int mesh_pathtbl_init(void) +int mesh_pathtbl_init(struct ieee80211_sub_if_data *sdata) { struct mesh_table *tbl_path, *tbl_mpp; int ret; - tbl_path = mesh_table_alloc(INIT_PATHS_SIZE_ORDER); + tbl_path = mesh_table_alloc(); if (!tbl_path) return -ENOMEM; - tbl_path->free_node = &mesh_path_node_free; - tbl_path->copy_node = &mesh_path_node_copy; - tbl_path->known_gates = kzalloc(sizeof(struct hlist_head), GFP_ATOMIC); - if (!tbl_path->known_gates) { - ret = -ENOMEM; - goto free_path; - } - INIT_HLIST_HEAD(tbl_path->known_gates); - - tbl_mpp = mesh_table_alloc(INIT_PATHS_SIZE_ORDER); + tbl_mpp = mesh_table_alloc(); if (!tbl_mpp) { ret = -ENOMEM; goto free_path; } - tbl_mpp->free_node = &mesh_path_node_free; - tbl_mpp->copy_node = &mesh_path_node_copy; - tbl_mpp->known_gates = kzalloc(sizeof(struct hlist_head), GFP_ATOMIC); - if (!tbl_mpp->known_gates) { - ret = -ENOMEM; - goto free_mpp; - } - INIT_HLIST_HEAD(tbl_mpp->known_gates); - /* Need no locking since this is during init */ - RCU_INIT_POINTER(mesh_paths, tbl_path); - RCU_INIT_POINTER(mpp_paths, tbl_mpp); + rhashtable_init(&tbl_path->rhead, &mesh_rht_params); + rhashtable_init(&tbl_mpp->rhead, &mesh_rht_params); + + sdata->u.mesh.mesh_paths = tbl_path; + sdata->u.mesh.mpp_paths = tbl_mpp; return 0; -free_mpp: - mesh_table_free(tbl_mpp, true); free_path: - mesh_table_free(tbl_path, true); + mesh_table_free(tbl_path); return ret; } -void mesh_path_expire(struct ieee80211_sub_if_data *sdata) +static +void mesh_path_tbl_expire(struct ieee80211_sub_if_data *sdata, + struct mesh_table *tbl) { - struct mesh_table *tbl; struct mesh_path *mpath; - struct mpath_node *node; - int i; + struct rhashtable_iter iter; + int ret; - rcu_read_lock(); - tbl = rcu_dereference(mesh_paths); - for_each_mesh_entry(tbl, node, i) { - if (node->mpath->sdata != sdata) + ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_KERNEL); + if (ret) + return; + + ret = rhashtable_walk_start(&iter); + if (ret && ret != -EAGAIN) + goto out; + + while ((mpath = rhashtable_walk_next(&iter))) { + if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) continue; - mpath = node->mpath; + if (IS_ERR(mpath)) + break; if ((!(mpath->flags & MESH_PATH_RESOLVING)) && (!(mpath->flags & MESH_PATH_FIXED)) && time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE)) - mesh_path_del(mpath->sdata, mpath->dst); - } - - tbl = rcu_dereference(mpp_paths); - for_each_mesh_entry(tbl, node, i) { - if (node->mpath->sdata != sdata) - continue; - mpath = node->mpath; - if ((!(mpath->flags & MESH_PATH_FIXED)) && - time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE)) - mpp_path_del(mpath->sdata, mpath->dst); + __mesh_path_del(tbl, mpath); } +out: + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); +} - rcu_read_unlock(); +void mesh_path_expire(struct ieee80211_sub_if_data *sdata) +{ + mesh_path_tbl_expire(sdata, sdata->u.mesh.mesh_paths); + mesh_path_tbl_expire(sdata, sdata->u.mesh.mpp_paths); } -void mesh_pathtbl_unregister(void) +void mesh_pathtbl_unregister(struct ieee80211_sub_if_data *sdata) { - /* no need for locking during exit path */ - mesh_table_free(rcu_dereference_protected(mesh_paths, 1), true); - mesh_table_free(rcu_dereference_protected(mpp_paths, 1), true); + mesh_table_free(sdata->u.mesh.mesh_paths); + mesh_table_free(sdata->u.mesh.mpp_paths); } diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index a07e93c21..79f2a0a13 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -61,7 +61,7 @@ static bool rssi_threshold_check(struct ieee80211_sub_if_data *sdata, s32 rssi_threshold = sdata->u.mesh.mshcfg.rssi_threshold; return rssi_threshold == 0 || (sta && - (s8)-ewma_signal_read(&sta->rx_stats.avg_signal) > + (s8)-ewma_signal_read(&sta->rx_stats_avg.signal) > rssi_threshold); } @@ -93,18 +93,18 @@ static inline void mesh_plink_fsm_restart(struct sta_info *sta) static u32 mesh_set_short_slot_time(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; struct sta_info *sta; u32 erp_rates = 0, changed = 0; int i; bool short_slot = false; - if (band == IEEE80211_BAND_5GHZ) { + if (band == NL80211_BAND_5GHZ) { /* (IEEE 802.11-2012 19.4.5) */ short_slot = true; goto out; - } else if (band != IEEE80211_BAND_2GHZ) + } else if (band != NL80211_BAND_2GHZ) goto out; for (i = 0; i < sband->n_bitrates; i++) @@ -247,7 +247,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, mgmt->u.action.u.self_prot.action_code = action; if (action != WLAN_SP_MESH_PEERING_CLOSE) { - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); /* capability info */ pos = skb_put(skb, 2); @@ -331,7 +331,9 @@ free: * * @sta: mesh peer link to deactivate * - * All mesh paths with this peer as next hop will be flushed + * Mesh paths with this peer as next hop should be flushed + * by the caller outside of plink_lock. + * * Returns beacon changed flag if the beacon content changed. * * Locking: the caller must hold sta->mesh->plink_lock @@ -346,7 +348,6 @@ static u32 __mesh_plink_deactivate(struct sta_info *sta) if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) changed = mesh_plink_dec_estab_count(sdata); sta->mesh->plink_state = NL80211_PLINK_BLOCKED; - mesh_path_flush_by_nexthop(sta); ieee80211_mps_sta_status_update(sta); changed |= ieee80211_mps_set_sta_local_pm(sta, @@ -374,6 +375,7 @@ u32 mesh_plink_deactivate(struct sta_info *sta) sta->sta.addr, sta->mesh->llid, sta->mesh->plid, sta->mesh->reason); spin_unlock_bh(&sta->mesh->plink_lock); + mesh_path_flush_by_nexthop(sta); return changed; } @@ -383,7 +385,7 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, bool insert) { struct ieee80211_local *local = sdata->local; - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_supported_band *sband; u32 rates, basic_rates = 0, changed = 0; enum ieee80211_sta_rx_bandwidth bw = sta->sta.bandwidth; @@ -748,6 +750,7 @@ u32 mesh_plink_block(struct sta_info *sta) changed = __mesh_plink_deactivate(sta); sta->mesh->plink_state = NL80211_PLINK_BLOCKED; spin_unlock_bh(&sta->mesh->plink_lock); + mesh_path_flush_by_nexthop(sta); return changed; } @@ -797,6 +800,7 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, struct mesh_config *mshcfg = &sdata->u.mesh.mshcfg; enum ieee80211_self_protected_actioncode action = 0; u32 changed = 0; + bool flush = false; mpl_dbg(sdata, "peer %pM in state %s got event %s\n", sta->sta.addr, mplstates[sta->mesh->plink_state], mplevents[event]); @@ -885,6 +889,7 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, changed |= mesh_set_short_slot_time(sdata); mesh_plink_close(sdata, sta, event); action = WLAN_SP_MESH_PEERING_CLOSE; + flush = true; break; case OPN_ACPT: action = WLAN_SP_MESH_PEERING_CONFIRM; @@ -916,6 +921,8 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, break; } spin_unlock_bh(&sta->mesh->plink_lock); + if (flush) + mesh_path_flush_by_nexthop(sta); if (action) { mesh_plink_frame_tx(sdata, sta, action, sta->sta.addr, sta->mesh->llid, sta->mesh->plid, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 281b8d6e5..8d426f637 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -122,15 +122,16 @@ void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - if (unlikely(!sdata->u.mgd.associated)) + if (unlikely(!ifmgd->associated)) return; - ifmgd->probe_send_count = 0; + if (ifmgd->probe_send_count) + ifmgd->probe_send_count = 0; if (ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR)) return; - mod_timer(&sdata->u.mgd.conn_mon_timer, + mod_timer(&ifmgd->conn_mon_timer, round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME)); } @@ -660,7 +661,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) capab = WLAN_CAPABILITY_ESS; - if (sband->band == IEEE80211_BAND_2GHZ) { + if (sband->band == NL80211_BAND_2GHZ) { capab |= WLAN_CAPABILITY_SHORT_SLOT_TIME; capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; } @@ -1099,7 +1100,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss = ifmgd->associated; struct ieee80211_chanctx_conf *conf; struct ieee80211_chanctx *chanctx; - enum ieee80211_band current_band; + enum nl80211_band current_band; struct ieee80211_csa_ie csa_ie; struct ieee80211_channel_switch ch_switch; int res; @@ -1256,11 +1257,11 @@ ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, default: WARN_ON_ONCE(1); /* fall through */ - case IEEE80211_BAND_2GHZ: - case IEEE80211_BAND_60GHZ: + case NL80211_BAND_2GHZ: + case NL80211_BAND_60GHZ: chan_increment = 1; break; - case IEEE80211_BAND_5GHZ: + case NL80211_BAND_5GHZ: chan_increment = 4; break; } @@ -1860,7 +1861,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, } use_short_slot = !!(capab & WLAN_CAPABILITY_SHORT_SLOT_TIME); - if (ieee80211_get_sdata_band(sdata) == IEEE80211_BAND_5GHZ) + if (ieee80211_get_sdata_band(sdata) == NL80211_BAND_5GHZ) use_short_slot = true; if (use_protection != bss_conf->use_cts_prot) { @@ -2216,6 +2217,7 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) const u8 *ssid; u8 *dst = ifmgd->associated->bssid; u8 unicast_limit = max(1, max_probe_tries - 3); + struct sta_info *sta; /* * Try sending broadcast probe requests for the last three @@ -2234,6 +2236,14 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) */ ifmgd->probe_send_count++; + if (dst) { + mutex_lock(&sdata->local->sta_mtx); + sta = sta_info_get(sdata, dst); + if (!WARN_ON(!sta)) + ieee80211_check_fast_rx(sta); + mutex_unlock(&sdata->local->sta_mtx); + } + if (ieee80211_hw_check(&sdata->local->hw, REPORTS_TX_ACK_STATUS)) { ifmgd->nullfunc_failed = false; ieee80211_send_nullfunc(sdata->local, sdata, false); @@ -2389,6 +2399,11 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) return; } + /* AP is probably out of range (or not reachable for another reason) so + * remove the bss struct for that AP. + */ + cfg80211_unlink_bss(local->hw.wiphy, ifmgd->associated); + ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, true, frame_buf); @@ -4365,7 +4380,7 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.basic_rates = basic_rates; /* cf. IEEE 802.11 9.2.12 */ - if (cbss->channel->band == IEEE80211_BAND_2GHZ && + if (cbss->channel->band == NL80211_BAND_2GHZ && have_higher_than_11mbit) sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE; else diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c index 0be0aadfc..88e6ebbbe 100644 --- a/net/mac80211/ocb.c +++ b/net/mac80211/ocb.c @@ -75,8 +75,6 @@ void ieee80211_ocb_rx_no_sta(struct ieee80211_sub_if_data *sdata, if (!sta) return; - sta->rx_stats.last_rx = jiffies; - /* Add only mandatory rates for now */ sband = local->hw.wiphy->bands[band]; sta->sta.supp_rates[band] = diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index a4e2f4e67..206698bc9 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -287,7 +287,7 @@ static void __rate_control_send_low(struct ieee80211_hw *hw, u32 rate_flags = ieee80211_chandef_rate_flags(&hw->conf.chandef); - if ((sband->band == IEEE80211_BAND_2GHZ) && + if ((sband->band == NL80211_BAND_2GHZ) && (info->flags & IEEE80211_TX_CTL_NO_CCK_RATE)) rate_flags |= IEEE80211_RATE_ERP_G; diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 624fe5b81..8d3260785 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -96,9 +96,9 @@ static inline void rate_control_add_sta_debugfs(struct sta_info *sta) { #ifdef CONFIG_MAC80211_DEBUGFS struct rate_control_ref *ref = sta->rate_ctrl; - if (ref && sta->debugfs.dir && ref->ops->add_sta_debugfs) + if (ref && sta->debugfs_dir && ref->ops->add_sta_debugfs) ref->ops->add_sta_debugfs(ref->priv, sta->rate_ctrl_priv, - sta->debugfs.dir); + sta->debugfs_dir); #endif } diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index b54f398cd..14c5ba3a1 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -436,7 +436,7 @@ minstrel_get_rate(void *priv, struct ieee80211_sta *sta, static void -calc_rate_durations(enum ieee80211_band band, +calc_rate_durations(enum nl80211_band band, struct minstrel_rate *d, struct ieee80211_rate *rate, struct cfg80211_chan_def *chandef) @@ -579,7 +579,7 @@ minstrel_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp) if (!mi) return NULL; - for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { sband = hw->wiphy->bands[i]; if (sband && sband->n_bitrates > max_rates) max_rates = sband->n_bitrates; @@ -621,7 +621,7 @@ minstrel_init_cck_rates(struct minstrel_priv *mp) u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); int i, j; - sband = mp->hw->wiphy->bands[IEEE80211_BAND_2GHZ]; + sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ]; if (!sband) return; diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 370d677b5..30fbabf4b 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -883,6 +883,59 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, ratetbl->rate[offset].flags = flags; } +static inline int +minstrel_ht_get_prob_ewma(struct minstrel_ht_sta *mi, int rate) +{ + int group = rate / MCS_GROUP_RATES; + rate %= MCS_GROUP_RATES; + return mi->groups[group].rates[rate].prob_ewma; +} + +static int +minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi) +{ + int group = mi->max_prob_rate / MCS_GROUP_RATES; + const struct mcs_group *g = &minstrel_mcs_groups[group]; + int rate = mi->max_prob_rate % MCS_GROUP_RATES; + + /* Disable A-MSDU if max_prob_rate is bad */ + if (mi->groups[group].rates[rate].prob_ewma < MINSTREL_FRAC(50, 100)) + return 1; + + /* If the rate is slower than single-stream MCS1, make A-MSDU limit small */ + if (g->duration[rate] > MCS_DURATION(1, 0, 52)) + return 500; + + /* + * If the rate is slower than single-stream MCS4, limit A-MSDU to usual + * data packet size + */ + if (g->duration[rate] > MCS_DURATION(1, 0, 104)) + return 1600; + + /* + * If the rate is slower than single-stream MCS7, or if the max throughput + * rate success probability is less than 75%, limit A-MSDU to twice the usual + * data packet size + */ + if (g->duration[rate] > MCS_DURATION(1, 0, 260) || + (minstrel_ht_get_prob_ewma(mi, mi->max_tp_rate[0]) < + MINSTREL_FRAC(75, 100))) + return 3200; + + /* + * HT A-MPDU limits maximum MPDU size under BA agreement to 4095 bytes. + * Since aggregation sessions are started/stopped without txq flush, use + * the limit here to avoid the complexity of having to de-aggregate + * packets in the queue. + */ + if (!mi->sta->vht_cap.vht_supported) + return IEEE80211_MAX_MPDU_LEN_HT_BA; + + /* unlimited */ + return 0; +} + static void minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { @@ -907,6 +960,7 @@ minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_prob_rate); } + mi->sta->max_rc_amsdu_len = minstrel_ht_get_max_amsdu_len(mi); rates->rate[i].idx = -1; rate_control_set_rates(mp->hw, mi->sta, rates); } @@ -924,6 +978,7 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) struct minstrel_rate_stats *mrs; struct minstrel_mcs_group_data *mg; unsigned int sample_dur, sample_group, cur_max_tp_streams; + int tp_rate1, tp_rate2; int sample_idx = 0; if (mi->sample_wait > 0) { @@ -945,14 +1000,22 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) mrs = &mg->rates[sample_idx]; sample_idx += sample_group * MCS_GROUP_RATES; + /* Set tp_rate1, tp_rate2 to the highest / second highest max_tp_rate */ + if (minstrel_get_duration(mi->max_tp_rate[0]) > + minstrel_get_duration(mi->max_tp_rate[1])) { + tp_rate1 = mi->max_tp_rate[1]; + tp_rate2 = mi->max_tp_rate[0]; + } else { + tp_rate1 = mi->max_tp_rate[0]; + tp_rate2 = mi->max_tp_rate[1]; + } + /* * Sampling might add some overhead (RTS, no aggregation) - * to the frame. Hence, don't use sampling for the currently - * used rates. + * to the frame. Hence, don't use sampling for the highest currently + * used highest throughput or probability rate. */ - if (sample_idx == mi->max_tp_rate[0] || - sample_idx == mi->max_tp_rate[1] || - sample_idx == mi->max_prob_rate) + if (sample_idx == mi->max_tp_rate[0] || sample_idx == mi->max_prob_rate) return -1; /* @@ -967,10 +1030,10 @@ minstrel_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) * if the link is working perfectly. */ - cur_max_tp_streams = minstrel_mcs_groups[mi->max_tp_rate[0] / + cur_max_tp_streams = minstrel_mcs_groups[tp_rate1 / MCS_GROUP_RATES].streams; sample_dur = minstrel_get_duration(sample_idx); - if (sample_dur >= minstrel_get_duration(mi->max_tp_rate[1]) && + if (sample_dur >= minstrel_get_duration(tp_rate2) && (cur_max_tp_streams - 1 < minstrel_mcs_groups[sample_group].streams || sample_dur >= minstrel_get_duration(mi->max_prob_rate))) { @@ -1074,7 +1137,7 @@ minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, { int i; - if (sband->band != IEEE80211_BAND_2GHZ) + if (sband->band != NL80211_BAND_2GHZ) return; if (!ieee80211_hw_check(mp->hw, SUPPORTS_HT_CCK_RATES)) @@ -1272,7 +1335,7 @@ minstrel_ht_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp) int max_rates = 0; int i; - for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { sband = hw->wiphy->bands[i]; if (sband && sband->n_bitrates > max_rates) max_rates = sband->n_bitrates; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index dc27becb9..5e65e8389 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -322,7 +322,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, else if (status->flag & RX_FLAG_5MHZ) channel_flags |= IEEE80211_CHAN_QUARTER; - if (status->band == IEEE80211_BAND_5GHZ) + if (status->band == NL80211_BAND_5GHZ) channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_5GHZ; else if (status->flag & (RX_FLAG_HT | RX_FLAG_VHT)) channel_flags |= IEEE80211_CHAN_DYN | IEEE80211_CHAN_2GHZ; @@ -722,8 +722,8 @@ static int ieee80211_get_mmie_keyidx(struct sk_buff *skb) return -1; } -static int iwl80211_get_cs_keyid(const struct ieee80211_cipher_scheme *cs, - struct sk_buff *skb) +static int ieee80211_get_cs_keyid(const struct ieee80211_cipher_scheme *cs, + struct sk_buff *skb) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; __le16 fc; @@ -1319,13 +1319,52 @@ int ieee80211_sta_ps_transition(struct ieee80211_sta *pubsta, bool start) } EXPORT_SYMBOL(ieee80211_sta_ps_transition); +void ieee80211_sta_pspoll(struct ieee80211_sta *pubsta) +{ + struct sta_info *sta = container_of(pubsta, struct sta_info, sta); + + if (test_sta_flag(sta, WLAN_STA_SP)) + return; + + if (!test_sta_flag(sta, WLAN_STA_PS_DRIVER)) + ieee80211_sta_ps_deliver_poll_response(sta); + else + set_sta_flag(sta, WLAN_STA_PSPOLL); +} +EXPORT_SYMBOL(ieee80211_sta_pspoll); + +void ieee80211_sta_uapsd_trigger(struct ieee80211_sta *pubsta, u8 tid) +{ + struct sta_info *sta = container_of(pubsta, struct sta_info, sta); + u8 ac = ieee802_1d_to_ac[tid & 7]; + + /* + * If this AC is not trigger-enabled do nothing. + * + * NB: This could/should check a separate bitmap of trigger- + * enabled queues, but for now we only implement uAPSD w/o + * TSPEC changes to the ACs, so they're always the same. + */ + if (!(sta->sta.uapsd_queues & BIT(ac))) + return; + + /* if we are in a service period, do nothing */ + if (test_sta_flag(sta, WLAN_STA_SP)) + return; + + if (!test_sta_flag(sta, WLAN_STA_PS_DRIVER)) + ieee80211_sta_ps_deliver_uapsd(sta); + else + set_sta_flag(sta, WLAN_STA_UAPSD); +} +EXPORT_SYMBOL(ieee80211_sta_uapsd_trigger); + static ieee80211_rx_result debug_noinline ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx) { struct ieee80211_sub_if_data *sdata = rx->sdata; struct ieee80211_hdr *hdr = (void *)rx->skb->data; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); - int tid, ac; if (!rx->sta) return RX_CONTINUE; @@ -1351,12 +1390,7 @@ ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (unlikely(ieee80211_is_pspoll(hdr->frame_control))) { - if (!test_sta_flag(rx->sta, WLAN_STA_SP)) { - if (!test_sta_flag(rx->sta, WLAN_STA_PS_DRIVER)) - ieee80211_sta_ps_deliver_poll_response(rx->sta); - else - set_sta_flag(rx->sta, WLAN_STA_PSPOLL); - } + ieee80211_sta_pspoll(&rx->sta->sta); /* Free PS Poll skb here instead of returning RX_DROP that would * count as an dropped frame. */ @@ -1368,27 +1402,11 @@ ieee80211_rx_h_uapsd_and_pspoll(struct ieee80211_rx_data *rx) ieee80211_has_pm(hdr->frame_control) && (ieee80211_is_data_qos(hdr->frame_control) || ieee80211_is_qos_nullfunc(hdr->frame_control))) { - tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; - ac = ieee802_1d_to_ac[tid & 7]; + u8 tid; - /* - * If this AC is not trigger-enabled do nothing. - * - * NB: This could/should check a separate bitmap of trigger- - * enabled queues, but for now we only implement uAPSD w/o - * TSPEC changes to the ACs, so they're always the same. - */ - if (!(rx->sta->sta.uapsd_queues & BIT(ac))) - return RX_CONTINUE; - - /* if we are in a service period, do nothing */ - if (test_sta_flag(rx->sta, WLAN_STA_SP)) - return RX_CONTINUE; + tid = *ieee80211_get_qos_ctl(hdr) & IEEE80211_QOS_CTL_TID_MASK; - if (!test_sta_flag(rx->sta, WLAN_STA_PS_DRIVER)) - ieee80211_sta_ps_deliver_uapsd(rx->sta); - else - set_sta_flag(rx->sta, WLAN_STA_UAPSD); + ieee80211_sta_uapsd_trigger(&rx->sta->sta, tid); } return RX_CONTINUE; @@ -1421,16 +1439,9 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) test_sta_flag(sta, WLAN_STA_AUTHORIZED)) { sta->rx_stats.last_rx = jiffies; if (ieee80211_is_data(hdr->frame_control) && - !is_multicast_ether_addr(hdr->addr1)) { - sta->rx_stats.last_rate_idx = - status->rate_idx; - sta->rx_stats.last_rate_flag = - status->flag; - sta->rx_stats.last_rate_vht_flag = - status->vht_flag; - sta->rx_stats.last_rate_vht_nss = - status->vht_nss; - } + !is_multicast_ether_addr(hdr->addr1)) + sta->rx_stats.last_rate = + sta_stats_encode_rate(status); } } else if (rx->sdata->vif.type == NL80211_IFTYPE_OCB) { sta->rx_stats.last_rx = jiffies; @@ -1440,22 +1451,22 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) * match the current local configuration when processed. */ sta->rx_stats.last_rx = jiffies; - if (ieee80211_is_data(hdr->frame_control)) { - sta->rx_stats.last_rate_idx = status->rate_idx; - sta->rx_stats.last_rate_flag = status->flag; - sta->rx_stats.last_rate_vht_flag = status->vht_flag; - sta->rx_stats.last_rate_vht_nss = status->vht_nss; - } + if (ieee80211_is_data(hdr->frame_control)) + sta->rx_stats.last_rate = sta_stats_encode_rate(status); } if (rx->sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_sta_rx_notify(rx->sdata, hdr); sta->rx_stats.fragments++; + + u64_stats_update_begin(&rx->sta->rx_stats.syncp); sta->rx_stats.bytes += rx->skb->len; + u64_stats_update_end(&rx->sta->rx_stats.syncp); + if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { sta->rx_stats.last_signal = status->signal; - ewma_signal_add(&sta->rx_stats.avg_signal, -status->signal); + ewma_signal_add(&sta->rx_stats_avg.signal, -status->signal); } if (status->chains) { @@ -1467,7 +1478,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) continue; sta->rx_stats.chain_signal_last[i] = signal; - ewma_signal_add(&sta->rx_stats.chain_signal_avg[i], + ewma_signal_add(&sta->rx_stats_avg.chain_signal[i], -signal); } } @@ -1586,7 +1597,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (ieee80211_has_protected(fc) && rx->sta->cipher_scheme) { cs = rx->sta->cipher_scheme; - keyid = iwl80211_get_cs_keyid(cs, rx->skb); + keyid = ieee80211_get_cs_keyid(cs, rx->skb); if (unlikely(keyid < 0)) return RX_DROP_UNUSABLE; } @@ -1670,7 +1681,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) hdrlen = ieee80211_hdrlen(fc); if (cs) { - keyidx = iwl80211_get_cs_keyid(cs, rx->skb); + keyidx = ieee80211_get_cs_keyid(cs, rx->skb); if (unlikely(keyidx < 0)) return RX_DROP_UNUSABLE; @@ -2129,6 +2140,17 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) ieee80211_rx_stats(dev, skb->len); + if (rx->sta) { + /* The seqno index has the same property as needed + * for the rx_msdu field, i.e. it is IEEE80211_NUM_TIDS + * for non-QoS-data frames. Here we know it's a data + * frame, so count MSDUs. + */ + u64_stats_update_begin(&rx->sta->rx_stats.syncp); + rx->sta->rx_stats.msdu[rx->seqno_idx]++; + u64_stats_update_end(&rx->sta->rx_stats.syncp); + } + if ((sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) && !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) && @@ -2415,15 +2437,6 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) return RX_DROP_MONITOR; - if (rx->sta) { - /* The seqno index has the same property as needed - * for the rx_msdu field, i.e. it is IEEE80211_NUM_TIDS - * for non-QoS-data frames. Here we know it's a data - * frame, so count MSDUs. - */ - rx->sta->rx_stats.msdu[rx->seqno_idx]++; - } - /* * Send unexpected-4addr-frame event to hostapd. For older versions, * also drop the frame to cooked monitor interfaces. @@ -2474,14 +2487,14 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) rx->skb->dev = dev; - if (local->ps_sdata && local->hw.conf.dynamic_ps_timeout > 0 && + if (!ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS) && + local->ps_sdata && local->hw.conf.dynamic_ps_timeout > 0 && !is_multicast_ether_addr( ((struct ethhdr *)rx->skb->data)->h_dest) && (!local->scanning && - !test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state))) { - mod_timer(&local->dynamic_ps_timer, jiffies + - msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); - } + !test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state))) + mod_timer(&local->dynamic_ps_timer, jiffies + + msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); ieee80211_deliver_skb(rx); @@ -2828,7 +2841,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) switch (mgmt->u.action.u.measurement.action_code) { case WLAN_ACTION_SPCT_MSR_REQ: - if (status->band != IEEE80211_BAND_5GHZ) + if (status->band != NL80211_BAND_5GHZ) break; if (len < (IEEE80211_MIN_ACTION_SIZE + @@ -3201,7 +3214,7 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, res = rxh(rx); \ if (res != RX_CONTINUE) \ goto rxh_next; \ - } while (0); + } while (0) /* Lock here to avoid hitting all of the data used in the RX * path (e.g. key data, station data, ...) concurrently when @@ -3219,30 +3232,30 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, */ rx->skb = skb; - CALL_RXH(ieee80211_rx_h_check_more_data) - CALL_RXH(ieee80211_rx_h_uapsd_and_pspoll) - CALL_RXH(ieee80211_rx_h_sta_process) - CALL_RXH(ieee80211_rx_h_decrypt) - CALL_RXH(ieee80211_rx_h_defragment) - CALL_RXH(ieee80211_rx_h_michael_mic_verify) + CALL_RXH(ieee80211_rx_h_check_more_data); + CALL_RXH(ieee80211_rx_h_uapsd_and_pspoll); + CALL_RXH(ieee80211_rx_h_sta_process); + CALL_RXH(ieee80211_rx_h_decrypt); + CALL_RXH(ieee80211_rx_h_defragment); + CALL_RXH(ieee80211_rx_h_michael_mic_verify); /* must be after MMIC verify so header is counted in MPDU mic */ #ifdef CONFIG_MAC80211_MESH if (ieee80211_vif_is_mesh(&rx->sdata->vif)) CALL_RXH(ieee80211_rx_h_mesh_fwding); #endif - CALL_RXH(ieee80211_rx_h_amsdu) - CALL_RXH(ieee80211_rx_h_data) + CALL_RXH(ieee80211_rx_h_amsdu); + CALL_RXH(ieee80211_rx_h_data); /* special treatment -- needs the queue */ res = ieee80211_rx_h_ctrl(rx, frames); if (res != RX_CONTINUE) goto rxh_next; - CALL_RXH(ieee80211_rx_h_mgmt_check) - CALL_RXH(ieee80211_rx_h_action) - CALL_RXH(ieee80211_rx_h_userspace_mgmt) - CALL_RXH(ieee80211_rx_h_action_return) - CALL_RXH(ieee80211_rx_h_mgmt) + CALL_RXH(ieee80211_rx_h_mgmt_check); + CALL_RXH(ieee80211_rx_h_action); + CALL_RXH(ieee80211_rx_h_userspace_mgmt); + CALL_RXH(ieee80211_rx_h_action_return); + CALL_RXH(ieee80211_rx_h_mgmt); rxh_next: ieee80211_rx_handlers_result(rx, res); @@ -3265,10 +3278,10 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx) res = rxh(rx); \ if (res != RX_CONTINUE) \ goto rxh_next; \ - } while (0); + } while (0) - CALL_RXH(ieee80211_rx_h_check_dup) - CALL_RXH(ieee80211_rx_h_check) + CALL_RXH(ieee80211_rx_h_check_dup); + CALL_RXH(ieee80211_rx_h_check); ieee80211_rx_reorder_ampdu(rx, &reorder_release); @@ -3513,6 +3526,351 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) return false; } +void ieee80211_check_fast_rx(struct sta_info *sta) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_local *local = sdata->local; + struct ieee80211_key *key; + struct ieee80211_fast_rx fastrx = { + .dev = sdata->dev, + .vif_type = sdata->vif.type, + .control_port_protocol = sdata->control_port_protocol, + }, *old, *new = NULL; + bool assign = false; + + /* use sparse to check that we don't return without updating */ + __acquire(check_fast_rx); + + BUILD_BUG_ON(sizeof(fastrx.rfc1042_hdr) != sizeof(rfc1042_header)); + BUILD_BUG_ON(sizeof(fastrx.rfc1042_hdr) != ETH_ALEN); + ether_addr_copy(fastrx.rfc1042_hdr, rfc1042_header); + ether_addr_copy(fastrx.vif_addr, sdata->vif.addr); + + fastrx.uses_rss = ieee80211_hw_check(&local->hw, USES_RSS); + + /* fast-rx doesn't do reordering */ + if (ieee80211_hw_check(&local->hw, AMPDU_AGGREGATION) && + !ieee80211_hw_check(&local->hw, SUPPORTS_REORDERING_BUFFER)) + goto clear; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_STATION: + /* 4-addr is harder to deal with, later maybe */ + if (sdata->u.mgd.use_4addr) + goto clear; + /* software powersave is a huge mess, avoid all of it */ + if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) + goto clear; + if (ieee80211_hw_check(&local->hw, SUPPORTS_PS) && + !ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) + goto clear; + if (sta->sta.tdls) { + fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1); + fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2); + fastrx.expected_ds_bits = 0; + } else { + fastrx.sta_notify = sdata->u.mgd.probe_send_count > 0; + fastrx.da_offs = offsetof(struct ieee80211_hdr, addr1); + fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr3); + fastrx.expected_ds_bits = + cpu_to_le16(IEEE80211_FCTL_FROMDS); + } + break; + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_AP: + /* parallel-rx requires this, at least with calls to + * ieee80211_sta_ps_transition() + */ + if (!ieee80211_hw_check(&local->hw, AP_LINK_PS)) + goto clear; + fastrx.da_offs = offsetof(struct ieee80211_hdr, addr3); + fastrx.sa_offs = offsetof(struct ieee80211_hdr, addr2); + fastrx.expected_ds_bits = cpu_to_le16(IEEE80211_FCTL_TODS); + + fastrx.internal_forward = + !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) && + (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || + !sdata->u.vlan.sta); + break; + default: + goto clear; + } + + if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED)) + goto clear; + + rcu_read_lock(); + key = rcu_dereference(sta->ptk[sta->ptk_idx]); + if (key) { + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_TKIP: + /* we don't want to deal with MMIC in fast-rx */ + goto clear_rcu; + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_CCMP_256: + case WLAN_CIPHER_SUITE_GCMP: + case WLAN_CIPHER_SUITE_GCMP_256: + break; + default: + /* we also don't want to deal with WEP or cipher scheme + * since those require looking up the key idx in the + * frame, rather than assuming the PTK is used + * (we need to revisit this once we implement the real + * PTK index, which is now valid in the spec, but we + * haven't implemented that part yet) + */ + goto clear_rcu; + } + + fastrx.key = true; + fastrx.icv_len = key->conf.icv_len; + } + + assign = true; + clear_rcu: + rcu_read_unlock(); + clear: + __release(check_fast_rx); + + if (assign) + new = kmemdup(&fastrx, sizeof(fastrx), GFP_KERNEL); + + spin_lock_bh(&sta->lock); + old = rcu_dereference_protected(sta->fast_rx, true); + rcu_assign_pointer(sta->fast_rx, new); + spin_unlock_bh(&sta->lock); + + if (old) + kfree_rcu(old, rcu_head); +} + +void ieee80211_clear_fast_rx(struct sta_info *sta) +{ + struct ieee80211_fast_rx *old; + + spin_lock_bh(&sta->lock); + old = rcu_dereference_protected(sta->fast_rx, true); + RCU_INIT_POINTER(sta->fast_rx, NULL); + spin_unlock_bh(&sta->lock); + + if (old) + kfree_rcu(old, rcu_head); +} + +void __ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + lockdep_assert_held(&local->sta_mtx); + + list_for_each_entry_rcu(sta, &local->sta_list, list) { + if (sdata != sta->sdata && + (!sta->sdata->bss || sta->sdata->bss != sdata->bss)) + continue; + ieee80211_check_fast_rx(sta); + } +} + +void ieee80211_check_fast_rx_iface(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + + mutex_lock(&local->sta_mtx); + __ieee80211_check_fast_rx_iface(sdata); + mutex_unlock(&local->sta_mtx); +} + +static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, + struct ieee80211_fast_rx *fast_rx) +{ + struct sk_buff *skb = rx->skb; + struct ieee80211_hdr *hdr = (void *)skb->data; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + struct sta_info *sta = rx->sta; + int orig_len = skb->len; + int snap_offs = ieee80211_hdrlen(hdr->frame_control); + struct { + u8 snap[sizeof(rfc1042_header)]; + __be16 proto; + } *payload __aligned(2); + struct { + u8 da[ETH_ALEN]; + u8 sa[ETH_ALEN]; + } addrs __aligned(2); + struct ieee80211_sta_rx_stats *stats = &sta->rx_stats; + + if (fast_rx->uses_rss) + stats = this_cpu_ptr(sta->pcpu_rx_stats); + + /* for parallel-rx, we need to have DUP_VALIDATED, otherwise we write + * to a common data structure; drivers can implement that per queue + * but we don't have that information in mac80211 + */ + if (!(status->flag & RX_FLAG_DUP_VALIDATED)) + return false; + +#define FAST_RX_CRYPT_FLAGS (RX_FLAG_PN_VALIDATED | RX_FLAG_DECRYPTED) + + /* If using encryption, we also need to have: + * - PN_VALIDATED: similar, but the implementation is tricky + * - DECRYPTED: necessary for PN_VALIDATED + */ + if (fast_rx->key && + (status->flag & FAST_RX_CRYPT_FLAGS) != FAST_RX_CRYPT_FLAGS) + return false; + + /* we don't deal with A-MSDU deaggregation here */ + if (status->rx_flags & IEEE80211_RX_AMSDU) + return false; + + if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) + return false; + + if (unlikely(ieee80211_is_frag(hdr))) + return false; + + /* Since our interface address cannot be multicast, this + * implicitly also rejects multicast frames without the + * explicit check. + * + * We shouldn't get any *data* frames not addressed to us + * (AP mode will accept multicast *management* frames), but + * punting here will make it go through the full checks in + * ieee80211_accept_frame(). + */ + if (!ether_addr_equal(fast_rx->vif_addr, hdr->addr1)) + return false; + + if ((hdr->frame_control & cpu_to_le16(IEEE80211_FCTL_FROMDS | + IEEE80211_FCTL_TODS)) != + fast_rx->expected_ds_bits) + goto drop; + + /* assign the key to drop unencrypted frames (later) + * and strip the IV/MIC if necessary + */ + if (fast_rx->key && !(status->flag & RX_FLAG_IV_STRIPPED)) { + /* GCMP header length is the same */ + snap_offs += IEEE80211_CCMP_HDR_LEN; + } + + if (!pskb_may_pull(skb, snap_offs + sizeof(*payload))) + goto drop; + payload = (void *)(skb->data + snap_offs); + + if (!ether_addr_equal(payload->snap, fast_rx->rfc1042_hdr)) + return false; + + /* Don't handle these here since they require special code. + * Accept AARP and IPX even though they should come with a + * bridge-tunnel header - but if we get them this way then + * there's little point in discarding them. + */ + if (unlikely(payload->proto == cpu_to_be16(ETH_P_TDLS) || + payload->proto == fast_rx->control_port_protocol)) + return false; + + /* after this point, don't punt to the slowpath! */ + + if (rx->key && !(status->flag & RX_FLAG_MIC_STRIPPED) && + pskb_trim(skb, skb->len - fast_rx->icv_len)) + goto drop; + + if (unlikely(fast_rx->sta_notify)) { + ieee80211_sta_rx_notify(rx->sdata, hdr); + fast_rx->sta_notify = false; + } + + /* statistics part of ieee80211_rx_h_sta_process() */ + stats->last_rx = jiffies; + stats->last_rate = sta_stats_encode_rate(status); + + stats->fragments++; + + if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { + stats->last_signal = status->signal; + if (!fast_rx->uses_rss) + ewma_signal_add(&sta->rx_stats_avg.signal, + -status->signal); + } + + if (status->chains) { + int i; + + stats->chains = status->chains; + for (i = 0; i < ARRAY_SIZE(status->chain_signal); i++) { + int signal = status->chain_signal[i]; + + if (!(status->chains & BIT(i))) + continue; + + stats->chain_signal_last[i] = signal; + if (!fast_rx->uses_rss) + ewma_signal_add(&sta->rx_stats_avg.chain_signal[i], + -signal); + } + } + /* end of statistics */ + + if (rx->key && !ieee80211_has_protected(hdr->frame_control)) + goto drop; + + /* do the header conversion - first grab the addresses */ + ether_addr_copy(addrs.da, skb->data + fast_rx->da_offs); + ether_addr_copy(addrs.sa, skb->data + fast_rx->sa_offs); + /* remove the SNAP but leave the ethertype */ + skb_pull(skb, snap_offs + sizeof(rfc1042_header)); + /* push the addresses in front */ + memcpy(skb_push(skb, sizeof(addrs)), &addrs, sizeof(addrs)); + + skb->dev = fast_rx->dev; + + ieee80211_rx_stats(fast_rx->dev, skb->len); + + /* The seqno index has the same property as needed + * for the rx_msdu field, i.e. it is IEEE80211_NUM_TIDS + * for non-QoS-data frames. Here we know it's a data + * frame, so count MSDUs. + */ + u64_stats_update_begin(&stats->syncp); + stats->msdu[rx->seqno_idx]++; + stats->bytes += orig_len; + u64_stats_update_end(&stats->syncp); + + if (fast_rx->internal_forward) { + struct sta_info *dsta = sta_info_get(rx->sdata, skb->data); + + if (dsta) { + /* + * Send to wireless media and increase priority by 256 + * to keep the received priority instead of + * reclassifying the frame (see cfg80211_classify8021d). + */ + skb->priority += 256; + skb->protocol = htons(ETH_P_802_3); + skb_reset_network_header(skb); + skb_reset_mac_header(skb); + dev_queue_xmit(skb); + return true; + } + } + + /* deliver to local stack */ + skb->protocol = eth_type_trans(skb, fast_rx->dev); + memset(skb->cb, 0, sizeof(skb->cb)); + if (rx->napi) + napi_gro_receive(rx->napi, skb); + else + netif_receive_skb(skb); + + return true; + drop: + dev_kfree_skb(skb); + stats->dropped++; + return true; +} + /* * This function returns whether or not the SKB * was destined for RX processing or not, which, @@ -3527,6 +3885,21 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, rx->skb = skb; + /* See if we can do fast-rx; if we have to copy we already lost, + * so punt in that case. We should never have to deliver a data + * frame to multiple interfaces anyway. + * + * We skip the ieee80211_accept_frame() call and do the necessary + * checking inside ieee80211_invoke_fast_rx(). + */ + if (consume && rx->sta) { + struct ieee80211_fast_rx *fast_rx; + + fast_rx = rcu_dereference(rx->sta->fast_rx); + if (fast_rx && ieee80211_invoke_fast_rx(rx, fast_rx)) + return true; + } + if (!ieee80211_accept_frame(rx)) return false; @@ -3552,6 +3925,7 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, * be called with rcu_read_lock protection. */ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, + struct ieee80211_sta *pubsta, struct sk_buff *skb, struct napi_struct *napi) { @@ -3561,7 +3935,6 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, __le16 fc; struct ieee80211_rx_data rx; struct ieee80211_sub_if_data *prev; - struct sta_info *sta, *prev_sta; struct rhash_head *tmp; int err = 0; @@ -3597,7 +3970,14 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, ieee80211_is_beacon(hdr->frame_control))) ieee80211_scan_rx(local, skb); - if (ieee80211_is_data(fc)) { + if (pubsta) { + rx.sta = container_of(pubsta, struct sta_info, sta); + rx.sdata = rx.sta->sdata; + if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) + return; + goto out; + } else if (ieee80211_is_data(fc)) { + struct sta_info *sta, *prev_sta; const struct bucket_table *tbl; prev_sta = NULL; @@ -3671,8 +4051,8 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, * This is the receive path handler. It is called by a low level driver when an * 802.11 MPDU is received from the hardware. */ -void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb, - struct napi_struct *napi) +void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, + struct sk_buff *skb, struct napi_struct *napi) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_rate *rate = NULL; @@ -3681,7 +4061,7 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb, WARN_ON_ONCE(softirq_count() == 0); - if (WARN_ON(status->band >= IEEE80211_NUM_BANDS)) + if (WARN_ON(status->band >= NUM_NL80211_BANDS)) goto drop; sband = local->hw.wiphy->bands[status->band]; @@ -3771,7 +4151,8 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb, ieee80211_tpt_led_trig_rx(local, ((struct ieee80211_hdr *)skb->data)->frame_control, skb->len); - __ieee80211_rx_handle_packet(hw, skb, napi); + + __ieee80211_rx_handle_packet(hw, pubsta, skb, napi); rcu_read_unlock(); diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index ae980ce8d..f9648ef9e 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -66,7 +66,9 @@ ieee80211_bss_info_update(struct ieee80211_local *local, struct cfg80211_bss *cbss; struct ieee80211_bss *bss; int clen, srlen; - struct cfg80211_inform_bss bss_meta = {}; + struct cfg80211_inform_bss bss_meta = { + .boottime_ns = rx_status->boottime_ns, + }; bool signal_valid; if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) @@ -270,7 +272,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) n_chans = req->n_channels; } else { do { - if (local->hw_scan_band == IEEE80211_NUM_BANDS) + if (local->hw_scan_band == NUM_NL80211_BANDS) return false; n_chans = 0; @@ -303,6 +305,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr); ether_addr_copy(local->hw_scan_req->req.mac_addr_mask, req->mac_addr_mask); + ether_addr_copy(local->hw_scan_req->req.bssid, req->bssid); return true; } @@ -482,7 +485,7 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, int i; struct ieee80211_sub_if_data *sdata; struct cfg80211_scan_request *scan_req; - enum ieee80211_band band = local->hw.conf.chandef.chan->band; + enum nl80211_band band = local->hw.conf.chandef.chan->band; u32 tx_flags; scan_req = rcu_dereference_protected(local->scan_req, @@ -497,7 +500,7 @@ static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, for (i = 0; i < scan_req->n_ssids; i++) ieee80211_send_probe_req( - sdata, local->scan_addr, NULL, + sdata, local->scan_addr, scan_req->bssid, scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len, scan_req->ie, scan_req->ie_len, scan_req->rates[band], false, @@ -562,6 +565,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, req->n_channels * sizeof(req->channels[0]); local->hw_scan_req->req.ie = ies; local->hw_scan_req->req.flags = req->flags; + eth_broadcast_addr(local->hw_scan_req->req.bssid); local->hw_scan_band = 0; @@ -949,7 +953,7 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; int ret = -EBUSY, i, n_ch = 0; - enum ieee80211_band band; + enum nl80211_band band; mutex_lock(&local->mtx); @@ -961,7 +965,7 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, if (!channels) { int max_n; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!local->hw.wiphy->bands[band]) continue; @@ -1081,7 +1085,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct ieee80211_scan_ies sched_scan_ies = {}; struct cfg80211_chan_def chandef; int ret, i, iebufsz, num_bands = 0; - u32 rate_masks[IEEE80211_NUM_BANDS] = {}; + u32 rate_masks[NUM_NL80211_BANDS] = {}; u8 bands_used = 0; u8 *ie; size_t len; @@ -1093,7 +1097,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, if (!local->ops->sched_scan_start) return -ENOTSUPP; - for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { if (local->hw.wiphy->bands[i]) { bands_used |= BIT(i); rate_masks[i] = (u32) -1; diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index 06e6ac8cc..2ddc661f0 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -23,11 +23,11 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, - enum ieee80211_band current_band, + enum nl80211_band current_band, u32 sta_flags, u8 *bssid, struct ieee80211_csa_ie *csa_ie) { - enum ieee80211_band new_band; + enum nl80211_band new_band; int new_freq; u8 new_chan_no; struct ieee80211_channel *new_chan; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 861b93ffb..5ccfdbd40 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2,7 +2,7 @@ * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2015 Intel Deutschland GmbH + * Copyright (C) 2015 - 2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -255,6 +255,7 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif + free_percpu(sta->pcpu_rx_stats); kfree(sta); } @@ -312,6 +313,13 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, if (!sta) return NULL; + if (ieee80211_hw_check(hw, USES_RSS)) { + sta->pcpu_rx_stats = + alloc_percpu(struct ieee80211_sta_rx_stats); + if (!sta->pcpu_rx_stats) + goto free; + } + spin_lock_init(&sta->lock); spin_lock_init(&sta->ps_lock); INIT_WORK(&sta->drv_deliver_wk, sta_deliver_ps_frames); @@ -336,15 +344,17 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->sdata = sdata; sta->rx_stats.last_rx = jiffies; + u64_stats_init(&sta->rx_stats.syncp); + sta->sta_state = IEEE80211_STA_NONE; /* Mark TID as unreserved */ sta->reserved_tid = IEEE80211_TID_UNRESERVED; sta->last_connected = ktime_get_seconds(); - ewma_signal_init(&sta->rx_stats.avg_signal); - for (i = 0; i < ARRAY_SIZE(sta->rx_stats.chain_signal_avg); i++) - ewma_signal_init(&sta->rx_stats.chain_signal_avg[i]); + ewma_signal_init(&sta->rx_stats_avg.signal); + for (i = 0; i < ARRAY_SIZE(sta->rx_stats_avg.chain_signal); i++) + ewma_signal_init(&sta->rx_stats_avg.chain_signal[i]); if (local->ops->wake_tx_queue) { void *txq_data; @@ -407,6 +417,8 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, } } + sta->sta.max_rc_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_BA; + sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr); return sta; @@ -879,6 +891,13 @@ static int __must_check __sta_info_destroy_part1(struct sta_info *sta) set_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA); + /* + * Before removing the station from the driver there might be pending + * rx frames on RSS queues sent prior to the disassociation - wait for + * all such frames to be processed. + */ + drv_sync_rx_queues(local, sta); + ret = sta_info_hash_del(local, sta); if (WARN_ON(ret)) return ret; @@ -1091,10 +1110,12 @@ void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, mutex_lock(&local->sta_mtx); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { + unsigned long last_active = ieee80211_sta_last_active(sta); + if (sdata != sta->sdata) continue; - if (time_after(jiffies, sta->rx_stats.last_rx + exp_time)) { + if (time_is_before_jiffies(last_active + exp_time)) { sta_dbg(sta->sdata, "expiring inactive STA %pM\n", sta->sta.addr); @@ -1764,6 +1785,31 @@ void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta, } EXPORT_SYMBOL(ieee80211_sta_set_buffered); +static void +ieee80211_recalc_p2p_go_ps_allowed(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + bool allow_p2p_go_ps = sdata->vif.p2p; + struct sta_info *sta; + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &local->sta_list, list) { + if (sdata != sta->sdata || + !test_sta_flag(sta, WLAN_STA_ASSOC)) + continue; + if (!sta->sta.support_p2p_ps) { + allow_p2p_go_ps = false; + break; + } + } + rcu_read_unlock(); + + if (allow_p2p_go_ps != sdata->vif.bss_conf.allow_p2p_go_ps) { + sdata->vif.bss_conf.allow_p2p_go_ps = allow_p2p_go_ps; + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_P2P_PS); + } +} + int sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state) { @@ -1825,12 +1871,16 @@ int sta_info_move_state(struct sta_info *sta, } else if (sta->sta_state == IEEE80211_STA_ASSOC) { clear_bit(WLAN_STA_ASSOC, &sta->_flags); ieee80211_recalc_min_chandef(sta->sdata); + if (!sta->sta.support_p2p_ps) + ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } break; case IEEE80211_STA_ASSOC: if (sta->sta_state == IEEE80211_STA_AUTH) { set_bit(WLAN_STA_ASSOC, &sta->_flags); ieee80211_recalc_min_chandef(sta->sdata); + if (!sta->sta.support_p2p_ps) + ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { if (sta->sdata->vif.type == NL80211_IFTYPE_AP || (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN && @@ -1838,6 +1888,7 @@ int sta_info_move_state(struct sta_info *sta, atomic_dec(&sta->sdata->bss->num_mcast_sta); clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags); ieee80211_clear_fast_xmit(sta); + ieee80211_clear_fast_rx(sta); } break; case IEEE80211_STA_AUTHORIZED: @@ -1848,6 +1899,7 @@ int sta_info_move_state(struct sta_info *sta, atomic_inc(&sta->sdata->bss->num_mcast_sta); set_bit(WLAN_STA_AUTHORIZED, &sta->_flags); ieee80211_check_fast_xmit(sta); + ieee80211_check_fast_rx(sta); } break; default: @@ -1894,43 +1946,117 @@ u8 sta_info_tx_streams(struct sta_info *sta) >> IEEE80211_HT_MCS_TX_MAX_STREAMS_SHIFT) + 1; } -static void sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) +static struct ieee80211_sta_rx_stats * +sta_get_last_rx_stats(struct sta_info *sta) { - rinfo->flags = 0; - - if (sta->rx_stats.last_rate_flag & RX_FLAG_HT) { - rinfo->flags |= RATE_INFO_FLAGS_MCS; - rinfo->mcs = sta->rx_stats.last_rate_idx; - } else if (sta->rx_stats.last_rate_flag & RX_FLAG_VHT) { - rinfo->flags |= RATE_INFO_FLAGS_VHT_MCS; - rinfo->nss = sta->rx_stats.last_rate_vht_nss; - rinfo->mcs = sta->rx_stats.last_rate_idx; - } else { + struct ieee80211_sta_rx_stats *stats = &sta->rx_stats; + struct ieee80211_local *local = sta->local; + int cpu; + + if (!ieee80211_hw_check(&local->hw, USES_RSS)) + return stats; + + for_each_possible_cpu(cpu) { + struct ieee80211_sta_rx_stats *cpustats; + + cpustats = per_cpu_ptr(sta->pcpu_rx_stats, cpu); + + if (time_after(cpustats->last_rx, stats->last_rx)) + stats = cpustats; + } + + return stats; +} + +static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate, + struct rate_info *rinfo) +{ + rinfo->bw = (rate & STA_STATS_RATE_BW_MASK) >> + STA_STATS_RATE_BW_SHIFT; + + if (rate & STA_STATS_RATE_VHT) { + rinfo->flags = RATE_INFO_FLAGS_VHT_MCS; + rinfo->mcs = rate & 0xf; + rinfo->nss = (rate & 0xf0) >> 4; + } else if (rate & STA_STATS_RATE_HT) { + rinfo->flags = RATE_INFO_FLAGS_MCS; + rinfo->mcs = rate & 0xff; + } else if (rate & STA_STATS_RATE_LEGACY) { struct ieee80211_supported_band *sband; - int shift = ieee80211_vif_get_shift(&sta->sdata->vif); u16 brate; - - sband = sta->local->hw.wiphy->bands[ - ieee80211_get_sdata_band(sta->sdata)]; - brate = sband->bitrates[sta->rx_stats.last_rate_idx].bitrate; + unsigned int shift; + + sband = local->hw.wiphy->bands[(rate >> 4) & 0xf]; + brate = sband->bitrates[rate & 0xf].bitrate; + if (rinfo->bw == RATE_INFO_BW_5) + shift = 2; + else if (rinfo->bw == RATE_INFO_BW_10) + shift = 1; + else + shift = 0; rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); } - if (sta->rx_stats.last_rate_flag & RX_FLAG_SHORT_GI) + if (rate & STA_STATS_RATE_SGI) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; +} - if (sta->rx_stats.last_rate_flag & RX_FLAG_5MHZ) - rinfo->bw = RATE_INFO_BW_5; - else if (sta->rx_stats.last_rate_flag & RX_FLAG_10MHZ) - rinfo->bw = RATE_INFO_BW_10; - else if (sta->rx_stats.last_rate_flag & RX_FLAG_40MHZ) - rinfo->bw = RATE_INFO_BW_40; - else if (sta->rx_stats.last_rate_vht_flag & RX_VHT_FLAG_80MHZ) - rinfo->bw = RATE_INFO_BW_80; - else if (sta->rx_stats.last_rate_vht_flag & RX_VHT_FLAG_160MHZ) - rinfo->bw = RATE_INFO_BW_160; +static void sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) +{ + u16 rate = ACCESS_ONCE(sta_get_last_rx_stats(sta)->last_rate); + + if (rate == STA_STATS_RATE_INVALID) + rinfo->flags = 0; else - rinfo->bw = RATE_INFO_BW_20; + sta_stats_decode_rate(sta->local, rate, rinfo); +} + +static void sta_set_tidstats(struct sta_info *sta, + struct cfg80211_tid_stats *tidstats, + int tid) +{ + struct ieee80211_local *local = sta->local; + + if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) { + unsigned int start; + + do { + start = u64_stats_fetch_begin(&sta->rx_stats.syncp); + tidstats->rx_msdu = sta->rx_stats.msdu[tid]; + } while (u64_stats_fetch_retry(&sta->rx_stats.syncp, start)); + + tidstats->filled |= BIT(NL80211_TID_STATS_RX_MSDU); + } + + if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) { + tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU); + tidstats->tx_msdu = sta->tx_stats.msdu[tid]; + } + + if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) && + ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { + tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_RETRIES); + tidstats->tx_msdu_retries = sta->status_stats.msdu_retries[tid]; + } + + if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) && + ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { + tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED); + tidstats->tx_msdu_failed = sta->status_stats.msdu_failed[tid]; + } +} + +static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) +{ + unsigned int start; + u64 value; + + do { + start = u64_stats_fetch_begin(&rxstats->syncp); + value = rxstats->bytes; + } while (u64_stats_fetch_retry(&rxstats->syncp, start)); + + return value; } void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) @@ -1939,7 +2065,10 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) struct ieee80211_local *local = sdata->local; struct rate_control_ref *ref = NULL; u32 thr = 0; - int i, ac; + int i, ac, cpu; + struct ieee80211_sta_rx_stats *last_rxstats; + + last_rxstats = sta_get_last_rx_stats(sta); if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) ref = local->rate_ctrl; @@ -1968,7 +2097,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->connected_time = ktime_get_seconds() - sta->last_connected; sinfo->inactive_time = - jiffies_to_msecs(jiffies - sta->rx_stats.last_rx); + jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta)); if (!(sinfo->filled & (BIT(NL80211_STA_INFO_TX_BYTES64) | BIT(NL80211_STA_INFO_TX_BYTES)))) { @@ -1987,12 +2116,30 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) if (!(sinfo->filled & (BIT(NL80211_STA_INFO_RX_BYTES64) | BIT(NL80211_STA_INFO_RX_BYTES)))) { - sinfo->rx_bytes = sta->rx_stats.bytes; + sinfo->rx_bytes += sta_get_stats_bytes(&sta->rx_stats); + + if (sta->pcpu_rx_stats) { + for_each_possible_cpu(cpu) { + struct ieee80211_sta_rx_stats *cpurxs; + + cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu); + sinfo->rx_bytes += sta_get_stats_bytes(cpurxs); + } + } + sinfo->filled |= BIT(NL80211_STA_INFO_RX_BYTES64); } if (!(sinfo->filled & BIT(NL80211_STA_INFO_RX_PACKETS))) { sinfo->rx_packets = sta->rx_stats.packets; + if (sta->pcpu_rx_stats) { + for_each_possible_cpu(cpu) { + struct ieee80211_sta_rx_stats *cpurxs; + + cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu); + sinfo->rx_packets += cpurxs->packets; + } + } sinfo->filled |= BIT(NL80211_STA_INFO_RX_PACKETS); } @@ -2007,6 +2154,14 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) } sinfo->rx_dropped_misc = sta->rx_stats.dropped; + if (sta->pcpu_rx_stats) { + for_each_possible_cpu(cpu) { + struct ieee80211_sta_rx_stats *cpurxs; + + cpurxs = per_cpu_ptr(sta->pcpu_rx_stats, cpu); + sinfo->rx_packets += cpurxs->dropped; + } + } if (sdata->vif.type == NL80211_IFTYPE_STATION && !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { @@ -2018,29 +2173,36 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL))) { - sinfo->signal = (s8)sta->rx_stats.last_signal; + sinfo->signal = (s8)last_rxstats->last_signal; sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL); } - if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))) { + if (!sta->pcpu_rx_stats && + !(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))) { sinfo->signal_avg = - -ewma_signal_read(&sta->rx_stats.avg_signal); + -ewma_signal_read(&sta->rx_stats_avg.signal); sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL_AVG); } } - if (sta->rx_stats.chains && + /* for the average - if pcpu_rx_stats isn't set - rxstats must point to + * the sta->rx_stats struct, so the check here is fine with and without + * pcpu statistics + */ + if (last_rxstats->chains && !(sinfo->filled & (BIT(NL80211_STA_INFO_CHAIN_SIGNAL) | BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { - sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL) | - BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); + sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL); + if (!sta->pcpu_rx_stats) + sinfo->filled |= BIT(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); + + sinfo->chains = last_rxstats->chains; - sinfo->chains = sta->rx_stats.chains; for (i = 0; i < ARRAY_SIZE(sinfo->chain_signal); i++) { sinfo->chain_signal[i] = - sta->rx_stats.chain_signal_last[i]; + last_rxstats->chain_signal_last[i]; sinfo->chain_signal_avg[i] = - -ewma_signal_read(&sta->rx_stats.chain_signal_avg[i]); + -ewma_signal_read(&sta->rx_stats_avg.chain_signal[i]); } } @@ -2059,33 +2221,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) { struct cfg80211_tid_stats *tidstats = &sinfo->pertid[i]; - if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) { - tidstats->filled |= BIT(NL80211_TID_STATS_RX_MSDU); - tidstats->rx_msdu = sta->rx_stats.msdu[i]; - } - - if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) { - tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU); - tidstats->tx_msdu = sta->tx_stats.msdu[i]; - } - - if (!(tidstats->filled & - BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) && - ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { - tidstats->filled |= - BIT(NL80211_TID_STATS_TX_MSDU_RETRIES); - tidstats->tx_msdu_retries = - sta->status_stats.msdu_retries[i]; - } - - if (!(tidstats->filled & - BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) && - ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { - tidstats->filled |= - BIT(NL80211_TID_STATS_TX_MSDU_FAILED); - tidstats->tx_msdu_failed = - sta->status_stats.msdu_failed[i]; - } + sta_set_tidstats(sta, tidstats, i); } if (ieee80211_vif_is_mesh(&sdata->vif)) { @@ -2154,3 +2290,12 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->expected_throughput = thr; } } + +unsigned long ieee80211_sta_last_active(struct sta_info *sta) +{ + struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta); + + if (time_after(stats->last_rx, sta->status_stats.last_ack)) + return stats->last_rx; + return sta->status_stats.last_ack; +} diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index ba7ce53ec..78b0ef32d 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -1,7 +1,7 @@ /* * Copyright 2002-2005, Devicescape Software, Inc. * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright(c) 2015 Intel Deutschland GmbH + * Copyright(c) 2015-2016 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -18,6 +18,7 @@ #include <linux/average.h> #include <linux/etherdevice.h> #include <linux/rhashtable.h> +#include <linux/u64_stats_sync.h> #include "key.h" /** @@ -69,6 +70,8 @@ * @WLAN_STA_MPSP_RECIPIENT: local STA is recipient of a MPSP. * @WLAN_STA_PS_DELIVER: station woke up, but we're still blocking TX * until pending frames are delivered + * + * @NUM_WLAN_STA_FLAGS: number of defined flags */ enum ieee80211_sta_info_flags { WLAN_STA_AUTH, @@ -97,6 +100,8 @@ enum ieee80211_sta_info_flags { WLAN_STA_MPSP_OWNER, WLAN_STA_MPSP_RECIPIENT, WLAN_STA_PS_DELIVER, + + NUM_WLAN_STA_FLAGS, }; #define ADDBA_RESP_INTERVAL HZ @@ -281,6 +286,40 @@ struct ieee80211_fast_tx { }; /** + * struct ieee80211_fast_rx - RX fastpath information + * @dev: netdevice for reporting the SKB + * @vif_type: (P2P-less) interface type of the original sdata (sdata->vif.type) + * @vif_addr: interface address + * @rfc1042_hdr: copy of the RFC 1042 SNAP header (to have in cache) + * @control_port_protocol: control port protocol copied from sdata + * @expected_ds_bits: from/to DS bits expected + * @icv_len: length of the MIC if present + * @key: bool indicating encryption is expected (key is set) + * @sta_notify: notify the MLME code (once) + * @internal_forward: forward froms internally on AP/VLAN type interfaces + * @uses_rss: copy of USES_RSS hw flag + * @da_offs: offset of the DA in the header (for header conversion) + * @sa_offs: offset of the SA in the header (for header conversion) + * @rcu_head: RCU head for freeing this structure + */ +struct ieee80211_fast_rx { + struct net_device *dev; + enum nl80211_iftype vif_type; + u8 vif_addr[ETH_ALEN] __aligned(2); + u8 rfc1042_hdr[6] __aligned(2); + __be16 control_port_protocol; + __le16 expected_ds_bits; + u8 icv_len; + u8 key:1, + sta_notify:1, + internal_forward:1, + uses_rss:1; + u8 da_offs, sa_offs; + + struct rcu_head rcu_head; +}; + +/** * struct mesh_sta - mesh STA information * @plink_lock: serialize access to plink fields * @llid: Local link ID @@ -330,6 +369,21 @@ struct mesh_sta { DECLARE_EWMA(signal, 1024, 8) +struct ieee80211_sta_rx_stats { + unsigned long packets; + unsigned long last_rx; + unsigned long num_duplicates; + unsigned long fragments; + unsigned long dropped; + int last_signal; + u8 chains; + s8 chain_signal_last[IEEE80211_MAX_CHAINS]; + u16 last_rate; + struct u64_stats_sync syncp; + u64 bytes; + u64 msdu[IEEE80211_NUM_TIDS + 1]; +}; + /** * struct sta_info - STA information * @@ -371,7 +425,7 @@ DECLARE_EWMA(signal, 1024, 8) * @ampdu_mlme: A-MPDU state machine state * @timer_to_tid: identity mapping to ID timers * @mesh: mesh STA information - * @debugfs: debug filesystem info + * @debugfs_dir: debug filesystem directory dentry * @dead: set to true when sta is unlinked * @removed: set to true when sta is being removed from sta_list * @uploaded: set to true when sta is uploaded to the driver @@ -385,10 +439,13 @@ DECLARE_EWMA(signal, 1024, 8) * @cipher_scheme: optional cipher scheme for this station * @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED) * @fast_tx: TX fastpath information + * @fast_rx: RX fastpath information * @tdls_chandef: a TDLS peer can have a wider chandef that is compatible to * the BSS one. * @tx_stats: TX statistics * @rx_stats: RX statistics + * @pcpu_rx_stats: per-CPU RX statistics, assigned only if the driver needs + * this (by advertising the USES_RSS hw flag) * @status_stats: TX status statistics */ struct sta_info { @@ -408,6 +465,8 @@ struct sta_info { spinlock_t lock; struct ieee80211_fast_tx __rcu *fast_tx; + struct ieee80211_fast_rx __rcu *fast_rx; + struct ieee80211_sta_rx_stats __percpu *pcpu_rx_stats; #ifdef CONFIG_MAC80211_MESH struct mesh_sta *mesh; @@ -437,24 +496,11 @@ struct sta_info { long last_connected; /* Updated from RX path only, no locking requirements */ + struct ieee80211_sta_rx_stats rx_stats; struct { - unsigned long packets; - u64 bytes; - unsigned long last_rx; - unsigned long num_duplicates; - unsigned long fragments; - unsigned long dropped; - int last_signal; - struct ewma_signal avg_signal; - u8 chains; - s8 chain_signal_last[IEEE80211_MAX_CHAINS]; - struct ewma_signal chain_signal_avg[IEEE80211_MAX_CHAINS]; - int last_rate_idx; - u32 last_rate_flag; - u32 last_rate_vht_flag; - u8 last_rate_vht_nss; - u64 msdu[IEEE80211_NUM_TIDS + 1]; - } rx_stats; + struct ewma_signal signal; + struct ewma_signal chain_signal[IEEE80211_MAX_CHAINS]; + } rx_stats_avg; /* Plus 1 for non-QoS frames */ __le16 last_seq_ctrl[IEEE80211_NUM_TIDS + 1]; @@ -467,6 +513,7 @@ struct sta_info { unsigned long last_tdls_pkt_time; u64 msdu_retries[IEEE80211_NUM_TIDS + 1]; u64 msdu_failed[IEEE80211_NUM_TIDS + 1]; + unsigned long last_ack; } status_stats; /* Updated from TX path only, no locking requirements */ @@ -485,10 +532,7 @@ struct sta_info { u8 timer_to_tid[IEEE80211_NUM_TIDS]; #ifdef CONFIG_MAC80211_DEBUGFS - struct sta_info_debugfsdentries { - struct dentry *dir; - bool add_has_run; - } debugfs; + struct dentry *debugfs_dir; #endif enum ieee80211_sta_rx_bandwidth cur_max_bandwidth; @@ -676,4 +720,44 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta); void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta); void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta); +unsigned long ieee80211_sta_last_active(struct sta_info *sta); + +#define STA_STATS_RATE_INVALID 0 +#define STA_STATS_RATE_VHT 0x8000 +#define STA_STATS_RATE_HT 0x4000 +#define STA_STATS_RATE_LEGACY 0x2000 +#define STA_STATS_RATE_SGI 0x1000 +#define STA_STATS_RATE_BW_SHIFT 9 +#define STA_STATS_RATE_BW_MASK (0x7 << STA_STATS_RATE_BW_SHIFT) + +static inline u16 sta_stats_encode_rate(struct ieee80211_rx_status *s) +{ + u16 r = s->rate_idx; + + if (s->vht_flag & RX_VHT_FLAG_80MHZ) + r |= RATE_INFO_BW_80 << STA_STATS_RATE_BW_SHIFT; + else if (s->vht_flag & RX_VHT_FLAG_160MHZ) + r |= RATE_INFO_BW_160 << STA_STATS_RATE_BW_SHIFT; + else if (s->flag & RX_FLAG_40MHZ) + r |= RATE_INFO_BW_40 << STA_STATS_RATE_BW_SHIFT; + else if (s->flag & RX_FLAG_10MHZ) + r |= RATE_INFO_BW_10 << STA_STATS_RATE_BW_SHIFT; + else if (s->flag & RX_FLAG_5MHZ) + r |= RATE_INFO_BW_5 << STA_STATS_RATE_BW_SHIFT; + else + r |= RATE_INFO_BW_20 << STA_STATS_RATE_BW_SHIFT; + + if (s->flag & RX_FLAG_SHORT_GI) + r |= STA_STATS_RATE_SGI; + + if (s->flag & RX_FLAG_VHT) + r |= STA_STATS_RATE_VHT | (s->vht_nss << 4); + else if (s->flag & RX_FLAG_HT) + r |= STA_STATS_RATE_HT; + else + r |= STA_STATS_RATE_LEGACY | (s->band << 4); + + return r; +} + #endif /* STA_INFO_H */ diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 8b1b2ea03..c6d5c724e 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -188,7 +188,7 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) struct ieee80211_sub_if_data *sdata = sta->sdata; if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) - sta->rx_stats.last_rx = jiffies; + sta->status_stats.last_ack = jiffies; if (ieee80211_is_data_qos(mgmt->frame_control)) { struct ieee80211_hdr *hdr = (void *) skb->data; @@ -647,7 +647,7 @@ void ieee80211_tx_status_noskb(struct ieee80211_hw *hw, sta->status_stats.retry_count += retry_count; if (acked) { - sta->rx_stats.last_rx = jiffies; + sta->status_stats.last_ack = jiffies; if (sta->status_stats.lost_packets) sta->status_stats.lost_packets = 0; diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index a29ea813b..1c7d45a6d 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -47,7 +47,7 @@ static void ieee80211_tdls_add_ext_capab(struct ieee80211_sub_if_data *sdata, NL80211_FEATURE_TDLS_CHANNEL_SWITCH; bool wider_band = ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) && !ifmgd->tdls_wider_bw_prohibited; - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; bool vht = sband && sband->vht_cap.vht_supported; u8 *pos = (void *)skb_put(skb, 10); @@ -184,7 +184,7 @@ static u16 ieee80211_get_tdls_sta_capab(struct ieee80211_sub_if_data *sdata, if (status_code != 0) return 0; - if (ieee80211_get_sdata_band(sdata) == IEEE80211_BAND_2GHZ) { + if (ieee80211_get_sdata_band(sdata) == NL80211_BAND_2GHZ) { return WLAN_CAPABILITY_SHORT_SLOT_TIME | WLAN_CAPABILITY_SHORT_PREAMBLE; } @@ -357,7 +357,7 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, u8 action_code, bool initiator, const u8 *extra_ies, size_t extra_ies_len) { - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; struct ieee80211_sta_ht_cap ht_cap; @@ -544,7 +544,7 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; size_t offset = 0, noffset; struct sta_info *sta, *ap_sta; - enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + enum nl80211_band band = ieee80211_get_sdata_band(sdata); u8 *pos; mutex_lock(&local->sta_mtx); @@ -611,7 +611,7 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator); /* only include VHT-operation if not on the 2.4GHz band */ - if (band != IEEE80211_BAND_2GHZ && sta->sta.vht_cap.vht_supported) { + if (band != NL80211_BAND_2GHZ && sta->sta.vht_cap.vht_supported) { /* * if both peers support WIDER_BW, we can expand the chandef to * a wider compatible one, up to 80MHz @@ -1773,7 +1773,7 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, u8 target_channel, oper_class; bool local_initiator; struct sta_info *sta; - enum ieee80211_band band; + enum nl80211_band band; struct ieee80211_tdls_data *tf = (void *)skb->data; struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb); int baselen = offsetof(typeof(*tf), u.chan_switch_req.variable); @@ -1805,10 +1805,10 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, if ((oper_class == 112 || oper_class == 2 || oper_class == 3 || oper_class == 4 || oper_class == 5 || oper_class == 6) && target_channel < 14) - band = IEEE80211_BAND_5GHZ; + band = NL80211_BAND_5GHZ; else - band = target_channel < 14 ? IEEE80211_BAND_2GHZ : - IEEE80211_BAND_5GHZ; + band = target_channel < 14 ? NL80211_BAND_2GHZ : + NL80211_BAND_5GHZ; freq = ieee80211_channel_to_frequency(target_channel, band); if (freq == 0) { diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 2b0a17ee9..77e4c53ba 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -1,3 +1,8 @@ +/* +* Portions of this file +* Copyright(c) 2016 Intel Deutschland GmbH +*/ + #if !defined(__MAC80211_DRIVER_TRACE) || defined(TRACE_HEADER_MULTI_READ) #define __MAC80211_DRIVER_TRACE @@ -396,7 +401,7 @@ TRACE_EVENT(drv_bss_info_changed, __field(u32, sync_device_ts) __field(u8, sync_dtim_count) __field(u32, basic_rates) - __array(int, mcast_rate, IEEE80211_NUM_BANDS) + __array(int, mcast_rate, NUM_NL80211_BANDS) __field(u16, ht_operation_mode) __field(s32, cqm_rssi_thold); __field(s32, cqm_rssi_hyst); @@ -899,6 +904,13 @@ DEFINE_EVENT(sta_event, drv_sta_pre_rcu_remove, TP_ARGS(local, sdata, sta) ); +DEFINE_EVENT(sta_event, drv_sync_rx_queues, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta), + TP_ARGS(local, sdata, sta) +); + DEFINE_EVENT(sta_event, drv_sta_rate_tbl_update, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, @@ -1253,8 +1265,8 @@ TRACE_EVENT(drv_set_bitrate_mask, TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; - __entry->legacy_2g = mask->control[IEEE80211_BAND_2GHZ].legacy; - __entry->legacy_5g = mask->control[IEEE80211_BAND_5GHZ].legacy; + __entry->legacy_2g = mask->control[NL80211_BAND_2GHZ].legacy; + __entry->legacy_5g = mask->control[NL80211_BAND_5GHZ].legacy; ), TP_printk( diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 21f660239..203044379 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -150,7 +150,7 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, rate = DIV_ROUND_UP(r->bitrate, 1 << shift); switch (sband->band) { - case IEEE80211_BAND_2GHZ: { + case NL80211_BAND_2GHZ: { u32 flag; if (tx->sdata->flags & IEEE80211_SDATA_OPERATING_GMODE) flag = IEEE80211_RATE_MANDATORY_G; @@ -160,13 +160,13 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, mrate = r->bitrate; break; } - case IEEE80211_BAND_5GHZ: + case NL80211_BAND_5GHZ: if (r->flags & IEEE80211_RATE_MANDATORY_A) mrate = r->bitrate; break; - case IEEE80211_BAND_60GHZ: + case NL80211_BAND_60GHZ: /* TODO, for now fall through */ - case IEEE80211_NUM_BANDS: + case NUM_NL80211_BANDS: WARN_ON(1); break; } @@ -1329,6 +1329,10 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw, out: spin_unlock_bh(&txqi->queue.lock); + if (skb && skb_has_frag_list(skb) && + !ieee80211_hw_check(&local->hw, TX_FRAG_LIST)) + skb_linearize(skb); + return skb; } EXPORT_SYMBOL(ieee80211_tx_dequeue); @@ -1696,7 +1700,9 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, bool rate_found = false; u8 rate_retries = 0; u16 rate_flags = 0; - u8 mcs_known, mcs_flags; + u8 mcs_known, mcs_flags, mcs_bw; + u16 vht_known; + u8 vht_mcs = 0, vht_nss = 0; int i; info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | @@ -1772,11 +1778,38 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, mcs_flags & IEEE80211_RADIOTAP_MCS_SGI) rate_flags |= IEEE80211_TX_RC_SHORT_GI; + mcs_bw = mcs_flags & IEEE80211_RADIOTAP_MCS_BW_MASK; if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_BW && - mcs_flags & IEEE80211_RADIOTAP_MCS_BW_40) + mcs_bw == IEEE80211_RADIOTAP_MCS_BW_40) rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; break; + case IEEE80211_RADIOTAP_VHT: + vht_known = get_unaligned_le16(iterator.this_arg); + rate_found = true; + + rate_flags = IEEE80211_TX_RC_VHT_MCS; + if ((vht_known & IEEE80211_RADIOTAP_VHT_KNOWN_GI) && + (iterator.this_arg[2] & + IEEE80211_RADIOTAP_VHT_FLAG_SGI)) + rate_flags |= IEEE80211_TX_RC_SHORT_GI; + if (vht_known & + IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH) { + if (iterator.this_arg[3] == 1) + rate_flags |= + IEEE80211_TX_RC_40_MHZ_WIDTH; + else if (iterator.this_arg[3] == 4) + rate_flags |= + IEEE80211_TX_RC_80_MHZ_WIDTH; + else if (iterator.this_arg[3] == 11) + rate_flags |= + IEEE80211_TX_RC_160_MHZ_WIDTH; + } + + vht_mcs = iterator.this_arg[4] >> 4; + vht_nss = iterator.this_arg[4] & 0xF; + break; + /* * Please update the file * Documentation/networking/mac80211-injection.txt @@ -1802,6 +1835,9 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, if (rate_flags & IEEE80211_TX_RC_MCS) { info->control.rates[0].idx = rate; + } else if (rate_flags & IEEE80211_TX_RC_VHT_MCS) { + ieee80211_rate_set_vht(info->control.rates, vht_mcs, + vht_nss); } else { for (i = 0; i < sband->n_bitrates; i++) { if (rate * 5 != sband->bitrates[i].bitrate) @@ -1812,6 +1848,9 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, } } + if (info->control.rates[0].idx < 0) + info->control.flags &= ~IEEE80211_TX_CTRL_RATE_INJECT; + info->control.rates[0].flags = rate_flags; info->control.rates[0].count = min_t(u8, rate_retries + 1, local->hw.max_rate_tries); @@ -2099,7 +2138,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, u16 info_id = 0; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_sub_if_data *ap_sdata; - enum ieee80211_band band; + enum nl80211_band band; int ret; if (IS_ERR(sta)) @@ -2186,7 +2225,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, } if (mppath && mpath) - mesh_path_del(mpath->sdata, mpath->dst); + mesh_path_del(sdata, mpath->dst); } /* @@ -2772,6 +2811,154 @@ void ieee80211_clear_fast_xmit(struct sta_info *sta) kfree_rcu(fast_tx, rcu_head); } +static bool ieee80211_amsdu_realloc_pad(struct ieee80211_local *local, + struct sk_buff *skb, int headroom, + int *subframe_len) +{ + int amsdu_len = *subframe_len + sizeof(struct ethhdr); + int padding = (4 - amsdu_len) & 3; + + if (skb_headroom(skb) < headroom || skb_tailroom(skb) < padding) { + I802_DEBUG_INC(local->tx_expand_skb_head); + + if (pskb_expand_head(skb, headroom, padding, GFP_ATOMIC)) { + wiphy_debug(local->hw.wiphy, + "failed to reallocate TX buffer\n"); + return false; + } + } + + if (padding) { + *subframe_len += padding; + memset(skb_put(skb, padding), 0, padding); + } + + return true; +} + +static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata, + struct ieee80211_fast_tx *fast_tx, + struct sk_buff *skb) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_hdr *hdr; + struct ethhdr amsdu_hdr; + int hdr_len = fast_tx->hdr_len - sizeof(rfc1042_header); + int subframe_len = skb->len - hdr_len; + void *data; + u8 *qc; + + if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) + return false; + + if (info->control.flags & IEEE80211_TX_CTRL_AMSDU) + return true; + + if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(amsdu_hdr), + &subframe_len)) + return false; + + amsdu_hdr.h_proto = cpu_to_be16(subframe_len); + memcpy(amsdu_hdr.h_source, skb->data + fast_tx->sa_offs, ETH_ALEN); + memcpy(amsdu_hdr.h_dest, skb->data + fast_tx->da_offs, ETH_ALEN); + + data = skb_push(skb, sizeof(amsdu_hdr)); + memmove(data, data + sizeof(amsdu_hdr), hdr_len); + memcpy(data + hdr_len, &amsdu_hdr, sizeof(amsdu_hdr)); + + hdr = data; + qc = ieee80211_get_qos_ctl(hdr); + *qc |= IEEE80211_QOS_CTL_A_MSDU_PRESENT; + + info->control.flags |= IEEE80211_TX_CTRL_AMSDU; + + return true; +} + +static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct ieee80211_fast_tx *fast_tx, + struct sk_buff *skb) +{ + struct ieee80211_local *local = sdata->local; + u8 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; + struct ieee80211_txq *txq = sta->sta.txq[tid]; + struct txq_info *txqi; + struct sk_buff **frag_tail, *head; + int subframe_len = skb->len - ETH_ALEN; + u8 max_subframes = sta->sta.max_amsdu_subframes; + int max_frags = local->hw.max_tx_fragments; + int max_amsdu_len = sta->sta.max_amsdu_len; + __be16 len; + void *data; + bool ret = false; + int n = 1, nfrags; + + if (!ieee80211_hw_check(&local->hw, TX_AMSDU)) + return false; + + if (!txq) + return false; + + txqi = to_txq_info(txq); + if (test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags)) + return false; + + if (sta->sta.max_rc_amsdu_len) + max_amsdu_len = min_t(int, max_amsdu_len, + sta->sta.max_rc_amsdu_len); + + spin_lock_bh(&txqi->queue.lock); + + head = skb_peek_tail(&txqi->queue); + if (!head) + goto out; + + if (skb->len + head->len > max_amsdu_len) + goto out; + + if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head)) + goto out; + + nfrags = 1 + skb_shinfo(skb)->nr_frags; + nfrags += 1 + skb_shinfo(head)->nr_frags; + frag_tail = &skb_shinfo(head)->frag_list; + while (*frag_tail) { + nfrags += 1 + skb_shinfo(*frag_tail)->nr_frags; + frag_tail = &(*frag_tail)->next; + n++; + } + + if (max_subframes && n > max_subframes) + goto out; + + if (max_frags && nfrags > max_frags) + goto out; + + if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(rfc1042_header) + 2, + &subframe_len)) + goto out; + + ret = true; + data = skb_push(skb, ETH_ALEN + 2); + memmove(data, data + ETH_ALEN + 2, 2 * ETH_ALEN); + + data += 2 * ETH_ALEN; + len = cpu_to_be16(subframe_len); + memcpy(data, &len, 2); + memcpy(data + 2, rfc1042_header, sizeof(rfc1042_header)); + + head->len += skb->len; + head->data_len += skb->len; + *frag_tail = skb; + +out: + spin_unlock_bh(&txqi->queue.lock); + + return ret; +} + static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, struct net_device *dev, struct sta_info *sta, struct ieee80211_fast_tx *fast_tx, @@ -2826,6 +3013,10 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, ieee80211_tx_stats(dev, skb->len + extra_head); + if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) && + ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb)) + return true; + /* will not be crypto-handled beyond what we do here, so use false * as the may-encrypt argument for the resize to not account for * more room than we already have in 'extra_head' @@ -3406,7 +3597,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, struct sk_buff *skb = NULL; struct ieee80211_tx_info *info; struct ieee80211_sub_if_data *sdata = NULL; - enum ieee80211_band band; + enum nl80211_band band; struct ieee80211_tx_rate_control txrc; struct ieee80211_chanctx_conf *chanctx_conf; int csa_off_base = 0; @@ -3974,7 +4165,7 @@ EXPORT_SYMBOL(ieee80211_unreserve_tid); void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, - enum ieee80211_band band) + enum nl80211_band band) { int ac = ieee802_1d_to_ac[tid & 7]; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 7390de494..905003f75 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -59,7 +59,7 @@ void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx) } } -int ieee80211_frame_duration(enum ieee80211_band band, size_t len, +int ieee80211_frame_duration(enum nl80211_band band, size_t len, int rate, int erp, int short_preamble, int shift) { @@ -77,7 +77,7 @@ int ieee80211_frame_duration(enum ieee80211_band band, size_t len, * is assumed to be 0 otherwise. */ - if (band == IEEE80211_BAND_5GHZ || erp) { + if (band == NL80211_BAND_5GHZ || erp) { /* * OFDM: * @@ -129,7 +129,7 @@ int ieee80211_frame_duration(enum ieee80211_band band, size_t len, /* Exported duration function for driver use */ __le16 ieee80211_generic_frame_duration(struct ieee80211_hw *hw, struct ieee80211_vif *vif, - enum ieee80211_band band, + enum nl80211_band band, size_t frame_len, struct ieee80211_rate *rate) { @@ -1129,7 +1129,7 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata, rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); use_11b = (chanctx_conf && - chanctx_conf->def.chan->band == IEEE80211_BAND_2GHZ) && + chanctx_conf->def.chan->band == NL80211_BAND_2GHZ) && !(sdata->flags & IEEE80211_SDATA_OPERATING_GMODE); rcu_read_unlock(); @@ -1301,7 +1301,7 @@ void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, u8 *buffer, size_t buffer_len, const u8 *ie, size_t ie_len, - enum ieee80211_band band, + enum nl80211_band band, u32 rate_mask, struct cfg80211_chan_def *chandef, size_t *offset) @@ -1375,7 +1375,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, pos += ext_rates_len; } - if (chandef->chan && sband->band == IEEE80211_BAND_2GHZ) { + if (chandef->chan && sband->band == NL80211_BAND_2GHZ) { if (end - pos < 3) goto out_err; *pos++ = WLAN_EID_DS_PARAMS; @@ -1479,7 +1479,7 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, memset(ie_desc, 0, sizeof(*ie_desc)); - for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { if (bands_used & BIT(i)) { pos += ieee80211_build_preq_ies_band(local, buffer + pos, @@ -1522,7 +1522,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb; struct ieee80211_mgmt *mgmt; int ies_len; - u32 rate_masks[IEEE80211_NUM_BANDS] = {}; + u32 rate_masks[NUM_NL80211_BANDS] = {}; struct ieee80211_scan_ies dummy_ie_desc; /* @@ -1582,7 +1582,7 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, - enum ieee80211_band band, u32 *basic_rates) + enum nl80211_band band, u32 *basic_rates) { struct ieee80211_supported_band *sband; size_t num_rates; @@ -2520,7 +2520,7 @@ int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, bool need_basic, - enum ieee80211_band band) + enum nl80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; @@ -2565,7 +2565,7 @@ int ieee80211_add_srates_ie(struct ieee80211_sub_if_data *sdata, int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, bool need_basic, - enum ieee80211_band band) + enum nl80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; @@ -2711,7 +2711,7 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, if (status->flag & RX_FLAG_MACTIME_PLCP_START) { /* TODO: handle HT/VHT preambles */ - if (status->band == IEEE80211_BAND_5GHZ) { + if (status->band == NL80211_BAND_5GHZ) { ts += 20 << shift; mpdu_offset += 2; } else if (status->flag & RX_FLAG_SHORTPRE) { @@ -2724,8 +2724,9 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, rate = cfg80211_calculate_bitrate(&ri); if (WARN_ONCE(!rate, - "Invalid bitrate: flags=0x%x, idx=%d, vht_nss=%d\n", - status->flag, status->rate_idx, status->vht_nss)) + "Invalid bitrate: flags=0x%llx, idx=%d, vht_nss=%d\n", + (unsigned long long)status->flag, status->rate_idx, + status->vht_nss)) return 0; /* rewind from end of MPDU */ diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index e590e2ef9..ee715764a 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -418,7 +418,7 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta) u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, - enum ieee80211_band band) + enum nl80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; @@ -504,7 +504,7 @@ EXPORT_SYMBOL_GPL(ieee80211_update_mu_groups); void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, - enum ieee80211_band band) + enum nl80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 18848258a..b48c1e13e 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -504,25 +504,31 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, !ieee80211_is_robust_mgmt_frame(skb)) return RX_CONTINUE; - data_len = skb->len - hdrlen - IEEE80211_CCMP_HDR_LEN - mic_len; - if (!rx->sta || data_len < 0) - return RX_DROP_UNUSABLE; - if (status->flag & RX_FLAG_DECRYPTED) { if (!pskb_may_pull(rx->skb, hdrlen + IEEE80211_CCMP_HDR_LEN)) return RX_DROP_UNUSABLE; + if (status->flag & RX_FLAG_MIC_STRIPPED) + mic_len = 0; } else { if (skb_linearize(rx->skb)) return RX_DROP_UNUSABLE; } + data_len = skb->len - hdrlen - IEEE80211_CCMP_HDR_LEN - mic_len; + if (!rx->sta || data_len < 0) + return RX_DROP_UNUSABLE; + if (!(status->flag & RX_FLAG_PN_VALIDATED)) { + int res; + ccmp_hdr2pn(pn, skb->data + hdrlen); queue = rx->security_idx; - if (memcmp(pn, key->u.ccmp.rx_pn[queue], - IEEE80211_CCMP_PN_LEN) <= 0) { + res = memcmp(pn, key->u.ccmp.rx_pn[queue], + IEEE80211_CCMP_PN_LEN); + if (res < 0 || + (!res && !(status->flag & RX_FLAG_ALLOW_SAME_PN))) { key->u.ccmp.replays++; return RX_DROP_UNUSABLE; } @@ -720,8 +726,7 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); u8 pn[IEEE80211_GCMP_PN_LEN]; - int data_len; - int queue; + int data_len, queue, mic_len = IEEE80211_GCMP_MIC_LEN; hdrlen = ieee80211_hdrlen(hdr->frame_control); @@ -729,26 +734,31 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) !ieee80211_is_robust_mgmt_frame(skb)) return RX_CONTINUE; - data_len = skb->len - hdrlen - IEEE80211_GCMP_HDR_LEN - - IEEE80211_GCMP_MIC_LEN; - if (!rx->sta || data_len < 0) - return RX_DROP_UNUSABLE; - if (status->flag & RX_FLAG_DECRYPTED) { if (!pskb_may_pull(rx->skb, hdrlen + IEEE80211_GCMP_HDR_LEN)) return RX_DROP_UNUSABLE; + if (status->flag & RX_FLAG_MIC_STRIPPED) + mic_len = 0; } else { if (skb_linearize(rx->skb)) return RX_DROP_UNUSABLE; } + data_len = skb->len - hdrlen - IEEE80211_GCMP_HDR_LEN - mic_len; + if (!rx->sta || data_len < 0) + return RX_DROP_UNUSABLE; + if (!(status->flag & RX_FLAG_PN_VALIDATED)) { + int res; + gcmp_hdr2pn(pn, skb->data + hdrlen); queue = rx->security_idx; - if (memcmp(pn, key->u.gcmp.rx_pn[queue], - IEEE80211_GCMP_PN_LEN) <= 0) { + res = memcmp(pn, key->u.gcmp.rx_pn[queue], + IEEE80211_GCMP_PN_LEN); + if (res < 0 || + (!res && !(status->flag & RX_FLAG_ALLOW_SAME_PN))) { key->u.gcmp.replays++; return RX_DROP_UNUSABLE; } @@ -772,7 +782,7 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) } /* Remove GCMP header and MIC */ - if (pskb_trim(skb, skb->len - IEEE80211_GCMP_MIC_LEN)) + if (pskb_trim(skb, skb->len - mic_len)) return RX_DROP_UNUSABLE; memmove(skb->data + IEEE80211_GCMP_HDR_LEN, skb->data, hdrlen); skb_pull(skb, IEEE80211_GCMP_HDR_LEN); diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c index 0183b32da..2055e57ed 100644 --- a/net/mpls/mpls_gso.c +++ b/net/mpls/mpls_gso.c @@ -26,14 +26,6 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, netdev_features_t mpls_features; __be16 mpls_protocol; - if (unlikely(skb_shinfo(skb)->gso_type & - ~(SKB_GSO_TCPV4 | - SKB_GSO_TCPV6 | - SKB_GSO_UDP | - SKB_GSO_DODGY | - SKB_GSO_TCP_ECN))) - goto out; - /* Setup inner SKB. */ mpls_protocol = skb->protocol; skb->protocol = skb->inner_protocol; @@ -56,7 +48,7 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, * skb_mac_gso_segment(), an indirect caller of this function. */ __skb_pull(skb, skb->data - skb_mac_header(skb)); -out: + return segs; } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 85ca189bd..096a45103 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -104,6 +104,7 @@ static inline void ct_write_unlock_bh(unsigned int key) spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } +static void ip_vs_conn_expire(unsigned long data); /* * Returns hash value for IPVS connection entry @@ -453,10 +454,16 @@ ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, } EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); +static void __ip_vs_conn_put_notimer(struct ip_vs_conn *cp) +{ + __ip_vs_conn_put(cp); + ip_vs_conn_expire((unsigned long)cp); +} + /* * Put back the conn and restart its timer with its timeout */ -void ip_vs_conn_put(struct ip_vs_conn *cp) +static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp) { unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? 0 : cp->timeout; @@ -465,6 +472,16 @@ void ip_vs_conn_put(struct ip_vs_conn *cp) __ip_vs_conn_put(cp); } +void ip_vs_conn_put(struct ip_vs_conn *cp) +{ + if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && + (atomic_read(&cp->refcnt) == 1) && + !timer_pending(&cp->timer)) + /* expire connection immediately */ + __ip_vs_conn_put_notimer(cp); + else + __ip_vs_conn_put_timer(cp); +} /* * Fill a no_client_port connection with a client port number @@ -745,7 +762,7 @@ static int expire_quiescent_template(struct netns_ipvs *ipvs, * If available, return 1, otherwise invalidate this connection * template and return 0. */ -int ip_vs_check_template(struct ip_vs_conn *ct) +int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest) { struct ip_vs_dest *dest = ct->dest; struct netns_ipvs *ipvs = ct->ipvs; @@ -755,7 +772,8 @@ int ip_vs_check_template(struct ip_vs_conn *ct) */ if ((dest == NULL) || !(dest->flags & IP_VS_DEST_F_AVAILABLE) || - expire_quiescent_template(ipvs, dest)) { + expire_quiescent_template(ipvs, dest) || + (cdest && (dest != cdest))) { IP_VS_DBG_BUF(9, "check_template: dest not available for " "protocol %s s:%s:%d v:%s:%d " "-> d:%s:%d\n", @@ -819,7 +837,8 @@ static void ip_vs_conn_expire(unsigned long data) if (cp->control) ip_vs_control_del(cp); - if (cp->flags & IP_VS_CONN_F_NFCT) { + if ((cp->flags & IP_VS_CONN_F_NFCT) && + !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) { /* Do not access conntracks during subsys cleanup * because nf_conntrack_find_get can not be used after * conntrack cleanup for the net. @@ -834,7 +853,10 @@ static void ip_vs_conn_expire(unsigned long data) ip_vs_unbind_dest(cp); if (cp->flags & IP_VS_CONN_F_NO_CPORT) atomic_dec(&ip_vs_conn_no_cport_cnt); - call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + ip_vs_conn_rcu_free(&cp->rcu_head); + else + call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); atomic_dec(&ipvs->conn_count); return; } @@ -850,7 +872,7 @@ static void ip_vs_conn_expire(unsigned long data) if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs)); - ip_vs_conn_put(cp); + __ip_vs_conn_put_timer(cp); } /* Modify timer, so that it expires as soon as possible. @@ -1240,6 +1262,16 @@ static inline int todrop_entry(struct ip_vs_conn *cp) return 1; } +static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp) +{ + struct ip_vs_service *svc; + + if (!cp->dest) + return false; + svc = rcu_dereference(cp->dest->svc); + return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET); +} + /* Called from keventd and must protect itself from softirqs */ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) { @@ -1254,11 +1286,16 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->flags & IP_VS_CONN_F_TEMPLATE) - /* connection template */ - continue; if (cp->ipvs != ipvs) continue; + if (cp->flags & IP_VS_CONN_F_TEMPLATE) { + if (atomic_read(&cp->n_control) || + !ip_vs_conn_ops_mode(cp)) + continue; + else + /* connection template of OPS */ + goto try_drop; + } if (cp->protocol == IPPROTO_TCP) { switch(cp->state) { case IP_VS_TCP_S_SYN_RECV: @@ -1286,6 +1323,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) continue; } } else { +try_drop: if (!todrop_entry(cp)) continue; } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index b9a4082af..2c1b498a7 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -68,6 +68,7 @@ EXPORT_SYMBOL(ip_vs_conn_put); #ifdef CONFIG_IP_VS_DEBUG EXPORT_SYMBOL(ip_vs_get_debug_level); #endif +EXPORT_SYMBOL(ip_vs_new_conn_out); static int ip_vs_net_id __read_mostly; /* netns cnt used for uniqueness */ @@ -320,7 +321,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* Check if a template already exists */ ct = ip_vs_ct_in_get(¶m); - if (!ct || !ip_vs_check_template(ct)) { + if (!ct || !ip_vs_check_template(ct, NULL)) { struct ip_vs_scheduler *sched; /* @@ -611,7 +612,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ret = cp->packet_xmit(skb, cp, pd->pp, iph); /* do not touch skb anymore */ - atomic_inc(&cp->in_pkts); + if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) + atomic_inc(&cp->control->in_pkts); + else + atomic_inc(&cp->in_pkts); ip_vs_conn_put(cp); return ret; } @@ -1100,6 +1104,144 @@ static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, } } +/* Generic function to create new connections for outgoing RS packets + * + * Pre-requisites for successful connection creation: + * 1) Virtual Service is NOT fwmark based: + * In fwmark-VS actual vaddr and vport are unknown to IPVS + * 2) Real Server and Virtual Service were NOT configured without port: + * This is to allow match of different VS to the same RS ip-addr + */ +struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + __be16 dport, + __be16 cport) +{ + struct ip_vs_conn_param param; + struct ip_vs_conn *ct = NULL, *cp = NULL; + const union nf_inet_addr *vaddr, *daddr, *caddr; + union nf_inet_addr snet; + __be16 vport; + unsigned int flags; + + EnterFunction(12); + vaddr = &svc->addr; + vport = svc->port; + daddr = &iph->saddr; + caddr = &iph->daddr; + + /* check pre-requisites are satisfied */ + if (svc->fwmark) + return NULL; + if (!vport || !dport) + return NULL; + + /* for persistent service first create connection template */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) { + /* apply netmask the same way ingress-side does */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + ipv6_addr_prefix(&snet.in6, &caddr->in6, + (__force __u32)svc->netmask); + else +#endif + snet.ip = caddr->ip & svc->netmask; + /* fill params and create template if not existent */ + if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol, + &snet, 0, vaddr, + vport, ¶m) < 0) + return NULL; + ct = ip_vs_ct_in_get(¶m); + /* check if template exists and points to the same dest */ + if (!ct || !ip_vs_check_template(ct, dest)) { + ct = ip_vs_conn_new(¶m, dest->af, daddr, dport, + IP_VS_CONN_F_TEMPLATE, dest, 0); + if (!ct) { + kfree(param.pe_data); + return NULL; + } + ct->timeout = svc->timeout; + } else { + kfree(param.pe_data); + } + } + + /* connection flags */ + flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) && + iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; + /* create connection */ + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, + caddr, cport, vaddr, vport, ¶m); + cp = ip_vs_conn_new(¶m, dest->af, daddr, dport, flags, dest, 0); + if (!cp) { + if (ct) + ip_vs_conn_put(ct); + return NULL; + } + if (ct) { + ip_vs_control_add(cp, ct); + ip_vs_conn_put(ct); + } + ip_vs_conn_stats(cp, svc); + + /* return connection (will be used to handle outgoing packet) */ + IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u " + "d:%s:%u conn->flags:%X conn->refcnt:%d\n", + ip_vs_fwd_tag(cp), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + cp->flags, atomic_read(&cp->refcnt)); + LeaveFunction(12); + return cp; +} + +/* Handle outgoing packets which are considered requests initiated by + * real servers, so that subsequent responses from external client can be + * routed to the right real server. + * Used also for outgoing responses in OPS mode. + * + * Connection management is handled by persistent-engine specific callback. + */ +static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum, + struct netns_ipvs *ipvs, + int af, struct sk_buff *skb, + const struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_conn *cp = NULL; + __be16 _ports[2], *pptr; + + if (hooknum == NF_INET_LOCAL_IN) + return NULL; + + pptr = frag_safe_skb_hp(skb, iph->len, + sizeof(_ports), _ports, iph); + if (!pptr) + return NULL; + + rcu_read_lock(); + dest = ip_vs_find_real_service(ipvs, af, iph->protocol, + &iph->saddr, pptr[0]); + if (dest) { + struct ip_vs_service *svc; + struct ip_vs_pe *pe; + + svc = rcu_dereference(dest->svc); + if (svc) { + pe = rcu_dereference(svc->pe); + if (pe && pe->conn_out) + cp = pe->conn_out(svc, dest, skb, iph, + pptr[0], pptr[1]); + } + } + rcu_read_unlock(); + + return cp; +} + /* Handle response packets: rewrite addresses and send away... */ static unsigned int @@ -1245,6 +1387,22 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in if (likely(cp)) return handle_response(af, skb, pd, cp, &iph, hooknum); + + /* Check for real-server-started requests */ + if (atomic_read(&ipvs->conn_out_counter)) { + /* Currently only for UDP: + * connection oriented protocols typically use + * ephemeral ports for outgoing connections, so + * related incoming responses would not match any VS + */ + if (pp->protocol == IPPROTO_UDP) { + cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph); + if (likely(cp)) + return handle_response(af, skb, pd, cp, &iph, + hooknum); + } + } + if (sysctl_nat_icmp_send(ipvs) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || @@ -1837,6 +1995,9 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, pkts); + else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) + /* increment is done inside ip_vs_sync_conn too */ + atomic_inc(&cp->control->in_pkts); ip_vs_conn_put(cp); return ret; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 404b2a4f4..c3c809b2e 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -567,6 +567,36 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, return false; } +/* Find real service record by <proto,addr,port>. + * In case of multiple records with the same <proto,addr,port>, only + * the first found record is returned. + * + * To be called under RCU lock. + */ +struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, + __u16 protocol, + const union nf_inet_addr *daddr, + __be16 dport) +{ + unsigned int hash; + struct ip_vs_dest *dest; + + /* Check for "full" addressed entries */ + hash = ip_vs_rs_hashkey(af, daddr, dport); + + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->port == dport && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + (dest->protocol == protocol || dest->vfwmark)) { + /* HIT */ + return dest; + } + } + + return NULL; +} + /* Lookup destination by {addr,port} in the given service * Called under RCU lock. */ @@ -1253,6 +1283,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, atomic_inc(&ipvs->ftpsvc_counter); else if (svc->port == 0) atomic_inc(&ipvs->nullsvc_counter); + if (svc->pe && svc->pe->conn_out) + atomic_inc(&ipvs->conn_out_counter); ip_vs_start_estimator(ipvs, &svc->stats); @@ -1293,6 +1325,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) struct ip_vs_scheduler *sched = NULL, *old_sched; struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; + bool new_pe_conn_out, old_pe_conn_out; /* * Lookup the scheduler, by 'u->sched_name' @@ -1355,8 +1388,16 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) svc->netmask = u->netmask; old_pe = rcu_dereference_protected(svc->pe, 1); - if (pe != old_pe) + if (pe != old_pe) { rcu_assign_pointer(svc->pe, pe); + /* check for optional methods in new pe */ + new_pe_conn_out = (pe && pe->conn_out) ? true : false; + old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false; + if (new_pe_conn_out && !old_pe_conn_out) + atomic_inc(&svc->ipvs->conn_out_counter); + if (old_pe_conn_out && !new_pe_conn_out) + atomic_dec(&svc->ipvs->conn_out_counter); + } out: ip_vs_scheduler_put(old_sched); @@ -1389,6 +1430,8 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) /* Unbind persistence engine, keep svc->pe */ old_pe = rcu_dereference_protected(svc->pe, 1); + if (old_pe && old_pe->conn_out) + atomic_dec(&ipvs->conn_out_counter); ip_vs_pe_put(old_pe); /* @@ -2875,8 +2918,10 @@ static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) || nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) || nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) || - nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) || - nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, + IPVS_STATS_ATTR_PAD) || nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) || nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) || nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) || @@ -2900,16 +2945,26 @@ static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type, if (!nl_stats) return -EMSGSIZE; - if (nla_put_u64(skb, IPVS_STATS_ATTR_CONNS, kstats->conns) || - nla_put_u64(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts) || - nla_put_u64(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts) || - nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) || - nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) || - nla_put_u64(skb, IPVS_STATS_ATTR_CPS, kstats->cps) || - nla_put_u64(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps) || - nla_put_u64(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps) || - nla_put_u64(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps) || - nla_put_u64(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps)) + if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps, + IPVS_STATS_ATTR_PAD)) goto nla_put_failure; nla_nest_end(skb, nl_stats); @@ -3957,6 +4012,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) (unsigned long) ipvs); atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0); + atomic_set(&ipvs->conn_out_counter, 0); /* procfs stats */ ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 30434fb13..f04fd8df2 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -93,6 +93,10 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) return; + /* Never alter conntrack for OPS conns (no reply is expected) */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return; + /* Alter reply only in original direction */ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) return; diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 0a6eb5c0d..d07ef9e31 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -143,6 +143,20 @@ static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf) return cp->pe_data_len; } +static struct ip_vs_conn * +ip_vs_sip_conn_out(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + __be16 dport, + __be16 cport) +{ + if (likely(iph->protocol == IPPROTO_UDP)) + return ip_vs_new_conn_out(svc, dest, skb, iph, dport, cport); + /* currently no need to handle other than UDP */ + return NULL; +} + static struct ip_vs_pe ip_vs_sip_pe = { .name = "sip", @@ -153,6 +167,7 @@ static struct ip_vs_pe ip_vs_sip_pe = .ct_match = ip_vs_sip_ct_match, .hashkey_raw = ip_vs_sip_hashkey_raw, .show_pe_data = ip_vs_sip_show_pe_data, + .conn_out = ip_vs_sip_conn_out, }; static int __init ip_vs_sip_init(void) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 803001a45..1b07578be 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1545,7 +1545,8 @@ error: /* * Set up receiving multicast socket over UDP */ -static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id) +static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id, + int ifindex) { /* multicast addr */ union ipvs_sockaddr mcast_addr; @@ -1566,6 +1567,7 @@ static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id) set_sock_size(sock->sk, 0, result); get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); + sock->sk->sk_bound_dev_if = ifindex; result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); if (result < 0) { pr_err("Error binding to the multicast addr\n"); @@ -1868,7 +1870,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, if (state == IP_VS_STATE_MASTER) sock = make_send_sock(ipvs, id); else - sock = make_receive_sock(ipvs, id); + sock = make_receive_sock(ipvs, id, dev->ifindex); if (IS_ERR(sock)) { result = PTR_ERR(sock); goto outtinfo; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index dc196a0f5..01d3d894d 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -932,17 +932,14 @@ error: static inline int __tun_gso_type_mask(int encaps_af, int orig_af) { - if (encaps_af == AF_INET) { - if (orig_af == AF_INET) - return SKB_GSO_IPIP; - - return SKB_GSO_SIT; + switch (encaps_af) { + case AF_INET: + return SKB_GSO_IPXIP4; + case AF_INET6: + return SKB_GSO_IPXIP6; + default: + return 0; } - - /* GSO: we need to provide proper SKB_GSO_ value for IPv6: - * SKB_GSO_SIT/IPV6 - */ - return 0; } /* @@ -1013,8 +1010,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (IS_ERR(skb)) goto tx_error; - skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af)); - if (IS_ERR(skb)) + if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af))) goto tx_error; skb->transport_header = skb->network_header; @@ -1105,8 +1101,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (IS_ERR(skb)) goto tx_error; - skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af)); - if (IS_ERR(skb)) + if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af))) goto tx_error; skb->transport_header = skb->network_header; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e27fd17c6..9f530adad 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -12,6 +12,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/netfilter.h> #include <linux/module.h> @@ -52,6 +54,7 @@ #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_helper.h> +#include <net/netns/hash.h> #define NF_CONNTRACK_VERSION "0.5.0" @@ -66,6 +69,12 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks); __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); +struct hlist_nulls_head *nf_conntrack_hash __read_mostly; +EXPORT_SYMBOL_GPL(nf_conntrack_hash); + +static __read_mostly struct kmem_cache *nf_conntrack_cachep; +static __read_mostly spinlock_t nf_conntrack_locks_all_lock; +static __read_mostly seqcount_t nf_conntrack_generation; static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; @@ -105,7 +114,7 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, spin_lock_nested(&nf_conntrack_locks[h1], SINGLE_DEPTH_NESTING); } - if (read_seqcount_retry(&net->ct.generation, sequence)) { + if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { nf_conntrack_double_unlock(h1, h2); return true; } @@ -139,43 +148,43 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max); DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); -unsigned int nf_conntrack_hash_rnd __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd); +static unsigned int nf_conntrack_hash_rnd __read_mostly; -static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple) +static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, + const struct net *net) { unsigned int n; + u32 seed; + + get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); /* The direction must be ignored, so we hash everything up to the * destination ports (which is a multiple of 4) and treat the last * three bytes manually. */ + seed = nf_conntrack_hash_rnd ^ net_hash_mix(net); n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); - return jhash2((u32 *)tuple, n, nf_conntrack_hash_rnd ^ + return jhash2((u32 *)tuple, n, seed ^ (((__force __u16)tuple->dst.u.all << 16) | tuple->dst.protonum)); } -static u32 __hash_bucket(u32 hash, unsigned int size) -{ - return reciprocal_scale(hash, size); -} - -static u32 hash_bucket(u32 hash, const struct net *net) +static u32 scale_hash(u32 hash) { - return __hash_bucket(hash, net->ct.htable_size); + return reciprocal_scale(hash, nf_conntrack_htable_size); } -static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, - unsigned int size) +static u32 __hash_conntrack(const struct net *net, + const struct nf_conntrack_tuple *tuple, + unsigned int size) { - return __hash_bucket(hash_conntrack_raw(tuple), size); + return reciprocal_scale(hash_conntrack_raw(tuple, net), size); } -static inline u_int32_t hash_conntrack(const struct net *net, - const struct nf_conntrack_tuple *tuple) +static u32 hash_conntrack(const struct net *net, + const struct nf_conntrack_tuple *tuple) { - return __hash_conntrack(tuple, net->ct.htable_size); + return scale_hash(hash_conntrack_raw(tuple, net)); } bool @@ -356,7 +365,7 @@ destroy_conntrack(struct nf_conntrack *nfct) } rcu_read_lock(); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); - if (l4proto && l4proto->destroy) + if (l4proto->destroy) l4proto->destroy(ct); rcu_read_unlock(); @@ -391,7 +400,7 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) local_bh_disable(); do { - sequence = read_seqcount_begin(&net->ct.generation); + sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); reply_hash = hash_conntrack(net, @@ -443,7 +452,8 @@ static void death_by_timeout(unsigned long ul_conntrack) static inline bool nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const struct nf_conntrack_zone *zone, + const struct net *net) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); @@ -452,7 +462,8 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, */ return nf_ct_tuple_equal(tuple, &h->tuple) && nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && - nf_ct_is_confirmed(ct); + nf_ct_is_confirmed(ct) && + net_eq(net, nf_ct_net(ct)); } /* @@ -465,21 +476,23 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; - unsigned int bucket = hash_bucket(hash, net); + unsigned int bucket, sequence; - /* Disable BHs the entire time since we normally need to disable them - * at least once for the stats anyway. - */ - local_bh_disable(); begin: - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) { - if (nf_ct_key_equal(h, tuple, zone)) { - NF_CT_STAT_INC(net, found); - local_bh_enable(); + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + bucket = scale_hash(hash); + ct_hash = nf_conntrack_hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { + if (nf_ct_key_equal(h, tuple, zone, net)) { + NF_CT_STAT_INC_ATOMIC(net, found); return h; } - NF_CT_STAT_INC(net, searched); + NF_CT_STAT_INC_ATOMIC(net, searched); } /* * if the nulls value we got at the end of this lookup is @@ -487,10 +500,9 @@ begin: * We probably met an item that was moved to another chain. */ if (get_nulls_value(n) != bucket) { - NF_CT_STAT_INC(net, search_restart); + NF_CT_STAT_INC_ATOMIC(net, search_restart); goto begin; } - local_bh_enable(); return NULL; } @@ -512,7 +524,7 @@ begin: !atomic_inc_not_zero(&ct->ct_general.use))) h = NULL; else { - if (unlikely(!nf_ct_key_equal(h, tuple, zone))) { + if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) { nf_ct_put(ct); goto begin; } @@ -528,7 +540,7 @@ nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { return __nf_conntrack_find_get(net, zone, tuple, - hash_conntrack_raw(tuple)); + hash_conntrack_raw(tuple, net)); } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -536,12 +548,10 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, unsigned int hash, unsigned int reply_hash) { - struct net *net = nf_ct_net(ct); - hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &net->ct.hash[hash]); + &nf_conntrack_hash[hash]); hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, - &net->ct.hash[reply_hash]); + &nf_conntrack_hash[reply_hash]); } int @@ -558,7 +568,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) local_bh_disable(); do { - sequence = read_seqcount_begin(&net->ct.generation); + sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); reply_hash = hash_conntrack(net, @@ -566,17 +576,14 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* See if there's one in the list already, including reverse */ - hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + zone, net)) goto out; - hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + zone, net)) goto out; add_timer(&ct->timeout); @@ -597,6 +604,63 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); +static inline void nf_ct_acct_update(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int len) +{ + struct nf_conn_acct *acct; + + acct = nf_conn_acct_find(ct); + if (acct) { + struct nf_conn_counter *counter = acct->counter; + + atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); + atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes); + } +} + +static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + const struct nf_conn *loser_ct) +{ + struct nf_conn_acct *acct; + + acct = nf_conn_acct_find(loser_ct); + if (acct) { + struct nf_conn_counter *counter = acct->counter; + unsigned int bytes; + + /* u32 should be fine since we must have seen one packet. */ + bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); + nf_ct_acct_update(ct, ctinfo, bytes); + } +} + +/* Resolve race on insertion if this protocol allows this. */ +static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + struct nf_conntrack_tuple_hash *h) +{ + /* This is the conntrack entry already in hashes that won race. */ + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + struct nf_conntrack_l4proto *l4proto; + + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + if (l4proto->allow_clash && + !nfct_nat(ct) && + !nf_ct_is_dying(ct) && + atomic_inc_not_zero(&ct->ct_general.use)) { + nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct); + nf_conntrack_put(skb->nfct); + /* Assign conntrack already in hashes to this skbuff. Don't + * modify skb->nfctinfo to ensure consistent stateful filtering. + */ + skb->nfct = &ct->ct_general; + return NF_ACCEPT; + } + NF_CT_STAT_INC(net, drop); + return NF_DROP; +} + /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) @@ -611,6 +675,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) enum ip_conntrack_info ctinfo; struct net *net; unsigned int sequence; + int ret = NF_DROP; ct = nf_ct_get(skb, &ctinfo); net = nf_ct_net(ct); @@ -626,10 +691,10 @@ __nf_conntrack_confirm(struct sk_buff *skb) local_bh_disable(); do { - sequence = read_seqcount_begin(&net->ct.generation); + sequence = read_seqcount_begin(&nf_conntrack_generation); /* reuse the hash saved before */ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; - hash = hash_bucket(hash, net); + hash = scale_hash(hash); reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); @@ -653,23 +718,22 @@ __nf_conntrack_confirm(struct sk_buff *skb) */ nf_ct_del_from_dying_or_unconfirmed_list(ct); - if (unlikely(nf_ct_is_dying(ct))) - goto out; + if (unlikely(nf_ct_is_dying(ct))) { + nf_ct_add_to_dying_list(ct); + goto dying; + } /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + zone, net)) goto out; - hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + zone, net)) goto out; /* Timer relative to confirmation time, not original @@ -708,10 +772,12 @@ __nf_conntrack_confirm(struct sk_buff *skb) out: nf_ct_add_to_dying_list(ct); + ret = nf_ct_resolve_clash(net, skb, ctinfo, h); +dying: nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert_failed); local_bh_enable(); - return NF_DROP; + return ret; } EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); @@ -724,29 +790,31 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, struct net *net = nf_ct_net(ignored_conntrack); const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_head *ct_hash; + unsigned int hash, sequence; struct hlist_nulls_node *n; struct nf_conn *ct; - unsigned int hash; zone = nf_ct_zone(ignored_conntrack); - hash = hash_conntrack(net, tuple); - /* Disable BHs the entire time since we need to disable them at - * least once for the stats anyway. - */ - rcu_read_lock_bh(); - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { + rcu_read_lock(); + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + hash = hash_conntrack(net, tuple); + ct_hash = nf_conntrack_hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (ct != ignored_conntrack && - nf_ct_tuple_equal(tuple, &h->tuple) && - nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h))) { - NF_CT_STAT_INC(net, found); - rcu_read_unlock_bh(); + nf_ct_key_equal(h, tuple, zone, net)) { + NF_CT_STAT_INC_ATOMIC(net, found); + rcu_read_unlock(); return 1; } - NF_CT_STAT_INC(net, searched); + NF_CT_STAT_INC_ATOMIC(net, searched); } - rcu_read_unlock_bh(); + rcu_read_unlock(); return 0; } @@ -760,71 +828,63 @@ static noinline int early_drop(struct net *net, unsigned int _hash) { /* Use oldest entry, which is roughly LRU */ struct nf_conntrack_tuple_hash *h; - struct nf_conn *ct = NULL, *tmp; + struct nf_conn *tmp; struct hlist_nulls_node *n; - unsigned int i = 0, cnt = 0; - int dropped = 0; - unsigned int hash, sequence; + unsigned int i, hash, sequence; + struct nf_conn *ct = NULL; spinlock_t *lockp; + bool ret = false; + + i = 0; local_bh_disable(); restart: - sequence = read_seqcount_begin(&net->ct.generation); - hash = hash_bucket(_hash, net); - for (; i < net->ct.htable_size; i++) { + sequence = read_seqcount_begin(&nf_conntrack_generation); + for (; i < NF_CT_EVICTION_RANGE; i++) { + hash = scale_hash(_hash++); lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; nf_conntrack_lock(lockp); - if (read_seqcount_retry(&net->ct.generation, sequence)) { + if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { spin_unlock(lockp); goto restart; } - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], - hnnode) { + hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], + hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); - if (!test_bit(IPS_ASSURED_BIT, &tmp->status) && - !nf_ct_is_dying(tmp) && - atomic_inc_not_zero(&tmp->ct_general.use)) { + + if (test_bit(IPS_ASSURED_BIT, &tmp->status) || + !net_eq(nf_ct_net(tmp), net) || + nf_ct_is_dying(tmp)) + continue; + + if (atomic_inc_not_zero(&tmp->ct_general.use)) { ct = tmp; break; } - cnt++; } - hash = (hash + 1) % net->ct.htable_size; spin_unlock(lockp); - - if (ct || cnt >= NF_CT_EVICTION_RANGE) + if (ct) break; - } + local_bh_enable(); if (!ct) - return dropped; + return false; - if (del_timer(&ct->timeout)) { + /* kill only if in same netns -- might have moved due to + * SLAB_DESTROY_BY_RCU rules + */ + if (net_eq(nf_ct_net(ct), net) && del_timer(&ct->timeout)) { if (nf_ct_delete(ct, 0, 0)) { - dropped = 1; NF_CT_STAT_INC_ATOMIC(net, early_drop); + ret = true; } } - nf_ct_put(ct); - return dropped; -} -void init_nf_conntrack_hash_rnd(void) -{ - unsigned int rand; - - /* - * Why not initialize nf_conntrack_rnd in a "init()" function ? - * Because there isn't enough entropy when system initializing, - * and we initialize it as late as possible. - */ - do { - get_random_bytes(&rand, sizeof(rand)); - } while (!rand); - cmpxchg(&nf_conntrack_hash_rnd, 0, rand); + nf_ct_put(ct); + return ret; } static struct nf_conn * @@ -836,12 +896,6 @@ __nf_conntrack_alloc(struct net *net, { struct nf_conn *ct; - if (unlikely(!nf_conntrack_hash_rnd)) { - init_nf_conntrack_hash_rnd(); - /* recompute the hash as nf_conntrack_hash_rnd is initialized */ - hash = hash_conntrack_raw(orig); - } - /* We don't want any race condition at early drop stage */ atomic_inc(&net->ct.count); @@ -858,7 +912,7 @@ __nf_conntrack_alloc(struct net *net, * Do not use kmem_cache_zalloc(), as this cache uses * SLAB_DESTROY_BY_RCU. */ - ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp); + ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); if (ct == NULL) goto out; @@ -885,7 +939,7 @@ __nf_conntrack_alloc(struct net *net, atomic_set(&ct->ct_general.use, 0); return ct; out_free: - kmem_cache_free(net->ct.nf_conntrack_cachep, ct); + kmem_cache_free(nf_conntrack_cachep, ct); out: atomic_dec(&net->ct.count); return ERR_PTR(-ENOMEM); @@ -912,7 +966,7 @@ void nf_conntrack_free(struct nf_conn *ct) nf_ct_ext_destroy(ct); nf_ct_ext_free(ct); - kmem_cache_free(net->ct.nf_conntrack_cachep, ct); + kmem_cache_free(nf_conntrack_cachep, ct); smp_mb__before_atomic(); atomic_dec(&net->ct.count); } @@ -966,7 +1020,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, if (!l4proto->new(ct, skb, dataoff, timeouts)) { nf_conntrack_free(ct); - pr_debug("init conntrack: can't track with proto module\n"); + pr_debug("can't track with proto module\n"); return NULL; } @@ -988,7 +1042,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, spin_lock(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple); if (exp) { - pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", + pr_debug("expectation arrives ct=%p exp=%p\n", ct, exp); /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); @@ -1053,13 +1107,13 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, protonum, net, &tuple, l3proto, l4proto)) { - pr_debug("resolve_normal_ct: Can't get tuple\n"); + pr_debug("Can't get tuple\n"); return NULL; } /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); - hash = hash_conntrack_raw(&tuple); + hash = hash_conntrack_raw(&tuple, net); h = __nf_conntrack_find_get(net, zone, &tuple, hash); if (!h) { h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, @@ -1079,14 +1133,13 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, } else { /* Once we've had two way comms, always ESTABLISHED. */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - pr_debug("nf_conntrack_in: normal packet for %p\n", ct); + pr_debug("normal packet for %p\n", ct); *ctinfo = IP_CT_ESTABLISHED; } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { - pr_debug("nf_conntrack_in: related packet for %p\n", - ct); + pr_debug("related packet for %p\n", ct); *ctinfo = IP_CT_RELATED; } else { - pr_debug("nf_conntrack_in: new packet for %p\n", ct); + pr_debug("new packet for %p\n", ct); *ctinfo = IP_CT_NEW; } *set_reply = 0; @@ -1269,17 +1322,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, } acct: - if (do_acct) { - struct nf_conn_acct *acct; - - acct = nf_conn_acct_find(ct); - if (acct) { - struct nf_conn_counter *counter = acct->counter; - - atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); - atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes); - } - } + if (do_acct) + nf_ct_acct_update(ct, ctinfo, skb->len); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); @@ -1288,18 +1332,8 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, const struct sk_buff *skb, int do_acct) { - if (do_acct) { - struct nf_conn_acct *acct; - - acct = nf_conn_acct_find(ct); - if (acct) { - struct nf_conn_counter *counter = acct->counter; - - atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); - atomic64_add(skb->len - skb_network_offset(skb), - &counter[CTINFO2DIR(ctinfo)].bytes); - } - } + if (do_acct) + nf_ct_acct_update(ct, ctinfo, skb->len); if (del_timer(&ct->timeout)) { ct->timeout.function((unsigned long)ct); @@ -1395,16 +1429,17 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), int cpu; spinlock_t *lockp; - for (; *bucket < net->ct.htable_size; (*bucket)++) { + for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; local_bh_disable(); nf_conntrack_lock(lockp); - if (*bucket < net->ct.htable_size) { - hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { + if (*bucket < nf_conntrack_htable_size) { + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = nf_ct_tuplehash_to_ctrack(h); - if (iter(ct, data)) + if (net_eq(nf_ct_net(ct), net) && + iter(ct, data)) goto found; } } @@ -1442,6 +1477,9 @@ void nf_ct_iterate_cleanup(struct net *net, might_sleep(); + if (atomic_read(&net->ct.count) == 0) + return; + while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ if (del_timer(&ct->timeout)) @@ -1493,6 +1531,8 @@ void nf_conntrack_cleanup_end(void) while (untrack_refs() > 0) schedule(); + nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); + #ifdef CONFIG_NF_CONNTRACK_ZONES nf_ct_extend_unregister(&nf_ct_zone_extend); #endif @@ -1505,6 +1545,8 @@ void nf_conntrack_cleanup_end(void) nf_conntrack_tstamp_fini(); nf_conntrack_acct_fini(); nf_conntrack_expect_fini(); + + kmem_cache_destroy(nf_conntrack_cachep); } /* @@ -1543,15 +1585,12 @@ i_see_dead_people: } list_for_each_entry(net, net_exit_list, exit_list) { - nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); nf_conntrack_proto_pernet_fini(net); nf_conntrack_helper_pernet_fini(net); nf_conntrack_ecache_pernet_fini(net); nf_conntrack_tstamp_pernet_fini(net); nf_conntrack_acct_pernet_fini(net); nf_conntrack_expect_pernet_fini(net); - kmem_cache_destroy(net->ct.nf_conntrack_cachep); - kfree(net->ct.slabname); free_percpu(net->ct.stat); free_percpu(net->ct.pcpu_lists); } @@ -1563,8 +1602,15 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) unsigned int nr_slots, i; size_t sz; + if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) + return NULL; + BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); + + if (nr_slots > (UINT_MAX / sizeof(struct hlist_nulls_head))) + return NULL; + sz = nr_slots * sizeof(struct hlist_nulls_head); hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, get_order(sz)); @@ -1606,7 +1652,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) local_bh_disable(); nf_conntrack_all_lock(); - write_seqcount_begin(&init_net.ct.generation); + write_seqcount_begin(&nf_conntrack_generation); /* Lookups in the old hash might happen in parallel, which means we * might get false negatives during connection lookup. New connections @@ -1614,26 +1660,28 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) * though since that required taking the locks. */ - for (i = 0; i < init_net.ct.htable_size; i++) { - while (!hlist_nulls_empty(&init_net.ct.hash[i])) { - h = hlist_nulls_entry(init_net.ct.hash[i].first, - struct nf_conntrack_tuple_hash, hnnode); + for (i = 0; i < nf_conntrack_htable_size; i++) { + while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { + h = hlist_nulls_entry(nf_conntrack_hash[i].first, + struct nf_conntrack_tuple_hash, hnnode); ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); - bucket = __hash_conntrack(&h->tuple, hashsize); + bucket = __hash_conntrack(nf_ct_net(ct), + &h->tuple, hashsize); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } - old_size = init_net.ct.htable_size; - old_hash = init_net.ct.hash; + old_size = nf_conntrack_htable_size; + old_hash = nf_conntrack_hash; - init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; - init_net.ct.hash = hash; + nf_conntrack_hash = hash; + nf_conntrack_htable_size = hashsize; - write_seqcount_end(&init_net.ct.generation); + write_seqcount_end(&nf_conntrack_generation); nf_conntrack_all_unlock(); local_bh_enable(); + synchronize_net(); nf_ct_free_hashtable(old_hash, old_size); return 0; } @@ -1654,7 +1702,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); int nf_conntrack_init_start(void) { int max_factor = 8; - int i, ret, cpu; + int ret = -ENOMEM; + int i, cpu; + + seqcount_init(&nf_conntrack_generation); for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_conntrack_locks[i]); @@ -1681,8 +1732,19 @@ int nf_conntrack_init_start(void) * entries. */ max_factor = 4; } + + nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); + if (!nf_conntrack_hash) + return -ENOMEM; + nf_conntrack_max = max_factor * nf_conntrack_htable_size; + nf_conntrack_cachep = kmem_cache_create("nf_conntrack", + sizeof(struct nf_conn), 0, + SLAB_DESTROY_BY_RCU, NULL); + if (!nf_conntrack_cachep) + goto err_cachep; + printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n", NF_CONNTRACK_VERSION, nf_conntrack_htable_size, nf_conntrack_max); @@ -1759,6 +1821,9 @@ err_tstamp: err_acct: nf_conntrack_expect_fini(); err_expect: + kmem_cache_destroy(nf_conntrack_cachep); +err_cachep: + nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); return ret; } @@ -1778,12 +1843,10 @@ void nf_conntrack_init_end(void) int nf_conntrack_init_net(struct net *net) { - static atomic64_t unique_id; int ret = -ENOMEM; int cpu; atomic_set(&net->ct.count, 0); - seqcount_init(&net->ct.generation); net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); if (!net->ct.pcpu_lists) @@ -1801,25 +1864,6 @@ int nf_conntrack_init_net(struct net *net) if (!net->ct.stat) goto err_pcpu_lists; - net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%llu", - (u64)atomic64_inc_return(&unique_id)); - if (!net->ct.slabname) - goto err_slabname; - - net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname, - sizeof(struct nf_conn), 0, - SLAB_DESTROY_BY_RCU, NULL); - if (!net->ct.nf_conntrack_cachep) { - printk(KERN_ERR "Unable to create nf_conn slab cache\n"); - goto err_cache; - } - - net->ct.htable_size = nf_conntrack_htable_size; - net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1); - if (!net->ct.hash) { - printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); - goto err_hash; - } ret = nf_conntrack_expect_pernet_init(net); if (ret < 0) goto err_expect; @@ -1851,12 +1895,6 @@ err_tstamp: err_acct: nf_conntrack_expect_pernet_fini(net); err_expect: - nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); -err_hash: - kmem_cache_destroy(net->ct.nf_conntrack_cachep); -err_cache: - kfree(net->ct.slabname); -err_slabname: free_percpu(net->ct.stat); err_pcpu_lists: free_percpu(net->ct.pcpu_lists); diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 4e78c57b8..d28011b42 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -113,6 +113,60 @@ static void ecache_work(struct work_struct *work) schedule_delayed_work(&ctnet->ecache_dwork, delay); } +int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, + u32 portid, int report) +{ + int ret = 0; + struct net *net = nf_ct_net(ct); + struct nf_ct_event_notifier *notify; + struct nf_conntrack_ecache *e; + + rcu_read_lock(); + notify = rcu_dereference(net->ct.nf_conntrack_event_cb); + if (!notify) + goto out_unlock; + + e = nf_ct_ecache_find(ct); + if (!e) + goto out_unlock; + + if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) { + struct nf_ct_event item = { + .ct = ct, + .portid = e->portid ? e->portid : portid, + .report = report + }; + /* This is a resent of a destroy event? If so, skip missed */ + unsigned long missed = e->portid ? 0 : e->missed; + + if (!((eventmask | missed) & e->ctmask)) + goto out_unlock; + + ret = notify->fcn(eventmask | missed, &item); + if (unlikely(ret < 0 || missed)) { + spin_lock_bh(&ct->lock); + if (ret < 0) { + /* This is a destroy event that has been + * triggered by a process, we store the PORTID + * to include it in the retransmission. + */ + if (eventmask & (1 << IPCT_DESTROY) && + e->portid == 0 && portid != 0) + e->portid = portid; + else + e->missed |= eventmask; + } else { + e->missed &= ~missed; + } + spin_unlock_bh(&ct->lock); + } + } +out_unlock: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); + /* deliver cached events and clear cache entry - must be called with locally * disabled softirqs */ void nf_ct_deliver_cached_events(struct nf_conn *ct) @@ -167,6 +221,36 @@ out_unlock: } EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); +void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, + struct nf_conntrack_expect *exp, + u32 portid, int report) + +{ + struct net *net = nf_ct_exp_net(exp); + struct nf_exp_event_notifier *notify; + struct nf_conntrack_ecache *e; + + rcu_read_lock(); + notify = rcu_dereference(net->ct.nf_expect_event_cb); + if (!notify) + goto out_unlock; + + e = nf_ct_ecache_find(exp->master); + if (!e) + goto out_unlock; + + if (e->expmask & (1 << event)) { + struct nf_exp_event item = { + .exp = exp, + .portid = portid, + .report = report + }; + notify->fcn(1 << event, &item); + } +out_unlock: + rcu_read_unlock(); +} + int nf_conntrack_register_notifier(struct net *net, struct nf_ct_event_notifier *new) { diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 278927ab0..9e3693128 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -24,6 +24,7 @@ #include <linux/moduleparam.h> #include <linux/export.h> #include <net/net_namespace.h> +#include <net/netns/hash.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> @@ -35,9 +36,13 @@ unsigned int nf_ct_expect_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); +struct hlist_head *nf_ct_expect_hash __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_expect_hash); + unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; +static unsigned int nf_ct_expect_hashrnd __read_mostly; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, @@ -72,21 +77,32 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect) nf_ct_expect_put(exp); } -static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple) +static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple) { - unsigned int hash; + unsigned int hash, seed; - if (unlikely(!nf_conntrack_hash_rnd)) { - init_nf_conntrack_hash_rnd(); - } + get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); + + seed = nf_ct_expect_hashrnd ^ net_hash_mix(n); hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | - (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd); + (__force __u16)tuple->dst.u.all) ^ seed); return reciprocal_scale(hash, nf_ct_expect_hsize); } +static bool +nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_expect *i, + const struct nf_conntrack_zone *zone, + const struct net *net) +{ + return nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && + net_eq(net, nf_ct_net(i->master)) && + nf_ct_zone_equal_any(i->master, zone); +} + struct nf_conntrack_expect * __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, @@ -98,10 +114,9 @@ __nf_ct_expect_find(struct net *net, if (!net->ct.expect_count) return NULL; - h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) { - if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone_equal_any(i->master, zone)) + h = nf_ct_expect_dst_hash(net, tuple); + hlist_for_each_entry_rcu(i, &nf_ct_expect_hash[h], hnode) { + if (nf_ct_exp_equal(tuple, i, zone, net)) return i; } return NULL; @@ -139,11 +154,10 @@ nf_ct_find_expectation(struct net *net, if (!net->ct.expect_count) return NULL; - h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) { + h = nf_ct_expect_dst_hash(net, tuple); + hlist_for_each_entry(i, &nf_ct_expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && - nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone_equal_any(i->master, zone)) { + nf_ct_exp_equal(tuple, i, zone, net)) { exp = i; break; } @@ -223,6 +237,7 @@ static inline int expect_clash(const struct nf_conntrack_expect *a, } return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) && + net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } @@ -232,6 +247,7 @@ static inline int expect_matches(const struct nf_conntrack_expect *a, return a->master == b->master && a->class == b->class && nf_ct_tuple_equal(&a->tuple, &b->tuple) && nf_ct_tuple_mask_equal(&a->mask, &b->mask) && + net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } @@ -342,7 +358,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) struct nf_conn_help *master_help = nfct_help(exp->master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(exp); - unsigned int h = nf_ct_expect_dst_hash(&exp->tuple); + unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple); /* two references : one for hash insert, one for the timer */ atomic_add(2, &exp->use); @@ -350,7 +366,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) hlist_add_head(&exp->lnode, &master_help->expectations); master_help->expecting[exp->class]++; - hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]); + hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); net->ct.expect_count++; setup_timer(&exp->timeout, nf_ct_expectation_timed_out, @@ -401,8 +417,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) ret = -ESHUTDOWN; goto out; } - h = nf_ct_expect_dst_hash(&expect->tuple); - hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) { + h = nf_ct_expect_dst_hash(net, &expect->tuple); + hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) { if (expect_matches(i, expect)) { if (del_timer(&i->timeout)) { nf_ct_unlink_expect(i); @@ -468,12 +484,11 @@ struct ct_expect_iter_state { static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + n = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); if (n) return n; } @@ -483,14 +498,13 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + head = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); } return head; } @@ -623,28 +637,13 @@ module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); int nf_conntrack_expect_pernet_init(struct net *net) { - int err = -ENOMEM; - net->ct.expect_count = 0; - net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); - if (net->ct.expect_hash == NULL) - goto err1; - - err = exp_proc_init(net); - if (err < 0) - goto err2; - - return 0; -err2: - nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); -err1: - return err; + return exp_proc_init(net); } void nf_conntrack_expect_pernet_fini(struct net *net) { exp_proc_remove(net); - nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); } int nf_conntrack_expect_init(void) @@ -660,6 +659,13 @@ int nf_conntrack_expect_init(void) 0, 0, NULL); if (!nf_ct_expect_cachep) return -ENOMEM; + + nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); + if (!nf_ct_expect_hash) { + kmem_cache_destroy(nf_ct_expect_cachep); + return -ENOMEM; + } + return 0; } @@ -667,4 +673,5 @@ void nf_conntrack_expect_fini(void) { rcu_barrier(); /* Wait for call_rcu() before destroy */ kmem_cache_destroy(nf_ct_expect_cachep); + nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_hsize); } diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 883c691ec..19efeba02 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -632,6 +632,7 @@ static int __init nf_conntrack_ftp_init(void) if (ret) { pr_err("failed to register helper for pf: %d port: %d\n", ftp[i][j].tuple.src.l3num, ports[i]); + ports_c = i; nf_conntrack_ftp_fini(); return ret; } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 3b40ec575..196cb3964 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -38,10 +38,10 @@ unsigned int nf_ct_helper_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_helper_hsize); static unsigned int nf_ct_helper_count __read_mostly; -static bool nf_ct_auto_assign_helper __read_mostly = true; +static bool nf_ct_auto_assign_helper __read_mostly = false; module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644); MODULE_PARM_DESC(nf_conntrack_helper, - "Enable automatic conntrack helper assignment (default 1)"); + "Enable automatic conntrack helper assignment (default 0)"); #ifdef CONFIG_SYSCTL static struct ctl_table helper_sysctl_table[] = { @@ -361,9 +361,10 @@ EXPORT_SYMBOL_GPL(nf_ct_helper_log); int nf_conntrack_helper_register(struct nf_conntrack_helper *me) { - int ret = 0; - struct nf_conntrack_helper *cur; + struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; unsigned int h = helper_hash(&me->tuple); + struct nf_conntrack_helper *cur; + int ret = 0; BUG_ON(me->expect_policy == NULL); BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); @@ -371,9 +372,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) mutex_lock(&nf_ct_helper_mutex); hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { - if (strncmp(cur->name, me->name, NF_CT_HELPER_NAME_LEN) == 0 && - cur->tuple.src.l3num == me->tuple.src.l3num && - cur->tuple.dst.protonum == me->tuple.dst.protonum) { + if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) { ret = -EEXIST; goto out; } @@ -400,7 +399,7 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, - &net->ct.expect_hash[i], hnode) { + &nf_ct_expect_hash[i], hnode) { struct nf_conn_help *help = nfct_help(exp->master); if ((rcu_dereference_protected( help->helper, @@ -424,10 +423,10 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, spin_unlock_bh(&pcpu->lock); } local_bh_disable(); - for (i = 0; i < net->ct.htable_size; i++) { + for (i = 0; i < nf_conntrack_htable_size; i++) { nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); - if (i < net->ct.htable_size) { - hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + if (i < nf_conntrack_htable_size) { + hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) unhelp(h, me); } spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 8b6da2719..f97ac61d2 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -271,6 +271,7 @@ static int __init nf_conntrack_irc_init(void) if (ret) { pr_err("failed to register helper for pf: %u port: %u\n", irc[i].tuple.src.l3num, ports[i]); + ports_c = i; nf_conntrack_irc_fini(); return ret; } diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c index 3ce5c314e..252e6a7cd 100644 --- a/net/netfilter/nf_conntrack_labels.c +++ b/net/netfilter/nf_conntrack_labels.c @@ -16,28 +16,11 @@ static spinlock_t nf_connlabels_lock; -static unsigned int label_bits(const struct nf_conn_labels *l) -{ - unsigned int longs = l->words; - return longs * BITS_PER_LONG; -} - -bool nf_connlabel_match(const struct nf_conn *ct, u16 bit) -{ - struct nf_conn_labels *labels = nf_ct_labels_find(ct); - - if (!labels) - return false; - - return bit < label_bits(labels) && test_bit(bit, labels->bits); -} -EXPORT_SYMBOL_GPL(nf_connlabel_match); - int nf_connlabel_set(struct nf_conn *ct, u16 bit) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); - if (!labels || bit >= label_bits(labels)) + if (!labels || BIT_WORD(bit) >= labels->words) return -ENOSPC; if (test_bit(bit, labels->bits)) @@ -50,14 +33,18 @@ int nf_connlabel_set(struct nf_conn *ct, u16 bit) } EXPORT_SYMBOL_GPL(nf_connlabel_set); -static void replace_u32(u32 *address, u32 mask, u32 new) +static int replace_u32(u32 *address, u32 mask, u32 new) { u32 old, tmp; do { old = *address; tmp = (old & mask) ^ new; + if (old == tmp) + return 0; } while (cmpxchg(address, old, tmp) != old); + + return 1; } int nf_connlabels_replace(struct nf_conn *ct, @@ -66,6 +53,7 @@ int nf_connlabels_replace(struct nf_conn *ct, { struct nf_conn_labels *labels; unsigned int size, i; + int changed = 0; u32 *dst; labels = nf_ct_labels_find(ct); @@ -77,29 +65,27 @@ int nf_connlabels_replace(struct nf_conn *ct, words32 = size / sizeof(u32); dst = (u32 *) labels->bits; - if (words32) { - for (i = 0; i < words32; i++) - replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]); - } + for (i = 0; i < words32; i++) + changed |= replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]); size /= sizeof(u32); for (i = words32; i < size; i++) /* pad */ replace_u32(&dst[i], 0, 0); - nf_conntrack_event_cache(IPCT_LABEL, ct); + if (changed) + nf_conntrack_event_cache(IPCT_LABEL, ct); return 0; } EXPORT_SYMBOL_GPL(nf_connlabels_replace); -int nf_connlabels_get(struct net *net, unsigned int n_bits) +int nf_connlabels_get(struct net *net, unsigned int bits) { size_t words; - if (n_bits > (NF_CT_LABELS_MAX_SIZE * BITS_PER_BYTE)) + words = BIT_WORD(bits) + 1; + if (words > NF_CT_LABELS_MAX_SIZE / sizeof(long)) return -ERANGE; - words = BITS_TO_LONGS(n_bits); - spin_lock(&nf_connlabels_lock); net->ct.labels_used++; if (words > net->ct.label_words) @@ -128,6 +114,8 @@ static struct nf_ct_ext_type labels_extend __read_mostly = { int nf_conntrack_labels_init(void) { + BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX); + spin_lock_init(&nf_connlabels_lock); return nf_ct_extend_register(&labels_extend); } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 355e8552f..a18d1ceab 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -58,10 +58,9 @@ MODULE_LICENSE("GPL"); static char __initdata version[] = "0.93"; -static inline int -ctnetlink_dump_tuples_proto(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple, - struct nf_conntrack_l4proto *l4proto) +static int ctnetlink_dump_tuples_proto(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_l4proto *l4proto) { int ret = 0; struct nlattr *nest_parms; @@ -83,10 +82,9 @@ nla_put_failure: return -1; } -static inline int -ctnetlink_dump_tuples_ip(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple, - struct nf_conntrack_l3proto *l3proto) +static int ctnetlink_dump_tuples_ip(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_l3proto *l3proto) { int ret = 0; struct nlattr *nest_parms; @@ -106,9 +104,8 @@ nla_put_failure: return -1; } -static int -ctnetlink_dump_tuples(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple) +static int ctnetlink_dump_tuples(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) { int ret; struct nf_conntrack_l3proto *l3proto; @@ -127,9 +124,8 @@ ctnetlink_dump_tuples(struct sk_buff *skb, return ret; } -static inline int -ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype, - const struct nf_conntrack_zone *zone, int dir) +static int ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype, + const struct nf_conntrack_zone *zone, int dir) { if (zone->id == NF_CT_DEFAULT_ZONE_ID || zone->dir != dir) return 0; @@ -141,8 +137,7 @@ nla_put_failure: return -1; } -static inline int -ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) { if (nla_put_be32(skb, CTA_STATUS, htonl(ct->status))) goto nla_put_failure; @@ -152,8 +147,7 @@ nla_put_failure: return -1; } -static inline int -ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) { long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ; @@ -168,8 +162,7 @@ nla_put_failure: return -1; } -static inline int -ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) +static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) { struct nf_conntrack_l4proto *l4proto; struct nlattr *nest_proto; @@ -193,8 +186,8 @@ nla_put_failure: return -1; } -static inline int -ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_helpinfo(struct sk_buff *skb, + const struct nf_conn *ct) { struct nlattr *nest_helper; const struct nf_conn_help *help = nfct_help(ct); @@ -245,8 +238,10 @@ dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct, if (!nest_count) goto nla_put_failure; - if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts)) || - nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes))) + if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts), + CTA_COUNTERS_PAD) || + nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes), + CTA_COUNTERS_PAD)) goto nla_put_failure; nla_nest_end(skb, nest_count); @@ -287,9 +282,11 @@ ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct) if (!nest_count) goto nla_put_failure; - if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start)) || + if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start), + CTA_TIMESTAMP_PAD) || (tstamp->stop != 0 && nla_put_be64(skb, CTA_TIMESTAMP_STOP, - cpu_to_be64(tstamp->stop)))) + cpu_to_be64(tstamp->stop), + CTA_TIMESTAMP_PAD))) goto nla_put_failure; nla_nest_end(skb, nest_count); @@ -300,8 +297,7 @@ nla_put_failure: } #ifdef CONFIG_NF_CONNTRACK_MARK -static inline int -ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) { if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark))) goto nla_put_failure; @@ -315,8 +311,7 @@ nla_put_failure: #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK -static inline int -ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_secctx; int len, ret; @@ -345,7 +340,7 @@ nla_put_failure: #endif #ifdef CONFIG_NF_CONNTRACK_LABELS -static int ctnetlink_label_size(const struct nf_conn *ct) +static inline int ctnetlink_label_size(const struct nf_conn *ct) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); @@ -380,8 +375,7 @@ ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) -static inline int -ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_parms; @@ -426,8 +420,8 @@ nla_put_failure: return -1; } -static inline int -ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, + const struct nf_conn *ct) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *seq; @@ -446,8 +440,7 @@ ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct) return 0; } -static inline int -ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) { if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct))) goto nla_put_failure; @@ -457,8 +450,7 @@ nla_put_failure: return -1; } -static inline int -ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) { if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use)))) goto nla_put_failure; @@ -538,8 +530,7 @@ nla_put_failure: return -1; } -static inline size_t -ctnetlink_proto_size(const struct nf_conn *ct) +static inline size_t ctnetlink_proto_size(const struct nf_conn *ct) { struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; @@ -556,19 +547,17 @@ ctnetlink_proto_size(const struct nf_conn *ct) return len; } -static inline size_t -ctnetlink_acct_size(const struct nf_conn *ct) +static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) { if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT)) return 0; return 2 * nla_total_size(0) /* CTA_COUNTERS_ORIG|REPL */ - + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */ - + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */ + + 2 * nla_total_size_64bit(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */ + + 2 * nla_total_size_64bit(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */ ; } -static inline int -ctnetlink_secctx_size(const struct nf_conn *ct) +static inline int ctnetlink_secctx_size(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_SECMARK int len, ret; @@ -584,20 +573,19 @@ ctnetlink_secctx_size(const struct nf_conn *ct) #endif } -static inline size_t -ctnetlink_timestamp_size(const struct nf_conn *ct) +static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) return 0; - return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t)); + return nla_total_size(0) + 2 * nla_total_size_64bit(sizeof(uint64_t)); #else return 0; #endif } -static inline size_t -ctnetlink_nlmsg_size(const struct nf_conn *ct) +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) { return NLMSG_ALIGN(sizeof(struct nfgenmsg)) + 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */ @@ -628,7 +616,6 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) ; } -#ifdef CONFIG_NF_CONNTRACK_EVENTS static int ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) { @@ -837,19 +824,22 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) last = (struct nf_conn *)cb->args[1]; local_bh_disable(); - for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { + for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { restart: lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; nf_conntrack_lock(lockp); - if (cb->args[0] >= net->ct.htable_size) { + if (cb->args[0] >= nf_conntrack_htable_size) { spin_unlock(lockp); goto out; } - hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], - hnnode) { + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]], + hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = nf_ct_tuplehash_to_ctrack(h); + if (!net_eq(net, nf_ct_net(ct))) + continue; + /* Dump entries of a given L3 protocol number. * If it is not specified, ie. l3proto == 0, * then dump everything. */ @@ -891,8 +881,8 @@ out: return skb->len; } -static inline int -ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple) +static int ctnetlink_parse_tuple_ip(struct nlattr *attr, + struct nf_conntrack_tuple *tuple) { struct nlattr *tb[CTA_IP_MAX+1]; struct nf_conntrack_l3proto *l3proto; @@ -921,9 +911,8 @@ static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = { [CTA_PROTO_NUM] = { .type = NLA_U8 }, }; -static inline int -ctnetlink_parse_tuple_proto(struct nlattr *attr, - struct nf_conntrack_tuple *tuple) +static int ctnetlink_parse_tuple_proto(struct nlattr *attr, + struct nf_conntrack_tuple *tuple) { struct nlattr *tb[CTA_PROTO_MAX+1]; struct nf_conntrack_l4proto *l4proto; @@ -1050,9 +1039,8 @@ static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { .len = NF_CT_HELPER_NAME_LEN - 1 }, }; -static inline int -ctnetlink_parse_help(const struct nlattr *attr, char **helper_name, - struct nlattr **helpinfo) +static int ctnetlink_parse_help(const struct nlattr *attr, char **helper_name, + struct nlattr **helpinfo) { int err; struct nlattr *tb[CTA_HELP_MAX+1]; @@ -1463,8 +1451,8 @@ ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[]) #endif } -static inline int -ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[]) +static int ctnetlink_change_helper(struct nf_conn *ct, + const struct nlattr * const cda[]) { struct nf_conntrack_helper *helper; struct nf_conn_help *help = nfct_help(ct); @@ -1524,8 +1512,8 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[]) return -EOPNOTSUPP; } -static inline int -ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[]) +static int ctnetlink_change_timeout(struct nf_conn *ct, + const struct nlattr * const cda[]) { u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); @@ -1544,8 +1532,8 @@ static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = { [CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED }, }; -static inline int -ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[]) +static int ctnetlink_change_protoinfo(struct nf_conn *ct, + const struct nlattr * const cda[]) { const struct nlattr *attr = cda[CTA_PROTOINFO]; struct nlattr *tb[CTA_PROTOINFO_MAX+1]; @@ -1571,8 +1559,8 @@ static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = { [CTA_SEQADJ_OFFSET_AFTER] = { .type = NLA_U32 }, }; -static inline int -change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr) +static int change_seq_adj(struct nf_ct_seqadj *seq, + const struct nlattr * const attr) { int err; struct nlattr *cda[CTA_SEQADJ_MAX+1]; @@ -2405,10 +2393,9 @@ static struct nfnl_ct_hook ctnetlink_glue_hook = { * EXPECT ***********************************************************************/ -static inline int -ctnetlink_exp_dump_tuple(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple, - enum ctattr_expect type) +static int ctnetlink_exp_dump_tuple(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + enum ctattr_expect type) { struct nlattr *nest_parms; @@ -2425,10 +2412,9 @@ nla_put_failure: return -1; } -static inline int -ctnetlink_exp_dump_mask(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple_mask *mask) +static int ctnetlink_exp_dump_mask(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple_mask *mask) { int ret; struct nf_conntrack_l3proto *l3proto; @@ -2646,10 +2632,14 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) last = (struct nf_conntrack_expect *)cb->args[1]; for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: - hlist_for_each_entry(exp, &net->ct.expect_hash[cb->args[0]], + hlist_for_each_entry(exp, &nf_ct_expect_hash[cb->args[0]], hnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; + + if (!net_eq(nf_ct_net(exp->master), net)) + continue; + if (cb->args[1]) { if (exp != last) continue; @@ -2900,8 +2890,12 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, - &net->ct.expect_hash[i], + &nf_ct_expect_hash[i], hnode) { + + if (!net_eq(nf_ct_exp_net(exp), net)) + continue; + m_help = nfct_help(exp->master); if (!strcmp(m_help->helper->name, name) && del_timer(&exp->timeout)) { @@ -2918,8 +2912,12 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, - &net->ct.expect_hash[i], + &nf_ct_expect_hash[i], hnode) { + + if (!net_eq(nf_ct_exp_net(exp), net)) + continue; + if (del_timer(&exp->timeout)) { nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index fce1b1cca..399a38fd6 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -645,7 +645,8 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE, ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) || nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ, - cpu_to_be64(ct->proto.dccp.handshake_seq))) + cpu_to_be64(ct->proto.dccp.handshake_seq), + CTA_PROTOINFO_DCCP_PAD)) goto nla_put_failure; nla_nest_end(skb, nest_parms); spin_unlock_bh(&ct->lock); @@ -660,6 +661,7 @@ static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = { [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 }, [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 }, [CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 }, + [CTA_PROTOINFO_DCCP_PAD] = { .type = NLA_UNSPEC }, }; static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 9578a7c37..1d7ab960a 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -191,13 +191,7 @@ static void sctp_print_tuple(struct seq_file *s, /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { - enum sctp_conntrack state; - - spin_lock_bh(&ct->lock); - state = ct->proto.sctp.state; - spin_unlock_bh(&ct->lock); - - seq_printf(s, "%s ", sctp_conntrack_names[state]); + seq_printf(s, "%s ", sctp_conntrack_names[ct->proto.sctp.state]); } #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 7cc1d9c22..70c838164 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -313,13 +313,7 @@ static void tcp_print_tuple(struct seq_file *s, /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { - enum tcp_conntrack state; - - spin_lock_bh(&ct->lock); - state = ct->proto.tcp.state; - spin_unlock_bh(&ct->lock); - - seq_printf(s, "%s ", tcp_conntrack_names[state]); + seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]); } static unsigned int get_conntrack_index(const struct tcphdr *tcph) diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 478f92f83..4fd040575 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -309,6 +309,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly = .l3proto = PF_INET, .l4proto = IPPROTO_UDP, .name = "udp", + .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, .print_tuple = udp_print_tuple, @@ -341,6 +342,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = .l3proto = PF_INET6, .l4proto = IPPROTO_UDP, .name = "udp", + .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, .print_tuple = udp_print_tuple, diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 1ac8ee13a..9d692f5ad 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -274,6 +274,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = .l3proto = PF_INET, .l4proto = IPPROTO_UDPLITE, .name = "udplite", + .allow_clash = true, .pkt_to_tuple = udplite_pkt_to_tuple, .invert_tuple = udplite_invert_tuple, .print_tuple = udplite_print_tuple, @@ -306,6 +307,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = .l3proto = PF_INET6, .l4proto = IPPROTO_UDPLITE, .name = "udplite", + .allow_clash = true, .pkt_to_tuple = udplite_pkt_to_tuple, .invert_tuple = udplite_invert_tuple, .print_tuple = udplite_print_tuple, diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index 7523a575f..3fcbaab83 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -223,6 +223,7 @@ static int __init nf_conntrack_sane_init(void) if (ret) { pr_err("failed to register helper for pf: %d port: %d\n", sane[i][j].tuple.src.l3num, ports[i]); + ports_c = i; nf_conntrack_sane_fini(); return ret; } diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 3e0640273..f72ba5587 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1669,6 +1669,7 @@ static int __init nf_conntrack_sip_init(void) if (ret) { pr_err("failed to register helper for pf: %u port: %u\n", sip[i][j].tuple.src.l3num, ports[i]); + ports_c = i; nf_conntrack_sip_fini(); return ret; } diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 0f1a45bca..c026c472e 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -54,14 +54,13 @@ struct ct_iter_state { static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < net->ct.htable_size; + st->bucket < nf_conntrack_htable_size; st->bucket++) { - n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -71,18 +70,17 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct hlist_nulls_node *head) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= net->ct.htable_size) + if (++st->bucket >= nf_conntrack_htable_size) return NULL; } head = rcu_dereference( hlist_nulls_first_rcu( - &net->ct.hash[st->bucket])); + &nf_conntrack_hash[st->bucket])); } return head; } @@ -458,7 +456,7 @@ static struct ctl_table nf_ct_sysctl_table[] = { }, { .procname = "nf_conntrack_buckets", - .data = &init_net.ct.htable_size, + .data = &nf_conntrack_htable_size, .maxlen = sizeof(unsigned int), .mode = 0444, .proc_handler = proc_dointvec, @@ -489,8 +487,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { { } }; -#define NET_NF_CONNTRACK_MAX 2089 - static struct ctl_table nf_ct_netfilter_table[] = { { .procname = "nf_conntrack_max", @@ -512,7 +508,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) goto out_kmemdup; table[1].data = &net->ct.count; - table[2].data = &net->ct.htable_size; table[3].data = &net->ct.sysctl_checksum; table[4].data = &net->ct.sysctl_log_invalid; diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index 36f964066..2e65b5430 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -142,6 +142,7 @@ static int __init nf_conntrack_tftp_init(void) if (ret) { pr_err("failed to register helper for pf: %u port: %u\n", tftp[i][j].tuple.src.l3num, ports[i]); + ports_c = i; nf_conntrack_tftp_fini(); return ret; } diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 06a9f4577..6877a396f 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -38,6 +38,9 @@ static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] __read_mostly; +static struct hlist_head *nf_nat_bysource __read_mostly; +static unsigned int nf_nat_htable_size __read_mostly; +static unsigned int nf_nat_hash_rnd __read_mostly; inline const struct nf_nat_l3proto * __nf_nat_l3proto_find(u8 family) @@ -118,15 +121,17 @@ EXPORT_SYMBOL(nf_xfrm_me_harder); /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int -hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple) +hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) { unsigned int hash; + get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); + /* Original src, to ensure we map it consistently if poss. */ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ nf_conntrack_hash_rnd); + tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); - return reciprocal_scale(hash, net->ct.nat_htable_size); + return reciprocal_scale(hash, nf_nat_htable_size); } /* Is this tuple already taken? (not by us) */ @@ -196,9 +201,10 @@ find_appropriate_src(struct net *net, const struct nf_conn_nat *nat; const struct nf_conn *ct; - hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) { + hlist_for_each_entry_rcu(nat, &nf_nat_bysource[h], bysource) { ct = nat->ct; if (same_src(ct, tuple) && + net_eq(net, nf_ct_net(ct)) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { /* Copy source part from reply tuple. */ nf_ct_invert_tuplepr(result, @@ -431,7 +437,7 @@ nf_nat_setup_info(struct nf_conn *ct, nat = nfct_nat(ct); nat->ct = ct; hlist_add_head_rcu(&nat->bysource, - &net->ct.nat_bysource[srchash]); + &nf_nat_bysource[srchash]); spin_unlock_bh(&nf_nat_lock); } @@ -819,27 +825,14 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, } #endif -static int __net_init nf_nat_net_init(struct net *net) -{ - /* Leave them the same for the moment. */ - net->ct.nat_htable_size = net->ct.htable_size; - net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0); - if (!net->ct.nat_bysource) - return -ENOMEM; - return 0; -} - static void __net_exit nf_nat_net_exit(struct net *net) { struct nf_nat_proto_clean clean = {}; nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0); - synchronize_rcu(); - nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size); } static struct pernet_operations nf_nat_net_ops = { - .init = nf_nat_net_init, .exit = nf_nat_net_exit, }; @@ -852,8 +845,16 @@ static int __init nf_nat_init(void) { int ret; + /* Leave them the same for the moment. */ + nf_nat_htable_size = nf_conntrack_htable_size; + + nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); + if (!nf_nat_bysource) + return -ENOMEM; + ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); return ret; } @@ -877,6 +878,7 @@ static int __init nf_nat_init(void) return 0; cleanup_extend: + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); nf_ct_extend_unregister(&nat_extend); return ret; } @@ -895,6 +897,7 @@ static void __exit nf_nat_cleanup(void) for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); synchronize_net(); + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); } MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 5baa8e24e..b19ad20a7 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -26,23 +26,21 @@ * Once the queue is registered it must reinject all packets it * receives, no matter what. */ -static const struct nf_queue_handler __rcu *queue_handler __read_mostly; /* return EBUSY when somebody else is registered, return EEXIST if the * same handler is registered, return 0 in case of success. */ -void nf_register_queue_handler(const struct nf_queue_handler *qh) +void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh) { /* should never happen, we only have one queueing backend in kernel */ - WARN_ON(rcu_access_pointer(queue_handler)); - rcu_assign_pointer(queue_handler, qh); + WARN_ON(rcu_access_pointer(net->nf.queue_handler)); + rcu_assign_pointer(net->nf.queue_handler, qh); } EXPORT_SYMBOL(nf_register_queue_handler); /* The caller must flush their queue before this */ -void nf_unregister_queue_handler(void) +void nf_unregister_queue_handler(struct net *net) { - RCU_INIT_POINTER(queue_handler, NULL); - synchronize_rcu(); + RCU_INIT_POINTER(net->nf.queue_handler, NULL); } EXPORT_SYMBOL(nf_unregister_queue_handler); @@ -103,7 +101,7 @@ void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops) const struct nf_queue_handler *qh; rcu_read_lock(); - qh = rcu_dereference(queue_handler); + qh = rcu_dereference(net->nf.queue_handler); if (qh) qh->nf_hook_drop(net, ops); rcu_read_unlock(); @@ -122,9 +120,10 @@ int nf_queue(struct sk_buff *skb, struct nf_queue_entry *entry = NULL; const struct nf_afinfo *afinfo; const struct nf_queue_handler *qh; + struct net *net = state->net; /* QUEUE == DROP if no one is waiting, to be safe. */ - qh = rcu_dereference(queue_handler); + qh = rcu_dereference(net->nf.queue_handler); if (!qh) { status = -ESRCH; goto err; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2011977cd..cf7c74599 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -944,8 +944,10 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) if (nest == NULL) goto nla_put_failure; - if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts)) || - nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes))) + if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts), + NFTA_COUNTER_PAD) || + nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes), + NFTA_COUNTER_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); @@ -975,7 +977,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name)) goto nla_put_failure; - if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle))) + if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle), + NFTA_CHAIN_PAD)) goto nla_put_failure; if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name)) goto nla_put_failure; @@ -1721,9 +1724,11 @@ struct nft_expr *nft_expr_init(const struct nft_ctx *ctx, err = nf_tables_newexpr(ctx, &info, expr); if (err < 0) - goto err2; + goto err3; return expr; +err3: + kfree(expr); err2: module_put(info.ops->type->owner); err1: @@ -1803,13 +1808,15 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, goto nla_put_failure; if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name)) goto nla_put_failure; - if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle))) + if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle), + NFTA_RULE_PAD)) goto nla_put_failure; if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) { prule = list_entry(rule->list.prev, struct nft_rule, list); if (nla_put_be64(skb, NFTA_RULE_POSITION, - cpu_to_be64(prule->handle))) + cpu_to_be64(prule->handle), + NFTA_RULE_PAD)) goto nla_put_failure; } @@ -2312,7 +2319,7 @@ nft_select_set_ops(const struct nlattr * const nla[], static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_TABLE] = { .type = NLA_STRING }, [NFTA_SET_NAME] = { .type = NLA_STRING, - .len = IFNAMSIZ - 1 }, + .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_SET_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 }, [NFTA_SET_KEY_LEN] = { .type = NLA_U32 }, @@ -2396,7 +2403,7 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, unsigned long *inuse; unsigned int n = 0, min = 0; - p = strnchr(name, IFNAMSIZ, '%'); + p = strnchr(name, NFT_SET_MAXNAMELEN, '%'); if (p != NULL) { if (p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; @@ -2473,7 +2480,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, } if (set->timeout && - nla_put_be64(skb, NFTA_SET_TIMEOUT, cpu_to_be64(set->timeout))) + nla_put_be64(skb, NFTA_SET_TIMEOUT, cpu_to_be64(set->timeout), + NFTA_SET_PAD)) goto nla_put_failure; if (set->gc_int && nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(set->gc_int))) @@ -2641,6 +2649,8 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, /* Only accept unspec with dump */ if (nfmsg->nfgen_family == NFPROTO_UNSPEC) return -EAFNOSUPPORT; + if (!nla[NFTA_SET_TABLE]) + return -EINVAL; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); if (IS_ERR(set)) @@ -2690,7 +2700,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; - char name[IFNAMSIZ]; + char name[NFT_SET_MAXNAMELEN]; unsigned int size; bool create; u64 timeout; @@ -2938,24 +2948,20 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, * jumps are already validated for that chain. */ list_for_each_entry(i, &set->bindings, list) { - if (binding->flags & NFT_SET_MAP && + if (i->flags & NFT_SET_MAP && i->chain == binding->chain) goto bind; } + iter.genmask = nft_genmask_next(ctx->net); iter.skip = 0; iter.count = 0; iter.err = 0; iter.fn = nf_tables_bind_check_setelem; set->ops->walk(ctx, set, &iter); - if (iter.err < 0) { - /* Destroy anonymous sets if binding fails */ - if (set->flags & NFT_SET_ANONYMOUS) - nf_tables_set_destroy(ctx, set); - + if (iter.err < 0) return iter.err; - } } bind: binding->chain = ctx->chain; @@ -3076,7 +3082,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, - cpu_to_be64(*nft_set_ext_timeout(ext)))) + cpu_to_be64(*nft_set_ext_timeout(ext)), + NFTA_SET_ELEM_PAD)) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { @@ -3089,7 +3096,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, expires = 0; if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, - cpu_to_be64(jiffies_to_msecs(expires)))) + cpu_to_be64(jiffies_to_msecs(expires)), + NFTA_SET_ELEM_PAD)) goto nla_put_failure; } @@ -3182,12 +3190,13 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto nla_put_failure; - args.cb = cb; - args.skb = skb; - args.iter.skip = cb->args[0]; - args.iter.count = 0; - args.iter.err = 0; - args.iter.fn = nf_tables_dump_setelem; + args.cb = cb; + args.skb = skb; + args.iter.genmask = nft_genmask_cur(ctx.net); + args.iter.skip = cb->args[0]; + args.iter.count = 0; + args.iter.err = 0; + args.iter.fn = nf_tables_dump_setelem; set->ops->walk(&ctx, set, &args.iter); nla_nest_end(skb, nest); @@ -3367,6 +3376,22 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem) } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); +static int nft_setelem_parse_flags(const struct nft_set *set, + const struct nlattr *attr, u32 *flags) +{ + if (attr == NULL) + return 0; + + *flags = ntohl(nla_get_be32(attr)); + if (*flags & ~NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + if (!(set->flags & NFT_SET_INTERVAL) && + *flags & NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + + return 0; +} + static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { @@ -3380,8 +3405,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; + u32 flags = 0; u64 timeout; - u32 flags; u8 ulen; int err; @@ -3395,17 +3420,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_prepare(&tmpl); - flags = 0; - if (nla[NFTA_SET_ELEM_FLAGS] != NULL) { - flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS])); - if (flags & ~NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; - if (!(set->flags & NFT_SET_INTERVAL) && - flags & NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; - if (flags != 0) - nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); - } + err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); + if (err < 0) + return err; + if (flags != 0) + nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); if (set->flags & NFT_SET_MAP) { if (nla[NFTA_SET_ELEM_DATA] == NULL && @@ -3574,9 +3593,13 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; + struct nft_set_ext_tmpl tmpl; struct nft_data_desc desc; struct nft_set_elem elem; + struct nft_set_ext *ext; struct nft_trans *trans; + u32 flags = 0; + void *priv; int err; err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, @@ -3588,6 +3611,14 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_KEY] == NULL) goto err1; + nft_set_ext_prepare(&tmpl); + + err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); + if (err < 0) + return err; + if (flags != 0) + nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); + err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, nla[NFTA_SET_ELEM_KEY]); if (err < 0) @@ -3597,24 +3628,40 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) goto err2; + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, desc.len); + + err = -ENOMEM; + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0, + GFP_KERNEL); + if (elem.priv == NULL) + goto err2; + + ext = nft_set_elem_ext(set, elem.priv); + if (flags) + *nft_set_ext_flags(ext) = flags; + trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); if (trans == NULL) { err = -ENOMEM; - goto err2; + goto err3; } - elem.priv = set->ops->deactivate(set, &elem); - if (elem.priv == NULL) { + priv = set->ops->deactivate(set, &elem); + if (priv == NULL) { err = -ENOENT; - goto err3; + goto err4; } + kfree(elem.priv); + elem.priv = priv; nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; -err3: +err4: kfree(trans); +err3: + kfree(elem.priv); err2: nft_data_uninit(&elem.key.val, desc.type); err1: @@ -4236,6 +4283,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, binding->chain != chain) continue; + iter.genmask = nft_genmask_next(ctx->net); iter.skip = 0; iter.count = 0; iter.err = 0; diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index e9f8dffcc..fb8b5892b 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -143,7 +143,7 @@ next_rule: list_for_each_entry_continue_rcu(rule, &chain->rules, list) { /* This rule is not active, skip. */ - if (unlikely(rule->genmask & (1 << gencursor))) + if (unlikely(rule->genmask & gencursor)) continue; rulenum++; diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index e9e959f65..39eb1cc62 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -156,7 +156,8 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb, return 0; return nla_put_be64(nlskb, NFTA_TRACE_RULE_HANDLE, - cpu_to_be64(info->rule->handle)); + cpu_to_be64(info->rule->handle), + NFTA_TRACE_PAD); } void nft_trace_notify(struct nft_traceinfo *info) @@ -174,7 +175,7 @@ void nft_trace_notify(struct nft_traceinfo *info) size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(NFT_TABLE_MAXNAMELEN) + nla_total_size(NFT_CHAIN_MAXNAMELEN) + - nla_total_size(sizeof(__be64)) + /* rule handle */ + nla_total_size_64bit(sizeof(__be64)) + /* rule handle */ nla_total_size(sizeof(__be32)) + /* trace type */ nla_total_size(0) + /* VERDICT, nested */ nla_total_size(sizeof(u32)) + /* verdict code */ diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index dbd0803b1..1b4de4bd6 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -162,15 +162,18 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, pkts = atomic64_read(&acct->pkts); bytes = atomic64_read(&acct->bytes); } - if (nla_put_be64(skb, NFACCT_PKTS, cpu_to_be64(pkts)) || - nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes)) || + if (nla_put_be64(skb, NFACCT_PKTS, cpu_to_be64(pkts), + NFACCT_PAD) || + nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes), + NFACCT_PAD) || nla_put_be32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt)))) goto nla_put_failure; if (acct->flags & NFACCT_F_QUOTA) { u64 *quota = (u64 *)acct->data; if (nla_put_be32(skb, NFACCT_FLAGS, htonl(old_flags)) || - nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota))) + nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota), + NFACCT_PAD)) goto nla_put_failure; } nlmsg_end(skb, nlh); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 2671b9deb..3c84f1432 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -306,10 +306,10 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout) int i; local_bh_disable(); - for (i = 0; i < net->ct.htable_size; i++) { + for (i = 0; i < nf_conntrack_htable_size; i++) { nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); - if (i < net->ct.htable_size) { - hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + if (i < nf_conntrack_htable_size) { + hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) untimeout(h, timeout); } spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index cb5b630a6..5d36a0926 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -295,6 +295,59 @@ static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata) return seclen; } +static u32 nfqnl_get_bridge_size(struct nf_queue_entry *entry) +{ + struct sk_buff *entskb = entry->skb; + u32 nlalen = 0; + + if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb)) + return 0; + + if (skb_vlan_tag_present(entskb)) + nlalen += nla_total_size(nla_total_size(sizeof(__be16)) + + nla_total_size(sizeof(__be16))); + + if (entskb->network_header > entskb->mac_header) + nlalen += nla_total_size((entskb->network_header - + entskb->mac_header)); + + return nlalen; +} + +static int nfqnl_put_bridge(struct nf_queue_entry *entry, struct sk_buff *skb) +{ + struct sk_buff *entskb = entry->skb; + + if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb)) + return 0; + + if (skb_vlan_tag_present(entskb)) { + struct nlattr *nest; + + nest = nla_nest_start(skb, NFQA_VLAN | NLA_F_NESTED); + if (!nest) + goto nla_put_failure; + + if (nla_put_be16(skb, NFQA_VLAN_TCI, htons(entskb->vlan_tci)) || + nla_put_be16(skb, NFQA_VLAN_PROTO, entskb->vlan_proto)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + } + + if (entskb->mac_header < entskb->network_header) { + int len = (int)(entskb->network_header - entskb->mac_header); + + if (nla_put(skb, NFQA_L2HDR, len, skb_mac_header(entskb))) + goto nla_put_failure; + } + + return 0; + +nla_put_failure: + return -1; +} + static struct sk_buff * nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry, @@ -334,6 +387,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, if (entskb->tstamp.tv64) size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); + size += nfqnl_get_bridge_size(entry); + if (entry->state.hook <= NF_INET_FORWARD || (entry->state.hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) csum_verify = !skb_csum_unnecessary(entskb); @@ -497,9 +552,12 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, } } + if (nfqnl_put_bridge(entry, skb) < 0) + goto nla_put_failure; + if (entskb->tstamp.tv64) { struct nfqnl_msg_packet_timestamp ts; - struct timespec64 kts = ktime_to_timespec64(skb->tstamp); + struct timespec64 kts = ktime_to_timespec64(entskb->tstamp); ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); @@ -911,12 +969,18 @@ static struct notifier_block nfqnl_rtnl_notifier = { .notifier_call = nfqnl_rcv_nl_event, }; +static const struct nla_policy nfqa_vlan_policy[NFQA_VLAN_MAX + 1] = { + [NFQA_VLAN_TCI] = { .type = NLA_U16}, + [NFQA_VLAN_PROTO] = { .type = NLA_U16}, +}; + static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, [NFQA_MARK] = { .type = NLA_U32 }, [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, [NFQA_CT] = { .type = NLA_UNSPEC }, [NFQA_EXP] = { .type = NLA_UNSPEC }, + [NFQA_VLAN] = { .type = NLA_NESTED }, }; static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { @@ -1030,6 +1094,40 @@ static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct, return ct; } +static int nfqa_parse_bridge(struct nf_queue_entry *entry, + const struct nlattr * const nfqa[]) +{ + if (nfqa[NFQA_VLAN]) { + struct nlattr *tb[NFQA_VLAN_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NFQA_VLAN_MAX, nfqa[NFQA_VLAN], + nfqa_vlan_policy); + if (err < 0) + return err; + + if (!tb[NFQA_VLAN_TCI] || !tb[NFQA_VLAN_PROTO]) + return -EINVAL; + + entry->skb->vlan_tci = ntohs(nla_get_be16(tb[NFQA_VLAN_TCI])); + entry->skb->vlan_proto = nla_get_be16(tb[NFQA_VLAN_PROTO]); + } + + if (nfqa[NFQA_L2HDR]) { + int mac_header_len = entry->skb->network_header - + entry->skb->mac_header; + + if (mac_header_len != nla_len(nfqa[NFQA_L2HDR])) + return -EINVAL; + else if (mac_header_len > 0) + memcpy(skb_mac_header(entry->skb), + nla_data(nfqa[NFQA_L2HDR]), + mac_header_len); + } + + return 0; +} + static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, @@ -1045,6 +1143,7 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, struct nfnl_ct_hook *nfnl_ct; struct nf_conn *ct = NULL; struct nfnl_queue_net *q = nfnl_queue_pernet(net); + int err; queue = instance_lookup(q, queue_num); if (!queue) @@ -1071,6 +1170,12 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo); } + if (entry->state.pf == PF_BRIDGE) { + err = nfqa_parse_bridge(entry, nfqa); + if (err < 0) + return err; + } + if (nfqa[NFQA_PAYLOAD]) { u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]); int diff = payload_len - entry->skb->len; @@ -1377,21 +1482,29 @@ static int __net_init nfnl_queue_net_init(struct net *net) net->nf.proc_netfilter, &nfqnl_file_ops)) return -ENOMEM; #endif + nf_register_queue_handler(net, &nfqh); return 0; } static void __net_exit nfnl_queue_net_exit(struct net *net) { + nf_unregister_queue_handler(net); #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); #endif } +static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list) +{ + synchronize_rcu(); +} + static struct pernet_operations nfnl_queue_net_ops = { - .init = nfnl_queue_net_init, - .exit = nfnl_queue_net_exit, - .id = &nfnl_queue_net_id, - .size = sizeof(struct nfnl_queue_net), + .init = nfnl_queue_net_init, + .exit = nfnl_queue_net_exit, + .exit_batch = nfnl_queue_net_exit_batch, + .id = &nfnl_queue_net_id, + .size = sizeof(struct nfnl_queue_net), }; static int __init nfnetlink_queue_init(void) @@ -1412,7 +1525,6 @@ static int __init nfnetlink_queue_init(void) } register_netdevice_notifier(&nfqnl_dev_notifier); - nf_register_queue_handler(&nfqh); return status; cleanup_netlink_notifier: @@ -1424,7 +1536,6 @@ out: static void __exit nfnetlink_queue_fini(void) { - nf_unregister_queue_handler(); unregister_netdevice_notifier(&nfqnl_dev_notifier); nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index c9743f78f..77db8358a 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -76,8 +76,10 @@ static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) nft_counter_fetch(priv->counter, &total); - if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)) || - nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets))) + if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes), + NFTA_COUNTER_PAD) || + nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets), + NFTA_COUNTER_PAD)) goto nla_put_failure; return 0; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index d4a4619fc..81fbb4507 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -54,7 +54,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr, const struct nf_conn_help *help; const struct nf_conntrack_tuple *tuple; const struct nf_conntrack_helper *helper; - long diff; unsigned int state; ct = nf_ct_get(pkt->skb, &ctinfo); @@ -94,10 +93,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, return; #endif case NFT_CT_EXPIRATION: - diff = (long)jiffies - (long)ct->timeout.expires; - if (diff < 0) - diff = 0; - *dest = jiffies_to_msecs(diff); + *dest = jiffies_to_msecs(nf_ct_expires(ct)); return; case NFT_CT_HELPER: if (ct->master == NULL) @@ -198,6 +194,14 @@ static void nft_ct_set_eval(const struct nft_expr *expr, } break; #endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: + nf_connlabels_replace(ct, + ®s->data[priv->sreg], + ®s->data[priv->sreg], + NF_CT_LABELS_MAX_SIZE / sizeof(u32)); + break; +#endif default: break; } @@ -365,6 +369,16 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, len = FIELD_SIZEOF(struct nf_conn, mark); break; #endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: + if (tb[NFTA_CT_DIRECTION]) + return -EINVAL; + len = NF_CT_LABELS_MAX_SIZE; + err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); + if (err) + return err; + break; +#endif default: return -EOPNOTSUPP; } @@ -384,6 +398,18 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, static void nft_ct_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { + struct nft_ct *priv = nft_expr_priv(expr); + + switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: + nf_connlabels_put(ctx->net); + break; +#endif + default: + break; + } + nft_ct_l3proto_module_put(ctx->afi->family); } @@ -484,6 +510,8 @@ static struct nft_expr_type nft_ct_type __read_mostly = { static int __init nft_ct_module_init(void) { + BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > NFT_REG_SIZE); + return nft_register_expr(&nft_ct_type); } diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 9dec3bd1b..78d4914fb 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -227,7 +227,8 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; if (nla_put_string(skb, NFTA_DYNSET_SET_NAME, priv->set->name)) goto nla_put_failure; - if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, cpu_to_be64(priv->timeout))) + if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, cpu_to_be64(priv->timeout), + NFTA_DYNSET_PAD)) goto nla_put_failure; if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr)) goto nla_put_failure; diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 3f9d45d3d..f39c53a15 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -189,10 +189,9 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, struct nft_hash_elem *he; struct rhashtable_iter hti; struct nft_set_elem elem; - u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); int err; - err = rhashtable_walk_init(&priv->ht, &hti); + err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL); iter->err = err; if (err) return; @@ -218,7 +217,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, goto cont; if (nft_set_elem_expired(&he->ext)) goto cont; - if (!nft_set_elem_active(&he->ext, genmask)) + if (!nft_set_elem_active(&he->ext, iter->genmask)) goto cont; elem.priv = he; @@ -248,7 +247,7 @@ static void nft_hash_gc(struct work_struct *work) priv = container_of(work, struct nft_hash, gc_work.work); set = nft_set_container_of(priv); - err = rhashtable_walk_init(&priv->ht, &hti); + err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL); if (err) goto schedule; diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 99d18578a..070b98938 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -97,8 +97,10 @@ static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit, u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC); u64 rate = limit->rate - limit->burst; - if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate)) || - nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs)) || + if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate), + NFTA_LIMIT_PAD) || + nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs), + NFTA_LIMIT_PAD) || nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) || nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)) || nla_put_be32(skb, NFTA_LIMIT_FLAGS, htonl(flags))) diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 16c50b0dd..f4bad9dc1 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -227,7 +227,7 @@ void nft_meta_set_eval(const struct nft_expr *expr, skb->pkt_type = value; break; case NFT_META_NFTRACE: - skb->nf_trace = 1; + skb->nf_trace = !!value; break; default: WARN_ON(1); diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index 1c30f41cf..7201d57b5 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -29,6 +29,17 @@ struct nft_rbtree_elem { struct nft_set_ext ext; }; +static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe) +{ + return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) && + (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END); +} + +static bool nft_rbtree_equal(const struct nft_set *set, const void *this, + const struct nft_rbtree_elem *interval) +{ + return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; +} static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) @@ -37,6 +48,7 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, const struct nft_rbtree_elem *rbe, *interval = NULL; const struct rb_node *parent; u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); + const void *this; int d; spin_lock_bh(&nft_rbtree_lock); @@ -44,9 +56,16 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); - d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); + this = nft_set_ext_key(&rbe->ext); + d = memcmp(this, key, set->klen); if (d < 0) { parent = parent->rb_left; + /* In case of adjacent ranges, we always see the high + * part of the range in first place, before the low one. + * So don't update interval if the keys are equal. + */ + if (interval && nft_rbtree_equal(set, this, interval)) + continue; interval = rbe; } else if (d > 0) parent = parent->rb_right; @@ -56,9 +75,7 @@ found: parent = parent->rb_left; continue; } - if (nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) && - *nft_set_ext_flags(&rbe->ext) & - NFT_SET_ELEM_INTERVAL_END) + if (nft_rbtree_interval_end(rbe)) goto out; spin_unlock_bh(&nft_rbtree_lock); @@ -98,9 +115,16 @@ static int __nft_rbtree_insert(const struct nft_set *set, else if (d > 0) p = &parent->rb_right; else { - if (nft_set_elem_active(&rbe->ext, genmask)) - return -EEXIST; - p = &parent->rb_left; + if (nft_set_elem_active(&rbe->ext, genmask)) { + if (nft_rbtree_interval_end(rbe) && + !nft_rbtree_interval_end(new)) + p = &parent->rb_left; + else if (!nft_rbtree_interval_end(rbe) && + nft_rbtree_interval_end(new)) + p = &parent->rb_right; + else + return -EEXIST; + } } } rb_link_node(&new->node, parent, p); @@ -145,7 +169,7 @@ static void *nft_rbtree_deactivate(const struct nft_set *set, { const struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; - struct nft_rbtree_elem *rbe; + struct nft_rbtree_elem *rbe, *this = elem->priv; u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); int d; @@ -163,6 +187,15 @@ static void *nft_rbtree_deactivate(const struct nft_set *set, parent = parent->rb_left; continue; } + if (nft_rbtree_interval_end(rbe) && + !nft_rbtree_interval_end(this)) { + parent = parent->rb_left; + continue; + } else if (!nft_rbtree_interval_end(rbe) && + nft_rbtree_interval_end(this)) { + parent = parent->rb_right; + continue; + } nft_set_elem_change_active(set, &rbe->ext); return rbe; } @@ -178,7 +211,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, struct nft_rbtree_elem *rbe; struct nft_set_elem elem; struct rb_node *node; - u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); spin_lock_bh(&nft_rbtree_lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { @@ -186,7 +218,7 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&rbe->ext, genmask)) + if (!nft_set_elem_active(&rbe->ext, iter->genmask)) goto cont; elem.priv = rbe; diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c index bb9cbeb18..a79af2555 100644 --- a/net/netfilter/xt_connlabel.c +++ b/net/netfilter/xt_connlabel.c @@ -18,6 +18,16 @@ MODULE_DESCRIPTION("Xtables: add/match connection trackling labels"); MODULE_ALIAS("ipt_connlabel"); MODULE_ALIAS("ip6t_connlabel"); +static bool connlabel_match(const struct nf_conn *ct, u16 bit) +{ + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + + if (!labels) + return false; + + return BIT_WORD(bit) < labels->words && test_bit(bit, labels->bits); +} + static bool connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par) { @@ -33,7 +43,7 @@ connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par) if (info->options & XT_CONNLABEL_OP_SET) return (nf_connlabel_set(ct, info->bit) == 0) ^ invert; - return nf_connlabel_match(ct, info->bit) ^ invert; + return connlabel_match(ct, info->bit) ^ invert; } static int connlabel_mt_check(const struct xt_mtchk_param *par) @@ -55,7 +65,7 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par) return ret; } - ret = nf_connlabels_get(par->net, info->bit + 1); + ret = nf_connlabels_get(par->net, info->bit); if (ret < 0) nf_ct_l3proto_module_put(par->family); return ret; diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 49d14ecad..b10ade272 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -120,9 +120,9 @@ xt_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, { switch (protocol) { case IPPROTO_TCP: - return __inet_lookup(net, &tcp_hashinfo, skb, doff, - saddr, sport, daddr, dport, - in->ifindex); + return inet_lookup(net, &tcp_hashinfo, skb, doff, + saddr, sport, daddr, dport, + in->ifindex); case IPPROTO_UDP: return udp4_lib_lookup(net, saddr, sport, daddr, dport, in->ifindex); diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 28cddc85b..1325776da 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -677,7 +677,7 @@ int netlbl_catmap_setrng(struct netlbl_lsm_catmap **catmap, u32 spot = start; while (rc == 0 && spot <= end) { - if (((spot & (BITS_PER_LONG - 1)) != 0) && + if (((spot & (BITS_PER_LONG - 1)) == 0) && ((end - spot) > BITS_PER_LONG)) { rc = netlbl_catmap_setlong(catmap, spot, diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f48e3b3ae..627f898c0 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2346,7 +2346,8 @@ static int netlink_walk_start(struct nl_seq_iter *iter) { int err; - err = rhashtable_walk_init(&nl_table[iter->link].hash, &iter->hti); + err = rhashtable_walk_init(&nl_table[iter->link].hash, &iter->hti, + GFP_KERNEL); if (err) { iter->link = MAX_LINKS; return err; diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index fbb7a2b57..61fff4224 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -64,18 +64,26 @@ struct nci_conn_info *nci_get_conn_info_by_conn_id(struct nci_dev *ndev, return NULL; } -int nci_get_conn_info_by_id(struct nci_dev *ndev, u8 id) +int nci_get_conn_info_by_dest_type_params(struct nci_dev *ndev, u8 dest_type, + struct dest_spec_params *params) { struct nci_conn_info *conn_info; list_for_each_entry(conn_info, &ndev->conn_info_list, list) { - if (conn_info->id == id) - return conn_info->conn_id; + if (conn_info->dest_type == dest_type) { + if (!params) + return conn_info->conn_id; + if (conn_info) { + if (params->id == conn_info->dest_params->id && + params->protocol == conn_info->dest_params->protocol) + return conn_info->conn_id; + } + } } return -EINVAL; } -EXPORT_SYMBOL(nci_get_conn_info_by_id); +EXPORT_SYMBOL(nci_get_conn_info_by_dest_type_params); /* ---- NCI requests ---- */ @@ -392,6 +400,83 @@ int nci_core_init(struct nci_dev *ndev) } EXPORT_SYMBOL(nci_core_init); +struct nci_loopback_data { + u8 conn_id; + struct sk_buff *data; +}; + +static void nci_send_data_req(struct nci_dev *ndev, unsigned long opt) +{ + struct nci_loopback_data *data = (struct nci_loopback_data *)opt; + + nci_send_data(ndev, data->conn_id, data->data); +} + +static void nci_nfcc_loopback_cb(void *context, struct sk_buff *skb, int err) +{ + struct nci_dev *ndev = (struct nci_dev *)context; + struct nci_conn_info *conn_info; + + conn_info = nci_get_conn_info_by_conn_id(ndev, ndev->cur_conn_id); + if (!conn_info) { + nci_req_complete(ndev, NCI_STATUS_REJECTED); + return; + } + + conn_info->rx_skb = skb; + + nci_req_complete(ndev, NCI_STATUS_OK); +} + +int nci_nfcc_loopback(struct nci_dev *ndev, void *data, size_t data_len, + struct sk_buff **resp) +{ + int r; + struct nci_loopback_data loopback_data; + struct nci_conn_info *conn_info; + struct sk_buff *skb; + int conn_id = nci_get_conn_info_by_dest_type_params(ndev, + NCI_DESTINATION_NFCC_LOOPBACK, NULL); + + if (conn_id < 0) { + r = nci_core_conn_create(ndev, NCI_DESTINATION_NFCC_LOOPBACK, + 0, 0, NULL); + if (r != NCI_STATUS_OK) + return r; + + conn_id = nci_get_conn_info_by_dest_type_params(ndev, + NCI_DESTINATION_NFCC_LOOPBACK, + NULL); + } + + conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id); + if (!conn_info) + return -EPROTO; + + /* store cb and context to be used on receiving data */ + conn_info->data_exchange_cb = nci_nfcc_loopback_cb; + conn_info->data_exchange_cb_context = ndev; + + skb = nci_skb_alloc(ndev, NCI_DATA_HDR_SIZE + data_len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + skb_reserve(skb, NCI_DATA_HDR_SIZE); + memcpy(skb_put(skb, data_len), data, data_len); + + loopback_data.conn_id = conn_id; + loopback_data.data = skb; + + ndev->cur_conn_id = conn_id; + r = nci_request(ndev, nci_send_data_req, (unsigned long)&loopback_data, + msecs_to_jiffies(NCI_DATA_TIMEOUT)); + if (r == NCI_STATUS_OK && resp) + *resp = conn_info->rx_skb; + + return r; +} +EXPORT_SYMBOL(nci_nfcc_loopback); + static int nci_open_device(struct nci_dev *ndev) { int rc = 0; @@ -610,9 +695,6 @@ int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type, struct nci_core_conn_create_cmd *cmd; struct core_conn_create_data data; - if (!number_destination_params) - return -EINVAL; - data.length = params_len + sizeof(struct nci_core_conn_create_cmd); cmd = kzalloc(data.length, GFP_KERNEL); if (!cmd) @@ -620,17 +702,23 @@ int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type, cmd->destination_type = destination_type; cmd->number_destination_params = number_destination_params; - memcpy(cmd->params, params, params_len); data.cmd = cmd; - if (params->length > 0) - ndev->cur_id = params->value[DEST_SPEC_PARAMS_ID_INDEX]; - else - ndev->cur_id = 0; + if (params) { + memcpy(cmd->params, params, params_len); + if (params->length > 0) + memcpy(&ndev->cur_params, + ¶ms->value[DEST_SPEC_PARAMS_ID_INDEX], + sizeof(struct dest_spec_params)); + else + ndev->cur_params.id = 0; + } else { + ndev->cur_params.id = 0; + } + ndev->cur_dest_type = destination_type; - r = __nci_request(ndev, nci_core_conn_create_req, - (unsigned long)&data, + r = __nci_request(ndev, nci_core_conn_create_req, (unsigned long)&data, msecs_to_jiffies(NCI_CMD_TIMEOUT)); kfree(cmd); return r; @@ -646,6 +734,7 @@ static void nci_core_conn_close_req(struct nci_dev *ndev, unsigned long opt) int nci_core_conn_close(struct nci_dev *ndev, u8 conn_id) { + ndev->cur_conn_id = conn_id; return __nci_request(ndev, nci_core_conn_close_req, conn_id, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 2ada2b39e..1e8c1a12a 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -734,7 +734,7 @@ static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, * “HCI Access”, even if the HCI Network contains multiple NFCEEs. */ ndev->hci_dev->nfcee_id = nfcee_ntf->nfcee_id; - ndev->cur_id = nfcee_ntf->nfcee_id; + ndev->cur_params.id = nfcee_ntf->nfcee_id; nci_req_complete(ndev, status); } diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c index 9b6eb913d..e3bbf1937 100644 --- a/net/nfc/nci/rsp.c +++ b/net/nfc/nci/rsp.c @@ -226,7 +226,7 @@ static void nci_core_conn_create_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) { __u8 status = skb->data[0]; - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info = NULL; struct nci_core_conn_create_rsp *rsp; pr_debug("status 0x%x\n", status); @@ -241,7 +241,17 @@ static void nci_core_conn_create_rsp_packet(struct nci_dev *ndev, goto exit; } - conn_info->id = ndev->cur_id; + conn_info->dest_params = devm_kzalloc(&ndev->nfc_dev->dev, + sizeof(struct dest_spec_params), + GFP_KERNEL); + if (!conn_info->dest_params) { + status = NCI_STATUS_REJECTED; + goto free_conn_info; + } + + conn_info->dest_type = ndev->cur_dest_type; + conn_info->dest_params->id = ndev->cur_params.id; + conn_info->dest_params->protocol = ndev->cur_params.protocol; conn_info->conn_id = rsp->conn_id; /* Note: data_exchange_cb and data_exchange_cb_context need to @@ -251,7 +261,7 @@ static void nci_core_conn_create_rsp_packet(struct nci_dev *ndev, INIT_LIST_HEAD(&conn_info->list); list_add(&conn_info->list, &ndev->conn_info_list); - if (ndev->cur_id == ndev->hci_dev->nfcee_id) + if (ndev->cur_params.id == ndev->hci_dev->nfcee_id) ndev->hci_dev->conn_info = conn_info; conn_info->conn_id = rsp->conn_id; @@ -259,7 +269,11 @@ static void nci_core_conn_create_rsp_packet(struct nci_dev *ndev, atomic_set(&conn_info->credits_cnt, rsp->credits_cnt); } +free_conn_info: + if (status == NCI_STATUS_REJECTED) + devm_kfree(&ndev->nfc_dev->dev, conn_info); exit: + nci_req_complete(ndev, status); } @@ -271,7 +285,8 @@ static void nci_core_conn_close_rsp_packet(struct nci_dev *ndev, pr_debug("status 0x%x\n", status); if (status == NCI_STATUS_OK) { - conn_info = nci_get_conn_info_by_conn_id(ndev, ndev->cur_id); + conn_info = nci_get_conn_info_by_conn_id(ndev, + ndev->cur_conn_id); if (conn_info) { list_del(&conn_info->list); devm_kfree(&ndev->nfc_dev->dev, conn_info); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 879185fe1..9a3eb7a0e 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -137,11 +137,23 @@ static bool is_flow_key_valid(const struct sw_flow_key *key) return !!key->eth.type; } +static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr, + __be16 ethertype) +{ + if (skb->ip_summed == CHECKSUM_COMPLETE) { + __be16 diff[] = { ~(hdr->h_proto), ethertype }; + + skb->csum = ~csum_partial((char *)diff, sizeof(diff), + ~skb->csum); + } + + hdr->h_proto = ethertype; +} + static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_action_push_mpls *mpls) { __be32 *new_mpls_lse; - struct ethhdr *hdr; /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */ if (skb->encapsulation) @@ -160,9 +172,7 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); - hdr = eth_hdr(skb); - hdr->h_proto = mpls->mpls_ethertype; - + update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype); if (!skb->inner_protocol) skb_set_inner_protocol(skb, skb->protocol); skb->protocol = mpls->mpls_ethertype; @@ -193,7 +203,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, * field correctly in the presence of VLAN tags. */ hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN); - hdr->h_proto = ethertype; + update_ethertype(skb, hdr, ethertype); if (eth_p_mpls(skb->protocol)) skb->protocol = ethertype; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 10c84d882..d84312584 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -439,20 +439,12 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, u8 protonum; l3proto = __nf_ct_l3proto_find(l3num); - if (!l3proto) { - pr_debug("ovs_ct_find_existing: Can't get l3proto\n"); - return NULL; - } if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, &protonum) <= 0) { pr_debug("ovs_ct_find_existing: Can't get protonum\n"); return NULL; } l4proto = __nf_ct_l4proto_find(l3num, protonum); - if (!l4proto) { - pr_debug("ovs_ct_find_existing: Can't get l4proto\n"); - return NULL; - } if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, protonum, net, &tuple, l3proto, l4proto)) { pr_debug("ovs_ct_find_existing: Can't get tuple\n"); @@ -826,8 +818,18 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, */ state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; __ovs_ct_update_key(key, state, &info->zone, exp->master); - } else - return __ovs_ct_lookup(net, key, info, skb); + } else { + struct nf_conn *ct; + int err; + + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; + + ct = (struct nf_conn *)skb->nfct; + if (ct) + nf_ct_deliver_cached_events(ct); + } return 0; } @@ -1358,7 +1360,7 @@ void ovs_ct_init(struct net *net) unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE; struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - if (nf_connlabels_get(net, n_bits)) { + if (nf_connlabels_get(net, n_bits - 1)) { ovs_net->xt_label = false; OVS_NLERR(true, "Failed to set connlabel length"); } else { diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 0cc66a4e4..856bd8dba 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -738,9 +738,9 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts, len += nla_total_size(acts->orig_len); return len - + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + + nla_total_size_64bit(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ - + nla_total_size(8); /* OVS_FLOW_ATTR_USED */ + + nla_total_size_64bit(8); /* OVS_FLOW_ATTR_USED */ } /* Called with ovs_mutex or RCU read lock. */ @@ -754,11 +754,14 @@ static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow, ovs_flow_stats_get(flow, &stats, &used, &tcp_flags); if (used && - nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used))) + nla_put_u64_64bit(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used), + OVS_FLOW_ATTR_PAD)) return -EMSGSIZE; if (stats.n_packets && - nla_put(skb, OVS_FLOW_ATTR_STATS, sizeof(struct ovs_flow_stats), &stats)) + nla_put_64bit(skb, OVS_FLOW_ATTR_STATS, + sizeof(struct ovs_flow_stats), &stats, + OVS_FLOW_ATTR_PAD)) return -EMSGSIZE; if ((u8)ntohs(tcp_flags) && @@ -1434,8 +1437,8 @@ static size_t ovs_dp_cmd_msg_size(void) size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header)); msgsize += nla_total_size(IFNAMSIZ); - msgsize += nla_total_size(sizeof(struct ovs_dp_stats)); - msgsize += nla_total_size(sizeof(struct ovs_dp_megaflow_stats)); + msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_stats)); + msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_megaflow_stats)); msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ return msgsize; @@ -1462,13 +1465,13 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, goto nla_put_failure; get_dp_stats(dp, &dp_stats, &dp_megaflow_stats); - if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), - &dp_stats)) + if (nla_put_64bit(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), + &dp_stats, OVS_DP_ATTR_PAD)) goto nla_put_failure; - if (nla_put(skb, OVS_DP_ATTR_MEGAFLOW_STATS, - sizeof(struct ovs_dp_megaflow_stats), - &dp_megaflow_stats)) + if (nla_put_64bit(skb, OVS_DP_ATTR_MEGAFLOW_STATS, + sizeof(struct ovs_dp_megaflow_stats), + &dp_megaflow_stats, OVS_DP_ATTR_PAD)) goto nla_put_failure; if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) @@ -1837,8 +1840,9 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, goto nla_put_failure; ovs_vport_get_stats(vport, &vport_stats); - if (nla_put(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats), - &vport_stats)) + if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS, + sizeof(struct ovs_vport_stats), &vport_stats, + OVS_VPORT_ATTR_PAD)) goto nla_put_failure; if (ovs_vport_get_upcall_portids(vport, skb)) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 689c17264..0bb650f4f 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -261,7 +261,7 @@ size_t ovs_tun_key_attr_size(void) /* Whenever adding new OVS_TUNNEL_KEY_ FIELDS, we should consider * updating this function. */ - return nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ + return nla_total_size_64bit(8) /* OVS_TUNNEL_KEY_ATTR_ID */ + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_SRC */ + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_DST */ + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ @@ -720,7 +720,8 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, unsigned short tun_proto) { if (output->tun_flags & TUNNEL_KEY && - nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) + nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id, + OVS_TUNNEL_KEY_ATTR_PAD)) return -EMSGSIZE; switch (tun_proto) { case AF_INET: diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 7c8b90bf0..2ee48e447 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -165,11 +165,10 @@ static void do_setup(struct net_device *netdev) netdev->priv_flags &= ~IFF_TX_SKB_SHARING; netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | - IFF_PHONY_HEADROOM; + IFF_PHONY_HEADROOM | IFF_NO_QUEUE; netdev->destructor = internal_dev_destructor; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; - netdev->tx_queue_len = 0; netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8012f67ca..b43c4015b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -93,6 +93,7 @@ #include <net/inet_common.h> #endif #include <linux/bpf.h> +#include <net/compat.h> #include "internal.h" @@ -1837,6 +1838,7 @@ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name); struct sk_buff *skb = NULL; struct net_device *dev; + struct sockcm_cookie sockc; __be16 proto = 0; int err; int extra_len = 0; @@ -1925,12 +1927,19 @@ retry: goto out_unlock; } + sockc.tsflags = sk->sk_tsflags; + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (unlikely(err)) + goto out_unlock; + } + skb->protocol = proto; skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); + sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags); if (unlikely(extra_len == 4)) skb->no_fcs = 1; @@ -2042,6 +2051,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; + bool is_drop_n_account = false; if (skb->pkt_type == PACKET_LOOPBACK) goto drop; @@ -2130,6 +2140,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, return 0; drop_n_acct: + is_drop_n_account = true; spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_drops++; atomic_inc(&sk->sk_drops); @@ -2141,7 +2152,10 @@ drop_n_restore: skb->len = skb_len; } drop: - consume_skb(skb); + if (!is_drop_n_account) + consume_skb(skb); + else + kfree_skb(skb); return 0; } @@ -2160,6 +2174,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct sk_buff *copy_skb = NULL; struct timespec ts; __u32 ts_status; + bool is_drop_n_account = false; /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. * We may add members to them until current aligned size without forcing @@ -2367,10 +2382,14 @@ drop_n_restore: skb->len = skb_len; } drop: - kfree_skb(skb); + if (!is_drop_n_account) + consume_skb(skb); + else + kfree_skb(skb); return 0; drop_n_account: + is_drop_n_account = true; po->stats.stats1.tp_drops++; spin_unlock(&sk->sk_receive_queue.lock); @@ -2486,7 +2505,8 @@ static int packet_snd_vnet_gso(struct sk_buff *skb, static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, void *frame, struct net_device *dev, void *data, int tp_len, - __be16 proto, unsigned char *addr, int hlen, int copylen) + __be16 proto, unsigned char *addr, int hlen, int copylen, + const struct sockcm_cookie *sockc) { union tpacket_uhdr ph; int to_write, offset, len, nr_frags, len_max; @@ -2500,7 +2520,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->dev = dev; skb->priority = po->sk.sk_priority; skb->mark = po->sk.sk_mark; - sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags); + sock_tx_timestamp(&po->sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); skb_shinfo(skb)->destructor_arg = ph.raw; skb_reserve(skb, hlen); @@ -2624,6 +2644,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) struct sk_buff *skb; struct net_device *dev; struct virtio_net_hdr *vnet_hdr = NULL; + struct sockcm_cookie sockc; __be16 proto; int err, reserve = 0; void *ph; @@ -2655,6 +2676,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); } + sockc.tsflags = po->sk.sk_tsflags; + if (msg->msg_controllen) { + err = sock_cmsg_send(&po->sk, msg, &sockc); + if (unlikely(err)) + goto out; + } + err = -ENXIO; if (unlikely(dev == NULL)) goto out; @@ -2712,7 +2740,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) goto out_status; } tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto, - addr, hlen, copylen); + addr, hlen, copylen, &sockc); if (likely(tp_len >= 0) && tp_len > dev->mtu + reserve && !po->has_vnet_hdr && @@ -2851,6 +2879,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; + sockc.tsflags = sk->sk_tsflags; sockc.mark = sk->sk_mark; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); @@ -2908,7 +2937,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_free; } - sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); + sock_tx_timestamp(sk, sockc.tsflags, &skb_shinfo(skb)->tx_flags); if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { @@ -3910,6 +3939,27 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, } +#ifdef CONFIG_COMPAT +static int compat_packet_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, unsigned int optlen) +{ + struct packet_sock *po = pkt_sk(sock->sk); + + if (level != SOL_PACKET) + return -ENOPROTOOPT; + + if (optname == PACKET_FANOUT_DATA && + po->fanout && po->fanout->type == PACKET_FANOUT_CBPF) { + optval = (char __user *)get_compat_bpf_fprog(optval); + if (!optval) + return -EFAULT; + optlen = sizeof(struct sock_fprog); + } + + return packet_setsockopt(sock, level, optname, optval, optlen); +} +#endif + static int packet_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { @@ -4386,6 +4436,9 @@ static const struct proto_ops packet_ops = { .shutdown = sock_no_shutdown, .setsockopt = packet_setsockopt, .getsockopt = packet_getsockopt, +#ifdef CONFIG_COMPAT + .compat_setsockopt = compat_packet_setsockopt, +#endif .sendmsg = packet_sendmsg, .recvmsg = packet_recvmsg, .mmap = packet_mmap, diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig new file mode 100644 index 000000000..b83c6807a --- /dev/null +++ b/net/qrtr/Kconfig @@ -0,0 +1,24 @@ +# Qualcomm IPC Router configuration +# + +config QRTR + tristate "Qualcomm IPC Router support" + depends on ARCH_QCOM || COMPILE_TEST + ---help--- + Say Y if you intend to use Qualcomm IPC router protocol. The + protocol is used to communicate with services provided by other + hardware blocks in the system. + + In order to do service lookups, a userspace daemon is required to + maintain a service listing. + +if QRTR + +config QRTR_SMD + tristate "SMD IPC Router channels" + depends on QCOM_SMD || (COMPILE_TEST && QCOM_SMD=n) + ---help--- + Say Y here to support SMD based ipcrouter channels. SMD is the + most common transport for IPC Router. + +endif # QRTR diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile new file mode 100644 index 000000000..ab09e40f7 --- /dev/null +++ b/net/qrtr/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_QRTR) := qrtr.o + +obj-$(CONFIG_QRTR_SMD) += qrtr-smd.o +qrtr-smd-y := smd.o diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c new file mode 100644 index 000000000..c985ecbe9 --- /dev/null +++ b/net/qrtr/qrtr.c @@ -0,0 +1,1007 @@ +/* + * Copyright (c) 2015, Sony Mobile Communications Inc. + * Copyright (c) 2013, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/qrtr.h> +#include <linux/termios.h> /* For TIOCINQ/OUTQ */ + +#include <net/sock.h> + +#include "qrtr.h" + +#define QRTR_PROTO_VER 1 + +/* auto-bind range */ +#define QRTR_MIN_EPH_SOCKET 0x4000 +#define QRTR_MAX_EPH_SOCKET 0x7fff + +enum qrtr_pkt_type { + QRTR_TYPE_DATA = 1, + QRTR_TYPE_HELLO = 2, + QRTR_TYPE_BYE = 3, + QRTR_TYPE_NEW_SERVER = 4, + QRTR_TYPE_DEL_SERVER = 5, + QRTR_TYPE_DEL_CLIENT = 6, + QRTR_TYPE_RESUME_TX = 7, + QRTR_TYPE_EXIT = 8, + QRTR_TYPE_PING = 9, +}; + +/** + * struct qrtr_hdr - (I|R)PCrouter packet header + * @version: protocol version + * @type: packet type; one of QRTR_TYPE_* + * @src_node_id: source node + * @src_port_id: source port + * @confirm_rx: boolean; whether a resume-tx packet should be send in reply + * @size: length of packet, excluding this header + * @dst_node_id: destination node + * @dst_port_id: destination port + */ +struct qrtr_hdr { + __le32 version; + __le32 type; + __le32 src_node_id; + __le32 src_port_id; + __le32 confirm_rx; + __le32 size; + __le32 dst_node_id; + __le32 dst_port_id; +} __packed; + +#define QRTR_HDR_SIZE sizeof(struct qrtr_hdr) +#define QRTR_NODE_BCAST ((unsigned int)-1) +#define QRTR_PORT_CTRL ((unsigned int)-2) + +struct qrtr_sock { + /* WARNING: sk must be the first member */ + struct sock sk; + struct sockaddr_qrtr us; + struct sockaddr_qrtr peer; +}; + +static inline struct qrtr_sock *qrtr_sk(struct sock *sk) +{ + BUILD_BUG_ON(offsetof(struct qrtr_sock, sk) != 0); + return container_of(sk, struct qrtr_sock, sk); +} + +static unsigned int qrtr_local_nid = -1; + +/* for node ids */ +static RADIX_TREE(qrtr_nodes, GFP_KERNEL); +/* broadcast list */ +static LIST_HEAD(qrtr_all_nodes); +/* lock for qrtr_nodes, qrtr_all_nodes and node reference */ +static DEFINE_MUTEX(qrtr_node_lock); + +/* local port allocation management */ +static DEFINE_IDR(qrtr_ports); +static DEFINE_MUTEX(qrtr_port_lock); + +/** + * struct qrtr_node - endpoint node + * @ep_lock: lock for endpoint management and callbacks + * @ep: endpoint + * @ref: reference count for node + * @nid: node id + * @rx_queue: receive queue + * @work: scheduled work struct for recv work + * @item: list item for broadcast list + */ +struct qrtr_node { + struct mutex ep_lock; + struct qrtr_endpoint *ep; + struct kref ref; + unsigned int nid; + + struct sk_buff_head rx_queue; + struct work_struct work; + struct list_head item; +}; + +/* Release node resources and free the node. + * + * Do not call directly, use qrtr_node_release. To be used with + * kref_put_mutex. As such, the node mutex is expected to be locked on call. + */ +static void __qrtr_node_release(struct kref *kref) +{ + struct qrtr_node *node = container_of(kref, struct qrtr_node, ref); + + if (node->nid != QRTR_EP_NID_AUTO) + radix_tree_delete(&qrtr_nodes, node->nid); + + list_del(&node->item); + mutex_unlock(&qrtr_node_lock); + + skb_queue_purge(&node->rx_queue); + kfree(node); +} + +/* Increment reference to node. */ +static struct qrtr_node *qrtr_node_acquire(struct qrtr_node *node) +{ + if (node) + kref_get(&node->ref); + return node; +} + +/* Decrement reference to node and release as necessary. */ +static void qrtr_node_release(struct qrtr_node *node) +{ + if (!node) + return; + kref_put_mutex(&node->ref, __qrtr_node_release, &qrtr_node_lock); +} + +/* Pass an outgoing packet socket buffer to the endpoint driver. */ +static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb) +{ + int rc = -ENODEV; + + mutex_lock(&node->ep_lock); + if (node->ep) + rc = node->ep->xmit(node->ep, skb); + else + kfree_skb(skb); + mutex_unlock(&node->ep_lock); + + return rc; +} + +/* Lookup node by id. + * + * callers must release with qrtr_node_release() + */ +static struct qrtr_node *qrtr_node_lookup(unsigned int nid) +{ + struct qrtr_node *node; + + mutex_lock(&qrtr_node_lock); + node = radix_tree_lookup(&qrtr_nodes, nid); + node = qrtr_node_acquire(node); + mutex_unlock(&qrtr_node_lock); + + return node; +} + +/* Assign node id to node. + * + * This is mostly useful for automatic node id assignment, based on + * the source id in the incoming packet. + */ +static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid) +{ + if (node->nid != QRTR_EP_NID_AUTO || nid == QRTR_EP_NID_AUTO) + return; + + mutex_lock(&qrtr_node_lock); + radix_tree_insert(&qrtr_nodes, nid, node); + node->nid = nid; + mutex_unlock(&qrtr_node_lock); +} + +/** + * qrtr_endpoint_post() - post incoming data + * @ep: endpoint handle + * @data: data pointer + * @len: size of data in bytes + * + * Return: 0 on success; negative error code on failure + */ +int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) +{ + struct qrtr_node *node = ep->node; + const struct qrtr_hdr *phdr = data; + struct sk_buff *skb; + unsigned int psize; + unsigned int size; + unsigned int type; + unsigned int ver; + unsigned int dst; + + if (len < QRTR_HDR_SIZE || len & 3) + return -EINVAL; + + ver = le32_to_cpu(phdr->version); + size = le32_to_cpu(phdr->size); + type = le32_to_cpu(phdr->type); + dst = le32_to_cpu(phdr->dst_port_id); + + psize = (size + 3) & ~3; + + if (ver != QRTR_PROTO_VER) + return -EINVAL; + + if (len != psize + QRTR_HDR_SIZE) + return -EINVAL; + + if (dst != QRTR_PORT_CTRL && type != QRTR_TYPE_DATA) + return -EINVAL; + + skb = netdev_alloc_skb(NULL, len); + if (!skb) + return -ENOMEM; + + skb_reset_transport_header(skb); + memcpy(skb_put(skb, len), data, len); + + skb_queue_tail(&node->rx_queue, skb); + schedule_work(&node->work); + + return 0; +} +EXPORT_SYMBOL_GPL(qrtr_endpoint_post); + +/* Allocate and construct a resume-tx packet. */ +static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node, + u32 dst_node, u32 port) +{ + const int pkt_len = 20; + struct qrtr_hdr *hdr; + struct sk_buff *skb; + u32 *buf; + + skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL); + if (!skb) + return NULL; + skb_reset_transport_header(skb); + + hdr = (struct qrtr_hdr *)skb_put(skb, QRTR_HDR_SIZE); + hdr->version = cpu_to_le32(QRTR_PROTO_VER); + hdr->type = cpu_to_le32(QRTR_TYPE_RESUME_TX); + hdr->src_node_id = cpu_to_le32(src_node); + hdr->src_port_id = cpu_to_le32(QRTR_PORT_CTRL); + hdr->confirm_rx = cpu_to_le32(0); + hdr->size = cpu_to_le32(pkt_len); + hdr->dst_node_id = cpu_to_le32(dst_node); + hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL); + + buf = (u32 *)skb_put(skb, pkt_len); + memset(buf, 0, pkt_len); + buf[0] = cpu_to_le32(QRTR_TYPE_RESUME_TX); + buf[1] = cpu_to_le32(src_node); + buf[2] = cpu_to_le32(port); + + return skb; +} + +static struct qrtr_sock *qrtr_port_lookup(int port); +static void qrtr_port_put(struct qrtr_sock *ipc); + +/* Handle and route a received packet. + * + * This will auto-reply with resume-tx packet as necessary. + */ +static void qrtr_node_rx_work(struct work_struct *work) +{ + struct qrtr_node *node = container_of(work, struct qrtr_node, work); + struct sk_buff *skb; + + while ((skb = skb_dequeue(&node->rx_queue)) != NULL) { + const struct qrtr_hdr *phdr; + u32 dst_node, dst_port; + struct qrtr_sock *ipc; + u32 src_node; + int confirm; + + phdr = (const struct qrtr_hdr *)skb_transport_header(skb); + src_node = le32_to_cpu(phdr->src_node_id); + dst_node = le32_to_cpu(phdr->dst_node_id); + dst_port = le32_to_cpu(phdr->dst_port_id); + confirm = !!phdr->confirm_rx; + + qrtr_node_assign(node, src_node); + + ipc = qrtr_port_lookup(dst_port); + if (!ipc) { + kfree_skb(skb); + } else { + if (sock_queue_rcv_skb(&ipc->sk, skb)) + kfree_skb(skb); + + qrtr_port_put(ipc); + } + + if (confirm) { + skb = qrtr_alloc_resume_tx(dst_node, node->nid, dst_port); + if (!skb) + break; + if (qrtr_node_enqueue(node, skb)) + break; + } + } +} + +/** + * qrtr_endpoint_register() - register a new endpoint + * @ep: endpoint to register + * @nid: desired node id; may be QRTR_EP_NID_AUTO for auto-assignment + * Return: 0 on success; negative error code on failure + * + * The specified endpoint must have the xmit function pointer set on call. + */ +int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid) +{ + struct qrtr_node *node; + + if (!ep || !ep->xmit) + return -EINVAL; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + INIT_WORK(&node->work, qrtr_node_rx_work); + kref_init(&node->ref); + mutex_init(&node->ep_lock); + skb_queue_head_init(&node->rx_queue); + node->nid = QRTR_EP_NID_AUTO; + node->ep = ep; + + qrtr_node_assign(node, nid); + + mutex_lock(&qrtr_node_lock); + list_add(&node->item, &qrtr_all_nodes); + mutex_unlock(&qrtr_node_lock); + ep->node = node; + + return 0; +} +EXPORT_SYMBOL_GPL(qrtr_endpoint_register); + +/** + * qrtr_endpoint_unregister - unregister endpoint + * @ep: endpoint to unregister + */ +void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) +{ + struct qrtr_node *node = ep->node; + + mutex_lock(&node->ep_lock); + node->ep = NULL; + mutex_unlock(&node->ep_lock); + + qrtr_node_release(node); + ep->node = NULL; +} +EXPORT_SYMBOL_GPL(qrtr_endpoint_unregister); + +/* Lookup socket by port. + * + * Callers must release with qrtr_port_put() + */ +static struct qrtr_sock *qrtr_port_lookup(int port) +{ + struct qrtr_sock *ipc; + + if (port == QRTR_PORT_CTRL) + port = 0; + + mutex_lock(&qrtr_port_lock); + ipc = idr_find(&qrtr_ports, port); + if (ipc) + sock_hold(&ipc->sk); + mutex_unlock(&qrtr_port_lock); + + return ipc; +} + +/* Release acquired socket. */ +static void qrtr_port_put(struct qrtr_sock *ipc) +{ + sock_put(&ipc->sk); +} + +/* Remove port assignment. */ +static void qrtr_port_remove(struct qrtr_sock *ipc) +{ + int port = ipc->us.sq_port; + + if (port == QRTR_PORT_CTRL) + port = 0; + + __sock_put(&ipc->sk); + + mutex_lock(&qrtr_port_lock); + idr_remove(&qrtr_ports, port); + mutex_unlock(&qrtr_port_lock); +} + +/* Assign port number to socket. + * + * Specify port in the integer pointed to by port, and it will be adjusted + * on return as necesssary. + * + * Port may be: + * 0: Assign ephemeral port in [QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET] + * <QRTR_MIN_EPH_SOCKET: Specified; requires CAP_NET_ADMIN + * >QRTR_MIN_EPH_SOCKET: Specified; available to all + */ +static int qrtr_port_assign(struct qrtr_sock *ipc, int *port) +{ + int rc; + + mutex_lock(&qrtr_port_lock); + if (!*port) { + rc = idr_alloc(&qrtr_ports, ipc, + QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET + 1, + GFP_ATOMIC); + if (rc >= 0) + *port = rc; + } else if (*port < QRTR_MIN_EPH_SOCKET && !capable(CAP_NET_ADMIN)) { + rc = -EACCES; + } else if (*port == QRTR_PORT_CTRL) { + rc = idr_alloc(&qrtr_ports, ipc, 0, 1, GFP_ATOMIC); + } else { + rc = idr_alloc(&qrtr_ports, ipc, *port, *port + 1, GFP_ATOMIC); + if (rc >= 0) + *port = rc; + } + mutex_unlock(&qrtr_port_lock); + + if (rc == -ENOSPC) + return -EADDRINUSE; + else if (rc < 0) + return rc; + + sock_hold(&ipc->sk); + + return 0; +} + +/* Bind socket to address. + * + * Socket should be locked upon call. + */ +static int __qrtr_bind(struct socket *sock, + const struct sockaddr_qrtr *addr, int zapped) +{ + struct qrtr_sock *ipc = qrtr_sk(sock->sk); + struct sock *sk = sock->sk; + int port; + int rc; + + /* rebinding ok */ + if (!zapped && addr->sq_port == ipc->us.sq_port) + return 0; + + port = addr->sq_port; + rc = qrtr_port_assign(ipc, &port); + if (rc) + return rc; + + /* unbind previous, if any */ + if (!zapped) + qrtr_port_remove(ipc); + ipc->us.sq_port = port; + + sock_reset_flag(sk, SOCK_ZAPPED); + + return 0; +} + +/* Auto bind to an ephemeral port. */ +static int qrtr_autobind(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct sockaddr_qrtr addr; + + if (!sock_flag(sk, SOCK_ZAPPED)) + return 0; + + addr.sq_family = AF_QIPCRTR; + addr.sq_node = qrtr_local_nid; + addr.sq_port = 0; + + return __qrtr_bind(sock, &addr, 1); +} + +/* Bind socket to specified sockaddr. */ +static int qrtr_bind(struct socket *sock, struct sockaddr *saddr, int len) +{ + DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); + struct qrtr_sock *ipc = qrtr_sk(sock->sk); + struct sock *sk = sock->sk; + int rc; + + if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR) + return -EINVAL; + + if (addr->sq_node != ipc->us.sq_node) + return -EINVAL; + + lock_sock(sk); + rc = __qrtr_bind(sock, addr, sock_flag(sk, SOCK_ZAPPED)); + release_sock(sk); + + return rc; +} + +/* Queue packet to local peer socket. */ +static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb) +{ + const struct qrtr_hdr *phdr; + struct qrtr_sock *ipc; + + phdr = (const struct qrtr_hdr *)skb_transport_header(skb); + + ipc = qrtr_port_lookup(le32_to_cpu(phdr->dst_port_id)); + if (!ipc || &ipc->sk == skb->sk) { /* do not send to self */ + kfree_skb(skb); + return -ENODEV; + } + + if (sock_queue_rcv_skb(&ipc->sk, skb)) { + qrtr_port_put(ipc); + kfree_skb(skb); + return -ENOSPC; + } + + qrtr_port_put(ipc); + + return 0; +} + +/* Queue packet for broadcast. */ +static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb) +{ + struct sk_buff *skbn; + + mutex_lock(&qrtr_node_lock); + list_for_each_entry(node, &qrtr_all_nodes, item) { + skbn = skb_clone(skb, GFP_KERNEL); + if (!skbn) + break; + skb_set_owner_w(skbn, skb->sk); + qrtr_node_enqueue(node, skbn); + } + mutex_unlock(&qrtr_node_lock); + + qrtr_local_enqueue(node, skb); + + return 0; +} + +static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) +{ + DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); + int (*enqueue_fn)(struct qrtr_node *, struct sk_buff *); + struct qrtr_sock *ipc = qrtr_sk(sock->sk); + struct sock *sk = sock->sk; + struct qrtr_node *node; + struct qrtr_hdr *hdr; + struct sk_buff *skb; + size_t plen; + int rc; + + if (msg->msg_flags & ~(MSG_DONTWAIT)) + return -EINVAL; + + if (len > 65535) + return -EMSGSIZE; + + lock_sock(sk); + + if (addr) { + if (msg->msg_namelen < sizeof(*addr)) { + release_sock(sk); + return -EINVAL; + } + + if (addr->sq_family != AF_QIPCRTR) { + release_sock(sk); + return -EINVAL; + } + + rc = qrtr_autobind(sock); + if (rc) { + release_sock(sk); + return rc; + } + } else if (sk->sk_state == TCP_ESTABLISHED) { + addr = &ipc->peer; + } else { + release_sock(sk); + return -ENOTCONN; + } + + node = NULL; + if (addr->sq_node == QRTR_NODE_BCAST) { + enqueue_fn = qrtr_bcast_enqueue; + } else if (addr->sq_node == ipc->us.sq_node) { + enqueue_fn = qrtr_local_enqueue; + } else { + enqueue_fn = qrtr_node_enqueue; + node = qrtr_node_lookup(addr->sq_node); + if (!node) { + release_sock(sk); + return -ECONNRESET; + } + } + + plen = (len + 3) & ~3; + skb = sock_alloc_send_skb(sk, plen + QRTR_HDR_SIZE, + msg->msg_flags & MSG_DONTWAIT, &rc); + if (!skb) + goto out_node; + + skb_reset_transport_header(skb); + skb_put(skb, len + QRTR_HDR_SIZE); + + hdr = (struct qrtr_hdr *)skb_transport_header(skb); + hdr->version = cpu_to_le32(QRTR_PROTO_VER); + hdr->src_node_id = cpu_to_le32(ipc->us.sq_node); + hdr->src_port_id = cpu_to_le32(ipc->us.sq_port); + hdr->confirm_rx = cpu_to_le32(0); + hdr->size = cpu_to_le32(len); + hdr->dst_node_id = cpu_to_le32(addr->sq_node); + hdr->dst_port_id = cpu_to_le32(addr->sq_port); + + rc = skb_copy_datagram_from_iter(skb, QRTR_HDR_SIZE, + &msg->msg_iter, len); + if (rc) { + kfree_skb(skb); + goto out_node; + } + + if (plen != len) { + skb_pad(skb, plen - len); + skb_put(skb, plen - len); + } + + if (ipc->us.sq_port == QRTR_PORT_CTRL) { + if (len < 4) { + rc = -EINVAL; + kfree_skb(skb); + goto out_node; + } + + /* control messages already require the type as 'command' */ + skb_copy_bits(skb, QRTR_HDR_SIZE, &hdr->type, 4); + } else { + hdr->type = cpu_to_le32(QRTR_TYPE_DATA); + } + + rc = enqueue_fn(node, skb); + if (rc >= 0) + rc = len; + +out_node: + qrtr_node_release(node); + release_sock(sk); + + return rc; +} + +static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, + size_t size, int flags) +{ + DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); + const struct qrtr_hdr *phdr; + struct sock *sk = sock->sk; + struct sk_buff *skb; + int copied, rc; + + lock_sock(sk); + + if (sock_flag(sk, SOCK_ZAPPED)) { + release_sock(sk); + return -EADDRNOTAVAIL; + } + + skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, + flags & MSG_DONTWAIT, &rc); + if (!skb) { + release_sock(sk); + return rc; + } + + phdr = (const struct qrtr_hdr *)skb_transport_header(skb); + copied = le32_to_cpu(phdr->size); + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + rc = skb_copy_datagram_msg(skb, QRTR_HDR_SIZE, msg, copied); + if (rc < 0) + goto out; + rc = copied; + + if (addr) { + addr->sq_family = AF_QIPCRTR; + addr->sq_node = le32_to_cpu(phdr->src_node_id); + addr->sq_port = le32_to_cpu(phdr->src_port_id); + msg->msg_namelen = sizeof(*addr); + } + +out: + skb_free_datagram(sk, skb); + release_sock(sk); + + return rc; +} + +static int qrtr_connect(struct socket *sock, struct sockaddr *saddr, + int len, int flags) +{ + DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); + struct qrtr_sock *ipc = qrtr_sk(sock->sk); + struct sock *sk = sock->sk; + int rc; + + if (len < sizeof(*addr) || addr->sq_family != AF_QIPCRTR) + return -EINVAL; + + lock_sock(sk); + + sk->sk_state = TCP_CLOSE; + sock->state = SS_UNCONNECTED; + + rc = qrtr_autobind(sock); + if (rc) { + release_sock(sk); + return rc; + } + + ipc->peer = *addr; + sock->state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; + + release_sock(sk); + + return 0; +} + +static int qrtr_getname(struct socket *sock, struct sockaddr *saddr, + int *len, int peer) +{ + struct qrtr_sock *ipc = qrtr_sk(sock->sk); + struct sockaddr_qrtr qaddr; + struct sock *sk = sock->sk; + + lock_sock(sk); + if (peer) { + if (sk->sk_state != TCP_ESTABLISHED) { + release_sock(sk); + return -ENOTCONN; + } + + qaddr = ipc->peer; + } else { + qaddr = ipc->us; + } + release_sock(sk); + + *len = sizeof(qaddr); + qaddr.sq_family = AF_QIPCRTR; + + memcpy(saddr, &qaddr, sizeof(qaddr)); + + return 0; +} + +static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct qrtr_sock *ipc = qrtr_sk(sock->sk); + struct sock *sk = sock->sk; + struct sockaddr_qrtr *sq; + struct sk_buff *skb; + struct ifreq ifr; + long len = 0; + int rc = 0; + + lock_sock(sk); + + switch (cmd) { + case TIOCOUTQ: + len = sk->sk_sndbuf - sk_wmem_alloc_get(sk); + if (len < 0) + len = 0; + rc = put_user(len, (int __user *)argp); + break; + case TIOCINQ: + skb = skb_peek(&sk->sk_receive_queue); + if (skb) + len = skb->len - QRTR_HDR_SIZE; + rc = put_user(len, (int __user *)argp); + break; + case SIOCGIFADDR: + if (copy_from_user(&ifr, argp, sizeof(ifr))) { + rc = -EFAULT; + break; + } + + sq = (struct sockaddr_qrtr *)&ifr.ifr_addr; + *sq = ipc->us; + if (copy_to_user(argp, &ifr, sizeof(ifr))) { + rc = -EFAULT; + break; + } + break; + case SIOCGSTAMP: + rc = sock_get_timestamp(sk, argp); + break; + case SIOCADDRT: + case SIOCDELRT: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + rc = -EINVAL; + break; + default: + rc = -ENOIOCTLCMD; + break; + } + + release_sock(sk); + + return rc; +} + +static int qrtr_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct qrtr_sock *ipc; + + if (!sk) + return 0; + + lock_sock(sk); + + ipc = qrtr_sk(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + + sock_set_flag(sk, SOCK_DEAD); + sock->sk = NULL; + + if (!sock_flag(sk, SOCK_ZAPPED)) + qrtr_port_remove(ipc); + + skb_queue_purge(&sk->sk_receive_queue); + + release_sock(sk); + sock_put(sk); + + return 0; +} + +static const struct proto_ops qrtr_proto_ops = { + .owner = THIS_MODULE, + .family = AF_QIPCRTR, + .bind = qrtr_bind, + .connect = qrtr_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .listen = sock_no_listen, + .sendmsg = qrtr_sendmsg, + .recvmsg = qrtr_recvmsg, + .getname = qrtr_getname, + .ioctl = qrtr_ioctl, + .poll = datagram_poll, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .release = qrtr_release, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto qrtr_proto = { + .name = "QIPCRTR", + .owner = THIS_MODULE, + .obj_size = sizeof(struct qrtr_sock), +}; + +static int qrtr_create(struct net *net, struct socket *sock, + int protocol, int kern) +{ + struct qrtr_sock *ipc; + struct sock *sk; + + if (sock->type != SOCK_DGRAM) + return -EPROTOTYPE; + + sk = sk_alloc(net, AF_QIPCRTR, GFP_KERNEL, &qrtr_proto, kern); + if (!sk) + return -ENOMEM; + + sock_set_flag(sk, SOCK_ZAPPED); + + sock_init_data(sock, sk); + sock->ops = &qrtr_proto_ops; + + ipc = qrtr_sk(sk); + ipc->us.sq_family = AF_QIPCRTR; + ipc->us.sq_node = qrtr_local_nid; + ipc->us.sq_port = 0; + + return 0; +} + +static const struct nla_policy qrtr_policy[IFA_MAX + 1] = { + [IFA_LOCAL] = { .type = NLA_U32 }, +}; + +static int qrtr_addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + struct nlattr *tb[IFA_MAX + 1]; + struct ifaddrmsg *ifm; + int rc; + + if (!netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; + + if (!netlink_capable(skb, CAP_SYS_ADMIN)) + return -EPERM; + + ASSERT_RTNL(); + + rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, qrtr_policy); + if (rc < 0) + return rc; + + ifm = nlmsg_data(nlh); + if (!tb[IFA_LOCAL]) + return -EINVAL; + + qrtr_local_nid = nla_get_u32(tb[IFA_LOCAL]); + return 0; +} + +static const struct net_proto_family qrtr_family = { + .owner = THIS_MODULE, + .family = AF_QIPCRTR, + .create = qrtr_create, +}; + +static int __init qrtr_proto_init(void) +{ + int rc; + + rc = proto_register(&qrtr_proto, 1); + if (rc) + return rc; + + rc = sock_register(&qrtr_family); + if (rc) { + proto_unregister(&qrtr_proto); + return rc; + } + + rtnl_register(PF_QIPCRTR, RTM_NEWADDR, qrtr_addr_doit, NULL, NULL); + + return 0; +} +module_init(qrtr_proto_init); + +static void __exit qrtr_proto_fini(void) +{ + rtnl_unregister(PF_QIPCRTR, RTM_NEWADDR); + sock_unregister(qrtr_family.family); + proto_unregister(&qrtr_proto); +} +module_exit(qrtr_proto_fini); + +MODULE_DESCRIPTION("Qualcomm IPC-router driver"); +MODULE_LICENSE("GPL v2"); diff --git a/net/qrtr/qrtr.h b/net/qrtr/qrtr.h new file mode 100644 index 000000000..2b848718f --- /dev/null +++ b/net/qrtr/qrtr.h @@ -0,0 +1,31 @@ +#ifndef __QRTR_H_ +#define __QRTR_H_ + +#include <linux/types.h> + +struct sk_buff; + +/* endpoint node id auto assignment */ +#define QRTR_EP_NID_AUTO (-1) + +/** + * struct qrtr_endpoint - endpoint handle + * @xmit: Callback for outgoing packets + * + * The socket buffer passed to the xmit function becomes owned by the endpoint + * driver. As such, when the driver is done with the buffer, it should + * call kfree_skb() on failure, or consume_skb() on success. + */ +struct qrtr_endpoint { + int (*xmit)(struct qrtr_endpoint *ep, struct sk_buff *skb); + /* private: not for endpoint use */ + struct qrtr_node *node; +}; + +int qrtr_endpoint_register(struct qrtr_endpoint *ep, unsigned int nid); + +void qrtr_endpoint_unregister(struct qrtr_endpoint *ep); + +int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len); + +#endif diff --git a/net/qrtr/smd.c b/net/qrtr/smd.c new file mode 100644 index 000000000..0d11132b3 --- /dev/null +++ b/net/qrtr/smd.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2015, Sony Mobile Communications Inc. + * Copyright (c) 2013, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/soc/qcom/smd.h> + +#include "qrtr.h" + +struct qrtr_smd_dev { + struct qrtr_endpoint ep; + struct qcom_smd_channel *channel; + struct device *dev; +}; + +/* from smd to qrtr */ +static int qcom_smd_qrtr_callback(struct qcom_smd_channel *channel, + const void *data, size_t len) +{ + struct qrtr_smd_dev *qdev = qcom_smd_get_drvdata(channel); + int rc; + + if (!qdev) + return -EAGAIN; + + rc = qrtr_endpoint_post(&qdev->ep, data, len); + if (rc == -EINVAL) { + dev_err(qdev->dev, "invalid ipcrouter packet\n"); + /* return 0 to let smd drop the packet */ + rc = 0; + } + + return rc; +} + +/* from qrtr to smd */ +static int qcom_smd_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb) +{ + struct qrtr_smd_dev *qdev = container_of(ep, struct qrtr_smd_dev, ep); + int rc; + + rc = skb_linearize(skb); + if (rc) + goto out; + + rc = qcom_smd_send(qdev->channel, skb->data, skb->len); + +out: + if (rc) + kfree_skb(skb); + else + consume_skb(skb); + return rc; +} + +static int qcom_smd_qrtr_probe(struct qcom_smd_device *sdev) +{ + struct qrtr_smd_dev *qdev; + int rc; + + qdev = devm_kzalloc(&sdev->dev, sizeof(*qdev), GFP_KERNEL); + if (!qdev) + return -ENOMEM; + + qdev->channel = sdev->channel; + qdev->dev = &sdev->dev; + qdev->ep.xmit = qcom_smd_qrtr_send; + + rc = qrtr_endpoint_register(&qdev->ep, QRTR_EP_NID_AUTO); + if (rc) + return rc; + + qcom_smd_set_drvdata(sdev->channel, qdev); + dev_set_drvdata(&sdev->dev, qdev); + + dev_dbg(&sdev->dev, "Qualcomm SMD QRTR driver probed\n"); + + return 0; +} + +static void qcom_smd_qrtr_remove(struct qcom_smd_device *sdev) +{ + struct qrtr_smd_dev *qdev = dev_get_drvdata(&sdev->dev); + + qrtr_endpoint_unregister(&qdev->ep); + + dev_set_drvdata(&sdev->dev, NULL); +} + +static const struct qcom_smd_id qcom_smd_qrtr_smd_match[] = { + { "IPCRTR" }, + {} +}; + +static struct qcom_smd_driver qcom_smd_qrtr_driver = { + .probe = qcom_smd_qrtr_probe, + .remove = qcom_smd_qrtr_remove, + .callback = qcom_smd_qrtr_callback, + .smd_match_table = qcom_smd_qrtr_smd_match, + .driver = { + .name = "qcom_smd_qrtr", + .owner = THIS_MODULE, + }, +}; + +module_qcom_smd_driver(qcom_smd_qrtr_driver); + +MODULE_DESCRIPTION("Qualcomm IPC-Router SMD interface driver"); +MODULE_LICENSE("GPL v2"); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 310cabce2..7c2a65a6a 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -111,7 +111,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even } } - if (conn->c_version < RDS_PROTOCOL(3,1)) { + if (conn->c_version < RDS_PROTOCOL(3, 1)) { printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed," " no longer supported\n", &conn->c_faddr, diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 93ff038ea..d921adc62 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -111,7 +111,7 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) cpu_relax(); } - ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, PAGE_SIZE); + ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, 0, PAGE_SIZE); if (unlikely(ret != ibmr->sg_len)) return ret < 0 ? ret : -EINVAL; diff --git a/net/rds/loop.c b/net/rds/loop.c index 6b12b6854..814173b46 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -95,8 +95,9 @@ out: */ static void rds_loop_inc_free(struct rds_incoming *inc) { - struct rds_message *rm = container_of(inc, struct rds_message, m_inc); - rds_message_put(rm); + struct rds_message *rm = container_of(inc, struct rds_message, m_inc); + + rds_message_put(rm); } /* we need to at least give the thread something to succeed */ diff --git a/net/rds/rds.h b/net/rds/rds.h index 80256b08e..387df5f32 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -74,6 +74,7 @@ enum { RDS_CONN_CONNECTING, RDS_CONN_DISCONNECTING, RDS_CONN_UP, + RDS_CONN_RESETTING, RDS_CONN_ERROR, }; @@ -813,6 +814,7 @@ void rds_connect_worker(struct work_struct *); void rds_shutdown_worker(struct work_struct *); void rds_send_worker(struct work_struct *); void rds_recv_worker(struct work_struct *); +void rds_connect_path_complete(struct rds_connection *conn, int curr); void rds_connect_complete(struct rds_connection *conn); /* transport.c */ diff --git a/net/rds/recv.c b/net/rds/recv.c index c0be1ecd1..8413f6c99 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -561,5 +561,7 @@ void rds_inc_info_copy(struct rds_incoming *inc, minfo.fport = inc->i_hdr.h_dport; } + minfo.flags = 0; + rds_info_copy(iter, &minfo, sizeof(minfo)); } diff --git a/net/rds/send.c b/net/rds/send.c index c9cdb358e..b1962f8e3 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -99,6 +99,7 @@ void rds_send_reset(struct rds_connection *conn) list_splice_init(&conn->c_retrans, &conn->c_send_queue); spin_unlock_irqrestore(&conn->c_lock, flags); } +EXPORT_SYMBOL_GPL(rds_send_reset); static int acquire_in_xmit(struct rds_connection *conn) { diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c index c173f69e1..e381bbcd9 100644 --- a/net/rds/sysctl.c +++ b/net/rds/sysctl.c @@ -102,7 +102,8 @@ int rds_sysctl_init(void) rds_sysctl_reconnect_min = msecs_to_jiffies(1); rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; - rds_sysctl_reg_table = register_net_sysctl(&init_net,"net/rds", rds_sysctl_rds_table); + rds_sysctl_reg_table = + register_net_sysctl(&init_net, "net/rds", rds_sysctl_rds_table); if (!rds_sysctl_reg_table) return -ENOMEM; return 0; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 86187dad1..c8a7b4c90 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -126,9 +126,81 @@ void rds_tcp_restore_callbacks(struct socket *sock, } /* - * This is the only path that sets tc->t_sock. Send and receive trust that - * it is set. The RDS_CONN_UP bit protects those paths from being - * called while it isn't set. + * rds_tcp_reset_callbacks() switches the to the new sock and + * returns the existing tc->t_sock. + * + * The only functions that set tc->t_sock are rds_tcp_set_callbacks + * and rds_tcp_reset_callbacks. Send and receive trust that + * it is set. The absence of RDS_CONN_UP bit protects those paths + * from being called while it isn't set. + */ +void rds_tcp_reset_callbacks(struct socket *sock, + struct rds_connection *conn) +{ + struct rds_tcp_connection *tc = conn->c_transport_data; + struct socket *osock = tc->t_sock; + + if (!osock) + goto newsock; + + /* Need to resolve a duelling SYN between peers. + * We have an outstanding SYN to this peer, which may + * potentially have transitioned to the RDS_CONN_UP state, + * so we must quiesce any send threads before resetting + * c_transport_data. We quiesce these threads by setting + * c_state to something other than RDS_CONN_UP, and then + * waiting for any existing threads in rds_send_xmit to + * complete release_in_xmit(). (Subsequent threads entering + * rds_send_xmit() will bail on !rds_conn_up(). + * + * However an incoming syn-ack at this point would end up + * marking the conn as RDS_CONN_UP, and would again permit + * rds_send_xmi() threads through, so ideally we would + * synchronize on RDS_CONN_UP after lock_sock(), but cannot + * do that: waiting on !RDS_IN_XMIT after lock_sock() may + * end up deadlocking with tcp_sendmsg(), and the RDS_IN_XMIT + * would not get set. As a result, we set c_state to + * RDS_CONN_RESETTTING, to ensure that rds_tcp_state_change + * cannot mark rds_conn_path_up() in the window before lock_sock() + */ + atomic_set(&conn->c_state, RDS_CONN_RESETTING); + wait_event(conn->c_waitq, !test_bit(RDS_IN_XMIT, &conn->c_flags)); + lock_sock(osock->sk); + /* reset receive side state for rds_tcp_data_recv() for osock */ + if (tc->t_tinc) { + rds_inc_put(&tc->t_tinc->ti_inc); + tc->t_tinc = NULL; + } + tc->t_tinc_hdr_rem = sizeof(struct rds_header); + tc->t_tinc_data_rem = 0; + tc->t_sock = NULL; + + write_lock_bh(&osock->sk->sk_callback_lock); + + osock->sk->sk_user_data = NULL; + osock->sk->sk_data_ready = tc->t_orig_data_ready; + osock->sk->sk_write_space = tc->t_orig_write_space; + osock->sk->sk_state_change = tc->t_orig_state_change; + write_unlock_bh(&osock->sk->sk_callback_lock); + release_sock(osock->sk); + sock_release(osock); +newsock: + rds_send_reset(conn); + lock_sock(sock->sk); + write_lock_bh(&sock->sk->sk_callback_lock); + tc->t_sock = sock; + sock->sk->sk_user_data = conn; + sock->sk->sk_data_ready = rds_tcp_data_ready; + sock->sk->sk_write_space = rds_tcp_write_space; + sock->sk->sk_state_change = rds_tcp_state_change; + + write_unlock_bh(&sock->sk->sk_callback_lock); + release_sock(sock->sk); +} + +/* Add tc to rds_tcp_tc_list and set tc->t_sock. See comments + * above rds_tcp_reset_callbacks for notes about synchronization + * with data path */ void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn) { @@ -544,7 +616,7 @@ static int rds_tcp_init(void) ret = rds_tcp_recv_init(); if (ret) - goto out_slab; + goto out_pernet; ret = rds_trans_register(&rds_tcp_transport); if (ret) @@ -556,8 +628,9 @@ static int rds_tcp_init(void) out_recv: rds_tcp_recv_exit(); -out_slab: +out_pernet: unregister_pernet_subsys(&rds_tcp_net_ops); +out_slab: kmem_cache_destroy(rds_tcp_conn_slab); out: return ret; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 41c228300..7940babf6 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -50,6 +50,7 @@ struct rds_tcp_statistics { void rds_tcp_tune(struct socket *sock); void rds_tcp_nonagle(struct socket *sock); void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn); +void rds_tcp_reset_callbacks(struct socket *sock, struct rds_connection *conn); void rds_tcp_restore_callbacks(struct socket *sock, struct rds_tcp_connection *tc); u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc); @@ -82,7 +83,7 @@ int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); void rds_tcp_xmit_prepare(struct rds_connection *conn); void rds_tcp_xmit_complete(struct rds_connection *conn); int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, - unsigned int hdr_off, unsigned int sg, unsigned int off); + unsigned int hdr_off, unsigned int sg, unsigned int off); void rds_tcp_write_space(struct sock *sk); /* tcp_stats.c */ diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 49a3fcfed..f6e95d60d 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -43,7 +43,7 @@ void rds_tcp_state_change(struct sock *sk) struct rds_connection *conn; struct rds_tcp_connection *tc; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); conn = sk->sk_user_data; if (!conn) { state_change = sk->sk_state_change; @@ -54,22 +54,22 @@ void rds_tcp_state_change(struct sock *sk) rdsdebug("sock %p state_change to %d\n", tc->t_sock, sk->sk_state); - switch(sk->sk_state) { - /* ignore connecting sockets as they make progress */ - case TCP_SYN_SENT: - case TCP_SYN_RECV: - break; - case TCP_ESTABLISHED: - rds_connect_complete(conn); - break; - case TCP_CLOSE_WAIT: - case TCP_CLOSE: - rds_conn_drop(conn); - default: - break; + switch (sk->sk_state) { + /* ignore connecting sockets as they make progress */ + case TCP_SYN_SENT: + case TCP_SYN_RECV: + break; + case TCP_ESTABLISHED: + rds_connect_path_complete(conn, RDS_CONN_CONNECTING); + break; + case TCP_CLOSE_WAIT: + case TCP_CLOSE: + rds_conn_drop(conn); + default: + break; } out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); state_change(sk); } diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index be263cdf2..245542ca4 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -78,7 +78,9 @@ int rds_tcp_accept_one(struct socket *sock) struct inet_sock *inet; struct rds_tcp_connection *rs_tcp = NULL; int conn_state; - struct sock *nsk; + + if (!sock) /* module unload or netns delete in progress */ + return -ENETUNREACH; ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family, sock->sk->sk_type, sock->sk->sk_protocol, @@ -129,28 +131,25 @@ int rds_tcp_accept_one(struct socket *sock) * so we must quiesce any send threads before resetting * c_transport_data. */ - wait_event(conn->c_waitq, - !test_bit(RDS_IN_XMIT, &conn->c_flags)); - if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr)) { + if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr) || + !conn->c_outgoing) { goto rst_nsk; - } else if (rs_tcp->t_sock) { - rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp); + } else { + rds_tcp_reset_callbacks(new_sock, conn); conn->c_outgoing = 0; + /* rds_connect_path_complete() marks RDS_CONN_UP */ + rds_connect_path_complete(conn, RDS_CONN_RESETTING); } + } else { + rds_tcp_set_callbacks(new_sock, conn); + rds_connect_path_complete(conn, RDS_CONN_CONNECTING); } - rds_tcp_set_callbacks(new_sock, conn); - rds_connect_complete(conn); /* marks RDS_CONN_UP */ new_sock = NULL; ret = 0; goto out; rst_nsk: /* reset the newly returned accept sock and bail */ - nsk = new_sock->sk; - rds_tcp_stats_inc(s_tcp_listen_closed_stale); - nsk->sk_user_data = NULL; - nsk->sk_prot->disconnect(nsk, 0); - tcp_done(nsk); - new_sock = NULL; + kernel_sock_shutdown(new_sock, SHUT_RDWR); ret = 0; out: if (rs_tcp) @@ -166,7 +165,7 @@ void rds_tcp_listen_data_ready(struct sock *sk) rdsdebug("listen data ready sk %p\n", sk); - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); ready = sk->sk_user_data; if (!ready) { /* check for teardown race */ ready = sk->sk_data_ready; @@ -183,7 +182,7 @@ void rds_tcp_listen_data_ready(struct sock *sk) rds_tcp_accept_work(sk); out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); ready(sk); } diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 27a992154..6e6a7111a 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -171,7 +171,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, while (left) { if (!tinc) { tinc = kmem_cache_alloc(rds_tcp_incoming_slab, - arg->gfp); + arg->gfp); if (!tinc) { desc->error = -ENOMEM; goto out; @@ -207,22 +207,14 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, } if (left && tc->t_tinc_data_rem) { - clone = skb_clone(skb, arg->gfp); + to_copy = min(tc->t_tinc_data_rem, left); + + clone = pskb_extract(skb, offset, to_copy, arg->gfp); if (!clone) { desc->error = -ENOMEM; goto out; } - to_copy = min(tc->t_tinc_data_rem, left); - if (!pskb_pull(clone, offset) || - pskb_trim(clone, to_copy)) { - pr_warn("rds_tcp_data_recv: pull/trim failed " - "left %zu data_rem %zu skb_len %d\n", - left, tc->t_tinc_data_rem, skb->len); - kfree_skb(clone); - desc->error = -ENOMEM; - goto out; - } skb_queue_tail(&tinc->ti_skb_list, clone); rdsdebug("skb %p data %p len %d off %u to_copy %zu -> " @@ -309,7 +301,7 @@ void rds_tcp_data_ready(struct sock *sk) rdsdebug("data ready sk %p\n", sk); - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); conn = sk->sk_user_data; if (!conn) { /* check for teardown race */ ready = sk->sk_data_ready; @@ -323,7 +315,7 @@ void rds_tcp_data_ready(struct sock *sk) if (rds_tcp_read_sock(conn, GFP_ATOMIC) == -ENOMEM) queue_delayed_work(rds_wq, &conn->c_recv_w, 0); out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); ready(sk); } diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 2894e6095..618be69c9 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -66,19 +66,19 @@ void rds_tcp_xmit_complete(struct rds_connection *conn) static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) { struct kvec vec = { - .iov_base = data, - .iov_len = len, + .iov_base = data, + .iov_len = len, + }; + struct msghdr msg = { + .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL, }; - struct msghdr msg = { - .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL, - }; return kernel_sendmsg(sock, &msg, &vec, 1, vec.iov_len); } /* the core send_sem serializes this with other xmit and shutdown */ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, - unsigned int hdr_off, unsigned int sg, unsigned int off) + unsigned int hdr_off, unsigned int sg, unsigned int off) { struct rds_tcp_connection *tc = conn->c_transport_data; int done = 0; @@ -180,7 +180,7 @@ void rds_tcp_write_space(struct sock *sk) struct rds_connection *conn; struct rds_tcp_connection *tc; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); conn = sk->sk_user_data; if (!conn) { write_space = sk->sk_write_space; @@ -196,11 +196,11 @@ void rds_tcp_write_space(struct sock *sk) tc->t_last_seen_una = rds_tcp_snd_una(tc); rds_send_drop_acked(conn, rds_tcp_snd_una(tc), rds_tcp_is_acked); - if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) + if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) queue_delayed_work(rds_wq, &conn->c_send_w, 0); out: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); /* * write_space is only called when data leaves tcp's send queue if diff --git a/net/rds/threads.c b/net/rds/threads.c index 454aa6d23..4a3230457 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -71,9 +71,9 @@ struct workqueue_struct *rds_wq; EXPORT_SYMBOL_GPL(rds_wq); -void rds_connect_complete(struct rds_connection *conn) +void rds_connect_path_complete(struct rds_connection *conn, int curr) { - if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) { + if (!rds_conn_transition(conn, curr, RDS_CONN_UP)) { printk(KERN_WARNING "%s: Cannot transition to state UP, " "current state is %d\n", __func__, @@ -90,6 +90,12 @@ void rds_connect_complete(struct rds_connection *conn) queue_delayed_work(rds_wq, &conn->c_send_w, 0); queue_delayed_work(rds_wq, &conn->c_recv_w, 0); } +EXPORT_SYMBOL_GPL(rds_connect_path_complete); + +void rds_connect_complete(struct rds_connection *conn) +{ + rds_connect_path_complete(conn, RDS_CONN_CONNECTING); +} EXPORT_SYMBOL_GPL(rds_connect_complete); /* diff --git a/net/rds/transport.c b/net/rds/transport.c index f3afd1d60..2ffd3e30c 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -140,8 +140,7 @@ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, rds_info_iter_unmap(iter); down_read(&rds_trans_sem); - for (i = 0; i < RDS_TRANS_COUNT; i++) - { + for (i = 0; i < RDS_TRANS_COUNT; i++) { trans = transports[i]; if (!trans || !trans->stats_info_copy) continue; diff --git a/net/rfkill/core.c b/net/rfkill/core.c index 03f26e3a6..884027f62 100644 --- a/net/rfkill/core.c +++ b/net/rfkill/core.c @@ -1141,6 +1141,7 @@ static ssize_t rfkill_fop_write(struct file *file, const char __user *buf, { struct rfkill *rfkill; struct rfkill_event ev; + int ret; /* we don't need the 'hard' variable but accept it */ if (count < RFKILL_EVENT_SIZE_V1 - 1) @@ -1155,29 +1156,36 @@ static ssize_t rfkill_fop_write(struct file *file, const char __user *buf, if (copy_from_user(&ev, buf, count)) return -EFAULT; - if (ev.op != RFKILL_OP_CHANGE && ev.op != RFKILL_OP_CHANGE_ALL) - return -EINVAL; - if (ev.type >= NUM_RFKILL_TYPES) return -EINVAL; mutex_lock(&rfkill_global_mutex); - if (ev.op == RFKILL_OP_CHANGE_ALL) + switch (ev.op) { + case RFKILL_OP_CHANGE_ALL: rfkill_update_global_state(ev.type, ev.soft); - - list_for_each_entry(rfkill, &rfkill_list, node) { - if (rfkill->idx != ev.idx && ev.op != RFKILL_OP_CHANGE_ALL) - continue; - - if (rfkill->type != ev.type && ev.type != RFKILL_TYPE_ALL) - continue; - - rfkill_set_block(rfkill, ev.soft); + list_for_each_entry(rfkill, &rfkill_list, node) + if (rfkill->type == ev.type || + ev.type == RFKILL_TYPE_ALL) + rfkill_set_block(rfkill, ev.soft); + ret = 0; + break; + case RFKILL_OP_CHANGE: + list_for_each_entry(rfkill, &rfkill_list, node) + if (rfkill->idx == ev.idx && + (rfkill->type == ev.type || + ev.type == RFKILL_TYPE_ALL)) + rfkill_set_block(rfkill, ev.soft); + ret = 0; + break; + default: + ret = -EINVAL; + break; } + mutex_unlock(&rfkill_global_mutex); - return count; + return ret ?: count; } static int rfkill_fop_release(struct inode *inode, struct file *file) diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index 79c4abcfa..0a6394754 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -164,7 +164,8 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety rose_frames_acked(sk, nr); if (ns == rose->vr) { rose_start_idletimer(sk); - if (sock_queue_rcv_skb(sk, skb) == 0) { + if (sk_filter_trim_cap(sk, skb, ROSE_MIN_LEN) == 0 && + __sock_queue_rcv_skb(sk, skb) == 0) { rose->vr = (rose->vr + 1) % ROSE_MODULUS; queued = 1; } else { diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index 23dcef12b..784c53163 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -30,7 +30,7 @@ config AF_RXRPC_DEBUG config RXKAD - tristate "RxRPC Kerberos security" + bool "RxRPC Kerberos security" depends on AF_RXRPC select CRYPTO select CRYPTO_MANAGER diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index ec126f912..e05a06ef2 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -18,11 +18,12 @@ af-rxrpc-y := \ ar-recvmsg.o \ ar-security.o \ ar-skbuff.o \ - ar-transport.o + ar-transport.o \ + insecure.o \ + misc.o af-rxrpc-$(CONFIG_PROC_FS) += ar-proc.o +af-rxrpc-$(CONFIG_RXKAD) += rxkad.o af-rxrpc-$(CONFIG_SYSCTL) += sysctl.o obj-$(CONFIG_AF_RXRPC) += af-rxrpc.o - -obj-$(CONFIG_RXKAD) += rxkad.o diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 9d935fa5a..e45e94ca0 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -806,6 +806,12 @@ static int __init af_rxrpc_init(void) goto error_work_queue; } + ret = rxrpc_init_security(); + if (ret < 0) { + printk(KERN_CRIT "RxRPC: Cannot initialise security\n"); + goto error_security; + } + ret = proto_register(&rxrpc_proto, 1); if (ret < 0) { printk(KERN_CRIT "RxRPC: Cannot register protocol\n"); @@ -853,6 +859,8 @@ error_sock: proto_unregister(&rxrpc_proto); error_proto: destroy_workqueue(rxrpc_workqueue); +error_security: + rxrpc_exit_security(); error_work_queue: kmem_cache_destroy(rxrpc_call_jar); error_call_jar: @@ -883,6 +891,7 @@ static void __exit af_rxrpc_exit(void) remove_proc_entry("rxrpc_conns", init_net.proc_net); remove_proc_entry("rxrpc_calls", init_net.proc_net); destroy_workqueue(rxrpc_workqueue); + rxrpc_exit_security(); kmem_cache_destroy(rxrpc_call_jar); _leave(""); } diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/ar-accept.c index 277731a5e..e7a7f05f1 100644 --- a/net/rxrpc/ar-accept.c +++ b/net/rxrpc/ar-accept.c @@ -108,7 +108,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local, goto error; } - conn = rxrpc_incoming_connection(trans, &sp->hdr, GFP_NOIO); + conn = rxrpc_incoming_connection(trans, &sp->hdr); rxrpc_put_transport(trans); if (IS_ERR(conn)) { _debug("no conn"); @@ -116,7 +116,7 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local, goto error; } - call = rxrpc_incoming_call(rx, conn, &sp->hdr, GFP_NOIO); + call = rxrpc_incoming_call(rx, conn, &sp->hdr); rxrpc_put_connection(conn); if (IS_ERR(call)) { _debug("no call"); diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c index 16d967075..374478e00 100644 --- a/net/rxrpc/ar-ack.c +++ b/net/rxrpc/ar-ack.c @@ -20,74 +20,6 @@ #include "ar-internal.h" /* - * How long to wait before scheduling ACK generation after seeing a - * packet with RXRPC_REQUEST_ACK set (in jiffies). - */ -unsigned int rxrpc_requested_ack_delay = 1; - -/* - * How long to wait before scheduling an ACK with subtype DELAY (in jiffies). - * - * We use this when we've received new data packets. If those packets aren't - * all consumed within this time we will send a DELAY ACK if an ACK was not - * requested to let the sender know it doesn't need to resend. - */ -unsigned int rxrpc_soft_ack_delay = 1 * HZ; - -/* - * How long to wait before scheduling an ACK with subtype IDLE (in jiffies). - * - * We use this when we've consumed some previously soft-ACK'd packets when - * further packets aren't immediately received to decide when to send an IDLE - * ACK let the other end know that it can free up its Tx buffer space. - */ -unsigned int rxrpc_idle_ack_delay = 0.5 * HZ; - -/* - * Receive window size in packets. This indicates the maximum number of - * unconsumed received packets we're willing to retain in memory. Once this - * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further - * packets. - */ -unsigned int rxrpc_rx_window_size = 32; - -/* - * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet - * made by gluing normal packets together that we're willing to handle. - */ -unsigned int rxrpc_rx_mtu = 5692; - -/* - * The maximum number of fragments in a received jumbo packet that we tell the - * sender that we're willing to handle. - */ -unsigned int rxrpc_rx_jumbo_max = 4; - -static const char *rxrpc_acks(u8 reason) -{ - static const char *const str[] = { - "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY", - "IDL", "-?-" - }; - - if (reason >= ARRAY_SIZE(str)) - reason = ARRAY_SIZE(str) - 1; - return str[reason]; -} - -static const s8 rxrpc_ack_priority[] = { - [0] = 0, - [RXRPC_ACK_DELAY] = 1, - [RXRPC_ACK_REQUESTED] = 2, - [RXRPC_ACK_IDLE] = 3, - [RXRPC_ACK_PING_RESPONSE] = 4, - [RXRPC_ACK_DUPLICATE] = 5, - [RXRPC_ACK_OUT_OF_SEQUENCE] = 6, - [RXRPC_ACK_EXCEEDS_WINDOW] = 7, - [RXRPC_ACK_NOSPACE] = 8, -}; - -/* * propose an ACK be sent */ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, @@ -426,7 +358,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard) int tail = call->acks_tail, old_tail; int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz); - kenter("{%u,%u},%u", call->acks_hard, win, hard); + _enter("{%u,%u},%u", call->acks_hard, win, hard); ASSERTCMP(hard - call->acks_hard, <=, win); @@ -656,7 +588,8 @@ process_further: _proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq); /* secured packets must be verified and possibly decrypted */ - if (rxrpc_verify_packet(call, skb, _abort_code) < 0) + if (call->conn->security->verify_packet(call, skb, + _abort_code) < 0) goto protocol_error; rxrpc_insert_oos_packet(call, skb); @@ -901,8 +834,8 @@ void rxrpc_process_call(struct work_struct *work) /* there's a good chance we're going to have to send a message, so set * one up in advance */ - msg.msg_name = &call->conn->trans->peer->srx.transport.sin; - msg.msg_namelen = sizeof(call->conn->trans->peer->srx.transport.sin); + msg.msg_name = &call->conn->trans->peer->srx.transport; + msg.msg_namelen = call->conn->trans->peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -973,7 +906,7 @@ void rxrpc_process_call(struct work_struct *work) ECONNABORTED, true) < 0) goto no_mem; whdr.type = RXRPC_PACKET_TYPE_ABORT; - data = htonl(call->abort_code); + data = htonl(call->local_abort); iov[1].iov_base = &data; iov[1].iov_len = sizeof(data); genbit = RXRPC_CALL_EV_ABORT; @@ -1036,7 +969,7 @@ void rxrpc_process_call(struct work_struct *work) write_lock_bh(&call->state_lock); if (call->state <= RXRPC_CALL_COMPLETE) { call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->abort_code = RX_CALL_TIMEOUT; + call->local_abort = RX_CALL_TIMEOUT; set_bit(RXRPC_CALL_EV_ABORT, &call->events); } write_unlock_bh(&call->state_lock); diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c index 7c8d300ad..571a41fd5 100644 --- a/net/rxrpc/ar-call.c +++ b/net/rxrpc/ar-call.c @@ -411,18 +411,17 @@ found_extant_second: */ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_connection *conn, - struct rxrpc_host_header *hdr, - gfp_t gfp) + struct rxrpc_host_header *hdr) { struct rxrpc_call *call, *candidate; struct rb_node **p, *parent; u32 call_id; - _enter(",%d,,%x", conn->debug_id, gfp); + _enter(",%d", conn->debug_id); ASSERT(rx != NULL); - candidate = rxrpc_alloc_call(gfp); + candidate = rxrpc_alloc_call(GFP_NOIO); if (!candidate) return ERR_PTR(-EBUSY); @@ -682,7 +681,7 @@ void rxrpc_release_call(struct rxrpc_call *call) call->state != RXRPC_CALL_CLIENT_FINAL_ACK) { _debug("+++ ABORTING STATE %d +++\n", call->state); call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->abort_code = RX_CALL_DEAD; + call->local_abort = RX_CALL_DEAD; set_bit(RXRPC_CALL_EV_ABORT, &call->events); rxrpc_queue_call(call); } @@ -758,7 +757,7 @@ static void rxrpc_mark_call_released(struct rxrpc_call *call) if (call->state < RXRPC_CALL_COMPLETE) { _debug("abort call %p", call); call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->abort_code = RX_CALL_DEAD; + call->local_abort = RX_CALL_DEAD; if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) sched = true; } diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c index 9942da1ed..97f4fae74 100644 --- a/net/rxrpc/ar-connection.c +++ b/net/rxrpc/ar-connection.c @@ -207,6 +207,7 @@ static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) INIT_LIST_HEAD(&conn->bundle_link); conn->calls = RB_ROOT; skb_queue_head_init(&conn->rx_queue); + conn->security = &rxrpc_no_security; rwlock_init(&conn->lock); spin_lock_init(&conn->state_lock); atomic_set(&conn->usage, 1); @@ -564,8 +565,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx, candidate->debug_id, candidate->trans->debug_id); rxrpc_assign_connection_id(candidate); - if (candidate->security) - candidate->security->prime_packet_security(candidate); + candidate->security->prime_packet_security(candidate); /* leave the candidate lurking in zombie mode attached to the * bundle until we're ready for it */ @@ -619,8 +619,7 @@ interrupted: */ struct rxrpc_connection * rxrpc_incoming_connection(struct rxrpc_transport *trans, - struct rxrpc_host_header *hdr, - gfp_t gfp) + struct rxrpc_host_header *hdr) { struct rxrpc_connection *conn, *candidate = NULL; struct rb_node *p, **pp; @@ -659,7 +658,7 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans, /* not yet present - create a candidate for a new record and then * redo the search */ - candidate = rxrpc_alloc_connection(gfp); + candidate = rxrpc_alloc_connection(GFP_NOIO); if (!candidate) { _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); @@ -831,7 +830,10 @@ static void rxrpc_destroy_connection(struct rxrpc_connection *conn) ASSERT(RB_EMPTY_ROOT(&conn->calls)); rxrpc_purge_queue(&conn->rx_queue); - rxrpc_clear_conn_security(conn); + conn->security->clear(conn); + key_put(conn->key); + key_put(conn->server_key); + rxrpc_put_transport(conn->trans); kfree(conn); _leave(""); diff --git a/net/rxrpc/ar-connevent.c b/net/rxrpc/ar-connevent.c index 1bdaaed8c..5f9563968 100644 --- a/net/rxrpc/ar-connevent.c +++ b/net/rxrpc/ar-connevent.c @@ -40,11 +40,13 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state, write_lock(&call->state_lock); if (call->state <= RXRPC_CALL_COMPLETE) { call->state = state; - call->abort_code = abort_code; - if (state == RXRPC_CALL_LOCALLY_ABORTED) + if (state == RXRPC_CALL_LOCALLY_ABORTED) { + call->local_abort = conn->local_abort; set_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events); - else + } else { + call->remote_abort = conn->remote_abort; set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); + } rxrpc_queue_call(call); } write_unlock(&call->state_lock); @@ -84,8 +86,8 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, abort_code); - msg.msg_name = &conn->trans->peer->srx.transport.sin; - msg.msg_namelen = sizeof(conn->trans->peer->srx.transport.sin); + msg.msg_name = &conn->trans->peer->srx.transport; + msg.msg_namelen = conn->trans->peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -101,7 +103,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, whdr._rsvd = 0; whdr.serviceId = htons(conn->service_id); - word = htonl(abort_code); + word = htonl(conn->local_abort); iov[0].iov_base = &whdr; iov[0].iov_len = sizeof(whdr); @@ -112,7 +114,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, serial = atomic_inc_return(&conn->serial); whdr.serial = htonl(serial); - _proto("Tx CONN ABORT %%%u { %d }", serial, abort_code); + _proto("Tx CONN ABORT %%%u { %d }", serial, conn->local_abort); ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len); if (ret < 0) { @@ -172,15 +174,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, return -ECONNABORTED; case RXRPC_PACKET_TYPE_CHALLENGE: - if (conn->security) - return conn->security->respond_to_challenge( - conn, skb, _abort_code); - return -EPROTO; + return conn->security->respond_to_challenge(conn, skb, + _abort_code); case RXRPC_PACKET_TYPE_RESPONSE: - if (!conn->security) - return -EPROTO; - ret = conn->security->verify_response(conn, skb, _abort_code); if (ret < 0) return ret; @@ -236,8 +233,6 @@ static void rxrpc_secure_connection(struct rxrpc_connection *conn) } } - ASSERT(conn->security != NULL); - if (conn->security->issue_challenge(conn) < 0) { abort_code = RX_CALL_DEAD; ret = -ENOMEM; diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c index 63ed75c40..6ff97412a 100644 --- a/net/rxrpc/ar-input.c +++ b/net/rxrpc/ar-input.c @@ -25,12 +25,6 @@ #include <net/net_namespace.h> #include "ar-internal.h" -const char *rxrpc_pkts[] = { - "?00", - "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG", - "?09", "?10", "?11", "?12", "VERSION", "?14", "?15" -}; - /* * queue a packet for recvmsg to pass to userspace * - the caller must hold a lock on call->lock @@ -199,7 +193,7 @@ static int rxrpc_fast_process_data(struct rxrpc_call *call, /* if the packet need security things doing to it, then it goes down * the slow path */ - if (call->conn->security) + if (call->conn->security_ix) goto enqueue_packet; sp->call = call; @@ -355,7 +349,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb) write_lock_bh(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE) { call->state = RXRPC_CALL_REMOTELY_ABORTED; - call->abort_code = abort_code; + call->remote_abort = abort_code; set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events); rxrpc_queue_call(call); } @@ -428,7 +422,7 @@ protocol_error: protocol_error_locked: if (call->state <= RXRPC_CALL_COMPLETE) { call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->abort_code = RX_PROTOCOL_ERROR; + call->local_abort = RX_PROTOCOL_ERROR; set_bit(RXRPC_CALL_EV_ABORT, &call->events); rxrpc_queue_call(call); } @@ -500,7 +494,7 @@ protocol_error: write_lock_bh(&call->state_lock); if (call->state <= RXRPC_CALL_COMPLETE) { call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->abort_code = RX_PROTOCOL_ERROR; + call->local_abort = RX_PROTOCOL_ERROR; set_bit(RXRPC_CALL_EV_ABORT, &call->events); rxrpc_queue_call(call); } @@ -612,9 +606,9 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) struct rxrpc_wire_header whdr; /* dig out the RxRPC connection details */ - if (skb_copy_bits(skb, sizeof(struct udphdr), &whdr, sizeof(whdr)) < 0) + if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) return -EBADMSG; - if (!pskb_pull(skb, sizeof(struct udphdr) + sizeof(whdr))) + if (!pskb_pull(skb, sizeof(whdr))) BUG(); memset(sp, 0, sizeof(*sp)); @@ -704,12 +698,12 @@ void rxrpc_data_ready(struct sock *sk) if (skb_checksum_complete(skb)) { rxrpc_free_skb(skb); rxrpc_put_local(local); - UDP_INC_STATS_BH(&init_net, UDP_MIB_INERRORS, 0); + __UDP_INC_STATS(&init_net, UDP_MIB_INERRORS, 0); _leave(" [CSUM failed]"); return; } - UDP_INC_STATS_BH(&init_net, UDP_MIB_INDATAGRAMS, 0); + __UDP_INC_STATS(&init_net, UDP_MIB_INDATAGRAMS, 0); /* The socket buffer we have is owned by UDP, with UDP's data all over * it, but we really want our own data there. diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index cd6cdbe87..f0b807a16 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -9,6 +9,7 @@ * 2 of the License, or (at your option) any later version. */ +#include <net/sock.h> #include <rxrpc/packet.h> #if 0 @@ -124,11 +125,15 @@ enum rxrpc_command { * RxRPC security module interface */ struct rxrpc_security { - struct module *owner; /* providing module */ - struct list_head link; /* link in master list */ const char *name; /* name of this service */ u8 security_index; /* security type provided */ + /* Initialise a security service */ + int (*init)(void); + + /* Clean up a security service */ + void (*exit)(void); + /* initialise a connection's security */ int (*init_connection_security)(struct rxrpc_connection *); @@ -268,7 +273,7 @@ struct rxrpc_connection { struct rb_root calls; /* calls on this connection */ struct sk_buff_head rx_queue; /* received conn-level packets */ struct rxrpc_call *channels[RXRPC_MAXCALLS]; /* channels (active calls) */ - struct rxrpc_security *security; /* applied security module */ + const struct rxrpc_security *security; /* applied security module */ struct key *key; /* security for this connection (client) */ struct key *server_key; /* security for this service */ struct crypto_skcipher *cipher; /* encryption handle */ @@ -289,7 +294,9 @@ struct rxrpc_connection { RXRPC_CONN_LOCALLY_ABORTED, /* - conn aborted locally */ RXRPC_CONN_NETWORK_ERROR, /* - conn terminated by network error */ } state; - int error; /* error code for local abort */ + u32 local_abort; /* local abort code */ + u32 remote_abort; /* remote abort code */ + int error; /* local error incurred */ int debug_id; /* debug ID for printks */ unsigned int call_counter; /* call ID counter */ atomic_t serial; /* packet serial number counter */ @@ -399,7 +406,9 @@ struct rxrpc_call { rwlock_t state_lock; /* lock for state transition */ atomic_t usage; atomic_t sequence; /* Tx data packet sequence counter */ - u32 abort_code; /* local/remote abort code */ + u32 local_abort; /* local abort code */ + u32 remote_abort; /* remote abort code */ + int error; /* local error incurred */ enum rxrpc_call_state state : 8; /* current state of call */ int debug_id; /* debug ID for printks */ u8 channel; /* connection channel occupied by this call */ @@ -453,7 +462,7 @@ static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code) { write_lock_bh(&call->state_lock); if (call->state < RXRPC_CALL_COMPLETE) { - call->abort_code = abort_code; + call->local_abort = abort_code; call->state = RXRPC_CALL_LOCALLY_ABORTED; set_bit(RXRPC_CALL_EV_ABORT, &call->events); } @@ -478,13 +487,6 @@ int rxrpc_reject_call(struct rxrpc_sock *); /* * ar-ack.c */ -extern unsigned int rxrpc_requested_ack_delay; -extern unsigned int rxrpc_soft_ack_delay; -extern unsigned int rxrpc_idle_ack_delay; -extern unsigned int rxrpc_rx_window_size; -extern unsigned int rxrpc_rx_mtu; -extern unsigned int rxrpc_rx_jumbo_max; - void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool); void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool); void rxrpc_process_call(struct work_struct *); @@ -506,7 +508,7 @@ struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *, unsigned long, int, gfp_t); struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_connection *, - struct rxrpc_host_header *, gfp_t); + struct rxrpc_host_header *); struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *, unsigned long); void rxrpc_release_call(struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); @@ -531,8 +533,7 @@ void __exit rxrpc_destroy_all_connections(void); struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *, struct rxrpc_host_header *); extern struct rxrpc_connection * -rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_host_header *, - gfp_t); +rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_host_header *); /* * ar-connevent.c @@ -550,8 +551,6 @@ void rxrpc_UDP_error_handler(struct work_struct *); /* * ar-input.c */ -extern const char *rxrpc_pkts[]; - void rxrpc_data_ready(struct sock *); int rxrpc_queue_rcv_skb(struct rxrpc_call *, struct sk_buff *, bool, bool); void rxrpc_fast_process_packet(struct rxrpc_call *, struct sk_buff *); @@ -610,14 +609,10 @@ int rxrpc_recvmsg(struct socket *, struct msghdr *, size_t, int); /* * ar-security.c */ -int rxrpc_register_security(struct rxrpc_security *); -void rxrpc_unregister_security(struct rxrpc_security *); +int __init rxrpc_init_security(void); +void rxrpc_exit_security(void); int rxrpc_init_client_conn_security(struct rxrpc_connection *); int rxrpc_init_server_conn_security(struct rxrpc_connection *); -int rxrpc_secure_packet(const struct rxrpc_call *, struct sk_buff *, size_t, - void *); -int rxrpc_verify_packet(const struct rxrpc_call *, struct sk_buff *, u32 *); -void rxrpc_clear_conn_security(struct rxrpc_connection *); /* * ar-skbuff.c @@ -637,6 +632,33 @@ struct rxrpc_transport *rxrpc_find_transport(struct rxrpc_local *, struct rxrpc_peer *); /* + * insecure.c + */ +extern const struct rxrpc_security rxrpc_no_security; + +/* + * misc.c + */ +extern unsigned int rxrpc_requested_ack_delay; +extern unsigned int rxrpc_soft_ack_delay; +extern unsigned int rxrpc_idle_ack_delay; +extern unsigned int rxrpc_rx_window_size; +extern unsigned int rxrpc_rx_mtu; +extern unsigned int rxrpc_rx_jumbo_max; + +extern const char *const rxrpc_pkts[]; +extern const s8 rxrpc_ack_priority[]; + +extern const char *rxrpc_acks(u8 reason); + +/* + * rxkad.c + */ +#ifdef CONFIG_RXKAD +extern const struct rxrpc_security rxkad; +#endif + +/* * sysctl.c */ #ifdef CONFIG_SYSCTL diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c index 3fb492eed..1021b4c0b 100644 --- a/net/rxrpc/ar-key.c +++ b/net/rxrpc/ar-key.c @@ -965,7 +965,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn, key = key_alloc(&key_type_rxrpc, "x", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, 0, - KEY_ALLOC_NOT_IN_QUOTA); + KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(key)) { _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); return -ENOMEM; @@ -1012,7 +1012,7 @@ struct key *rxrpc_get_null_key(const char *keyname) key = key_alloc(&key_type_rxrpc, keyname, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, - KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA); + KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA, NULL); if (IS_ERR(key)) return key; diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index d36fb6e1a..51cb10062 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -110,7 +110,7 @@ static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code) if (call->state <= RXRPC_CALL_COMPLETE) { call->state = RXRPC_CALL_LOCALLY_ABORTED; - call->abort_code = abort_code; + call->local_abort = abort_code; set_bit(RXRPC_CALL_EV_ABORT, &call->events); del_timer_sync(&call->resend_timer); del_timer_sync(&call->ack_timer); @@ -663,7 +663,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, size_t pad; /* pad out if we're using security */ - if (conn->security) { + if (conn->security_ix) { pad = conn->security_size + skb->mark; pad = conn->size_align - pad; pad &= conn->size_align - 1; @@ -695,7 +695,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, if (more && seq & 1) sp->hdr.flags |= RXRPC_REQUEST_ACK; - ret = rxrpc_secure_packet( + ret = conn->security->secure_packet( call, skb, skb->mark, skb->head + sizeof(struct rxrpc_wire_header)); if (ret < 0) diff --git a/net/rxrpc/ar-proc.c b/net/rxrpc/ar-proc.c index 525b2ba5a..225163bc6 100644 --- a/net/rxrpc/ar-proc.c +++ b/net/rxrpc/ar-proc.c @@ -80,7 +80,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) call->conn->in_clientflag ? "Svc" : "Clt", atomic_read(&call->usage), rxrpc_call_states[call->state], - call->abort_code, + call->remote_abort ?: call->local_abort, call->user_call_ID); return 0; diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index 64facba24..160f0927a 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -288,7 +288,11 @@ receive_non_data_message: ret = put_cmsg(msg, SOL_RXRPC, RXRPC_BUSY, 0, &abort_code); break; case RXRPC_SKB_MARK_REMOTE_ABORT: - abort_code = call->abort_code; + abort_code = call->remote_abort; + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code); + break; + case RXRPC_SKB_MARK_LOCAL_ABORT: + abort_code = call->local_abort; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &abort_code); break; case RXRPC_SKB_MARK_NET_ERROR: @@ -303,6 +307,7 @@ receive_non_data_message: &abort_code); break; default: + pr_err("RxRPC: Unknown packet mark %u\n", skb->mark); BUG(); break; } @@ -401,9 +406,14 @@ u32 rxrpc_kernel_get_abort_code(struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - ASSERTCMP(skb->mark, ==, RXRPC_SKB_MARK_REMOTE_ABORT); - - return sp->call->abort_code; + switch (skb->mark) { + case RXRPC_SKB_MARK_REMOTE_ABORT: + return sp->call->remote_abort; + case RXRPC_SKB_MARK_LOCAL_ABORT: + return sp->call->local_abort; + default: + BUG(); + } } EXPORT_SYMBOL(rxrpc_kernel_get_abort_code); diff --git a/net/rxrpc/ar-security.c b/net/rxrpc/ar-security.c index ceff6394a..d223253b2 100644 --- a/net/rxrpc/ar-security.c +++ b/net/rxrpc/ar-security.c @@ -22,109 +22,60 @@ static LIST_HEAD(rxrpc_security_methods); static DECLARE_RWSEM(rxrpc_security_sem); -/* - * get an RxRPC security module - */ -static struct rxrpc_security *rxrpc_security_get(struct rxrpc_security *sec) -{ - return try_module_get(sec->owner) ? sec : NULL; -} - -/* - * release an RxRPC security module - */ -static void rxrpc_security_put(struct rxrpc_security *sec) +static const struct rxrpc_security *rxrpc_security_types[] = { + [RXRPC_SECURITY_NONE] = &rxrpc_no_security, +#ifdef CONFIG_RXKAD + [RXRPC_SECURITY_RXKAD] = &rxkad, +#endif +}; + +int __init rxrpc_init_security(void) { - module_put(sec->owner); -} - -/* - * look up an rxrpc security module - */ -static struct rxrpc_security *rxrpc_security_lookup(u8 security_index) -{ - struct rxrpc_security *sec = NULL; - - _enter(""); + int i, ret; - down_read(&rxrpc_security_sem); - - list_for_each_entry(sec, &rxrpc_security_methods, link) { - if (sec->security_index == security_index) { - if (unlikely(!rxrpc_security_get(sec))) - break; - goto out; + for (i = 0; i < ARRAY_SIZE(rxrpc_security_types); i++) { + if (rxrpc_security_types[i]) { + ret = rxrpc_security_types[i]->init(); + if (ret < 0) + goto failed; } } - sec = NULL; -out: - up_read(&rxrpc_security_sem); - _leave(" = %p [%s]", sec, sec ? sec->name : ""); - return sec; + return 0; + +failed: + for (i--; i >= 0; i--) + if (rxrpc_security_types[i]) + rxrpc_security_types[i]->exit(); + return ret; } -/** - * rxrpc_register_security - register an RxRPC security handler - * @sec: security module - * - * register an RxRPC security handler for use by RxRPC - */ -int rxrpc_register_security(struct rxrpc_security *sec) +void rxrpc_exit_security(void) { - struct rxrpc_security *psec; - int ret; + int i; - _enter(""); - down_write(&rxrpc_security_sem); - - ret = -EEXIST; - list_for_each_entry(psec, &rxrpc_security_methods, link) { - if (psec->security_index == sec->security_index) - goto out; - } - - list_add(&sec->link, &rxrpc_security_methods); - - printk(KERN_NOTICE "RxRPC: Registered security type %d '%s'\n", - sec->security_index, sec->name); - ret = 0; - -out: - up_write(&rxrpc_security_sem); - _leave(" = %d", ret); - return ret; + for (i = 0; i < ARRAY_SIZE(rxrpc_security_types); i++) + if (rxrpc_security_types[i]) + rxrpc_security_types[i]->exit(); } -EXPORT_SYMBOL_GPL(rxrpc_register_security); - -/** - * rxrpc_unregister_security - unregister an RxRPC security handler - * @sec: security module - * - * unregister an RxRPC security handler +/* + * look up an rxrpc security module */ -void rxrpc_unregister_security(struct rxrpc_security *sec) +static const struct rxrpc_security *rxrpc_security_lookup(u8 security_index) { - - _enter(""); - down_write(&rxrpc_security_sem); - list_del_init(&sec->link); - up_write(&rxrpc_security_sem); - - printk(KERN_NOTICE "RxRPC: Unregistered security type %d '%s'\n", - sec->security_index, sec->name); + if (security_index >= ARRAY_SIZE(rxrpc_security_types)) + return NULL; + return rxrpc_security_types[security_index]; } -EXPORT_SYMBOL_GPL(rxrpc_unregister_security); - /* * initialise the security on a client connection */ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) { + const struct rxrpc_security *sec; struct rxrpc_key_token *token; - struct rxrpc_security *sec; struct key *key = conn->key; int ret; @@ -148,8 +99,7 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) ret = conn->security->init_connection_security(conn); if (ret < 0) { - rxrpc_security_put(conn->security); - conn->security = NULL; + conn->security = &rxrpc_no_security; return ret; } @@ -162,7 +112,7 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) */ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) { - struct rxrpc_security *sec; + const struct rxrpc_security *sec; struct rxrpc_local *local = conn->trans->local; struct rxrpc_sock *rx; struct key *key; @@ -188,14 +138,12 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn) /* the service appears to have died */ read_unlock_bh(&local->services_lock); - rxrpc_security_put(sec); _leave(" = -ENOENT"); return -ENOENT; found_service: if (!rx->securities) { read_unlock_bh(&local->services_lock); - rxrpc_security_put(sec); _leave(" = -ENOKEY"); return -ENOKEY; } @@ -205,7 +153,6 @@ found_service: &key_type_rxrpc_s, kdesc); if (IS_ERR(kref)) { read_unlock_bh(&local->services_lock); - rxrpc_security_put(sec); _leave(" = %ld [search]", PTR_ERR(kref)); return PTR_ERR(kref); } @@ -219,46 +166,3 @@ found_service: _leave(" = 0"); return 0; } - -/* - * secure a packet prior to transmission - */ -int rxrpc_secure_packet(const struct rxrpc_call *call, - struct sk_buff *skb, - size_t data_size, - void *sechdr) -{ - if (call->conn->security) - return call->conn->security->secure_packet( - call, skb, data_size, sechdr); - return 0; -} - -/* - * secure a packet prior to transmission - */ -int rxrpc_verify_packet(const struct rxrpc_call *call, struct sk_buff *skb, - u32 *_abort_code) -{ - if (call->conn->security) - return call->conn->security->verify_packet( - call, skb, _abort_code); - return 0; -} - -/* - * clear connection security - */ -void rxrpc_clear_conn_security(struct rxrpc_connection *conn) -{ - _enter("{%d}", conn->debug_id); - - if (conn->security) { - conn->security->clear(conn); - rxrpc_security_put(conn->security); - conn->security = NULL; - } - - key_put(conn->key); - key_put(conn->server_key); -} diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c new file mode 100644 index 000000000..e57140361 --- /dev/null +++ b/net/rxrpc/insecure.c @@ -0,0 +1,83 @@ +/* Null security operations. + * + * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +static int none_init_connection_security(struct rxrpc_connection *conn) +{ + return 0; +} + +static void none_prime_packet_security(struct rxrpc_connection *conn) +{ +} + +static int none_secure_packet(const struct rxrpc_call *call, + struct sk_buff *skb, + size_t data_size, + void *sechdr) +{ + return 0; +} + +static int none_verify_packet(const struct rxrpc_call *call, + struct sk_buff *skb, + u32 *_abort_code) +{ + return 0; +} + +static int none_respond_to_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb, + u32 *_abort_code) +{ + *_abort_code = RX_PROTOCOL_ERROR; + return -EPROTO; +} + +static int none_verify_response(struct rxrpc_connection *conn, + struct sk_buff *skb, + u32 *_abort_code) +{ + *_abort_code = RX_PROTOCOL_ERROR; + return -EPROTO; +} + +static void none_clear(struct rxrpc_connection *conn) +{ +} + +static int none_init(void) +{ + return 0; +} + +static void none_exit(void) +{ +} + +/* + * RxRPC Kerberos-based security + */ +const struct rxrpc_security rxrpc_no_security = { + .name = "none", + .security_index = RXRPC_SECURITY_NONE, + .init = none_init, + .exit = none_exit, + .init_connection_security = none_init_connection_security, + .prime_packet_security = none_prime_packet_security, + .secure_packet = none_secure_packet, + .verify_packet = none_verify_packet, + .respond_to_challenge = none_respond_to_challenge, + .verify_response = none_verify_response, + .clear = none_clear, +}; diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c new file mode 100644 index 000000000..1afe9876e --- /dev/null +++ b/net/rxrpc/misc.c @@ -0,0 +1,89 @@ +/* Miscellaneous bits + * + * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +/* + * How long to wait before scheduling ACK generation after seeing a + * packet with RXRPC_REQUEST_ACK set (in jiffies). + */ +unsigned int rxrpc_requested_ack_delay = 1; + +/* + * How long to wait before scheduling an ACK with subtype DELAY (in jiffies). + * + * We use this when we've received new data packets. If those packets aren't + * all consumed within this time we will send a DELAY ACK if an ACK was not + * requested to let the sender know it doesn't need to resend. + */ +unsigned int rxrpc_soft_ack_delay = 1 * HZ; + +/* + * How long to wait before scheduling an ACK with subtype IDLE (in jiffies). + * + * We use this when we've consumed some previously soft-ACK'd packets when + * further packets aren't immediately received to decide when to send an IDLE + * ACK let the other end know that it can free up its Tx buffer space. + */ +unsigned int rxrpc_idle_ack_delay = 0.5 * HZ; + +/* + * Receive window size in packets. This indicates the maximum number of + * unconsumed received packets we're willing to retain in memory. Once this + * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further + * packets. + */ +unsigned int rxrpc_rx_window_size = 32; + +/* + * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet + * made by gluing normal packets together that we're willing to handle. + */ +unsigned int rxrpc_rx_mtu = 5692; + +/* + * The maximum number of fragments in a received jumbo packet that we tell the + * sender that we're willing to handle. + */ +unsigned int rxrpc_rx_jumbo_max = 4; + +const char *const rxrpc_pkts[] = { + "?00", + "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG", + "?09", "?10", "?11", "?12", "VERSION", "?14", "?15" +}; + +const s8 rxrpc_ack_priority[] = { + [0] = 0, + [RXRPC_ACK_DELAY] = 1, + [RXRPC_ACK_REQUESTED] = 2, + [RXRPC_ACK_IDLE] = 3, + [RXRPC_ACK_PING_RESPONSE] = 4, + [RXRPC_ACK_DUPLICATE] = 5, + [RXRPC_ACK_OUT_OF_SEQUENCE] = 6, + [RXRPC_ACK_EXCEEDS_WINDOW] = 7, + [RXRPC_ACK_NOSPACE] = 8, +}; + +const char *rxrpc_acks(u8 reason) +{ + static const char *const str[] = { + "---", "REQ", "DUP", "OOS", "WIN", "MEM", "PNG", "PNR", "DLY", + "IDL", "-?-" + }; + + if (reason >= ARRAY_SIZE(str)) + reason = ARRAY_SIZE(str) - 1; + return str[reason]; +} diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index f0aeb8163..bab56ed64 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -20,7 +20,6 @@ #include <net/sock.h> #include <net/af_rxrpc.h> #include <keys/rxrpc-type.h> -#define rxrpc_debug rxkad_debug #include "ar-internal.h" #define RXKAD_VERSION 2 @@ -31,10 +30,6 @@ #define REALM_SZ 40 /* size of principal's auth domain */ #define SNAME_SZ 40 /* size of service name */ -unsigned int rxrpc_debug; -module_param_named(debug, rxrpc_debug, uint, S_IWUSR | S_IRUGO); -MODULE_PARM_DESC(debug, "rxkad debugging mask"); - struct rxkad_level1_hdr { __be32 data_size; /* true data size (excluding padding) */ }; @@ -44,10 +39,6 @@ struct rxkad_level2_hdr { __be32 checksum; /* decrypted data checksum */ }; -MODULE_DESCRIPTION("RxRPC network protocol type-2 security (Kerberos 4)"); -MODULE_AUTHOR("Red Hat, Inc."); -MODULE_LICENSE("GPL"); - /* * this holds a pinned cipher so that keventd doesn't get called by the cipher * alloc routine, but since we have it to hand, we use it to decrypt RESPONSE @@ -1164,12 +1155,33 @@ static void rxkad_clear(struct rxrpc_connection *conn) } /* + * Initialise the rxkad security service. + */ +static int rxkad_init(void) +{ + /* pin the cipher we need so that the crypto layer doesn't invoke + * keventd to go get it */ + rxkad_ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); + return PTR_ERR_OR_ZERO(rxkad_ci); +} + +/* + * Clean up the rxkad security service. + */ +static void rxkad_exit(void) +{ + if (rxkad_ci) + crypto_free_skcipher(rxkad_ci); +} + +/* * RxRPC Kerberos-based security */ -static struct rxrpc_security rxkad = { - .owner = THIS_MODULE, +const struct rxrpc_security rxkad = { .name = "rxkad", .security_index = RXRPC_SECURITY_RXKAD, + .init = rxkad_init, + .exit = rxkad_exit, .init_connection_security = rxkad_init_connection_security, .prime_packet_security = rxkad_prime_packet_security, .secure_packet = rxkad_secure_packet, @@ -1179,28 +1191,3 @@ static struct rxrpc_security rxkad = { .verify_response = rxkad_verify_response, .clear = rxkad_clear, }; - -static __init int rxkad_init(void) -{ - _enter(""); - - /* pin the cipher we need so that the crypto layer doesn't invoke - * keventd to go get it */ - rxkad_ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(rxkad_ci)) - return PTR_ERR(rxkad_ci); - - return rxrpc_register_security(&rxkad); -} - -module_init(rxkad_init); - -static __exit void rxkad_exit(void) -{ - _enter(""); - - rxrpc_unregister_security(&rxkad); - crypto_free_skcipher(rxkad_ci); -} - -module_exit(rxkad_exit); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 96066665e..c7a0b0d48 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -657,12 +657,15 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (compat_mode) { if (a->type == TCA_OLD_COMPAT) err = gnet_stats_start_copy_compat(skb, 0, - TCA_STATS, TCA_XSTATS, &p->tcfc_lock, &d); + TCA_STATS, + TCA_XSTATS, + &p->tcfc_lock, &d, + TCA_PAD); else return 0; } else err = gnet_stats_start_copy(skb, TCA_ACT_STATS, - &p->tcfc_lock, &d); + &p->tcfc_lock, &d, TCA_ACT_PAD); if (err < 0) goto errout; @@ -1115,7 +1118,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) nla_nest_end(skb, nest); ret = skb->len; } else - nla_nest_cancel(skb, nest); + nlmsg_trim(skb, b); nlh->nlmsg_len = skb_tail_pointer(skb) - b; if (NETLINK_CB(cb->skb).portid && ret) diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 8c9f1f045..c7123e01c 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -53,9 +53,11 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); + bpf_compute_data_end(skb); filter_res = BPF_PROG_RUN(filter, skb); __skb_pull(skb, skb->mac_len); } else { + bpf_compute_data_end(skb); filter_res = BPF_PROG_RUN(filter, skb); } rcu_read_unlock(); @@ -156,7 +158,8 @@ static int tcf_bpf_dump(struct sk_buff *skb, struct tc_action *act, tm.lastuse = jiffies_to_clock_t(jiffies - prog->tcf_tm.lastuse); tm.expires = jiffies_to_clock_t(prog->tcf_tm.expires); - if (nla_put(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm)) + if (nla_put_64bit(skb, TCA_ACT_BPF_TM, sizeof(tm), &tm, + TCA_ACT_BPF_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index c0ed93ce2..2ba700c76 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -163,7 +163,8 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, t.install = jiffies_to_clock_t(jiffies - ci->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - ci->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(ci->tcf_tm.expires); - if (nla_put(skb, TCA_CONNMARK_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t, + TCA_CONNMARK_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index d22426cde..28e934ed0 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -549,7 +549,7 @@ static int tcf_csum_dump(struct sk_buff *skb, t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); - if (nla_put(skb, TCA_CSUM_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 887fc1f20..ec5cc8435 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -148,6 +148,20 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, return action; } +static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets, + u64 lastuse) +{ + struct tcf_gact *gact = a->priv; + int action = READ_ONCE(gact->tcf_action); + struct tcf_t *tm = &gact->tcf_tm; + + _bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), bytes, packets); + if (action == TC_ACT_SHOT) + this_cpu_ptr(gact->common.cpu_qstats)->drops += packets; + + tm->lastuse = lastuse; +} + static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); @@ -177,7 +191,7 @@ static int tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int t.install = jiffies_to_clock_t(jiffies - gact->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - gact->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(gact->tcf_tm.expires); - if (nla_put(skb, TCA_GACT_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_GACT_TM, sizeof(t), &t, TCA_GACT_PAD)) goto nla_put_failure; return skb->len; @@ -207,6 +221,7 @@ static struct tc_action_ops act_gact_ops = { .type = TCA_ACT_GACT, .owner = THIS_MODULE, .act = tcf_gact, + .stats_update = tcf_gact_stats_update, .dump = tcf_gact_dump, .init = tcf_gact_init, .walk = tcf_gact_walker, diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 343d011aa..ea4a2fef1 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -106,9 +106,9 @@ int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi) } EXPORT_SYMBOL_GPL(ife_get_meta_u16); -int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval) +int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval, gfp_t gfp) { - mi->metaval = kmemdup(metaval, sizeof(u32), GFP_KERNEL); + mi->metaval = kmemdup(metaval, sizeof(u32), gfp); if (!mi->metaval) return -ENOMEM; @@ -116,9 +116,9 @@ int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval) } EXPORT_SYMBOL_GPL(ife_alloc_meta_u32); -int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval) +int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval, gfp_t gfp) { - mi->metaval = kmemdup(metaval, sizeof(u16), GFP_KERNEL); + mi->metaval = kmemdup(metaval, sizeof(u16), gfp); if (!mi->metaval) return -ENOMEM; @@ -240,10 +240,10 @@ static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len) } /* called when adding new meta information - * under ife->tcf_lock + * under ife->tcf_lock for existing action */ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid, - void *val, int len) + void *val, int len, bool exists) { struct tcf_meta_ops *ops = find_ife_oplist(metaid); int ret = 0; @@ -251,11 +251,13 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid, if (!ops) { ret = -ENOENT; #ifdef CONFIG_MODULES - spin_unlock_bh(&ife->tcf_lock); + if (exists) + spin_unlock_bh(&ife->tcf_lock); rtnl_unlock(); request_module("ifemeta%u", metaid); rtnl_lock(); - spin_lock_bh(&ife->tcf_lock); + if (exists) + spin_lock_bh(&ife->tcf_lock); ops = find_ife_oplist(metaid); #endif } @@ -272,10 +274,10 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid, } /* called when adding new meta information - * under ife->tcf_lock + * under ife->tcf_lock for existing action */ static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval, - int len) + int len, bool atomic) { struct tcf_meta_info *mi = NULL; struct tcf_meta_ops *ops = find_ife_oplist(metaid); @@ -284,7 +286,7 @@ static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval, if (!ops) return -ENOENT; - mi = kzalloc(sizeof(*mi), GFP_KERNEL); + mi = kzalloc(sizeof(*mi), atomic ? GFP_ATOMIC : GFP_KERNEL); if (!mi) { /*put back what find_ife_oplist took */ module_put(ops->owner); @@ -294,7 +296,7 @@ static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval, mi->metaid = metaid; mi->ops = ops; if (len > 0) { - ret = ops->alloc(mi, metaval); + ret = ops->alloc(mi, metaval, atomic ? GFP_ATOMIC : GFP_KERNEL); if (ret != 0) { kfree(mi); module_put(ops->owner); @@ -313,11 +315,13 @@ static int use_all_metadata(struct tcf_ife_info *ife) int rc = 0; int installed = 0; + read_lock(&ife_mod_lock); list_for_each_entry(o, &ifeoplist, list) { - rc = add_metainfo(ife, o->metaid, NULL, 0); + rc = add_metainfo(ife, o->metaid, NULL, 0, true); if (rc == 0) installed += 1; } + read_unlock(&ife_mod_lock); if (installed) return 0; @@ -385,8 +389,9 @@ static void tcf_ife_cleanup(struct tc_action *a, int bind) spin_unlock_bh(&ife->tcf_lock); } -/* under ife->tcf_lock */ -static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb) +/* under ife->tcf_lock for existing action */ +static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, + bool exists) { int len = 0; int rc = 0; @@ -398,11 +403,11 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb) val = nla_data(tb[i]); len = nla_len(tb[i]); - rc = load_metaops_and_vet(ife, i, val, len); + rc = load_metaops_and_vet(ife, i, val, len, exists); if (rc != 0) return rc; - rc = add_metainfo(ife, i, val, len); + rc = add_metainfo(ife, i, val, len, exists); if (rc) return rc; } @@ -474,7 +479,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, saddr = nla_data(tb[TCA_IFE_SMAC]); } - spin_lock_bh(&ife->tcf_lock); + if (exists) + spin_lock_bh(&ife->tcf_lock); ife->tcf_action = parm->action; if (parm->flags & IFE_ENCODE) { @@ -504,11 +510,12 @@ metadata_parse_err: if (ret == ACT_P_CREATED) _tcf_ife_cleanup(a, bind); - spin_unlock_bh(&ife->tcf_lock); + if (exists) + spin_unlock_bh(&ife->tcf_lock); return err; } - err = populate_metalist(ife, tb2); + err = populate_metalist(ife, tb2, exists); if (err) goto metadata_parse_err; @@ -523,12 +530,14 @@ metadata_parse_err: if (ret == ACT_P_CREATED) _tcf_ife_cleanup(a, bind); - spin_unlock_bh(&ife->tcf_lock); + if (exists) + spin_unlock_bh(&ife->tcf_lock); return err; } } - spin_unlock_bh(&ife->tcf_lock); + if (exists) + spin_unlock_bh(&ife->tcf_lock); if (ret == ACT_P_CREATED) tcf_hash_insert(tn, a); @@ -556,7 +565,7 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, t.install = jiffies_to_clock_t(jiffies - ife->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - ife->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(ife->tcf_tm.expires); - if (nla_put(skb, TCA_IFE_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD)) goto nla_put_failure; if (!is_zero_ether_addr(ife->eth_dst)) { diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 606323339..d4bd19ee5 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -283,7 +283,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int tm.install = jiffies_to_clock_t(jiffies - ipt->tcf_tm.install); tm.lastuse = jiffies_to_clock_t(jiffies - ipt->tcf_tm.lastuse); tm.expires = jiffies_to_clock_t(ipt->tcf_tm.expires); - if (nla_put(skb, TCA_IPT_TM, sizeof (tm), &tm)) + if (nla_put_64bit(skb, TCA_IPT_TM, sizeof(tm), &tm, TCA_IPT_PAD)) goto nla_put_failure; kfree(t); return skb->len; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 934336e12..1f5bd6ccb 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -36,14 +36,15 @@ static DEFINE_SPINLOCK(mirred_list_lock); static void tcf_mirred_release(struct tc_action *a, int bind) { struct tcf_mirred *m = to_mirred(a); - struct net_device *dev = rcu_dereference_protected(m->tcfm_dev, 1); + struct net_device *dev; /* We could be called either in a RCU callback or with RTNL lock held. */ spin_lock_bh(&mirred_list_lock); list_del(&m->tcfm_list); - spin_unlock_bh(&mirred_list_lock); + dev = rcu_dereference_protected(m->tcfm_dev, 1); if (dev) dev_put(dev); + spin_unlock_bh(&mirred_list_lock); } static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { @@ -221,7 +222,7 @@ static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, i t.install = jiffies_to_clock_t(jiffies - m->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - m->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(m->tcf_tm.expires); - if (nla_put(skb, TCA_MIRRED_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 0f65cdfbf..c0a879f94 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -267,7 +267,7 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); - if (nla_put(skb, TCA_NAT_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 429c3ab65..c6e18f230 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -203,7 +203,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(p->tcf_tm.expires); - if (nla_put(skb, TCA_PEDIT_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; kfree(opt); return skb->len; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 330f14e30..c55778976 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -38,7 +38,7 @@ struct tcf_police { bool peak_present; }; #define to_police(pc) \ - container_of(pc, struct tcf_police, common) + container_of(pc->priv, struct tcf_police, common) #define POL_TAB_MASK 15 @@ -119,14 +119,12 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { - unsigned int h; int ret = 0, err; struct nlattr *tb[TCA_POLICE_MAX + 1]; struct tc_police *parm; struct tcf_police *police; struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; struct tc_action_net *tn = net_generic(net, police_net_id); - struct tcf_hashinfo *hinfo = tn->hinfo; int size; if (nla == NULL) @@ -145,7 +143,7 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla, if (parm->index) { if (tcf_hash_search(tn, a, parm->index)) { - police = to_police(a->priv); + police = to_police(a); if (bind) { police->tcf_bindcnt += 1; police->tcf_refcnt += 1; @@ -156,16 +154,15 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla, /* not replacing */ return -EEXIST; } + } else { + ret = tcf_hash_create(tn, parm->index, NULL, a, + sizeof(*police), bind, false); + if (ret) + return ret; + ret = ACT_P_CREATED; } - police = kzalloc(sizeof(*police), GFP_KERNEL); - if (police == NULL) - return -ENOMEM; - ret = ACT_P_CREATED; - police->tcf_refcnt = 1; - spin_lock_init(&police->tcf_lock); - if (bind) - police->tcf_bindcnt = 1; + police = to_police(a); override: if (parm->rate.rate) { err = -ENOMEM; @@ -237,14 +234,8 @@ override: return ret; police->tcfp_t_c = ktime_get_ns(); - police->tcf_index = parm->index ? parm->index : - tcf_hash_new_index(tn); - h = tcf_hash(police->tcf_index, POL_TAB_MASK); - spin_lock_bh(&hinfo->lock); - hlist_add_head(&police->tcf_head, &hinfo->htab[h]); - spin_unlock_bh(&hinfo->lock); + tcf_hash_insert(tn, a); - a->priv = police; return ret; failure_unlock: @@ -253,7 +244,7 @@ failure: qdisc_put_rtab(P_tab); qdisc_put_rtab(R_tab); if (ret == ACT_P_CREATED) - kfree(police); + tcf_hash_cleanup(a, est); return err; } @@ -268,6 +259,7 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a, spin_lock(&police->tcf_lock); bstats_update(&police->tcf_bstats, skb); + tcf_lastuse_update(&police->tcf_tm); if (police->tcfp_ewma_rate && police->tcf_rate_est.bps >= police->tcfp_ewma_rate) { @@ -327,6 +319,7 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) .refcnt = police->tcf_refcnt - ref, .bindcnt = police->tcf_bindcnt - bind, }; + struct tcf_t t; if (police->rate_present) psched_ratecfg_getrate(&opt.rate, &police->rate); @@ -340,6 +333,13 @@ tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) if (police->tcfp_ewma_rate && nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate)) goto nla_put_failure; + + t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - police->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(police->tcf_tm.expires); + if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) + goto nla_put_failure; + return skb->len; nla_put_failure: diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 3a33fb648..e42f8daca 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -161,7 +161,7 @@ static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(d->tcf_tm.expires); - if (nla_put(skb, TCA_DEF_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 69da5a8f0..e92880296 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -171,7 +171,7 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, t.install = jiffies_to_clock_t(jiffies - d->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - d->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(d->tcf_tm.expires); - if (nla_put(skb, TCA_SKBEDIT_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index c45f926da..ac4adc812 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -185,7 +185,7 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, t.install = jiffies_to_clock_t(jiffies - v->tcf_tm.install); t.lastuse = jiffies_to_clock_t(jiffies - v->tcf_tm.lastuse); t.expires = jiffies_to_clock_t(v->tcf_tm.expires); - if (nla_put(skb, TCA_VLAN_TM, sizeof(t), &t)) + if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; return skb->len; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 425fe6a0e..7b342c779 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -96,9 +96,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); + bpf_compute_data_end(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { + bpf_compute_data_end(skb); filter_res = BPF_PROG_RUN(prog->filter, skb); } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 2181ffc76..b3b7978f4 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -171,7 +171,7 @@ static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie) struct tc_cls_flower_offload offload = {0}; struct tc_to_netdev tc; - if (!tc_should_offload(dev, 0)) + if (!tc_should_offload(dev, tp, 0)) return; offload.command = TC_CLSFLOWER_DESTROY; @@ -194,7 +194,7 @@ static void fl_hw_replace_filter(struct tcf_proto *tp, struct tc_cls_flower_offload offload = {0}; struct tc_to_netdev tc; - if (!tc_should_offload(dev, flags)) + if (!tc_should_offload(dev, tp, flags)) return; offload.command = TC_CLSFLOWER_REPLACE; @@ -210,6 +210,25 @@ static void fl_hw_replace_filter(struct tcf_proto *tp, dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); } +static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) +{ + struct net_device *dev = tp->q->dev_queue->dev; + struct tc_cls_flower_offload offload = {0}; + struct tc_to_netdev tc; + + if (!tc_should_offload(dev, tp, 0)) + return; + + offload.command = TC_CLSFLOWER_STATS; + offload.cookie = (unsigned long)f; + offload.exts = &f->exts; + + tc.type = TC_SETUP_CLSFLOWER; + tc.cls_flower = &offload; + + dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc); +} + static bool fl_destroy(struct tcf_proto *tp, bool force) { struct cls_fl_head *head = rtnl_dereference(tp->root); @@ -662,6 +681,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, goto nla_put_failure; } + fl_hw_update_stats(tp, f); + if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, sizeof(key->eth.dst)) || diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 563cdad76..ffe593efe 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -134,6 +134,11 @@ next_knode: j = 0; #endif + if (tc_skip_sw(n->flags)) { + n = rcu_dereference_bh(n->next); + goto next_knode; + } + #ifdef CONFIG_CLS_U32_MARK if ((skb->mark & n->mask) != n->val) { n = rcu_dereference_bh(n->next); @@ -435,7 +440,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle) offload.type = TC_SETUP_CLSU32; offload.cls_u32 = &u32_offload; - if (tc_should_offload(dev, 0)) { + if (tc_should_offload(dev, tp, 0)) { offload.cls_u32->command = TC_CLSU32_DELETE_KNODE; offload.cls_u32->knode.handle = handle; dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, @@ -443,26 +448,32 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle) } } -static void u32_replace_hw_hnode(struct tcf_proto *tp, +static int u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h, u32 flags) { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_u32_offload u32_offload = {0}; struct tc_to_netdev offload; + int err; + + if (!tc_should_offload(dev, tp, flags)) + return tc_skip_sw(flags) ? -EINVAL : 0; offload.type = TC_SETUP_CLSU32; offload.cls_u32 = &u32_offload; - if (tc_should_offload(dev, flags)) { - offload.cls_u32->command = TC_CLSU32_NEW_HNODE; - offload.cls_u32->hnode.divisor = h->divisor; - offload.cls_u32->hnode.handle = h->handle; - offload.cls_u32->hnode.prio = h->prio; + offload.cls_u32->command = TC_CLSU32_NEW_HNODE; + offload.cls_u32->hnode.divisor = h->divisor; + offload.cls_u32->hnode.handle = h->handle; + offload.cls_u32->hnode.prio = h->prio; - dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, - tp->protocol, &offload); - } + err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + if (tc_skip_sw(flags)) + return err; + + return 0; } static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) @@ -474,7 +485,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) offload.type = TC_SETUP_CLSU32; offload.cls_u32 = &u32_offload; - if (tc_should_offload(dev, 0)) { + if (tc_should_offload(dev, tp, 0)) { offload.cls_u32->command = TC_CLSU32_DELETE_HNODE; offload.cls_u32->hnode.divisor = h->divisor; offload.cls_u32->hnode.handle = h->handle; @@ -485,36 +496,42 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h) } } -static void u32_replace_hw_knode(struct tcf_proto *tp, +static int u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n, u32 flags) { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_u32_offload u32_offload = {0}; struct tc_to_netdev offload; + int err; offload.type = TC_SETUP_CLSU32; offload.cls_u32 = &u32_offload; - if (tc_should_offload(dev, flags)) { - offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE; - offload.cls_u32->knode.handle = n->handle; - offload.cls_u32->knode.fshift = n->fshift; + if (!tc_should_offload(dev, tp, flags)) + return tc_skip_sw(flags) ? -EINVAL : 0; + + offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE; + offload.cls_u32->knode.handle = n->handle; + offload.cls_u32->knode.fshift = n->fshift; #ifdef CONFIG_CLS_U32_MARK - offload.cls_u32->knode.val = n->val; - offload.cls_u32->knode.mask = n->mask; + offload.cls_u32->knode.val = n->val; + offload.cls_u32->knode.mask = n->mask; #else - offload.cls_u32->knode.val = 0; - offload.cls_u32->knode.mask = 0; + offload.cls_u32->knode.val = 0; + offload.cls_u32->knode.mask = 0; #endif - offload.cls_u32->knode.sel = &n->sel; - offload.cls_u32->knode.exts = &n->exts; - if (n->ht_down) - offload.cls_u32->knode.link_handle = n->ht_down->handle; + offload.cls_u32->knode.sel = &n->sel; + offload.cls_u32->knode.exts = &n->exts; + if (n->ht_down) + offload.cls_u32->knode.link_handle = n->ht_down->handle; - dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, - tp->protocol, &offload); - } + err = dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, + tp->protocol, &offload); + if (tc_skip_sw(flags)) + return err; + + return 0; } static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) @@ -845,8 +862,11 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (err < 0) return err; - if (tb[TCA_U32_FLAGS]) + if (tb[TCA_U32_FLAGS]) { flags = nla_get_u32(tb[TCA_U32_FLAGS]); + if (!tc_flags_valid(flags)) + return -EINVAL; + } n = (struct tc_u_knode *)*arg; if (n) { @@ -871,10 +891,15 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return err; } + err = u32_replace_hw_knode(tp, new, flags); + if (err) { + u32_destroy_key(tp, new, false); + return err; + } + u32_replace_knode(tp, tp_c, new); tcf_unbind_filter(tp, &n->res); call_rcu(&n->rcu, u32_delete_key_rcu); - u32_replace_hw_knode(tp, new, flags); return 0; } @@ -898,11 +923,17 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, ht->divisor = divisor; ht->handle = handle; ht->prio = tp->prio; + + err = u32_replace_hw_hnode(tp, ht, flags); + if (err) { + kfree(ht); + return err; + } + RCU_INIT_POINTER(ht->next, tp_c->hlist); rcu_assign_pointer(tp_c->hlist, ht); *arg = (unsigned long)ht; - u32_replace_hw_hnode(tp, ht, flags); return 0; } @@ -978,6 +1009,10 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, struct tc_u_knode __rcu **ins; struct tc_u_knode *pins; + err = u32_replace_hw_knode(tp, n, flags); + if (err) + goto errhw; + ins = &ht->ht[TC_U32_HASH(handle)]; for (pins = rtnl_dereference(*ins); pins; ins = &pins->next, pins = rtnl_dereference(*ins)) @@ -986,11 +1021,11 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(n->next, pins); rcu_assign_pointer(*ins, n); - u32_replace_hw_knode(tp, n, flags); *arg = (unsigned long)n; return 0; } +errhw: #ifdef CONFIG_CLS_U32_MARK free_percpu(n->pcpu_success); errout: @@ -1140,9 +1175,10 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, gpf->kcnts[i] += pf->kcnts[i]; } - if (nla_put(skb, TCA_U32_PCNT, - sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64), - gpf)) { + if (nla_put_64bit(skb, TCA_U32_PCNT, + sizeof(struct tc_u32_pcnt) + + n->sel.nkeys * sizeof(u64), + gpf, TCA_U32_PAD)) { kfree(gpf); goto nla_put_failure; } diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index f2aabc008..a309a07cc 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -796,7 +796,7 @@ struct meta_type_ops { int (*dump)(struct sk_buff *, struct meta_value *, int); }; -static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = { +static const struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = { [TCF_META_TYPE_VAR] = { .destroy = meta_var_destroy, .compare = meta_var_compare, @@ -812,7 +812,7 @@ static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = { } }; -static inline struct meta_type_ops *meta_type_ops(struct meta_value *v) +static inline const struct meta_type_ops *meta_type_ops(struct meta_value *v) { return &__meta_type_ops[meta_type(v)]; } @@ -870,7 +870,7 @@ static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m, static void meta_delete(struct meta_match *meta) { if (meta) { - struct meta_type_ops *ops = meta_type_ops(&meta->lvalue); + const struct meta_type_ops *ops = meta_type_ops(&meta->lvalue); if (ops && ops->destroy) { ops->destroy(&meta->lvalue); @@ -964,7 +964,7 @@ static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em) { struct meta_match *meta = (struct meta_match *) em->data; struct tcf_meta_hdr hdr; - struct meta_type_ops *ops; + const struct meta_type_ops *ops; memset(&hdr, 0, sizeof(hdr)); memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left)); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 3b180ff72..ddf047df5 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -607,6 +607,10 @@ void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires, bool thr if (throttle) qdisc_throttled(wd->qdisc); + if (wd->last_expires == expires) + return; + + wd->last_expires = expires; hrtimer_start(&wd->timer, ns_to_ktime(expires), HRTIMER_MODE_ABS_PINNED); @@ -1365,7 +1369,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, - qdisc_root_sleeping_lock(q), &d) < 0) + qdisc_root_sleeping_lock(q), &d, + TCA_PAD) < 0) goto nla_put_failure; if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) @@ -1679,7 +1684,8 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, - qdisc_root_sleeping_lock(q), &d) < 0) + qdisc_root_sleeping_lock(q), &d, + TCA_PAD) < 0) goto nla_put_failure; if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 9b7e2980e..dddf3bb65 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -49,6 +49,8 @@ #include <linux/prefetch.h> #include <net/pkt_sched.h> #include <net/codel.h> +#include <net/codel_impl.h> +#include <net/codel_qdisc.h> #define DEFAULT_CODEL_LIMIT 1000 @@ -64,20 +66,33 @@ struct codel_sched_data { * to dequeue a packet from queue. Note: backlog is handled in * codel, we dont need to reduce it here. */ -static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) +static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx) { + struct Qdisc *sch = ctx; struct sk_buff *skb = __skb_dequeue(&sch->q); + if (skb) + sch->qstats.backlog -= qdisc_pkt_len(skb); + prefetch(&skb->end); /* we'll need skb_shinfo() */ return skb; } +static void drop_func(struct sk_buff *skb, void *ctx) +{ + struct Qdisc *sch = ctx; + + qdisc_drop(skb, sch); +} + static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) { struct codel_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; - skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue); + skb = codel_dequeue(sch, &sch->qstats.backlog, &q->params, &q->vars, + &q->stats, qdisc_pkt_len, codel_get_enqueue_time, + drop_func, dequeue_func); /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, * or HTB crashes. Defer it for next round. @@ -173,9 +188,10 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt) sch->limit = DEFAULT_CODEL_LIMIT; - codel_params_init(&q->params, sch); + codel_params_init(&q->params); codel_vars_init(&q->vars); codel_stats_init(&q->stats); + q->params.mtu = psched_mtu(qdisc_dev(sch)); if (opt) { int err = codel_change(sch, opt); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index a63e879e8..bf8af2c43 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -375,6 +375,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) cl->deficit = cl->quantum; } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return err; } @@ -407,6 +408,7 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch) bstats_update(&cl->bstats, skb); qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } @@ -428,6 +430,7 @@ static unsigned int drr_drop(struct Qdisc *sch) if (cl->qdisc->ops->drop) { len = cl->qdisc->ops->drop(cl->qdisc); if (len > 0) { + sch->qstats.backlog -= len; sch->q.qlen--; if (cl->qdisc->q.qlen == 0) list_del(&cl->alist); @@ -463,6 +466,7 @@ static void drr_reset_qdisc(struct Qdisc *sch) qdisc_reset(cl->qdisc); } } + sch->qstats.backlog = 0; sch->q.qlen = 0; } diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index d3fc8f9dd..da250b2e0 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -24,6 +24,8 @@ #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/codel.h> +#include <net/codel_impl.h> +#include <net/codel_qdisc.h> /* Fair Queue CoDel. * @@ -57,8 +59,12 @@ struct fq_codel_sched_data { u32 flows_cnt; /* number of flows */ u32 perturbation; /* hash perturbation */ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ + u32 drop_batch_size; + u32 memory_limit; struct codel_params cparams; struct codel_stats cstats; + u32 memory_usage; + u32 drop_overmemory; u32 drop_overlimit; u32 new_flow_count; @@ -133,17 +139,21 @@ static inline void flow_queue_add(struct fq_codel_flow *flow, skb->next = NULL; } -static unsigned int fq_codel_drop(struct Qdisc *sch) +static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets) { struct fq_codel_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; unsigned int maxbacklog = 0, idx = 0, i, len; struct fq_codel_flow *flow; + unsigned int threshold; + unsigned int mem = 0; - /* Queue is full! Find the fat flow and drop packet from it. + /* Queue is full! Find the fat flow and drop packet(s) from it. * This might sound expensive, but with 1024 flows, we scan * 4KB of memory, and we dont need to handle a complex tree * in fast path (packet queue/enqueue) with many cache misses. + * In stress mode, we'll try to drop 64 packets from the flow, + * amortizing this linear lookup to one cache line per drop. */ for (i = 0; i < q->flows_cnt; i++) { if (q->backlogs[i] > maxbacklog) { @@ -151,15 +161,26 @@ static unsigned int fq_codel_drop(struct Qdisc *sch) idx = i; } } + + /* Our goal is to drop half of this fat flow backlog */ + threshold = maxbacklog >> 1; + flow = &q->flows[idx]; - skb = dequeue_head(flow); - len = qdisc_pkt_len(skb); + len = 0; + i = 0; + do { + skb = dequeue_head(flow); + len += qdisc_pkt_len(skb); + mem += skb->truesize; + kfree_skb(skb); + } while (++i < max_packets && len < threshold); + + flow->dropped += i; q->backlogs[idx] -= len; - sch->q.qlen--; - qdisc_qstats_drop(sch); - qdisc_qstats_backlog_dec(sch, skb); - kfree_skb(skb); - flow->dropped++; + q->memory_usage -= mem; + sch->qstats.drops += i; + sch->qstats.backlog -= len; + sch->q.qlen -= i; return idx; } @@ -168,16 +189,18 @@ static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch) unsigned int prev_backlog; prev_backlog = sch->qstats.backlog; - fq_codel_drop(sch); + fq_codel_drop(sch, 1U); return prev_backlog - sch->qstats.backlog; } static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); - unsigned int idx, prev_backlog; + unsigned int idx, prev_backlog, prev_qlen; struct fq_codel_flow *flow; int uninitialized_var(ret); + unsigned int pkt_len; + bool memory_limited; idx = fq_codel_classify(skb, sch, &ret); if (idx == 0) { @@ -200,19 +223,39 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) flow->deficit = q->quantum; flow->dropped = 0; } - if (++sch->q.qlen <= sch->limit) + q->memory_usage += skb->truesize; + memory_limited = q->memory_usage > q->memory_limit; + if (++sch->q.qlen <= sch->limit && !memory_limited) return NET_XMIT_SUCCESS; prev_backlog = sch->qstats.backlog; - q->drop_overlimit++; - /* Return Congestion Notification only if we dropped a packet - * from this flow. + prev_qlen = sch->q.qlen; + + /* save this packet length as it might be dropped by fq_codel_drop() */ + pkt_len = qdisc_pkt_len(skb); + /* fq_codel_drop() is quite expensive, as it performs a linear search + * in q->backlogs[] to find a fat flow. + * So instead of dropping a single packet, drop half of its backlog + * with a 64 packets limit to not add a too big cpu spike here. */ - if (fq_codel_drop(sch) == idx) - return NET_XMIT_CN; + ret = fq_codel_drop(sch, q->drop_batch_size); - /* As we dropped a packet, better let upper stack know this */ - qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog); + prev_qlen -= sch->q.qlen; + prev_backlog -= sch->qstats.backlog; + q->drop_overlimit += prev_qlen; + if (memory_limited) + q->drop_overmemory += prev_qlen; + + /* As we dropped packet(s), better let upper stack know this. + * If we dropped a packet for this flow, return NET_XMIT_CN, + * but in this case, our parents wont increase their backlogs. + */ + if (ret == idx) { + qdisc_tree_reduce_backlog(sch, prev_qlen - 1, + prev_backlog - pkt_len); + return NET_XMIT_CN; + } + qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog); return NET_XMIT_SUCCESS; } @@ -220,8 +263,9 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) * to dequeue a packet from queue. Note: backlog is handled in * codel, we dont need to reduce it here. */ -static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) +static struct sk_buff *dequeue_func(struct codel_vars *vars, void *ctx) { + struct Qdisc *sch = ctx; struct fq_codel_sched_data *q = qdisc_priv(sch); struct fq_codel_flow *flow; struct sk_buff *skb = NULL; @@ -230,11 +274,20 @@ static struct sk_buff *dequeue(struct codel_vars *vars, struct Qdisc *sch) if (flow->head) { skb = dequeue_head(flow); q->backlogs[flow - q->flows] -= qdisc_pkt_len(skb); + q->memory_usage -= skb->truesize; sch->q.qlen--; + sch->qstats.backlog -= qdisc_pkt_len(skb); } return skb; } +static void drop_func(struct sk_buff *skb, void *ctx) +{ + struct Qdisc *sch = ctx; + + qdisc_drop(skb, sch); +} + static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); @@ -263,8 +316,9 @@ begin: prev_ecn_mark = q->cstats.ecn_mark; prev_backlog = sch->qstats.backlog; - skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats, - dequeue); + skb = codel_dequeue(sch, &sch->qstats.backlog, &q->cparams, + &flow->cvars, &q->cstats, qdisc_pkt_len, + codel_get_enqueue_time, drop_func, dequeue_func); flow->dropped += q->cstats.drop_count - prev_drop_count; flow->dropped += q->cstats.ecn_mark - prev_ecn_mark; @@ -313,6 +367,7 @@ static void fq_codel_reset(struct Qdisc *sch) } memset(q->backlogs, 0, q->flows_cnt * sizeof(u32)); sch->q.qlen = 0; + q->memory_usage = 0; } static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { @@ -323,6 +378,8 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { [TCA_FQ_CODEL_FLOWS] = { .type = NLA_U32 }, [TCA_FQ_CODEL_QUANTUM] = { .type = NLA_U32 }, [TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_DROP_BATCH_SIZE] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_MEMORY_LIMIT] = { .type = NLA_U32 }, }; static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) @@ -374,7 +431,14 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_FQ_CODEL_QUANTUM]) q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); - while (sch->q.qlen > sch->limit) { + if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]) + q->drop_batch_size = min(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])); + + if (tb[TCA_FQ_CODEL_MEMORY_LIMIT]) + q->memory_limit = min(1U << 31, nla_get_u32(tb[TCA_FQ_CODEL_MEMORY_LIMIT])); + + while (sch->q.qlen > sch->limit || + q->memory_usage > q->memory_limit) { struct sk_buff *skb = fq_codel_dequeue(sch); q->cstats.drop_len += qdisc_pkt_len(skb); @@ -419,13 +483,16 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) sch->limit = 10*1024; q->flows_cnt = 1024; + q->memory_limit = 32 << 20; /* 32 MBytes */ + q->drop_batch_size = 64; q->quantum = psched_mtu(qdisc_dev(sch)); q->perturbation = prandom_u32(); INIT_LIST_HEAD(&q->new_flows); INIT_LIST_HEAD(&q->old_flows); - codel_params_init(&q->cparams, sch); + codel_params_init(&q->cparams); codel_stats_init(&q->cstats); q->cparams.ecn = true; + q->cparams.mtu = psched_mtu(qdisc_dev(sch)); if (opt) { int err = fq_codel_change(sch, opt); @@ -476,6 +543,10 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) q->cparams.ecn) || nla_put_u32(skb, TCA_FQ_CODEL_QUANTUM, q->quantum) || + nla_put_u32(skb, TCA_FQ_CODEL_DROP_BATCH_SIZE, + q->drop_batch_size) || + nla_put_u32(skb, TCA_FQ_CODEL_MEMORY_LIMIT, + q->memory_limit) || nla_put_u32(skb, TCA_FQ_CODEL_FLOWS, q->flows_cnt)) goto nla_put_failure; @@ -504,6 +575,8 @@ static int fq_codel_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.qdisc_stats.ecn_mark = q->cstats.ecn_mark; st.qdisc_stats.new_flow_count = q->new_flow_count; st.qdisc_stats.ce_mark = q->cstats.ce_mark; + st.qdisc_stats.memory_usage = q->memory_usage; + st.qdisc_stats.drop_overmemory = q->drop_overmemory; list_for_each(pos, &q->new_flows) st.qdisc_stats.new_flows_len++; @@ -588,7 +661,7 @@ static int fq_codel_dump_class_stats(struct Qdisc *sch, unsigned long cl, qs.backlog = q->backlogs[idx]; qs.drops = flow->dropped; } - if (gnet_stats_copy_queue(d, NULL, &qs, 0) < 0) + if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; if (idx < q->flows_cnt) return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 80742edea..f9e0e9c03 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -49,6 +49,7 @@ static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) { q->gso_skb = skb; q->qstats.requeues++; + qdisc_qstats_backlog_inc(q, skb); q->q.qlen++; /* it's still part of the queue */ __netif_schedule(q); @@ -92,6 +93,7 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, txq = skb_get_tx_queue(txq->dev, skb); if (!netif_xmit_frozen_or_stopped(txq)) { q->gso_skb = NULL; + qdisc_qstats_backlog_dec(q, skb); q->q.qlen--; } else skb = NULL; @@ -108,35 +110,6 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, return skb; } -static inline int handle_dev_cpu_collision(struct sk_buff *skb, - struct netdev_queue *dev_queue, - struct Qdisc *q) -{ - int ret; - - if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) { - /* - * Same CPU holding the lock. It may be a transient - * configuration error, when hard_start_xmit() recurses. We - * detect it by checking xmit owner and drop the packet when - * deadloop is detected. Return OK to try the next skb. - */ - kfree_skb_list(skb); - net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n", - dev_queue->dev->name); - ret = qdisc_qlen(q); - } else { - /* - * Another cpu is holding lock, requeue & delay xmits for - * some time. - */ - __this_cpu_inc(softnet_data.cpu_collision); - ret = dev_requeue_skb(skb, q); - } - - return ret; -} - /* * Transmit possibly several skbs, and handle the return status as * required. Holding the __QDISC___STATE_RUNNING bit guarantees that @@ -174,9 +147,6 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, if (dev_xmit_complete(ret)) { /* Driver sent out skb successfully or skb was consumed */ ret = qdisc_qlen(q); - } else if (ret == NETDEV_TX_LOCKED) { - /* Driver try lock failed */ - ret = handle_dev_cpu_collision(skb, txq, q); } else { /* Driver returned NETDEV_TX_BUSY - requeue skb */ if (unlikely(ret != NETDEV_TX_BUSY)) @@ -259,13 +229,12 @@ unsigned long dev_trans_start(struct net_device *dev) if (is_vlan_dev(dev)) dev = vlan_dev_real_dev(dev); - res = dev->trans_start; - for (i = 0; i < dev->num_tx_queues; i++) { + res = netdev_get_tx_queue(dev, 0)->trans_start; + for (i = 1; i < dev->num_tx_queues; i++) { val = netdev_get_tx_queue(dev, i)->trans_start; if (val && time_after(val, res)) res = val; } - dev->trans_start = res; return res; } @@ -288,10 +257,7 @@ static void dev_watchdog(unsigned long arg) struct netdev_queue *txq; txq = netdev_get_tx_queue(dev, i); - /* - * old device drivers set dev->trans_start - */ - trans_start = txq->trans_start ? : dev->trans_start; + trans_start = txq->trans_start; if (netif_xmit_stopped(txq) && time_after(jiffies, (trans_start + dev->watchdog_timeo))) { @@ -807,7 +773,7 @@ void dev_activate(struct net_device *dev) transition_one_qdisc(dev, dev_ingress_queue(dev), NULL); if (need_watchdog) { - dev->trans_start = jiffies; + netif_trans_update(dev); dev_watchdog_up(dev); } } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index d783d7cc3..1ac9f9f03 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1529,6 +1529,7 @@ hfsc_reset_qdisc(struct Qdisc *sch) q->eligible = RB_ROOT; INIT_LIST_HEAD(&q->droplist); qdisc_watchdog_cancel(&q->watchdog); + sch->qstats.backlog = 0; sch->q.qlen = 0; } @@ -1559,14 +1560,6 @@ hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) struct hfsc_sched *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_hfsc_qopt qopt; - struct hfsc_class *cl; - unsigned int i; - - sch->qstats.backlog = 0; - for (i = 0; i < q->clhash.hashsize; i++) { - hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) - sch->qstats.backlog += cl->qdisc->qstats.backlog; - } qopt.defcls = q->defcls; if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) @@ -1604,6 +1597,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (cl->qdisc->q.qlen == 1) set_active(cl, qdisc_pkt_len(skb)); + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; @@ -1672,6 +1666,7 @@ hfsc_dequeue(struct Qdisc *sch) qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; @@ -1695,6 +1690,7 @@ hfsc_drop(struct Qdisc *sch) } cl->qstats.drops++; qdisc_qstats_drop(sch); + sch->qstats.backlog -= len; sch->q.qlen--; return len; } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 87b02ed3d..052f84d6c 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -928,17 +928,10 @@ ok: } } qdisc_qstats_overlimit(sch); - if (likely(next_event > q->now)) { - if (!test_bit(__QDISC_STATE_DEACTIVATED, - &qdisc_root_sleeping(q->watchdog.qdisc)->state)) { - ktime_t time = ns_to_ktime(next_event); - qdisc_throttled(q->watchdog.qdisc); - hrtimer_start(&q->watchdog.timer, time, - HRTIMER_MODE_ABS_PINNED); - } - } else { + if (likely(next_event > q->now)) + qdisc_watchdog_schedule_ns(&q->watchdog, next_event, true); + else schedule_work(&q->work); - } fin: return skb; } @@ -1014,7 +1007,9 @@ static void htb_work_func(struct work_struct *work) struct htb_sched *q = container_of(work, struct htb_sched, work); struct Qdisc *sch = q->watchdog.qdisc; + rcu_read_lock(); __netif_schedule(qdisc_root(sch)); + rcu_read_unlock(); } static int htb_init(struct Qdisc *sch, struct nlattr *opt) @@ -1122,10 +1117,12 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, if (nla_put(skb, TCA_HTB_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if ((cl->rate.rate_bytes_ps >= (1ULL << 32)) && - nla_put_u64(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps)) + nla_put_u64_64bit(skb, TCA_HTB_RATE64, cl->rate.rate_bytes_ps, + TCA_HTB_PAD)) goto nla_put_failure; if ((cl->ceil.rate_bytes_ps >= (1ULL << 32)) && - nla_put_u64(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps)) + nla_put_u64_64bit(skb, TCA_HTB_CEIL64, cl->ceil.rate_bytes_ps, + TCA_HTB_PAD)) goto nla_put_failure; return nla_nest_end(skb, nest); @@ -1143,8 +1140,10 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) if (!cl->level && cl->un.leaf.q) qlen = cl->un.leaf.q->q.qlen; - cl->xstats.tokens = PSCHED_NS2TICKS(cl->tokens); - cl->xstats.ctokens = PSCHED_NS2TICKS(cl->ctokens); + cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens), + INT_MIN, INT_MAX); + cl->xstats.ctokens = clamp_t(s64, PSCHED_NS2TICKS(cl->ctokens), + INT_MIN, INT_MAX); if (gnet_stats_copy_basic(d, NULL, &cl->bstats) < 0 || gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 || diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 10adbc617..8fe6999b6 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -27,6 +27,11 @@ static unsigned long ingress_get(struct Qdisc *sch, u32 classid) return TC_H_MIN(classid) + 1; } +static bool ingress_cl_offload(u32 classid) +{ + return true; +} + static unsigned long ingress_bind_filter(struct Qdisc *sch, unsigned long parent, u32 classid) { @@ -86,6 +91,7 @@ static const struct Qdisc_class_ops ingress_class_ops = { .put = ingress_put, .walk = ingress_walk, .tcf_chain = ingress_find_tcf, + .tcf_cl_offload = ingress_cl_offload, .bind_tcf = ingress_bind_filter, .unbind_tcf = ingress_put, }; @@ -110,6 +116,11 @@ static unsigned long clsact_get(struct Qdisc *sch, u32 classid) } } +static bool clsact_cl_offload(u32 classid) +{ + return TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS); +} + static unsigned long clsact_bind_filter(struct Qdisc *sch, unsigned long parent, u32 classid) { @@ -158,6 +169,7 @@ static const struct Qdisc_class_ops clsact_class_ops = { .put = ingress_put, .walk = ingress_walk, .tcf_chain = clsact_find_tcf, + .tcf_cl_offload = clsact_cl_offload, .bind_tcf = clsact_bind_filter, .unbind_tcf = ingress_put, }; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index b7c29d5b6..178f1630a 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -1051,7 +1051,8 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) goto nla_put_failure; if (q->rate >= (1ULL << 32)) { - if (nla_put_u64(skb, TCA_NETEM_RATE64, q->rate)) + if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate, + TCA_NETEM_PAD)) goto nla_put_failure; rate.rate = ~0U; } else { diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index fee1b1550..a356450b7 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -85,6 +85,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) ret = qdisc_enqueue(skb, qdisc); if (ret == NET_XMIT_SUCCESS) { + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; } @@ -117,6 +118,7 @@ static struct sk_buff *prio_dequeue(struct Qdisc *sch) struct sk_buff *skb = qdisc_dequeue_peeked(qdisc); if (skb) { qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } @@ -135,6 +137,7 @@ static unsigned int prio_drop(struct Qdisc *sch) for (prio = q->bands-1; prio >= 0; prio--) { qdisc = q->queues[prio]; if (qdisc->ops->drop && (len = qdisc->ops->drop(qdisc)) != 0) { + sch->qstats.backlog -= len; sch->q.qlen--; return len; } @@ -151,6 +154,7 @@ prio_reset(struct Qdisc *sch) for (prio = 0; prio < q->bands; prio++) qdisc_reset(q->queues[prio]); + sch->qstats.backlog = 0; sch->q.qlen = 0; } @@ -168,8 +172,9 @@ prio_destroy(struct Qdisc *sch) static int prio_tune(struct Qdisc *sch, struct nlattr *opt) { struct prio_sched_data *q = qdisc_priv(sch); + struct Qdisc *queues[TCQ_PRIO_BANDS]; + int oldbands = q->bands, i; struct tc_prio_qopt *qopt; - int i; if (nla_len(opt) < sizeof(*qopt)) return -EINVAL; @@ -183,62 +188,42 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt) return -EINVAL; } + /* Before commit, make sure we can allocate all new qdiscs */ + for (i = oldbands; i < qopt->bands; i++) { + queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + TC_H_MAKE(sch->handle, i + 1)); + if (!queues[i]) { + while (i > oldbands) + qdisc_destroy(queues[--i]); + return -ENOMEM; + } + } + sch_tree_lock(sch); q->bands = qopt->bands; memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); - for (i = q->bands; i < TCQ_PRIO_BANDS; i++) { + for (i = q->bands; i < oldbands; i++) { struct Qdisc *child = q->queues[i]; - q->queues[i] = &noop_qdisc; - if (child != &noop_qdisc) { - qdisc_tree_reduce_backlog(child, child->q.qlen, child->qstats.backlog); - qdisc_destroy(child); - } - } - sch_tree_unlock(sch); - for (i = 0; i < q->bands; i++) { - if (q->queues[i] == &noop_qdisc) { - struct Qdisc *child, *old; - - child = qdisc_create_dflt(sch->dev_queue, - &pfifo_qdisc_ops, - TC_H_MAKE(sch->handle, i + 1)); - if (child) { - sch_tree_lock(sch); - old = q->queues[i]; - q->queues[i] = child; - - if (old != &noop_qdisc) { - qdisc_tree_reduce_backlog(old, - old->q.qlen, - old->qstats.backlog); - qdisc_destroy(old); - } - sch_tree_unlock(sch); - } - } + qdisc_tree_reduce_backlog(child, child->q.qlen, + child->qstats.backlog); + qdisc_destroy(child); } + + for (i = oldbands; i < q->bands; i++) + q->queues[i] = queues[i]; + + sch_tree_unlock(sch); return 0; } static int prio_init(struct Qdisc *sch, struct nlattr *opt) { - struct prio_sched_data *q = qdisc_priv(sch); - int i; - - for (i = 0; i < TCQ_PRIO_BANDS; i++) - q->queues[i] = &noop_qdisc; - - if (opt == NULL) { + if (!opt) return -EINVAL; - } else { - int err; - if ((err = prio_tune(sch, opt)) != 0) - return err; - } - return 0; + return prio_tune(sch, opt); } static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 8d2d8d953..f18857feb 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1235,8 +1235,10 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) cl->agg->lmax, qdisc_pkt_len(skb), cl->common.classid); err = qfq_change_agg(sch, cl, cl->agg->class_weight, qdisc_pkt_len(skb)); - if (err) - return err; + if (err) { + cl->qstats.drops++; + return qdisc_drop(skb, sch); + } } err = qdisc_enqueue(skb, cl->qdisc); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 8c0508c0e..91578bdd3 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -97,6 +97,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) ret = qdisc_enqueue(skb, child); if (likely(ret == NET_XMIT_SUCCESS)) { + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; } else if (net_xmit_drop_count(ret)) { q->stats.pdrop++; @@ -118,6 +119,7 @@ static struct sk_buff *red_dequeue(struct Qdisc *sch) skb = child->dequeue(child); if (skb) { qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; } else { if (!red_is_idling(&q->vars)) @@ -143,6 +145,7 @@ static unsigned int red_drop(struct Qdisc *sch) if (child->ops->drop && (len = child->ops->drop(child)) > 0) { q->stats.other++; qdisc_qstats_drop(sch); + sch->qstats.backlog -= len; sch->q.qlen--; return len; } @@ -158,6 +161,7 @@ static void red_reset(struct Qdisc *sch) struct red_sched_data *q = qdisc_priv(sch); qdisc_reset(q->qdisc); + sch->qstats.backlog = 0; sch->q.qlen = 0; red_restart(&q->vars); } diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index c2fbde742..3161e4919 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -207,6 +207,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) return ret; } + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; } @@ -217,6 +218,7 @@ static unsigned int tbf_drop(struct Qdisc *sch) unsigned int len = 0; if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) { + sch->qstats.backlog -= len; sch->q.qlen--; qdisc_qstats_drop(sch); } @@ -263,6 +265,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc *sch) q->t_c = now; q->tokens = toks; q->ptokens = ptoks; + qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; qdisc_unthrottled(sch); qdisc_bstats_update(sch, skb); @@ -294,6 +297,7 @@ static void tbf_reset(struct Qdisc *sch) struct tbf_sched_data *q = qdisc_priv(sch); qdisc_reset(q->qdisc); + sch->qstats.backlog = 0; sch->q.qlen = 0; q->t_c = ktime_get_ns(); q->tokens = q->buffer; @@ -472,11 +476,13 @@ static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_TBF_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if (q->rate.rate_bytes_ps >= (1ULL << 32) && - nla_put_u64(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps)) + nla_put_u64_64bit(skb, TCA_TBF_RATE64, q->rate.rate_bytes_ps, + TCA_TBF_PAD)) goto nla_put_failure; if (tbf_peak_present(q) && q->peak.rate_bytes_ps >= (1ULL << 32) && - nla_put_u64(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps)) + nla_put_u64_64bit(skb, TCA_TBF_PRATE64, q->peak.rate_bytes_ps, + TCA_TBF_PAD)) goto nla_put_failure; return nla_nest_end(skb, nest); diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 71c1a598d..d9c04dc1b 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -99,5 +99,9 @@ config SCTP_COOKIE_HMAC_SHA1 select CRYPTO_HMAC if SCTP_COOKIE_HMAC_SHA1 select CRYPTO_SHA1 if SCTP_COOKIE_HMAC_SHA1 +config INET_SCTP_DIAG + depends on INET_DIAG + def_tristate INET_DIAG + endif # IP_SCTP diff --git a/net/sctp/Makefile b/net/sctp/Makefile index 3b4ffb021..0fca5824a 100644 --- a/net/sctp/Makefile +++ b/net/sctp/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_IP_SCTP) += sctp.o obj-$(CONFIG_NET_SCTPPROBE) += sctp_probe.o +obj-$(CONFIG_INET_SCTP_DIAG) += sctp_diag.o sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ protocol.o endpointola.o associola.o \ diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 958ef5f33..1eb94bf18 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -239,7 +239,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, offset = 0; if ((whole > 1) || (whole && over)) - SCTP_INC_STATS_USER(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS); + SCTP_INC_STATS(sock_net(asoc->base.sk), SCTP_MIB_FRAGUSRMSGS); /* Create chunks for all the full sized DATA chunks. */ for (i = 0, len = first_len; i < whole; i++) { diff --git a/net/sctp/input.c b/net/sctp/input.c index 00b844536..f09332256 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -84,7 +84,7 @@ static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb) if (val != cmp) { /* CRC failure, dump it. */ - SCTP_INC_STATS_BH(net, SCTP_MIB_CHECKSUMERRORS); + __SCTP_INC_STATS(net, SCTP_MIB_CHECKSUMERRORS); return -1; } return 0; @@ -112,7 +112,6 @@ int sctp_rcv(struct sk_buff *skb) struct sctp_ep_common *rcvr; struct sctp_transport *transport = NULL; struct sctp_chunk *chunk; - struct sctphdr *sh; union sctp_addr src; union sctp_addr dest; int family; @@ -122,13 +121,11 @@ int sctp_rcv(struct sk_buff *skb) if (skb->pkt_type != PACKET_HOST) goto discard_it; - SCTP_INC_STATS_BH(net, SCTP_MIB_INSCTPPACKS); + __SCTP_INC_STATS(net, SCTP_MIB_INSCTPPACKS); if (skb_linearize(skb)) goto discard_it; - sh = sctp_hdr(skb); - /* Pull up the IP and SCTP headers. */ __skb_pull(skb, skb_transport_offset(skb)); if (skb->len < sizeof(struct sctphdr)) @@ -208,7 +205,7 @@ int sctp_rcv(struct sk_buff *skb) */ if (!asoc) { if (sctp_rcv_ootb(skb)) { - SCTP_INC_STATS_BH(net, SCTP_MIB_OUTOFBLUES); + __SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES); goto discard_release; } } @@ -230,7 +227,7 @@ int sctp_rcv(struct sk_buff *skb) chunk->rcvr = rcvr; /* Remember the SCTP header. */ - chunk->sctp_hdr = sh; + chunk->sctp_hdr = sctp_hdr(skb); /* Set the source and destination addresses of the incoming chunk. */ sctp_init_addrs(chunk, &src, &dest); @@ -264,9 +261,9 @@ int sctp_rcv(struct sk_buff *skb) skb = NULL; /* sctp_chunk_free already freed the skb */ goto discard_release; } - SCTP_INC_STATS_BH(net, SCTP_MIB_IN_PKT_BACKLOG); + __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_BACKLOG); } else { - SCTP_INC_STATS_BH(net, SCTP_MIB_IN_PKT_SOFTIRQ); + __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_SOFTIRQ); sctp_inq_push(&chunk->rcvr->inqueue, chunk); } @@ -281,7 +278,7 @@ int sctp_rcv(struct sk_buff *skb) return 0; discard_it: - SCTP_INC_STATS_BH(net, SCTP_MIB_IN_PKT_DISCARDS); + __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_DISCARDS); kfree_skb(skb); return 0; @@ -331,6 +328,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) */ sk = rcvr->sk; + local_bh_disable(); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -342,6 +340,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) sctp_inq_push(inqueue, chunk); bh_unlock_sock(sk); + local_bh_enable(); /* If the chunk was backloged again, don't drop refs */ if (backloged) @@ -532,7 +531,7 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, * servers this needs to be solved differently. */ if (sock_owned_by_user(sk)) - NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); + __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); *app = asoc; *tpp = transport; @@ -589,7 +588,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) skb->network_header = saveip; skb->transport_header = savesctp; if (!sk) { - ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); + __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; } /* Warning: The sock lock is held. Remember to call diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 7e8a16c77..b335ffcef 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -163,6 +163,9 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) chunk->singleton = 1; ch = (sctp_chunkhdr_t *) chunk->skb->data; chunk->data_accepted = 0; + + if (chunk->asoc) + sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb); } chunk->chunk_hdr = ch; diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index ce46f1c7f..0657d18a8 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -162,7 +162,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, skb->network_header = saveip; skb->transport_header = savesctp; if (!sk) { - ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(net, idev, ICMP6_MIB_INERRORS); goto out; } diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 5cfac8d5d..4cb5aedfe 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -280,82 +280,38 @@ void sctp_eps_proc_exit(struct net *net) struct sctp_ht_iter { struct seq_net_private p; struct rhashtable_iter hti; + int start_fail; }; -static struct sctp_transport *sctp_transport_get_next(struct seq_file *seq) +static void *sctp_transport_seq_start(struct seq_file *seq, loff_t *pos) { struct sctp_ht_iter *iter = seq->private; - struct sctp_transport *t; - - t = rhashtable_walk_next(&iter->hti); - for (; t; t = rhashtable_walk_next(&iter->hti)) { - if (IS_ERR(t)) { - if (PTR_ERR(t) == -EAGAIN) - continue; - break; - } + int err = sctp_transport_walk_start(&iter->hti); - if (net_eq(sock_net(t->asoc->base.sk), seq_file_net(seq)) && - t->asoc->peer.primary_path == t) - break; + if (err) { + iter->start_fail = 1; + return ERR_PTR(err); } - return t; + return sctp_transport_get_idx(seq_file_net(seq), &iter->hti, *pos); } -static struct sctp_transport *sctp_transport_get_idx(struct seq_file *seq, - loff_t pos) -{ - void *obj = SEQ_START_TOKEN; - - while (pos && (obj = sctp_transport_get_next(seq)) && !IS_ERR(obj)) - pos--; - - return obj; -} - -static int sctp_transport_walk_start(struct seq_file *seq) +static void sctp_transport_seq_stop(struct seq_file *seq, void *v) { struct sctp_ht_iter *iter = seq->private; - int err; - - err = rhashtable_walk_init(&sctp_transport_hashtable, &iter->hti); - if (err) - return err; - - err = rhashtable_walk_start(&iter->hti); - return err == -EAGAIN ? 0 : err; + if (iter->start_fail) + return; + sctp_transport_walk_stop(&iter->hti); } -static void sctp_transport_walk_stop(struct seq_file *seq) +static void *sctp_transport_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sctp_ht_iter *iter = seq->private; - rhashtable_walk_stop(&iter->hti); - rhashtable_walk_exit(&iter->hti); -} - -static void *sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos) -{ - int err = sctp_transport_walk_start(seq); - - if (err) - return ERR_PTR(err); - - return sctp_transport_get_idx(seq, *pos); -} - -static void sctp_assocs_seq_stop(struct seq_file *seq, void *v) -{ - sctp_transport_walk_stop(seq); -} - -static void *sctp_assocs_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ ++*pos; - return sctp_transport_get_next(seq); + return sctp_transport_get_next(seq_file_net(seq), &iter->hti); } /* Display sctp associations (/proc/net/sctp/assocs). */ @@ -416,9 +372,9 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) } static const struct seq_operations sctp_assoc_ops = { - .start = sctp_assocs_seq_start, - .next = sctp_assocs_seq_next, - .stop = sctp_assocs_seq_stop, + .start = sctp_transport_seq_start, + .next = sctp_transport_seq_next, + .stop = sctp_transport_seq_stop, .show = sctp_assocs_seq_show, }; @@ -455,28 +411,6 @@ void sctp_assocs_proc_exit(struct net *net) remove_proc_entry("assocs", net->sctp.proc_net_sctp); } -static void *sctp_remaddr_seq_start(struct seq_file *seq, loff_t *pos) -{ - int err = sctp_transport_walk_start(seq); - - if (err) - return ERR_PTR(err); - - return sctp_transport_get_idx(seq, *pos); -} - -static void *sctp_remaddr_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - ++*pos; - - return sctp_transport_get_next(seq); -} - -static void sctp_remaddr_seq_stop(struct seq_file *seq, void *v) -{ - sctp_transport_walk_stop(seq); -} - static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) { struct sctp_association *assoc; @@ -550,9 +484,9 @@ static int sctp_remaddr_seq_show(struct seq_file *seq, void *v) } static const struct seq_operations sctp_remaddr_ops = { - .start = sctp_remaddr_seq_start, - .next = sctp_remaddr_seq_next, - .stop = sctp_remaddr_seq_stop, + .start = sctp_transport_seq_start, + .next = sctp_transport_seq_next, + .stop = sctp_transport_seq_stop, .show = sctp_remaddr_seq_show, }; diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c new file mode 100644 index 000000000..f69edcf21 --- /dev/null +++ b/net/sctp/sctp_diag.c @@ -0,0 +1,497 @@ +#include <linux/module.h> +#include <linux/inet_diag.h> +#include <linux/sock_diag.h> +#include <net/sctp/sctp.h> + +static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *info); + +/* define some functions to make asoc/ep fill look clean */ +static void inet_diag_msg_sctpasoc_fill(struct inet_diag_msg *r, + struct sock *sk, + struct sctp_association *asoc) +{ + union sctp_addr laddr, paddr; + struct dst_entry *dst; + + laddr = list_entry(asoc->base.bind_addr.address_list.next, + struct sctp_sockaddr_entry, list)->a; + paddr = asoc->peer.primary_path->ipaddr; + dst = asoc->peer.primary_path->dst; + + r->idiag_family = sk->sk_family; + r->id.idiag_sport = htons(asoc->base.bind_addr.port); + r->id.idiag_dport = htons(asoc->peer.port); + r->id.idiag_if = dst ? dst->dev->ifindex : 0; + sock_diag_save_cookie(sk, r->id.idiag_cookie); + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + *(struct in6_addr *)r->id.idiag_src = laddr.v6.sin6_addr; + *(struct in6_addr *)r->id.idiag_dst = paddr.v6.sin6_addr; + } else +#endif + { + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + + r->id.idiag_src[0] = laddr.v4.sin_addr.s_addr; + r->id.idiag_dst[0] = paddr.v4.sin_addr.s_addr; + } + + r->idiag_state = asoc->state; + r->idiag_timer = SCTP_EVENT_TIMEOUT_T3_RTX; + r->idiag_retrans = asoc->rtx_data_chunks; + r->idiag_expires = jiffies_to_msecs( + asoc->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] - jiffies); +} + +static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb, + struct list_head *address_list) +{ + struct sctp_sockaddr_entry *laddr; + int addrlen = sizeof(struct sockaddr_storage); + int addrcnt = 0; + struct nlattr *attr; + void *info = NULL; + + list_for_each_entry_rcu(laddr, address_list, list) + addrcnt++; + + attr = nla_reserve(skb, INET_DIAG_LOCALS, addrlen * addrcnt); + if (!attr) + return -EMSGSIZE; + + info = nla_data(attr); + list_for_each_entry_rcu(laddr, address_list, list) { + memcpy(info, &laddr->a, addrlen); + info += addrlen; + } + + return 0; +} + +static int inet_diag_msg_sctpaddrs_fill(struct sk_buff *skb, + struct sctp_association *asoc) +{ + int addrlen = sizeof(struct sockaddr_storage); + struct sctp_transport *from; + struct nlattr *attr; + void *info = NULL; + + attr = nla_reserve(skb, INET_DIAG_PEERS, + addrlen * asoc->peer.transport_count); + if (!attr) + return -EMSGSIZE; + + info = nla_data(attr); + list_for_each_entry(from, &asoc->peer.transport_addr_list, + transports) { + memcpy(info, &from->ipaddr, addrlen); + info += addrlen; + } + + return 0; +} + +/* sctp asoc/ep fill*/ +static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, + struct sk_buff *skb, + const struct inet_diag_req_v2 *req, + struct user_namespace *user_ns, + int portid, u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh) +{ + struct sctp_endpoint *ep = sctp_sk(sk)->ep; + struct list_head *addr_list; + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + int ext = req->idiag_ext; + struct sctp_infox infox; + void *info = NULL; + + nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r), + nlmsg_flags); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + BUG_ON(!sk_fullsock(sk)); + + if (asoc) { + inet_diag_msg_sctpasoc_fill(r, sk, asoc); + } else { + inet_diag_msg_common_fill(r, sk); + r->idiag_state = sk->sk_state; + r->idiag_timer = 0; + r->idiag_retrans = 0; + } + + if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns)) + goto errout; + + if (ext & (1 << (INET_DIAG_SKMEMINFO - 1))) { + u32 mem[SK_MEMINFO_VARS]; + int amt; + + if (asoc && asoc->ep->sndbuf_policy) + amt = asoc->sndbuf_used; + else + amt = sk_wmem_alloc_get(sk); + mem[SK_MEMINFO_WMEM_ALLOC] = amt; + if (asoc && asoc->ep->rcvbuf_policy) + amt = atomic_read(&asoc->rmem_alloc); + else + amt = sk_rmem_alloc_get(sk); + mem[SK_MEMINFO_RMEM_ALLOC] = amt; + mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; + mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; + mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; + mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; + mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); + mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + + if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0) + goto errout; + } + + if (ext & (1 << (INET_DIAG_INFO - 1))) { + struct nlattr *attr; + + attr = nla_reserve_64bit(skb, INET_DIAG_INFO, + sizeof(struct sctp_info), + INET_DIAG_PAD); + if (!attr) + goto errout; + + info = nla_data(attr); + } + infox.sctpinfo = (struct sctp_info *)info; + infox.asoc = asoc; + sctp_diag_get_info(sk, r, &infox); + + addr_list = asoc ? &asoc->base.bind_addr.address_list + : &ep->base.bind_addr.address_list; + if (inet_diag_msg_sctpladdrs_fill(skb, addr_list)) + goto errout; + + if (asoc && (ext & (1 << (INET_DIAG_CONG - 1)))) + if (nla_put_string(skb, INET_DIAG_CONG, "reno") < 0) + goto errout; + + if (asoc && inet_diag_msg_sctpaddrs_fill(skb, asoc)) + goto errout; + + nlmsg_end(skb, nlh); + return 0; + +errout: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +/* callback and param */ +struct sctp_comm_param { + struct sk_buff *skb; + struct netlink_callback *cb; + const struct inet_diag_req_v2 *r; + const struct nlmsghdr *nlh; +}; + +static size_t inet_assoc_attr_size(struct sctp_association *asoc) +{ + int addrlen = sizeof(struct sockaddr_storage); + int addrcnt = 0; + struct sctp_sockaddr_entry *laddr; + + list_for_each_entry_rcu(laddr, &asoc->base.bind_addr.address_list, + list) + addrcnt++; + + return nla_total_size(sizeof(struct sctp_info)) + + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + + nla_total_size(1) /* INET_DIAG_TOS */ + + nla_total_size(1) /* INET_DIAG_TCLASS */ + + nla_total_size(addrlen * asoc->peer.transport_count) + + nla_total_size(addrlen * addrcnt) + + nla_total_size(sizeof(struct inet_diag_meminfo)) + + nla_total_size(sizeof(struct inet_diag_msg)) + + 64; +} + +static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p) +{ + struct sctp_association *assoc = tsp->asoc; + struct sock *sk = tsp->asoc->base.sk; + struct sctp_comm_param *commp = p; + struct sk_buff *in_skb = commp->skb; + const struct inet_diag_req_v2 *req = commp->r; + const struct nlmsghdr *nlh = commp->nlh; + struct net *net = sock_net(in_skb->sk); + struct sk_buff *rep; + int err; + + err = sock_diag_check_cookie(sk, req->id.idiag_cookie); + if (err) + goto out; + + err = -ENOMEM; + rep = nlmsg_new(inet_assoc_attr_size(assoc), GFP_KERNEL); + if (!rep) + goto out; + + lock_sock(sk); + if (sk != assoc->base.sk) { + release_sock(sk); + sk = assoc->base.sk; + lock_sock(sk); + } + err = inet_sctp_diag_fill(sk, assoc, rep, req, + sk_user_ns(NETLINK_CB(in_skb).sk), + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, 0, nlh); + release_sock(sk); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + kfree_skb(rep); + goto out; + } + + err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid, + MSG_DONTWAIT); + if (err > 0) + err = 0; +out: + return err; +} + +static int sctp_tsp_dump(struct sctp_transport *tsp, void *p) +{ + struct sctp_endpoint *ep = tsp->asoc->ep; + struct sctp_comm_param *commp = p; + struct sock *sk = ep->base.sk; + struct sk_buff *skb = commp->skb; + struct netlink_callback *cb = commp->cb; + const struct inet_diag_req_v2 *r = commp->r; + struct sctp_association *assoc = + list_entry(ep->asocs.next, struct sctp_association, asocs); + int err = 0; + + /* find the ep only once through the transports by this condition */ + if (tsp->asoc != assoc) + goto out; + + if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) + goto out; + + lock_sock(sk); + if (sk != assoc->base.sk) + goto release; + list_for_each_entry(assoc, &ep->asocs, asocs) { + if (cb->args[4] < cb->args[1]) + goto next; + + if (r->id.idiag_sport != htons(assoc->base.bind_addr.port) && + r->id.idiag_sport) + goto next; + if (r->id.idiag_dport != htons(assoc->peer.port) && + r->id.idiag_dport) + goto next; + + if (!cb->args[3] && + inet_sctp_diag_fill(sk, NULL, skb, r, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, cb->nlh) < 0) { + cb->args[3] = 1; + err = 2; + goto release; + } + cb->args[3] = 1; + + if (inet_sctp_diag_fill(sk, assoc, skb, r, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, 0, cb->nlh) < 0) { + err = 2; + goto release; + } +next: + cb->args[4]++; + } + cb->args[1] = 0; + cb->args[2]++; + cb->args[3] = 0; + cb->args[4] = 0; +release: + release_sock(sk); + return err; +out: + cb->args[2]++; + return err; +} + +static int sctp_ep_dump(struct sctp_endpoint *ep, void *p) +{ + struct sctp_comm_param *commp = p; + struct sock *sk = ep->base.sk; + struct sk_buff *skb = commp->skb; + struct netlink_callback *cb = commp->cb; + const struct inet_diag_req_v2 *r = commp->r; + struct net *net = sock_net(skb->sk); + struct inet_sock *inet = inet_sk(sk); + int err = 0; + + if (!net_eq(sock_net(sk), net)) + goto out; + + if (cb->args[4] < cb->args[1]) + goto next; + + if ((r->idiag_states & ~TCPF_LISTEN) && !list_empty(&ep->asocs)) + goto next; + + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next; + + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next; + + if (r->id.idiag_dport != inet->inet_dport && + r->id.idiag_dport) + goto next; + + if (inet_sctp_diag_fill(sk, NULL, skb, r, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->nlh) < 0) { + err = 2; + goto out; + } +next: + cb->args[4]++; +out: + return err; +} + +/* define the functions for sctp_diag_handler*/ +static void sctp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *info) +{ + struct sctp_infox *infox = (struct sctp_infox *)info; + + if (infox->asoc) { + r->idiag_rqueue = atomic_read(&infox->asoc->rmem_alloc); + r->idiag_wqueue = infox->asoc->sndbuf_used; + } else { + r->idiag_rqueue = sk->sk_ack_backlog; + r->idiag_wqueue = sk->sk_max_ack_backlog; + } + if (infox->sctpinfo) + sctp_get_sctp_info(sk, infox->asoc, infox->sctpinfo); +} + +static int sctp_diag_dump_one(struct sk_buff *in_skb, + const struct nlmsghdr *nlh, + const struct inet_diag_req_v2 *req) +{ + struct net *net = sock_net(in_skb->sk); + union sctp_addr laddr, paddr; + struct sctp_comm_param commp = { + .skb = in_skb, + .r = req, + .nlh = nlh, + }; + + if (req->sdiag_family == AF_INET) { + laddr.v4.sin_port = req->id.idiag_sport; + laddr.v4.sin_addr.s_addr = req->id.idiag_src[0]; + laddr.v4.sin_family = AF_INET; + + paddr.v4.sin_port = req->id.idiag_dport; + paddr.v4.sin_addr.s_addr = req->id.idiag_dst[0]; + paddr.v4.sin_family = AF_INET; + } else { + laddr.v6.sin6_port = req->id.idiag_sport; + memcpy(&laddr.v6.sin6_addr, req->id.idiag_src, 64); + laddr.v6.sin6_family = AF_INET6; + + paddr.v6.sin6_port = req->id.idiag_dport; + memcpy(&paddr.v6.sin6_addr, req->id.idiag_dst, 64); + paddr.v6.sin6_family = AF_INET6; + } + + return sctp_transport_lookup_process(sctp_tsp_dump_one, + net, &laddr, &paddr, &commp); +} + +static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, struct nlattr *bc) +{ + u32 idiag_states = r->idiag_states; + struct net *net = sock_net(skb->sk); + struct sctp_comm_param commp = { + .skb = skb, + .cb = cb, + .r = r, + }; + + /* eps hashtable dumps + * args: + * 0 : if it will traversal listen sock + * 1 : to record the sock pos of this time's traversal + * 4 : to work as a temporary variable to traversal list + */ + if (cb->args[0] == 0) { + if (!(idiag_states & TCPF_LISTEN)) + goto skip; + if (sctp_for_each_endpoint(sctp_ep_dump, &commp)) + goto done; +skip: + cb->args[0] = 1; + cb->args[1] = 0; + cb->args[4] = 0; + } + + /* asocs by transport hashtable dump + * args: + * 1 : to record the assoc pos of this time's traversal + * 2 : to record the transport pos of this time's traversal + * 3 : to mark if we have dumped the ep info of the current asoc + * 4 : to work as a temporary variable to traversal list + */ + if (!(idiag_states & ~TCPF_LISTEN)) + goto done; + sctp_for_each_transport(sctp_tsp_dump, net, cb->args[2], &commp); +done: + cb->args[1] = cb->args[4]; + cb->args[4] = 0; +} + +static const struct inet_diag_handler sctp_diag_handler = { + .dump = sctp_diag_dump, + .dump_one = sctp_diag_dump_one, + .idiag_get_info = sctp_diag_get_info, + .idiag_type = IPPROTO_SCTP, + .idiag_info_size = sizeof(struct sctp_info), +}; + +static int __init sctp_diag_init(void) +{ + return inet_diag_register(&sctp_diag_handler); +} + +static void __exit sctp_diag_exit(void) +{ + inet_diag_unregister(&sctp_diag_handler); +} + +module_init(sctp_diag_init); +module_exit(sctp_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-132); diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 41b081a64..aa3712259 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1218,6 +1218,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, sctp_cmd_seq_t *commands, gfp_t gfp) { + struct sock *sk = ep->base.sk; + struct sctp_sock *sp = sctp_sk(sk); int error = 0; int force; sctp_cmd_t *cmd; @@ -1738,6 +1740,10 @@ out: error = sctp_outq_uncork(&asoc->outqueue, gfp); } else if (local_cork) error = sctp_outq_uncork(&asoc->outqueue, gfp); + + if (sp->data_ready_signalled) + sp->data_ready_signalled = 0; + return error; nomem: error = -ENOMEM; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 878d28eda..7f5689a93 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4202,6 +4202,224 @@ static void sctp_shutdown(struct sock *sk, int how) } } +int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, + struct sctp_info *info) +{ + struct sctp_transport *prim; + struct list_head *pos; + int mask; + + memset(info, 0, sizeof(*info)); + if (!asoc) { + struct sctp_sock *sp = sctp_sk(sk); + + info->sctpi_s_autoclose = sp->autoclose; + info->sctpi_s_adaptation_ind = sp->adaptation_ind; + info->sctpi_s_pd_point = sp->pd_point; + info->sctpi_s_nodelay = sp->nodelay; + info->sctpi_s_disable_fragments = sp->disable_fragments; + info->sctpi_s_v4mapped = sp->v4mapped; + info->sctpi_s_frag_interleave = sp->frag_interleave; + info->sctpi_s_type = sp->type; + + return 0; + } + + info->sctpi_tag = asoc->c.my_vtag; + info->sctpi_state = asoc->state; + info->sctpi_rwnd = asoc->a_rwnd; + info->sctpi_unackdata = asoc->unack_data; + info->sctpi_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map); + info->sctpi_instrms = asoc->c.sinit_max_instreams; + info->sctpi_outstrms = asoc->c.sinit_num_ostreams; + list_for_each(pos, &asoc->base.inqueue.in_chunk_list) + info->sctpi_inqueue++; + list_for_each(pos, &asoc->outqueue.out_chunk_list) + info->sctpi_outqueue++; + info->sctpi_overall_error = asoc->overall_error_count; + info->sctpi_max_burst = asoc->max_burst; + info->sctpi_maxseg = asoc->frag_point; + info->sctpi_peer_rwnd = asoc->peer.rwnd; + info->sctpi_peer_tag = asoc->c.peer_vtag; + + mask = asoc->peer.ecn_capable << 1; + mask = (mask | asoc->peer.ipv4_address) << 1; + mask = (mask | asoc->peer.ipv6_address) << 1; + mask = (mask | asoc->peer.hostname_address) << 1; + mask = (mask | asoc->peer.asconf_capable) << 1; + mask = (mask | asoc->peer.prsctp_capable) << 1; + mask = (mask | asoc->peer.auth_capable); + info->sctpi_peer_capable = mask; + mask = asoc->peer.sack_needed << 1; + mask = (mask | asoc->peer.sack_generation) << 1; + mask = (mask | asoc->peer.zero_window_announced); + info->sctpi_peer_sack = mask; + + info->sctpi_isacks = asoc->stats.isacks; + info->sctpi_osacks = asoc->stats.osacks; + info->sctpi_opackets = asoc->stats.opackets; + info->sctpi_ipackets = asoc->stats.ipackets; + info->sctpi_rtxchunks = asoc->stats.rtxchunks; + info->sctpi_outofseqtsns = asoc->stats.outofseqtsns; + info->sctpi_idupchunks = asoc->stats.idupchunks; + info->sctpi_gapcnt = asoc->stats.gapcnt; + info->sctpi_ouodchunks = asoc->stats.ouodchunks; + info->sctpi_iuodchunks = asoc->stats.iuodchunks; + info->sctpi_oodchunks = asoc->stats.oodchunks; + info->sctpi_iodchunks = asoc->stats.iodchunks; + info->sctpi_octrlchunks = asoc->stats.octrlchunks; + info->sctpi_ictrlchunks = asoc->stats.ictrlchunks; + + prim = asoc->peer.primary_path; + memcpy(&info->sctpi_p_address, &prim->ipaddr, + sizeof(struct sockaddr_storage)); + info->sctpi_p_state = prim->state; + info->sctpi_p_cwnd = prim->cwnd; + info->sctpi_p_srtt = prim->srtt; + info->sctpi_p_rto = jiffies_to_msecs(prim->rto); + info->sctpi_p_hbinterval = prim->hbinterval; + info->sctpi_p_pathmaxrxt = prim->pathmaxrxt; + info->sctpi_p_sackdelay = jiffies_to_msecs(prim->sackdelay); + info->sctpi_p_ssthresh = prim->ssthresh; + info->sctpi_p_partial_bytes_acked = prim->partial_bytes_acked; + info->sctpi_p_flight_size = prim->flight_size; + info->sctpi_p_error = prim->error_count; + + return 0; +} +EXPORT_SYMBOL_GPL(sctp_get_sctp_info); + +/* use callback to avoid exporting the core structure */ +int sctp_transport_walk_start(struct rhashtable_iter *iter) +{ + int err; + + err = rhashtable_walk_init(&sctp_transport_hashtable, iter, + GFP_KERNEL); + if (err) + return err; + + err = rhashtable_walk_start(iter); + if (err && err != -EAGAIN) { + rhashtable_walk_stop(iter); + rhashtable_walk_exit(iter); + return err; + } + + return 0; +} + +void sctp_transport_walk_stop(struct rhashtable_iter *iter) +{ + rhashtable_walk_stop(iter); + rhashtable_walk_exit(iter); +} + +struct sctp_transport *sctp_transport_get_next(struct net *net, + struct rhashtable_iter *iter) +{ + struct sctp_transport *t; + + t = rhashtable_walk_next(iter); + for (; t; t = rhashtable_walk_next(iter)) { + if (IS_ERR(t)) { + if (PTR_ERR(t) == -EAGAIN) + continue; + break; + } + + if (net_eq(sock_net(t->asoc->base.sk), net) && + t->asoc->peer.primary_path == t) + break; + } + + return t; +} + +struct sctp_transport *sctp_transport_get_idx(struct net *net, + struct rhashtable_iter *iter, + int pos) +{ + void *obj = SEQ_START_TOKEN; + + while (pos && (obj = sctp_transport_get_next(net, iter)) && + !IS_ERR(obj)) + pos--; + + return obj; +} + +int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), + void *p) { + int err = 0; + int hash = 0; + struct sctp_ep_common *epb; + struct sctp_hashbucket *head; + + for (head = sctp_ep_hashtable; hash < sctp_ep_hashsize; + hash++, head++) { + read_lock(&head->lock); + sctp_for_each_hentry(epb, &head->chain) { + err = cb(sctp_ep(epb), p); + if (err) + break; + } + read_unlock(&head->lock); + } + + return err; +} +EXPORT_SYMBOL_GPL(sctp_for_each_endpoint); + +int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *), + struct net *net, + const union sctp_addr *laddr, + const union sctp_addr *paddr, void *p) +{ + struct sctp_transport *transport; + int err = 0; + + rcu_read_lock(); + transport = sctp_addrs_lookup_transport(net, laddr, paddr); + if (!transport || !sctp_transport_hold(transport)) + goto out; + err = cb(transport, p); + sctp_transport_put(transport); + +out: + rcu_read_unlock(); + return err; +} +EXPORT_SYMBOL_GPL(sctp_transport_lookup_process); + +int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *), + struct net *net, int pos, void *p) { + struct rhashtable_iter hti; + void *obj; + int err; + + err = sctp_transport_walk_start(&hti); + if (err) + return err; + + sctp_transport_get_idx(net, &hti, pos); + obj = sctp_transport_get_next(net, &hti); + for (; obj && !IS_ERR(obj); obj = sctp_transport_get_next(net, &hti)) { + struct sctp_transport *transport = obj; + + if (!sctp_transport_hold(transport)) + continue; + err = cb(transport, p); + sctp_transport_put(transport); + if (err) + break; + } + sctp_transport_walk_stop(&hti); + + return err; +} +EXPORT_SYMBOL_GPL(sctp_for_each_transport); + /* 7.2.1 Association Status (SCTP_STATUS) * Applications can retrieve current status information about an @@ -6430,6 +6648,8 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait) poll_wait(file, sk_sleep(sk), wait); + sock_rps_record_flow(sk); + /* A TCP-style listening socket becomes readable when the accept queue * is not empty. */ @@ -6764,13 +6984,11 @@ struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, * However, this function was correct in any case. 8) */ if (flags & MSG_PEEK) { - spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) atomic_inc(&skb->users); - spin_unlock_bh(&sk->sk_receive_queue.lock); } else { - skb = skb_dequeue(&sk->sk_receive_queue); + skb = __skb_dequeue(&sk->sk_receive_queue); } if (skb) @@ -7186,6 +7404,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newsk->sk_lingertime = sk->sk_lingertime; newsk->sk_rcvtimeo = sk->sk_rcvtimeo; newsk->sk_sndtimeo = sk->sk_sndtimeo; + newsk->sk_rxhash = sk->sk_rxhash; newinet = inet_sk(newsk); diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index ce469d648..ec166d2bd 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -141,7 +141,8 @@ int sctp_clear_pd(struct sock *sk, struct sctp_association *asoc) */ if (!skb_queue_empty(&sp->pd_lobby)) { struct list_head *list; - sctp_skb_list_tail(&sp->pd_lobby, &sk->sk_receive_queue); + skb_queue_splice_tail_init(&sp->pd_lobby, + &sk->sk_receive_queue); list = (struct list_head *)&sctp_sk(sk)->pd_lobby; INIT_LIST_HEAD(list); return 1; @@ -193,6 +194,7 @@ static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq) int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sock *sk = ulpq->asoc->base.sk; + struct sctp_sock *sp = sctp_sk(sk); struct sk_buff_head *queue, *skb_list; struct sk_buff *skb = sctp_event2skb(event); int clear_pd = 0; @@ -210,7 +212,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) sk_incoming_cpu_update(sk); } /* Check if the user wishes to receive this event. */ - if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe)) + if (!sctp_ulpevent_is_enabled(event, &sp->subscribe)) goto out_free; /* If we are in partial delivery mode, post to the lobby until @@ -218,7 +220,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) * the association the cause of the partial delivery. */ - if (atomic_read(&sctp_sk(sk)->pd_mode) == 0) { + if (atomic_read(&sp->pd_mode) == 0) { queue = &sk->sk_receive_queue; } else { if (ulpq->pd_mode) { @@ -230,7 +232,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) if ((event->msg_flags & MSG_NOTIFICATION) || (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK))) - queue = &sctp_sk(sk)->pd_lobby; + queue = &sp->pd_lobby; else { clear_pd = event->msg_flags & MSG_EOR; queue = &sk->sk_receive_queue; @@ -241,10 +243,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) * can queue this to the receive queue instead * of the lobby. */ - if (sctp_sk(sk)->frag_interleave) + if (sp->frag_interleave) queue = &sk->sk_receive_queue; else - queue = &sctp_sk(sk)->pd_lobby; + queue = &sp->pd_lobby; } } @@ -252,7 +254,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) * collected on a list. */ if (skb_list) - sctp_skb_list_tail(skb_list, queue); + skb_queue_splice_tail_init(skb_list, queue); else __skb_queue_tail(queue, skb); @@ -263,8 +265,10 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) if (clear_pd) sctp_ulpq_clear_pd(ulpq); - if (queue == &sk->sk_receive_queue) + if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) { + sp->data_ready_signalled = 1; sk->sk_data_ready(sk); + } return 1; out_free: @@ -1125,11 +1129,13 @@ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *ev = NULL; struct sock *sk; + struct sctp_sock *sp; if (!ulpq->pd_mode) return; sk = ulpq->asoc->base.sk; + sp = sctp_sk(sk); if (sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT, &sctp_sk(sk)->subscribe)) ev = sctp_ulpevent_make_pdapi(ulpq->asoc, @@ -1139,6 +1145,8 @@ void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); /* If there is data waiting, send it up the socket now. */ - if (sctp_ulpq_clear_pd(ulpq) || ev) + if ((sctp_ulpq_clear_pd(ulpq) || ev) && !sp->data_ready_signalled) { + sp->data_ready_signalled = 1; sk->sk_data_ready(sk); + } } diff --git a/net/socket.c b/net/socket.c index 5f77a8e93..a1bd16106 100644 --- a/net/socket.c +++ b/net/socket.c @@ -466,7 +466,7 @@ static struct socket *sockfd_lookup_light(int fd, int *err, int *fput_needed) #define XATTR_SOCKPROTONAME_SUFFIX "sockprotoname" #define XATTR_NAME_SOCKPROTONAME (XATTR_SYSTEM_PREFIX XATTR_SOCKPROTONAME_SUFFIX) #define XATTR_NAME_SOCKPROTONAME_LEN (sizeof(XATTR_NAME_SOCKPROTONAME)-1) -static ssize_t sockfs_getxattr(struct dentry *dentry, +static ssize_t sockfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { const char *proto_name; @@ -587,22 +587,19 @@ void sock_release(struct socket *sock) } EXPORT_SYMBOL(sock_release); -void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) +void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags) { u8 flags = *tx_flags; - if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_HARDWARE) + if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) flags |= SKBTX_HW_TSTAMP; - if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) + if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) flags |= SKBTX_SW_TSTAMP; - if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED) + if (tsflags & SOF_TIMESTAMPING_TX_SCHED) flags |= SKBTX_SCHED_TSTAMP; - if (sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK) - flags |= SKBTX_ACK_TSTAMP; - *tx_flags = flags; } EXPORT_SYMBOL(__sock_tx_timestamp); @@ -709,17 +706,16 @@ void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops); static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, - size_t size, int flags) + int flags) { - return sock->ops->recvmsg(sock, msg, size, flags); + return sock->ops->recvmsg(sock, msg, msg_data_left(msg), flags); } -int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, - int flags) +int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags) { - int err = security_socket_recvmsg(sock, msg, size, flags); + int err = security_socket_recvmsg(sock, msg, msg_data_left(msg), flags); - return err ?: sock_recvmsg_nosec(sock, msg, size, flags); + return err ?: sock_recvmsg_nosec(sock, msg, flags); } EXPORT_SYMBOL(sock_recvmsg); @@ -746,7 +742,7 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, vec, num, size); set_fs(KERNEL_DS); - result = sock_recvmsg(sock, msg, size, flags); + result = sock_recvmsg(sock, msg, flags); set_fs(oldfs); return result; } @@ -796,7 +792,7 @@ static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to) if (!iov_iter_count(to)) /* Match SYS5 behaviour */ return 0; - res = sock_recvmsg(sock, &msg, iov_iter_count(to), msg.msg_flags); + res = sock_recvmsg(sock, &msg, msg.msg_flags); *to = msg.msg_iter; return res; } @@ -1046,7 +1042,7 @@ static int sock_fasync(int fd, struct file *filp, int on) return -EINVAL; lock_sock(sk); - wq = rcu_dereference_protected(sock->wq, sock_owned_by_user(sk)); + wq = rcu_dereference_protected(sock->wq, lockdep_sock_is_held(sk)); fasync_helper(fd, filp, on, &wq->fasync_list); if (!wq->fasync_list) @@ -1696,7 +1692,7 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, msg.msg_iocb = NULL; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; - err = sock_recvmsg(sock, &msg, iov_iter_count(&msg.msg_iter), flags); + err = sock_recvmsg(sock, &msg, flags); if (err >= 0 && addr != NULL) { err2 = move_addr_to_user(&address, @@ -2073,7 +2069,7 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; unsigned long cmsg_ptr; - int total_len, len; + int len; ssize_t err; /* kernel mode address */ @@ -2091,7 +2087,6 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov); if (err < 0) return err; - total_len = iov_iter_count(&msg_sys->msg_iter); cmsg_ptr = (unsigned long)msg_sys->msg_control; msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); @@ -2101,8 +2096,7 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; - err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys, - total_len, flags); + err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys, flags); if (err < 0) goto out_freeiov; len = err; @@ -2174,7 +2168,8 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, struct mmsghdr __user *entry; struct compat_mmsghdr __user *compat_entry; struct msghdr msg_sys; - struct timespec end_time; + struct timespec64 end_time; + struct timespec64 timeout64; if (timeout && poll_select_set_timeout(&end_time, timeout->tv_sec, @@ -2226,8 +2221,9 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, flags |= MSG_DONTWAIT; if (timeout) { - ktime_get_ts(timeout); - *timeout = timespec_sub(end_time, *timeout); + ktime_get_ts64(&timeout64); + *timeout = timespec64_to_timespec( + timespec64_sub(end_time, timeout64)); if (timeout->tv_sec < 0) { timeout->tv_sec = timeout->tv_nsec = 0; break; diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 02f53674d..040ff627c 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -543,7 +543,7 @@ rpcauth_cache_enforce_limit(void) */ struct rpc_cred * rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, - int flags) + int flags, gfp_t gfp) { LIST_HEAD(free); struct rpc_cred_cache *cache = auth->au_credcache; @@ -580,7 +580,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, if (flags & RPCAUTH_LOOKUP_RCU) return ERR_PTR(-ECHILD); - new = auth->au_ops->crcreate(auth, acred, flags); + new = auth->au_ops->crcreate(auth, acred, flags, gfp); if (IS_ERR(new)) { cred = new; goto out; @@ -703,8 +703,7 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags) new = rpcauth_bind_new_cred(task, lookupflags); if (IS_ERR(new)) return PTR_ERR(new); - if (req->rq_cred != NULL) - put_rpccred(req->rq_cred); + put_rpccred(req->rq_cred); req->rq_cred = new; return 0; } @@ -712,6 +711,8 @@ rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags) void put_rpccred(struct rpc_cred *cred) { + if (cred == NULL) + return; /* Fast path for unhashed credentials */ if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) { if (atomic_dec_and_test(&cred->cr_count)) diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 41248b182..54dd3fdea 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -38,6 +38,13 @@ struct rpc_cred *rpc_lookup_cred(void) } EXPORT_SYMBOL_GPL(rpc_lookup_cred); +struct rpc_cred * +rpc_lookup_generic_cred(struct auth_cred *acred, int flags, gfp_t gfp) +{ + return rpcauth_lookup_credcache(&generic_auth, acred, flags, gfp); +} +EXPORT_SYMBOL_GPL(rpc_lookup_generic_cred); + struct rpc_cred *rpc_lookup_cred_nonblock(void) { return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU); @@ -77,15 +84,15 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task, static struct rpc_cred * generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { - return rpcauth_lookup_credcache(&generic_auth, acred, flags); + return rpcauth_lookup_credcache(&generic_auth, acred, flags, GFP_KERNEL); } static struct rpc_cred * -generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp) { struct generic_cred *gcred; - gcred = kmalloc(sizeof(*gcred), GFP_KERNEL); + gcred = kmalloc(sizeof(*gcred), gfp); if (gcred == NULL) return ERR_PTR(-ENOMEM); diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 15612ffa8..e64ae93d5 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1299,11 +1299,11 @@ gss_destroy_cred(struct rpc_cred *cred) static struct rpc_cred * gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { - return rpcauth_lookup_credcache(auth, acred, flags); + return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS); } static struct rpc_cred * -gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp) { struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); struct gss_cred *cred = NULL; @@ -1313,7 +1313,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) __func__, from_kuid(&init_user_ns, acred->uid), auth->au_flavor); - if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS))) + if (!(cred = kzalloc(sizeof(*cred), gfp))) goto out_err; rpcauth_init_cred(&cred->gc_base, acred, auth, &gss_credops); diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 4605dc73d..e085f5ae1 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -569,10 +569,9 @@ gss_svc_searchbyctx(struct cache_detail *cd, struct xdr_netobj *handle) struct rsc *found; memset(&rsci, 0, sizeof(rsci)); - if (dup_to_netobj(&rsci.handle, handle->data, handle->len)) - return NULL; + rsci.handle.data = handle->data; + rsci.handle.len = handle->len; found = rsc_lookup(cd, &rsci); - rsc_free(&rsci); if (!found) return NULL; if (cache_check(cd, &found->h, NULL)) diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 0d3dd364c..9f65452b7 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -52,11 +52,11 @@ unx_destroy(struct rpc_auth *auth) static struct rpc_cred * unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { - return rpcauth_lookup_credcache(auth, acred, flags); + return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS); } static struct rpc_cred * -unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp) { struct unx_cred *cred; unsigned int groups = 0; @@ -66,7 +66,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) from_kuid(&init_user_ns, acred->uid), from_kgid(&init_user_ns, acred->gid)); - if (!(cred = kmalloc(sizeof(*cred), GFP_NOFS))) + if (!(cred = kmalloc(sizeof(*cred), gfp))) return ERR_PTR(-ENOMEM); rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 837dd910a..2808d550d 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1433,6 +1433,23 @@ size_t rpc_max_payload(struct rpc_clnt *clnt) EXPORT_SYMBOL_GPL(rpc_max_payload); /** + * rpc_max_bc_payload - Get maximum backchannel payload size, in bytes + * @clnt: RPC client to query + */ +size_t rpc_max_bc_payload(struct rpc_clnt *clnt) +{ + struct rpc_xprt *xprt; + size_t ret; + + rcu_read_lock(); + xprt = rcu_dereference(clnt->cl_xprt); + ret = xprt->ops->bc_maxpayload(xprt); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(rpc_max_bc_payload); + +/** * rpc_get_timeout - Get timeout for transport in units of HZ * @clnt: RPC client to query */ diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index de70c7802..f217c348b 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -155,7 +155,7 @@ int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) struct xdr_skb_reader desc; desc.skb = skb; - desc.offset = sizeof(struct udphdr); + desc.offset = 0; desc.count = skb->len - desc.offset; if (skb_csum_unnecessary(skb)) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 7231cb413..4f01f6310 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -246,13 +246,12 @@ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new) svc_xprt_received(new); } -int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, +int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name, struct net *net, const int family, const unsigned short port, int flags) { struct svc_xprt_class *xcl; - dprintk("svc: creating transport %s[%d]\n", xprt_name, port); spin_lock(&svc_xprt_class_lock); list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { struct svc_xprt *newxprt; @@ -276,12 +275,28 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, } err: spin_unlock(&svc_xprt_class_lock); - dprintk("svc: transport %s not found\n", xprt_name); - /* This errno is exposed to user space. Provide a reasonable * perror msg for a bad transport. */ return -EPROTONOSUPPORT; } + +int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, + struct net *net, const int family, + const unsigned short port, int flags) +{ + int err; + + dprintk("svc: creating transport %s[%d]\n", xprt_name, port); + err = _svc_create_xprt(serv, xprt_name, net, family, port, flags); + if (err == -EPROTONOSUPPORT) { + request_module("svc%s", xprt_name); + err = _svc_create_xprt(serv, xprt_name, net, family, port, flags); + } + if (err) + dprintk("svc: transport %s not found, err %d\n", + xprt_name, err); + return err; +} EXPORT_SYMBOL_GPL(svc_create_xprt); /* diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 1413cdcc1..dadfec66d 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -85,8 +85,7 @@ static void svc_reclassify_socket(struct socket *sock) { struct sock *sk = sock->sk; - WARN_ON_ONCE(sock_owned_by_user(sk)); - if (sock_owned_by_user(sk)) + if (WARN_ON_ONCE(!sock_allow_reclassification(sk))) return; switch (sk->sk_family) { @@ -617,7 +616,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) svsk->sk_sk->sk_stamp = skb->tstamp; set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* there may be more data... */ - len = skb->len - sizeof(struct udphdr); + len = skb->len; rqstp->rq_arg.len = len; rqstp->rq_prot = IPPROTO_UDP; @@ -641,8 +640,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) skb_free_datagram_locked(svsk->sk_sk, skb); } else { /* we can use it in-place */ - rqstp->rq_arg.head[0].iov_base = skb->data + - sizeof(struct udphdr); + rqstp->rq_arg.head[0].iov_base = skb->data; rqstp->rq_arg.head[0].iov_len = len; if (skb_checksum_complete(skb)) goto out_free; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 6bdb38652..c4f3cc0c0 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -797,6 +797,8 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) xdr_set_iov(xdr, buf->head, buf->len); else if (buf->page_len != 0) xdr_set_page_base(xdr, 0, buf->len); + else + xdr_set_iov(xdr, buf->head, buf->len); if (p != NULL && p > xdr->p && xdr->end >= p) { xdr->nwords -= p - xdr->p; xdr->p = p; diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 2dcd7640e..87762d976 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -192,6 +192,22 @@ int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net) } /** + * xprt_rdma_bc_maxpayload - Return maximum backchannel message size + * @xprt: transport + * + * Returns maximum size, in bytes, of a backchannel message + */ +size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; + size_t maxmsg; + + maxmsg = min_t(unsigned int, cdata->inline_rsize, cdata->inline_wsize); + return maxmsg - RPCRDMA_HDRLEN_MIN; +} + +/** * rpcrdma_bc_marshal_reply - Send backwards direction reply * @rqst: buffer containing RPC reply data * diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index b289e1065..6326ebe8b 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -35,10 +35,71 @@ /* Maximum scatter/gather per FMR */ #define RPCRDMA_MAX_FMR_SGES (64) +static struct workqueue_struct *fmr_recovery_wq; + +#define FMR_RECOVERY_WQ_FLAGS (WQ_UNBOUND) + +int +fmr_alloc_recovery_wq(void) +{ + fmr_recovery_wq = alloc_workqueue("fmr_recovery", WQ_UNBOUND, 0); + return !fmr_recovery_wq ? -ENOMEM : 0; +} + +void +fmr_destroy_recovery_wq(void) +{ + struct workqueue_struct *wq; + + if (!fmr_recovery_wq) + return; + + wq = fmr_recovery_wq; + fmr_recovery_wq = NULL; + destroy_workqueue(wq); +} + +static int +__fmr_unmap(struct rpcrdma_mw *mw) +{ + LIST_HEAD(l); + + list_add(&mw->fmr.fmr->list, &l); + return ib_unmap_fmr(&l); +} + +/* Deferred reset of a single FMR. Generate a fresh rkey by + * replacing the MR. There's no recovery if this fails. + */ +static void +__fmr_recovery_worker(struct work_struct *work) +{ + struct rpcrdma_mw *mw = container_of(work, struct rpcrdma_mw, + mw_work); + struct rpcrdma_xprt *r_xprt = mw->mw_xprt; + + __fmr_unmap(mw); + rpcrdma_put_mw(r_xprt, mw); + return; +} + +/* A broken MR was discovered in a context that can't sleep. + * Defer recovery to the recovery worker. + */ +static void +__fmr_queue_recovery(struct rpcrdma_mw *mw) +{ + INIT_WORK(&mw->mw_work, __fmr_recovery_worker); + queue_work(fmr_recovery_wq, &mw->mw_work); +} + static int fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { + rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1, + RPCRDMA_MAX_DATA_SEGS / + RPCRDMA_MAX_FMR_SGES)); return 0; } @@ -48,7 +109,7 @@ static size_t fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) { return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES); + RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES); } static int @@ -89,6 +150,7 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt) if (IS_ERR(r->fmr.fmr)) goto out_fmr_err; + r->mw_xprt = r_xprt; list_add(&r->mw_list, &buf->rb_mws); list_add(&r->mw_all, &buf->rb_all); } @@ -104,15 +166,6 @@ out: return rc; } -static int -__fmr_unmap(struct rpcrdma_mw *r) -{ - LIST_HEAD(l); - - list_add(&r->fmr.fmr->list, &l); - return ib_unmap_fmr(&l); -} - /* Use the ib_map_phys_fmr() verb to register a memory region * for remote access via RDMA READ or RDMA WRITE. */ @@ -183,15 +236,10 @@ static void __fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) { struct ib_device *device = r_xprt->rx_ia.ri_device; - struct rpcrdma_mw *mw = seg->rl_mw; int nsegs = seg->mr_nsegs; - seg->rl_mw = NULL; - while (nsegs--) rpcrdma_unmap_one(device, seg++); - - rpcrdma_put_mw(r_xprt, mw); } /* Invalidate all memory regions that were registered for "req". @@ -234,42 +282,50 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) seg = &req->rl_segments[i]; __fmr_dma_unmap(r_xprt, seg); + rpcrdma_put_mw(r_xprt, seg->rl_mw); i += seg->mr_nsegs; seg->mr_nsegs = 0; + seg->rl_mw = NULL; } req->rl_nchunks = 0; } -/* Use the ib_unmap_fmr() verb to prevent further remote - * access via RDMA READ or RDMA WRITE. +/* Use a slow, safe mechanism to invalidate all memory regions + * that were registered for "req". + * + * In the asynchronous case, DMA unmapping occurs first here + * because the rpcrdma_mr_seg is released immediately after this + * call. It's contents won't be available in __fmr_dma_unmap later. + * FIXME. */ -static int -fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +static void +fmr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + bool sync) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_mr_seg *seg1 = seg; - struct rpcrdma_mw *mw = seg1->rl_mw; - int rc, nsegs = seg->mr_nsegs; + struct rpcrdma_mr_seg *seg; + struct rpcrdma_mw *mw; + unsigned int i; - dprintk("RPC: %s: FMR %p\n", __func__, mw); + for (i = 0; req->rl_nchunks; req->rl_nchunks--) { + seg = &req->rl_segments[i]; + mw = seg->rl_mw; - seg1->rl_mw = NULL; - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia->ri_device, seg++); - rc = __fmr_unmap(mw); - if (rc) - goto out_err; - rpcrdma_put_mw(r_xprt, mw); - return nsegs; + if (sync) { + /* ORDER */ + __fmr_unmap(mw); + __fmr_dma_unmap(r_xprt, seg); + rpcrdma_put_mw(r_xprt, mw); + } else { + __fmr_dma_unmap(r_xprt, seg); + __fmr_queue_recovery(mw); + } -out_err: - /* The FMR is abandoned, but remains in rb_all. fmr_op_destroy - * will attempt to release it when the transport is destroyed. - */ - dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc); - return nsegs; + i += seg->mr_nsegs; + seg->mr_nsegs = 0; + seg->rl_mw = NULL; + } } static void @@ -295,7 +351,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_unmap_sync = fmr_op_unmap_sync, - .ro_unmap = fmr_op_unmap, + .ro_unmap_safe = fmr_op_unmap_safe, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, .ro_init = fmr_op_init, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index c250924a9..c0947544b 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -98,6 +98,47 @@ frwr_destroy_recovery_wq(void) destroy_workqueue(wq); } +static int +__frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) +{ + struct rpcrdma_frmr *f = &r->frmr; + int rc; + + rc = ib_dereg_mr(f->fr_mr); + if (rc) { + pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n", + rc, r); + return rc; + } + + f->fr_mr = ib_alloc_mr(ia->ri_pd, IB_MR_TYPE_MEM_REG, + ia->ri_max_frmr_depth); + if (IS_ERR(f->fr_mr)) { + pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n", + PTR_ERR(f->fr_mr), r); + return PTR_ERR(f->fr_mr); + } + + dprintk("RPC: %s: recovered FRMR %p\n", __func__, r); + f->fr_state = FRMR_IS_INVALID; + return 0; +} + +static void +__frwr_reset_and_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_frmr *f = &mw->frmr; + int rc; + + rc = __frwr_reset_mr(ia, mw); + ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, f->fr_dir); + if (rc) + return; + + rpcrdma_put_mw(r_xprt, mw); +} + /* Deferred reset of a single FRMR. Generate a fresh rkey by * replacing the MR. * @@ -109,26 +150,10 @@ static void __frwr_recovery_worker(struct work_struct *work) { struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw, - frmr.fr_work); - struct rpcrdma_xprt *r_xprt = r->frmr.fr_xprt; - unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; - struct ib_pd *pd = r_xprt->rx_ia.ri_pd; - - if (ib_dereg_mr(r->frmr.fr_mr)) - goto out_fail; + mw_work); - r->frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth); - if (IS_ERR(r->frmr.fr_mr)) - goto out_fail; - - dprintk("RPC: %s: recovered FRMR %p\n", __func__, r); - r->frmr.fr_state = FRMR_IS_INVALID; - rpcrdma_put_mw(r_xprt, r); + __frwr_reset_and_unmap(r->mw_xprt, r); return; - -out_fail: - pr_warn("RPC: %s: FRMR %p unrecovered\n", - __func__, r); } /* A broken MR was discovered in a context that can't sleep. @@ -137,8 +162,8 @@ out_fail: static void __frwr_queue_recovery(struct rpcrdma_mw *r) { - INIT_WORK(&r->frmr.fr_work, __frwr_recovery_worker); - queue_work(frwr_recovery_wq, &r->frmr.fr_work); + INIT_WORK(&r->mw_work, __frwr_recovery_worker); + queue_work(frwr_recovery_wq, &r->mw_work); } static int @@ -152,11 +177,11 @@ __frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, if (IS_ERR(f->fr_mr)) goto out_mr_err; - f->sg = kcalloc(depth, sizeof(*f->sg), GFP_KERNEL); - if (!f->sg) + f->fr_sg = kcalloc(depth, sizeof(*f->fr_sg), GFP_KERNEL); + if (!f->fr_sg) goto out_list_err; - sg_init_table(f->sg, depth); + sg_init_table(f->fr_sg, depth); init_completion(&f->fr_linv_done); @@ -185,7 +210,7 @@ __frwr_release(struct rpcrdma_mw *r) if (rc) dprintk("RPC: %s: ib_dereg_mr status %i\n", __func__, rc); - kfree(r->frmr.sg); + kfree(r->frmr.fr_sg); } static int @@ -231,6 +256,9 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, depth; } + rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1, + RPCRDMA_MAX_DATA_SEGS / + ia->ri_max_frmr_depth)); return 0; } @@ -243,7 +271,7 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) struct rpcrdma_ia *ia = &r_xprt->rx_ia; return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); + RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frmr_depth); } static void @@ -350,9 +378,9 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt) return rc; } + r->mw_xprt = r_xprt; list_add(&r->mw_list, &buf->rb_mws); list_add(&r->mw_all, &buf->rb_all); - r->frmr.fr_xprt = r_xprt; } return 0; @@ -396,12 +424,12 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, for (i = 0; i < nsegs;) { if (seg->mr_page) - sg_set_page(&frmr->sg[i], + sg_set_page(&frmr->fr_sg[i], seg->mr_page, seg->mr_len, offset_in_page(seg->mr_offset)); else - sg_set_buf(&frmr->sg[i], seg->mr_offset, + sg_set_buf(&frmr->fr_sg[i], seg->mr_offset, seg->mr_len); ++seg; @@ -412,25 +440,26 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) break; } - frmr->sg_nents = i; + frmr->fr_nents = i; + frmr->fr_dir = direction; - dma_nents = ib_dma_map_sg(device, frmr->sg, frmr->sg_nents, direction); + dma_nents = ib_dma_map_sg(device, frmr->fr_sg, frmr->fr_nents, direction); if (!dma_nents) { pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n", - __func__, frmr->sg, frmr->sg_nents); + __func__, frmr->fr_sg, frmr->fr_nents); return -ENOMEM; } - n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, PAGE_SIZE); - if (unlikely(n != frmr->sg_nents)) { + n = ib_map_mr_sg(mr, frmr->fr_sg, frmr->fr_nents, NULL, PAGE_SIZE); + if (unlikely(n != frmr->fr_nents)) { pr_err("RPC: %s: failed to map mr %p (%u/%u)\n", - __func__, frmr->fr_mr, n, frmr->sg_nents); + __func__, frmr->fr_mr, n, frmr->fr_nents); rc = n < 0 ? n : -EINVAL; goto out_senderr; } dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n", - __func__, mw, frmr->sg_nents, mr->length); + __func__, mw, frmr->fr_nents, mr->length); key = (u8)(mr->rkey & 0x000000FF); ib_update_fast_reg_key(mr, ++key); @@ -452,18 +481,16 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, if (rc) goto out_senderr; - seg1->mr_dir = direction; seg1->rl_mw = mw; seg1->mr_rkey = mr->rkey; seg1->mr_base = mr->iova; - seg1->mr_nsegs = frmr->sg_nents; + seg1->mr_nsegs = frmr->fr_nents; seg1->mr_len = mr->length; - return frmr->sg_nents; + return frmr->fr_nents; out_senderr: dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); - ib_dma_unmap_sg(device, frmr->sg, dma_nents, direction); __frwr_queue_recovery(mw); return rc; } @@ -487,24 +514,6 @@ __frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg) return invalidate_wr; } -static void -__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int rc) -{ - struct ib_device *device = r_xprt->rx_ia.ri_device; - struct rpcrdma_mw *mw = seg->rl_mw; - struct rpcrdma_frmr *f = &mw->frmr; - - seg->rl_mw = NULL; - - ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir); - - if (!rc) - rpcrdma_put_mw(r_xprt, mw); - else - __frwr_queue_recovery(mw); -} - /* Invalidate all memory regions that were registered for "req". * * Sleeps until it is safe for the host CPU to access the @@ -518,6 +527,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) struct rpcrdma_mr_seg *seg; unsigned int i, nchunks; struct rpcrdma_frmr *f; + struct rpcrdma_mw *mw; int rc; dprintk("RPC: %s: req %p\n", __func__, req); @@ -558,11 +568,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * unless ri_id->qp is a valid pointer. */ rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr); - if (rc) { - pr_warn("%s: ib_post_send failed %i\n", __func__, rc); - rdma_disconnect(ia->ri_id); - goto unmap; - } + if (rc) + goto reset_mrs; wait_for_completion(&f->fr_linv_done); @@ -572,56 +579,65 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) unmap: for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { seg = &req->rl_segments[i]; + mw = seg->rl_mw; + seg->rl_mw = NULL; - __frwr_dma_unmap(r_xprt, seg, rc); + ib_dma_unmap_sg(ia->ri_device, f->fr_sg, f->fr_nents, + f->fr_dir); + rpcrdma_put_mw(r_xprt, mw); i += seg->mr_nsegs; seg->mr_nsegs = 0; } req->rl_nchunks = 0; -} + return; -/* Post a LOCAL_INV Work Request to prevent further remote access - * via RDMA READ or RDMA WRITE. - */ -static int -frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) -{ - struct rpcrdma_mr_seg *seg1 = seg; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_mw *mw = seg1->rl_mw; - struct rpcrdma_frmr *frmr = &mw->frmr; - struct ib_send_wr *invalidate_wr, *bad_wr; - int rc, nsegs = seg->mr_nsegs; +reset_mrs: + pr_warn("%s: ib_post_send failed %i\n", __func__, rc); - dprintk("RPC: %s: FRMR %p\n", __func__, mw); + /* Find and reset the MRs in the LOCAL_INV WRs that did not + * get posted. This is synchronous, and slow. + */ + for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) { + seg = &req->rl_segments[i]; + mw = seg->rl_mw; + f = &mw->frmr; - seg1->rl_mw = NULL; - frmr->fr_state = FRMR_IS_INVALID; - invalidate_wr = &mw->frmr.fr_invwr; + if (mw->frmr.fr_mr->rkey == bad_wr->ex.invalidate_rkey) { + __frwr_reset_mr(ia, mw); + bad_wr = bad_wr->next; + } - memset(invalidate_wr, 0, sizeof(*invalidate_wr)); - frmr->fr_cqe.done = frwr_wc_localinv; - invalidate_wr->wr_cqe = &frmr->fr_cqe; - invalidate_wr->opcode = IB_WR_LOCAL_INV; - invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); + i += seg->mr_nsegs; + } + goto unmap; +} - ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir); - read_lock(&ia->ri_qplock); - rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr); - read_unlock(&ia->ri_qplock); - if (rc) - goto out_err; +/* Use a slow, safe mechanism to invalidate all memory regions + * that were registered for "req". + */ +static void +frwr_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + bool sync) +{ + struct rpcrdma_mr_seg *seg; + struct rpcrdma_mw *mw; + unsigned int i; - rpcrdma_put_mw(r_xprt, mw); - return nsegs; + for (i = 0; req->rl_nchunks; req->rl_nchunks--) { + seg = &req->rl_segments[i]; + mw = seg->rl_mw; -out_err: - dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); - __frwr_queue_recovery(mw); - return nsegs; + if (sync) + __frwr_reset_and_unmap(r_xprt, mw); + else + __frwr_queue_recovery(mw); + + i += seg->mr_nsegs; + seg->mr_nsegs = 0; + seg->rl_mw = NULL; + } } static void @@ -643,7 +659,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_map = frwr_op_map, .ro_unmap_sync = frwr_op_unmap_sync, - .ro_unmap = frwr_op_unmap, + .ro_unmap_safe = frwr_op_unmap_safe, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, .ro_init = frwr_op_init, diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index 481b9b6f4..3750596cc 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -36,8 +36,11 @@ physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, __func__, PTR_ERR(mr)); return -ENOMEM; } - ia->ri_dma_mr = mr; + + rpcrdma_set_max_header_sizes(ia, cdata, min_t(unsigned int, + RPCRDMA_MAX_DATA_SEGS, + RPCRDMA_MAX_HDR_SEGS)); return 0; } @@ -47,7 +50,7 @@ static size_t physical_op_maxpages(struct rpcrdma_xprt *r_xprt) { return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - rpcrdma_max_segments(r_xprt)); + RPCRDMA_MAX_HDR_SEGS); } static int @@ -71,17 +74,6 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, return 1; } -/* Unmap a memory region, but leave it registered. - */ -static int -physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - - rpcrdma_unmap_one(ia->ri_device, seg); - return 1; -} - /* DMA unmap all memory regions that were mapped for "req". */ static void @@ -94,6 +86,25 @@ physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) rpcrdma_unmap_one(device, &req->rl_segments[i++]); } +/* Use a slow, safe mechanism to invalidate all memory regions + * that were registered for "req". + * + * For physical memory registration, there is no good way to + * fence a single MR that has been advertised to the server. The + * client has already handed the server an R_key that cannot be + * invalidated and is shared by all MRs on this connection. + * Tearing down the PD might be the only safe choice, but it's + * not clear that a freshly acquired DMA R_key would be different + * than the one used by the PD that was just destroyed. + * FIXME. + */ +static void +physical_op_unmap_safe(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + bool sync) +{ + physical_op_unmap_sync(r_xprt, req); +} + static void physical_op_destroy(struct rpcrdma_buffer *buf) { @@ -102,7 +113,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf) const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { .ro_map = physical_op_map, .ro_unmap_sync = physical_op_unmap_sync, - .ro_unmap = physical_op_unmap, + .ro_unmap_safe = physical_op_unmap_safe, .ro_open = physical_op_open, .ro_maxpages = physical_op_maxpages, .ro_init = physical_op_init, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 888823bb6..35a81096e 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -61,26 +61,84 @@ enum rpcrdma_chunktype { rpcrdma_replych }; -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static const char transfertypes[][12] = { - "pure inline", /* no chunks */ - " read chunk", /* some argument via rdma read */ - "*read chunk", /* entire request via rdma read */ - "write chunk", /* some result via rdma write */ + "inline", /* no chunks */ + "read list", /* some argument via rdma read */ + "*read list", /* entire request via rdma read */ + "write list", /* some result via rdma write */ "reply chunk" /* entire reply via rdma write */ }; -#endif + +/* Returns size of largest RPC-over-RDMA header in a Call message + * + * The largest Call header contains a full-size Read list and a + * minimal Reply chunk. + */ +static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) +{ + unsigned int size; + + /* Fixed header fields and list discriminators */ + size = RPCRDMA_HDRLEN_MIN; + + /* Maximum Read list size */ + maxsegs += 2; /* segment for head and tail buffers */ + size = maxsegs * sizeof(struct rpcrdma_read_chunk); + + /* Minimal Read chunk size */ + size += sizeof(__be32); /* segment count */ + size += sizeof(struct rpcrdma_segment); + size += sizeof(__be32); /* list discriminator */ + + dprintk("RPC: %s: max call header size = %u\n", + __func__, size); + return size; +} + +/* Returns size of largest RPC-over-RDMA header in a Reply message + * + * There is only one Write list or one Reply chunk per Reply + * message. The larger list is the Write list. + */ +static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) +{ + unsigned int size; + + /* Fixed header fields and list discriminators */ + size = RPCRDMA_HDRLEN_MIN; + + /* Maximum Write list size */ + maxsegs += 2; /* segment for head and tail buffers */ + size = sizeof(__be32); /* segment count */ + size += maxsegs * sizeof(struct rpcrdma_segment); + size += sizeof(__be32); /* list discriminator */ + + dprintk("RPC: %s: max reply header size = %u\n", + __func__, size); + return size; +} + +void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia, + struct rpcrdma_create_data_internal *cdata, + unsigned int maxsegs) +{ + ia->ri_max_inline_write = cdata->inline_wsize - + rpcrdma_max_call_header_size(maxsegs); + ia->ri_max_inline_read = cdata->inline_rsize - + rpcrdma_max_reply_header_size(maxsegs); +} /* The client can send a request inline as long as the RPCRDMA header * plus the RPC call fit under the transport's inline limit. If the * combined call message size exceeds that limit, the client must use * the read chunk list for this operation. */ -static bool rpcrdma_args_inline(struct rpc_rqst *rqst) +static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, + struct rpc_rqst *rqst) { - unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; - return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); + return rqst->rq_snd_buf.len <= ia->ri_max_inline_write; } /* The client can't know how large the actual reply will be. Thus it @@ -89,11 +147,12 @@ static bool rpcrdma_args_inline(struct rpc_rqst *rqst) * limit, the client must provide a write list or a reply chunk for * this request. */ -static bool rpcrdma_results_inline(struct rpc_rqst *rqst) +static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, + struct rpc_rqst *rqst) { - unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; - return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst); + return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read; } static int @@ -226,23 +285,16 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, return n; } -/* - * Create read/write chunk lists, and reply chunks, for RDMA - * - * Assume check against THRESHOLD has been done, and chunks are required. - * Assume only encoding one list entry for read|write chunks. The NFSv3 - * protocol is simple enough to allow this as it only has a single "bulk - * result" in each procedure - complicated NFSv4 COMPOUNDs are not. (The - * RDMA/Sessions NFSv4 proposal addresses this for future v4 revs.) - * - * When used for a single reply chunk (which is a special write - * chunk used for the entire reply, rather than just the data), it - * is used primarily for READDIR and READLINK which would otherwise - * be severely size-limited by a small rdma inline read max. The server - * response will come back as an RDMA Write, followed by a message - * of type RDMA_NOMSG carrying the xid and length. As a result, reply - * chunks do not provide data alignment, however they do not require - * "fixup" (moving the response to the upper layer buffer) either. +static inline __be32 * +xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr_seg *seg) +{ + *iptr++ = cpu_to_be32(seg->mr_rkey); + *iptr++ = cpu_to_be32(seg->mr_len); + return xdr_encode_hyper(iptr, seg->mr_base); +} + +/* XDR-encode the Read list. Supports encoding a list of read + * segments that belong to a single read chunk. * * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): * @@ -250,131 +302,190 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos, * N elements, position P (same P for all chunks of same arg!): * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 * + * Returns a pointer to the XDR word in the RDMA header following + * the end of the Read list, or an error pointer. + */ +static __be32 * +rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, struct rpc_rqst *rqst, + __be32 *iptr, enum rpcrdma_chunktype rtype) +{ + struct rpcrdma_mr_seg *seg = req->rl_nextseg; + unsigned int pos; + int n, nsegs; + + if (rtype == rpcrdma_noch) { + *iptr++ = xdr_zero; /* item not present */ + return iptr; + } + + pos = rqst->rq_snd_buf.head[0].iov_len; + if (rtype == rpcrdma_areadch) + pos = 0; + nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, + RPCRDMA_MAX_SEGS - req->rl_nchunks); + if (nsegs < 0) + return ERR_PTR(nsegs); + + do { + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, false); + if (n <= 0) + return ERR_PTR(n); + + *iptr++ = xdr_one; /* item present */ + + /* All read segments in this chunk + * have the same "position". + */ + *iptr++ = cpu_to_be32(pos); + iptr = xdr_encode_rdma_segment(iptr, seg); + + dprintk("RPC: %5u %s: read segment pos %u " + "%d@0x%016llx:0x%08x (%s)\n", + rqst->rq_task->tk_pid, __func__, pos, + seg->mr_len, (unsigned long long)seg->mr_base, + seg->mr_rkey, n < nsegs ? "more" : "last"); + + r_xprt->rx_stats.read_chunk_count++; + req->rl_nchunks++; + seg += n; + nsegs -= n; + } while (nsegs); + req->rl_nextseg = seg; + + /* Finish Read list */ + *iptr++ = xdr_zero; /* Next item not present */ + return iptr; +} + +/* XDR-encode the Write list. Supports encoding a list containing + * one array of plain segments that belong to a single write chunk. + * + * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): + * * Write chunklist (a list of (one) counted array): * N elements: * 1 - N - HLOO - HLOO - ... - HLOO - 0 * + * Returns a pointer to the XDR word in the RDMA header following + * the end of the Write list, or an error pointer. + */ +static __be32 * +rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, + struct rpc_rqst *rqst, __be32 *iptr, + enum rpcrdma_chunktype wtype) +{ + struct rpcrdma_mr_seg *seg = req->rl_nextseg; + int n, nsegs, nchunks; + __be32 *segcount; + + if (wtype != rpcrdma_writech) { + *iptr++ = xdr_zero; /* no Write list present */ + return iptr; + } + + nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, + rqst->rq_rcv_buf.head[0].iov_len, + wtype, seg, + RPCRDMA_MAX_SEGS - req->rl_nchunks); + if (nsegs < 0) + return ERR_PTR(nsegs); + + *iptr++ = xdr_one; /* Write list present */ + segcount = iptr++; /* save location of segment count */ + + nchunks = 0; + do { + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); + if (n <= 0) + return ERR_PTR(n); + + iptr = xdr_encode_rdma_segment(iptr, seg); + + dprintk("RPC: %5u %s: write segment " + "%d@0x016%llx:0x%08x (%s)\n", + rqst->rq_task->tk_pid, __func__, + seg->mr_len, (unsigned long long)seg->mr_base, + seg->mr_rkey, n < nsegs ? "more" : "last"); + + r_xprt->rx_stats.write_chunk_count++; + r_xprt->rx_stats.total_rdma_request += seg->mr_len; + req->rl_nchunks++; + nchunks++; + seg += n; + nsegs -= n; + } while (nsegs); + req->rl_nextseg = seg; + + /* Update count of segments in this Write chunk */ + *segcount = cpu_to_be32(nchunks); + + /* Finish Write list */ + *iptr++ = xdr_zero; /* Next item not present */ + return iptr; +} + +/* XDR-encode the Reply chunk. Supports encoding an array of plain + * segments that belong to a single write (reply) chunk. + * + * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): + * * Reply chunk (a counted array): * N elements: * 1 - N - HLOO - HLOO - ... - HLOO * - * Returns positive RPC/RDMA header size, or negative errno. + * Returns a pointer to the XDR word in the RDMA header following + * the end of the Reply chunk, or an error pointer. */ - -static ssize_t -rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, - struct rpcrdma_msg *headerp, enum rpcrdma_chunktype type) +static __be32 * +rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, struct rpc_rqst *rqst, + __be32 *iptr, enum rpcrdma_chunktype wtype) { - struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); - int n, nsegs, nchunks = 0; - unsigned int pos; - struct rpcrdma_mr_seg *seg = req->rl_segments; - struct rpcrdma_read_chunk *cur_rchunk = NULL; - struct rpcrdma_write_array *warray = NULL; - struct rpcrdma_write_chunk *cur_wchunk = NULL; - __be32 *iptr = headerp->rm_body.rm_chunks; - int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool); - - if (type == rpcrdma_readch || type == rpcrdma_areadch) { - /* a read chunk - server will RDMA Read our memory */ - cur_rchunk = (struct rpcrdma_read_chunk *) iptr; - } else { - /* a write or reply chunk - server will RDMA Write our memory */ - *iptr++ = xdr_zero; /* encode a NULL read chunk list */ - if (type == rpcrdma_replych) - *iptr++ = xdr_zero; /* a NULL write chunk list */ - warray = (struct rpcrdma_write_array *) iptr; - cur_wchunk = (struct rpcrdma_write_chunk *) (warray + 1); - } + struct rpcrdma_mr_seg *seg = req->rl_nextseg; + int n, nsegs, nchunks; + __be32 *segcount; - if (type == rpcrdma_replych || type == rpcrdma_areadch) - pos = 0; - else - pos = target->head[0].iov_len; + if (wtype != rpcrdma_replych) { + *iptr++ = xdr_zero; /* no Reply chunk present */ + return iptr; + } - nsegs = rpcrdma_convert_iovs(target, pos, type, seg, RPCRDMA_MAX_SEGS); + nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg, + RPCRDMA_MAX_SEGS - req->rl_nchunks); if (nsegs < 0) - return nsegs; + return ERR_PTR(nsegs); - map = r_xprt->rx_ia.ri_ops->ro_map; + *iptr++ = xdr_one; /* Reply chunk present */ + segcount = iptr++; /* save location of segment count */ + + nchunks = 0; do { - n = map(r_xprt, seg, nsegs, cur_wchunk != NULL); + n = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, true); if (n <= 0) - goto out; - if (cur_rchunk) { /* read */ - cur_rchunk->rc_discrim = xdr_one; - /* all read chunks have the same "position" */ - cur_rchunk->rc_position = cpu_to_be32(pos); - cur_rchunk->rc_target.rs_handle = - cpu_to_be32(seg->mr_rkey); - cur_rchunk->rc_target.rs_length = - cpu_to_be32(seg->mr_len); - xdr_encode_hyper( - (__be32 *)&cur_rchunk->rc_target.rs_offset, - seg->mr_base); - dprintk("RPC: %s: read chunk " - "elem %d@0x%llx:0x%x pos %u (%s)\n", __func__, - seg->mr_len, (unsigned long long)seg->mr_base, - seg->mr_rkey, pos, n < nsegs ? "more" : "last"); - cur_rchunk++; - r_xprt->rx_stats.read_chunk_count++; - } else { /* write/reply */ - cur_wchunk->wc_target.rs_handle = - cpu_to_be32(seg->mr_rkey); - cur_wchunk->wc_target.rs_length = - cpu_to_be32(seg->mr_len); - xdr_encode_hyper( - (__be32 *)&cur_wchunk->wc_target.rs_offset, - seg->mr_base); - dprintk("RPC: %s: %s chunk " - "elem %d@0x%llx:0x%x (%s)\n", __func__, - (type == rpcrdma_replych) ? "reply" : "write", - seg->mr_len, (unsigned long long)seg->mr_base, - seg->mr_rkey, n < nsegs ? "more" : "last"); - cur_wchunk++; - if (type == rpcrdma_replych) - r_xprt->rx_stats.reply_chunk_count++; - else - r_xprt->rx_stats.write_chunk_count++; - r_xprt->rx_stats.total_rdma_request += seg->mr_len; - } + return ERR_PTR(n); + + iptr = xdr_encode_rdma_segment(iptr, seg); + + dprintk("RPC: %5u %s: reply segment " + "%d@0x%016llx:0x%08x (%s)\n", + rqst->rq_task->tk_pid, __func__, + seg->mr_len, (unsigned long long)seg->mr_base, + seg->mr_rkey, n < nsegs ? "more" : "last"); + + r_xprt->rx_stats.reply_chunk_count++; + r_xprt->rx_stats.total_rdma_request += seg->mr_len; + req->rl_nchunks++; nchunks++; seg += n; nsegs -= n; } while (nsegs); + req->rl_nextseg = seg; - /* success. all failures return above */ - req->rl_nchunks = nchunks; - - /* - * finish off header. If write, marshal discrim and nchunks. - */ - if (cur_rchunk) { - iptr = (__be32 *) cur_rchunk; - *iptr++ = xdr_zero; /* finish the read chunk list */ - *iptr++ = xdr_zero; /* encode a NULL write chunk list */ - *iptr++ = xdr_zero; /* encode a NULL reply chunk */ - } else { - warray->wc_discrim = xdr_one; - warray->wc_nchunks = cpu_to_be32(nchunks); - iptr = (__be32 *) cur_wchunk; - if (type == rpcrdma_writech) { - *iptr++ = xdr_zero; /* finish the write chunk list */ - *iptr++ = xdr_zero; /* encode a NULL reply chunk */ - } - } - - /* - * Return header size. - */ - return (unsigned char *)iptr - (unsigned char *)headerp; + /* Update count of segments in the Reply chunk */ + *segcount = cpu_to_be32(nchunks); -out: - for (pos = 0; nchunks--;) - pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, - &req->rl_segments[pos]); - return n; + return iptr; } /* @@ -440,13 +551,10 @@ static void rpcrdma_inline_pullup(struct rpc_rqst *rqst) * Marshal a request: the primary job of this routine is to choose * the transfer modes. See comments below. * - * Uses multiple RDMA IOVs for a request: - * [0] -- RPC RDMA header, which uses memory from the *start* of the - * preregistered buffer that already holds the RPC data in - * its middle. - * [1] -- the RPC header/data, marshaled by RPC and the NFS protocol. - * [2] -- optional padding. - * [3] -- if padded, header only in [1] and data here. + * Prepares up to two IOVs per Call message: + * + * [0] -- RPC RDMA header + * [1] -- the RPC header/data * * Returns zero on success, otherwise a negative errno. */ @@ -457,24 +565,17 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - char *base; - size_t rpclen; - ssize_t hdrlen; enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; + ssize_t hdrlen; + size_t rpclen; + __be32 *iptr; #if defined(CONFIG_SUNRPC_BACKCHANNEL) if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)) return rpcrdma_bc_marshal_reply(rqst); #endif - /* - * rpclen gets amount of data in first buffer, which is the - * pre-registered buffer. - */ - base = rqst->rq_svec[0].iov_base; - rpclen = rqst->rq_svec[0].iov_len; - headerp = rdmab_to_msg(req->rl_rdmabuf); /* don't byte-swap XID, it's already done in request */ headerp->rm_xid = rqst->rq_xid; @@ -485,15 +586,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) /* * Chunks needed for results? * - * o Read ops return data as write chunk(s), header as inline. * o If the expected result is under the inline threshold, all ops * return as inline. + * o Large read ops return data as write chunk(s), header as + * inline. * o Large non-read ops return as a single reply chunk. */ - if (rqst->rq_rcv_buf.flags & XDRBUF_READ) - wtype = rpcrdma_writech; - else if (rpcrdma_results_inline(rqst)) + if (rpcrdma_results_inline(r_xprt, rqst)) wtype = rpcrdma_noch; + else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) + wtype = rpcrdma_writech; else wtype = rpcrdma_replych; @@ -511,10 +613,14 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * that both has a data payload, and whose non-data arguments * by themselves are larger than the inline threshold. */ - if (rpcrdma_args_inline(rqst)) { + if (rpcrdma_args_inline(r_xprt, rqst)) { rtype = rpcrdma_noch; + rpcrdma_inline_pullup(rqst); + rpclen = rqst->rq_svec[0].iov_len; } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) { rtype = rpcrdma_readch; + rpclen = rqst->rq_svec[0].iov_len; + rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); } else { r_xprt->rx_stats.nomsg_call_count++; headerp->rm_type = htonl(RDMA_NOMSG); @@ -522,57 +628,50 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) rpclen = 0; } - /* The following simplification is not true forever */ - if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) - wtype = rpcrdma_noch; - if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { - dprintk("RPC: %s: cannot marshal multiple chunk lists\n", - __func__); - return -EIO; - } - - hdrlen = RPCRDMA_HDRLEN_MIN; - - /* - * Pull up any extra send data into the preregistered buffer. - * When padding is in use and applies to the transfer, insert - * it and change the message type. + /* This implementation supports the following combinations + * of chunk lists in one RPC-over-RDMA Call message: + * + * - Read list + * - Write list + * - Reply chunk + * - Read list + Reply chunk + * + * It might not yet support the following combinations: + * + * - Read list + Write list + * + * It does not support the following combinations: + * + * - Write list + Reply chunk + * - Read list + Write list + Reply chunk + * + * This implementation supports only a single chunk in each + * Read or Write list. Thus for example the client cannot + * send a Call message with a Position Zero Read chunk and a + * regular Read chunk at the same time. */ - if (rtype == rpcrdma_noch) { - - rpcrdma_inline_pullup(rqst); - - headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero; - headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero; - headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero; - /* new length after pullup */ - rpclen = rqst->rq_svec[0].iov_len; - } else if (rtype == rpcrdma_readch) - rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf); - if (rtype != rpcrdma_noch) { - hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, - headerp, rtype); - wtype = rtype; /* simplify dprintk */ - - } else if (wtype != rpcrdma_noch) { - hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, - headerp, wtype); - } - if (hdrlen < 0) - return hdrlen; + req->rl_nchunks = 0; + req->rl_nextseg = req->rl_segments; + iptr = headerp->rm_body.rm_chunks; + iptr = rpcrdma_encode_read_list(r_xprt, req, rqst, iptr, rtype); + if (IS_ERR(iptr)) + goto out_unmap; + iptr = rpcrdma_encode_write_list(r_xprt, req, rqst, iptr, wtype); + if (IS_ERR(iptr)) + goto out_unmap; + iptr = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, iptr, wtype); + if (IS_ERR(iptr)) + goto out_unmap; + hdrlen = (unsigned char *)iptr - (unsigned char *)headerp; + + if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) + goto out_overflow; + + dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n", + rqst->rq_task->tk_pid, __func__, + transfertypes[rtype], transfertypes[wtype], + hdrlen, rpclen); - dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd" - " headerp 0x%p base 0x%p lkey 0x%x\n", - __func__, transfertypes[wtype], hdrlen, rpclen, - headerp, base, rdmab_lkey(req->rl_rdmabuf)); - - /* - * initialize send_iov's - normally only two: rdma chunk header and - * single preregistered RPC header buffer, but if padding is present, - * then use a preregistered (and zeroed) pad buffer between the RPC - * header and any write data. In all non-rdma cases, any following - * data has been copied into the RPC header buffer. - */ req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); req->rl_send_iov[0].length = hdrlen; req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); @@ -587,6 +686,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) req->rl_niovs = 2; return 0; + +out_overflow: + pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n", + hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]); + /* Terminate this RPC. Chunks registered above will be + * released by xprt_release -> xprt_rmda_free . + */ + return -EIO; + +out_unmap: + r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); + return PTR_ERR(iptr); } /* diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c index 765bca47c..0ba9887f3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c +++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c @@ -145,19 +145,32 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend) return (__be32 *)&ary->wc_array[nchunks]; } -int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp) +/** + * svc_rdma_xdr_decode_req - Parse incoming RPC-over-RDMA header + * @rq_arg: Receive buffer + * + * On entry, xdr->head[0].iov_base points to first byte in the + * RPC-over-RDMA header. + * + * On successful exit, head[0] points to first byte past the + * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message. + * The length of the RPC-over-RDMA header is returned. + */ +int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) { + struct rpcrdma_msg *rmsgp; __be32 *va, *vaend; unsigned int len; u32 hdr_len; /* Verify that there's enough bytes for header + something */ - if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_ERR) { + if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) { dprintk("svcrdma: header too short = %d\n", - rqstp->rq_arg.len); + rq_arg->len); return -EINVAL; } + rmsgp = (struct rpcrdma_msg *)rq_arg->head[0].iov_base; if (rmsgp->rm_vers != rpcrdma_version) { dprintk("%s: bad version %u\n", __func__, be32_to_cpu(rmsgp->rm_vers)); @@ -189,10 +202,10 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp) be32_to_cpu(rmsgp->rm_body.rm_padded.rm_thresh); va = &rmsgp->rm_body.rm_padded.rm_pempty[4]; - rqstp->rq_arg.head[0].iov_base = va; + rq_arg->head[0].iov_base = va; len = (u32)((unsigned long)va - (unsigned long)rmsgp); - rqstp->rq_arg.head[0].iov_len -= len; - if (len > rqstp->rq_arg.len) + rq_arg->head[0].iov_len -= len; + if (len > rq_arg->len) return -EINVAL; return len; default: @@ -205,7 +218,7 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp) * chunk list and a reply chunk list. */ va = &rmsgp->rm_body.rm_chunks[0]; - vaend = (__be32 *)((unsigned long)rmsgp + rqstp->rq_arg.len); + vaend = (__be32 *)((unsigned long)rmsgp + rq_arg->len); va = decode_read_list(va, vaend); if (!va) { dprintk("svcrdma: failed to decode read list\n"); @@ -222,10 +235,9 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp) return -EINVAL; } - rqstp->rq_arg.head[0].iov_base = va; + rq_arg->head[0].iov_base = va; hdr_len = (unsigned long)va - (unsigned long)rmsgp; - rqstp->rq_arg.head[0].iov_len -= hdr_len; - + rq_arg->head[0].iov_len -= hdr_len; return hdr_len; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 3b24a646e..2c25606f2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -281,7 +281,7 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt, } atomic_inc(&xprt->sc_dma_used); - n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, PAGE_SIZE); + n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, NULL, PAGE_SIZE); if (unlikely(n != frmr->sg_nents)) { pr_err("svcrdma: failed to map mr %p (%d/%d elements)\n", frmr->mr, n, frmr->sg_nents); @@ -447,10 +447,8 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt, head->arg.len = rqstp->rq_arg.len; head->arg.buflen = rqstp->rq_arg.buflen; - ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; - position = be32_to_cpu(ch->rc_position); - /* RDMA_NOMSG: RDMA READ data should land just after RDMA RECV data */ + position = be32_to_cpu(ch->rc_position); if (position == 0) { head->arg.pages = &head->pages[0]; page_offset = head->byte_len; @@ -488,7 +486,7 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt, if (page_offset & 3) { u32 pad = 4 - (page_offset & 3); - head->arg.page_len += pad; + head->arg.tail[0].iov_len += pad; head->arg.len += pad; head->arg.buflen += pad; page_offset += pad; @@ -510,11 +508,10 @@ static int rdma_read_chunks(struct svcxprt_rdma *xprt, return ret; } -static int rdma_read_complete(struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *head) +static void rdma_read_complete(struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *head) { int page_no; - int ret; /* Copy RPC pages */ for (page_no = 0; page_no < head->count; page_no++) { @@ -550,23 +547,6 @@ static int rdma_read_complete(struct svc_rqst *rqstp, rqstp->rq_arg.tail[0] = head->arg.tail[0]; rqstp->rq_arg.len = head->arg.len; rqstp->rq_arg.buflen = head->arg.buflen; - - /* Free the context */ - svc_rdma_put_context(head, 0); - - /* XXX: What should this be? */ - rqstp->rq_prot = IPPROTO_MAX; - svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt); - - ret = rqstp->rq_arg.head[0].iov_len - + rqstp->rq_arg.page_len - + rqstp->rq_arg.tail[0].iov_len; - dprintk("svcrdma: deferred read ret=%d, rq_arg.len=%u, " - "rq_arg.head[0].iov_base=%p, rq_arg.head[0].iov_len=%zu\n", - ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, - rqstp->rq_arg.head[0].iov_len); - - return ret; } /* By convention, backchannel calls arrive via rdma_msg type @@ -624,7 +604,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) dto_q); list_del_init(&ctxt->dto_q); spin_unlock_bh(&rdma_xprt->sc_rq_dto_lock); - return rdma_read_complete(rqstp, ctxt); + rdma_read_complete(rqstp, ctxt); + goto complete; } else if (!list_empty(&rdma_xprt->sc_rq_dto_q)) { ctxt = list_entry(rdma_xprt->sc_rq_dto_q.next, struct svc_rdma_op_ctxt, @@ -655,7 +636,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) /* Decode the RDMA header. */ rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base; - ret = svc_rdma_xdr_decode_req(rmsgp, rqstp); + ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg); if (ret < 0) goto out_err; if (ret == 0) @@ -682,6 +663,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) return 0; } +complete: ret = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len + rqstp->rq_arg.tail[0].iov_len; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 4f1b1c4f4..54d533300 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -463,25 +463,21 @@ static int send_reply(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, struct page *page, struct rpcrdma_msg *rdma_resp, - struct svc_rdma_op_ctxt *ctxt, struct svc_rdma_req_map *vec, int byte_count) { + struct svc_rdma_op_ctxt *ctxt; struct ib_send_wr send_wr; u32 xdr_off; int sge_no; int sge_bytes; int page_no; int pages; - int ret; - - ret = svc_rdma_repost_recv(rdma, GFP_KERNEL); - if (ret) { - svc_rdma_put_context(ctxt, 0); - return -ENOTCONN; - } + int ret = -EIO; /* Prepare the context */ + ctxt = svc_rdma_get_context(rdma); + ctxt->direction = DMA_TO_DEVICE; ctxt->pages[0] = page; ctxt->count = 1; @@ -565,8 +561,7 @@ static int send_reply(struct svcxprt_rdma *rdma, err: svc_rdma_unmap_dma(ctxt); svc_rdma_put_context(ctxt, 1); - pr_err("svcrdma: failed to send reply, rc=%d\n", ret); - return -EIO; + return ret; } void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) @@ -585,7 +580,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) int ret; int inline_bytes; struct page *res_page; - struct svc_rdma_op_ctxt *ctxt; struct svc_rdma_req_map *vec; dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); @@ -598,8 +592,6 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary); /* Build an req vec for the XDR */ - ctxt = svc_rdma_get_context(rdma); - ctxt->direction = DMA_TO_DEVICE; vec = svc_rdma_get_req_map(rdma); ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL); if (ret) @@ -635,7 +627,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) inline_bytes -= ret; } - ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec, + /* Post a fresh Receive buffer _before_ sending the reply */ + ret = svc_rdma_post_recv(rdma, GFP_KERNEL); + if (ret) + goto err1; + + ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec, inline_bytes); if (ret < 0) goto err1; @@ -648,7 +645,8 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) put_page(res_page); err0: svc_rdma_put_req_map(rdma, vec); - svc_rdma_put_context(ctxt, 0); + pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n", + ret); set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); return -ENOTCONN; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 90668969d..dd9440137 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -789,7 +789,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, int ret; dprintk("svcrdma: Creating RDMA socket\n"); - if (sa->sa_family != AF_INET) { + if ((sa->sa_family != AF_INET) && (sa->sa_family != AF_INET6)) { dprintk("svcrdma: Address family %d is not supported.\n", sa->sa_family); return ERR_PTR(-EAFNOSUPPORT); } @@ -805,6 +805,16 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, goto err0; } + /* Allow both IPv4 and IPv6 sockets to bind a single port + * at the same time. + */ +#if IS_ENABLED(CONFIG_IPV6) + ret = rdma_set_afonly(listen_id, 1); + if (ret) { + dprintk("svcrdma: rdma_set_afonly failed = %d\n", ret); + goto err1; + } +#endif ret = rdma_bind_addr(listen_id, sa); if (ret) { dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret); @@ -1073,7 +1083,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; /* Post receive buffers */ - for (i = 0; i < newxprt->sc_rq_depth; i++) { + for (i = 0; i < newxprt->sc_max_requests; i++) { ret = svc_rdma_post_recv(newxprt, GFP_KERNEL); if (ret) { dprintk("svcrdma: failure posting receive buffers\n"); @@ -1170,6 +1180,9 @@ static void __svc_rdma_free(struct work_struct *work) dprintk("svcrdma: %s(%p)\n", __func__, rdma); + if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) + ib_drain_qp(rdma->sc_qp); + /* We should only be called from kref_put */ if (atomic_read(&xprt->xpt_ref.refcount) != 0) pr_err("svcrdma: sc_xprt still in use? (%d)\n", diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index b1b009f10..99d2e5b72 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -73,6 +73,8 @@ static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE; static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE; +static unsigned int min_inline_size = RPCRDMA_MIN_INLINE; +static unsigned int max_inline_size = RPCRDMA_MAX_INLINE; static unsigned int zero; static unsigned int max_padding = PAGE_SIZE; static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS; @@ -96,6 +98,8 @@ static struct ctl_table xr_tunables_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, + .extra1 = &min_inline_size, + .extra2 = &max_inline_size, }, { .procname = "rdma_max_inline_write", @@ -103,6 +107,8 @@ static struct ctl_table xr_tunables_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec, + .extra1 = &min_inline_size, + .extra2 = &max_inline_size, }, { .procname = "rdma_inline_write_padding", @@ -508,6 +514,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) out: dprintk("RPC: %s: size %zd, request 0x%p\n", __func__, size, req); req->rl_connect_cookie = 0; /* our reserved value */ + req->rl_task = task; return req->rl_sendbuf->rg_base; out_rdmabuf: @@ -564,7 +571,6 @@ xprt_rdma_free(void *buffer) struct rpcrdma_req *req; struct rpcrdma_xprt *r_xprt; struct rpcrdma_regbuf *rb; - int i; if (buffer == NULL) return; @@ -578,11 +584,8 @@ xprt_rdma_free(void *buffer) dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply); - for (i = 0; req->rl_nchunks;) { - --req->rl_nchunks; - i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, - &req->rl_segments[i]); - } + r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, + !RPC_IS_ASYNC(req->rl_task)); rpcrdma_buffer_put(req); } @@ -707,6 +710,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = { #if defined(CONFIG_SUNRPC_BACKCHANNEL) .bc_setup = xprt_rdma_bc_setup, .bc_up = xprt_rdma_bc_up, + .bc_maxpayload = xprt_rdma_bc_maxpayload, .bc_free_rqst = xprt_rdma_bc_free_rqst, .bc_destroy = xprt_rdma_bc_destroy, #endif diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index f5ed9f982..b044d98a1 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -203,15 +203,6 @@ out_fail: goto out_schedule; } -static void -rpcrdma_flush_cqs(struct rpcrdma_ep *ep) -{ - struct ib_wc wc; - - while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0) - rpcrdma_receive_wc(NULL, &wc); -} - static int rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) { @@ -374,23 +365,6 @@ out: } /* - * Drain any cq, prior to teardown. - */ -static void -rpcrdma_clean_cq(struct ib_cq *cq) -{ - struct ib_wc wc; - int count = 0; - - while (1 == ib_poll_cq(cq, 1, &wc)) - ++count; - - if (count) - dprintk("RPC: %s: flushed %d events (last 0x%x)\n", - __func__, count, wc.opcode); -} - -/* * Exported functions. */ @@ -459,7 +433,6 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) dprintk("RPC: %s: memory registration strategy is '%s'\n", __func__, ia->ri_ops->ro_displayname); - rwlock_init(&ia->ri_qplock); return 0; out3: @@ -515,7 +488,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, __func__); return -ENOMEM; } - max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS; + max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS - 1; /* check provider's send/recv wr limits */ if (cdata->max_requests > max_qp_wr) @@ -526,11 +499,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; + ep->rep_attr.cap.max_send_wr += 1; /* drain cqe */ rc = ia->ri_ops->ro_open(ia, ep, cdata); if (rc) return rc; ep->rep_attr.cap.max_recv_wr = cdata->max_requests; ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; + ep->rep_attr.cap.max_recv_wr += 1; /* drain cqe */ ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS; ep->rep_attr.cap.max_recv_sge = 1; ep->rep_attr.cap.max_inline_data = 0; @@ -578,6 +553,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.recv_cq = recvcq; /* Initialize cma parameters */ + memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma)); /* RPC/RDMA does not use private data */ ep->rep_remote_cma.private_data = NULL; @@ -591,7 +567,16 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_remote_cma.responder_resources = ia->ri_device->attrs.max_qp_rd_atom; - ep->rep_remote_cma.retry_count = 7; + /* Limit transport retries so client can detect server + * GID changes quickly. RPC layer handles re-establishing + * transport connection and retransmission. + */ + ep->rep_remote_cma.retry_count = 6; + + /* RPC-over-RDMA handles its own flow control. In addition, + * make all RNR NAKs visible so we know that RPC-over-RDMA + * flow control is working correctly (no NAKs should be seen). + */ ep->rep_remote_cma.flow_control = 0; ep->rep_remote_cma.rnr_retry_count = 0; @@ -622,13 +607,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) cancel_delayed_work_sync(&ep->rep_connect_worker); - if (ia->ri_id->qp) - rpcrdma_ep_disconnect(ep, ia); - - rpcrdma_clean_cq(ep->rep_attr.recv_cq); - rpcrdma_clean_cq(ep->rep_attr.send_cq); - if (ia->ri_id->qp) { + rpcrdma_ep_disconnect(ep, ia); rdma_destroy_qp(ia->ri_id); ia->ri_id->qp = NULL; } @@ -659,7 +639,6 @@ retry: dprintk("RPC: %s: reconnecting...\n", __func__); rpcrdma_ep_disconnect(ep, ia); - rpcrdma_flush_cqs(ep); xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); id = rpcrdma_create_id(xprt, ia, @@ -692,10 +671,8 @@ retry: goto out; } - write_lock(&ia->ri_qplock); old = ia->ri_id; ia->ri_id = id; - write_unlock(&ia->ri_qplock); rdma_destroy_qp(old); rpcrdma_destroy_id(old); @@ -785,7 +762,6 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { int rc; - rpcrdma_flush_cqs(ep); rc = rdma_disconnect(ia->ri_id); if (!rc) { /* returns without wait if not connected */ @@ -797,6 +773,8 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); ep->rep_connected = rc; } + + ib_drain_qp(ia->ri_id->qp); } struct rpcrdma_req * @@ -1271,25 +1249,3 @@ out_rc: rpcrdma_recv_buffer_put(rep); return rc; } - -/* How many chunk list items fit within our inline buffers? - */ -unsigned int -rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - int bytes, segments; - - bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize); - bytes -= RPCRDMA_HDRLEN_MIN; - if (bytes < sizeof(struct rpcrdma_segment) * 2) { - pr_warn("RPC: %s: inline threshold too small\n", - __func__); - return 0; - } - - segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1); - dprintk("RPC: %s: max chunk list size = %d segments\n", - __func__, segments); - return segments; -} diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2ebc743cb..95cdc6622 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -65,7 +65,6 @@ */ struct rpcrdma_ia { const struct rpcrdma_memreg_ops *ri_ops; - rwlock_t ri_qplock; struct ib_device *ri_device; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; @@ -73,6 +72,8 @@ struct rpcrdma_ia { struct completion ri_done; int ri_async_rc; unsigned int ri_max_frmr_depth; + unsigned int ri_max_inline_write; + unsigned int ri_max_inline_read; struct ib_qp_attr ri_qp_attr; struct ib_qp_init_attr ri_qp_init_attr; }; @@ -144,6 +145,26 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) #define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) +/* To ensure a transport can always make forward progress, + * the number of RDMA segments allowed in header chunk lists + * is capped at 8. This prevents less-capable devices and + * memory registrations from overrunning the Send buffer + * while building chunk lists. + * + * Elements of the Read list take up more room than the + * Write list or Reply chunk. 8 read segments means the Read + * list (or Write list or Reply chunk) cannot consume more + * than + * + * ((8 + 2) * read segment size) + 1 XDR words, or 244 bytes. + * + * And the fixed part of the header is another 24 bytes. + * + * The smallest inline threshold is 1024 bytes, ensuring that + * at least 750 bytes are available for RPC messages. + */ +#define RPCRDMA_MAX_HDR_SEGS (8) + /* * struct rpcrdma_rep -- this structure encapsulates state required to recv * and complete a reply, asychronously. It needs several pieces of @@ -162,7 +183,9 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) */ #define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE) -#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */ + +/* data segments + head/tail for Call + head/tail for Reply */ +#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 4) struct rpcrdma_buffer; @@ -198,14 +221,13 @@ enum rpcrdma_frmr_state { }; struct rpcrdma_frmr { - struct scatterlist *sg; - int sg_nents; + struct scatterlist *fr_sg; + int fr_nents; + enum dma_data_direction fr_dir; struct ib_mr *fr_mr; struct ib_cqe fr_cqe; enum rpcrdma_frmr_state fr_state; struct completion fr_linv_done; - struct work_struct fr_work; - struct rpcrdma_xprt *fr_xprt; union { struct ib_reg_wr fr_regwr; struct ib_send_wr fr_invwr; @@ -222,6 +244,8 @@ struct rpcrdma_mw { struct rpcrdma_fmr fmr; struct rpcrdma_frmr frmr; }; + struct work_struct mw_work; + struct rpcrdma_xprt *mw_xprt; struct list_head mw_list; struct list_head mw_all; }; @@ -270,12 +294,14 @@ struct rpcrdma_req { unsigned int rl_niovs; unsigned int rl_nchunks; unsigned int rl_connect_cookie; + struct rpc_task *rl_task; struct rpcrdma_buffer *rl_buffer; struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS]; struct rpcrdma_regbuf *rl_rdmabuf; struct rpcrdma_regbuf *rl_sendbuf; struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; + struct rpcrdma_mr_seg *rl_nextseg; struct ib_cqe rl_cqe; struct list_head rl_all; @@ -372,8 +398,8 @@ struct rpcrdma_memreg_ops { struct rpcrdma_mr_seg *, int, bool); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct rpcrdma_req *); - int (*ro_unmap)(struct rpcrdma_xprt *, - struct rpcrdma_mr_seg *); + void (*ro_unmap_safe)(struct rpcrdma_xprt *, + struct rpcrdma_req *, bool); int (*ro_open)(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_create_data_internal *); @@ -456,7 +482,6 @@ struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, void rpcrdma_free_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); -unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *); int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); int frwr_alloc_recovery_wq(void); @@ -519,6 +544,9 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *); * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ int rpcrdma_marshal_req(struct rpc_rqst *); +void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *, + struct rpcrdma_create_data_internal *, + unsigned int); /* RPC/RDMA module init - xprtrdma/transport.c */ @@ -534,6 +562,7 @@ void xprt_rdma_cleanup(void); #if defined(CONFIG_SUNRPC_BACKCHANNEL) int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int); int xprt_rdma_bc_up(struct svc_serv *, struct net *); +size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *); int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int); void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *); int rpcrdma_bc_marshal_reply(struct rpc_rqst *); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e9e5dd0dc..7e2b2fa18 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -995,15 +995,14 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, u32 _xid; __be32 *xp; - repsize = skb->len - sizeof(struct udphdr); + repsize = skb->len; if (repsize < 4) { dprintk("RPC: impossible RPC reply size %d!\n", repsize); return; } /* Copy the XID from the skb... */ - xp = skb_header_pointer(skb, sizeof(struct udphdr), - sizeof(_xid), &_xid); + xp = skb_header_pointer(skb, 0, sizeof(_xid), &_xid); if (xp == NULL) return; @@ -1019,11 +1018,11 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, /* Suck it into the iovec, verify checksum if not done by hw. */ if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) { - UDPX_INC_STATS_BH(sk, UDP_MIB_INERRORS); + __UDPX_INC_STATS(sk, UDP_MIB_INERRORS); goto out_unlock; } - UDPX_INC_STATS_BH(sk, UDP_MIB_INDATAGRAMS); + __UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS); xprt_adjust_cwnd(xprt, task, copied); xprt_complete_rqst(task, copied); @@ -1365,6 +1364,11 @@ static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net) return ret; return 0; } + +static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt) +{ + return PAGE_SIZE; +} #else static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) @@ -1881,8 +1885,7 @@ static inline void xs_reclassify_socket6(struct socket *sock) static inline void xs_reclassify_socket(int family, struct socket *sock) { - WARN_ON_ONCE(sock_owned_by_user(sock->sk)); - if (sock_owned_by_user(sock->sk)) + if (WARN_ON_ONCE(!sock_allow_reclassification(sock->sk))) return; switch (family) { @@ -1952,6 +1955,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, sk->sk_user_data = xprt; sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; + sock_set_flag(sk, SOCK_FASYNC); sk->sk_error_report = xs_error_report; sk->sk_allocation = GFP_NOIO; @@ -2138,6 +2142,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_user_data = xprt; sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; + sock_set_flag(sk, SOCK_FASYNC); sk->sk_allocation = GFP_NOIO; xprt_set_connected(xprt); @@ -2239,6 +2244,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; + sock_set_flag(sk, SOCK_FASYNC); sk->sk_error_report = xs_error_report; sk->sk_allocation = GFP_NOIO; @@ -2660,6 +2666,7 @@ static struct rpc_xprt_ops xs_tcp_ops = { #ifdef CONFIG_SUNRPC_BACKCHANNEL .bc_setup = xprt_setup_bc, .bc_up = xs_tcp_bc_up, + .bc_maxpayload = xs_tcp_bc_maxpayload, .bc_free_rqst = xprt_free_bc_rqst, .bc_destroy = xprt_destroy_bc, #endif diff --git a/net/switchdev/Kconfig b/net/switchdev/Kconfig index 86a47e17c..651fa201a 100644 --- a/net/switchdev/Kconfig +++ b/net/switchdev/Kconfig @@ -3,7 +3,7 @@ # config NET_SWITCHDEV - bool "Switch (and switch-ish) device support (EXPERIMENTAL)" + bool "Switch (and switch-ish) device support" depends on INET ---help--- This module provides glue between core networking code and device diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 27a540621..a597708ae 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -205,6 +205,7 @@ static int tipc_enable_bearer(struct net *net, const char *name, struct tipc_bearer *b; struct tipc_media *m; struct tipc_bearer_names b_names; + struct sk_buff *skb; char addr_string[16]; u32 bearer_id; u32 with_this_prio; @@ -301,7 +302,7 @@ restart: b->net_plane = bearer_id + 'A'; b->priority = priority; - res = tipc_disc_create(net, b, &b->bcast_addr); + res = tipc_disc_create(net, b, &b->bcast_addr, &skb); if (res) { bearer_disable(net, b); pr_warn("Bearer <%s> rejected, discovery object creation failed\n", @@ -310,7 +311,8 @@ restart: } rcu_assign_pointer(tn->bearer_list[bearer_id], b); - + if (skb) + tipc_bearer_xmit_skb(net, bearer_id, skb, &b->bcast_addr); pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n", name, tipc_addr_string_fill(addr_string, disc_domain), priority); @@ -328,6 +330,21 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b) return 0; } +/* tipc_bearer_reset_all - reset all links on all bearers + */ +void tipc_bearer_reset_all(struct net *net) +{ + struct tipc_net *tn = tipc_net(net); + struct tipc_bearer *b; + int i; + + for (i = 0; i < MAX_BEARERS; i++) { + b = rcu_dereference_rtnl(tn->bearer_list[i]); + if (b) + tipc_reset_bearer(net, b); + } +} + /** * bearer_disable * @@ -335,23 +352,16 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b) */ static void bearer_disable(struct net *net, struct tipc_bearer *b) { - struct tipc_net *tn = net_generic(net, tipc_net_id); - u32 i; + struct tipc_net *tn = tipc_net(net); + int bearer_id = b->identity; pr_info("Disabling bearer <%s>\n", b->name); b->media->disable_media(b); - - tipc_node_delete_links(net, b->identity); + tipc_node_delete_links(net, bearer_id); RCU_INIT_POINTER(b->media_ptr, NULL); if (b->link_req) tipc_disc_delete(b->link_req); - - for (i = 0; i < MAX_BEARERS; i++) { - if (b == rtnl_dereference(tn->bearer_list[i])) { - RCU_INIT_POINTER(tn->bearer_list[i], NULL); - break; - } - } + RCU_INIT_POINTER(tn->bearer_list[bearer_id], NULL); kfree_rcu(b, rcu); } @@ -394,7 +404,7 @@ void tipc_disable_l2_media(struct tipc_bearer *b) /** * tipc_l2_send_msg - send a TIPC packet out over an L2 interface - * @buf: the packet to be sent + * @skb: the packet to be sent * @b: the bearer through which the packet is to be sent * @dest: peer destination address */ @@ -403,17 +413,21 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb, { struct net_device *dev; int delta; + void *tipc_ptr; dev = (struct net_device *)rcu_dereference_rtnl(b->media_ptr); if (!dev) return 0; + /* Send RESET message even if bearer is detached from device */ + tipc_ptr = rcu_dereference_rtnl(dev->tipc_ptr); + if (unlikely(!tipc_ptr && !msg_is_reset(buf_msg(skb)))) + goto drop; + delta = dev->hard_header_len - skb_headroom(skb); if ((delta > 0) && - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) { - kfree_skb(skb); - return 0; - } + pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) + goto drop; skb_reset_network_header(skb); skb->dev = dev; @@ -422,6 +436,9 @@ int tipc_l2_send_msg(struct net *net, struct sk_buff *skb, dev->dev_addr, skb->len); dev_queue_xmit(skb); return 0; +drop: + kfree_skb(skb); + return 0; } int tipc_bearer_mtu(struct net *net, u32 bearer_id) @@ -450,6 +467,8 @@ void tipc_bearer_xmit_skb(struct net *net, u32 bearer_id, b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); if (likely(b)) b->media->send_msg(net, skb, b, dest); + else + kfree_skb(skb); rcu_read_unlock(); } @@ -468,11 +487,11 @@ void tipc_bearer_xmit(struct net *net, u32 bearer_id, rcu_read_lock(); b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); - if (likely(b)) { - skb_queue_walk_safe(xmitq, skb, tmp) { - __skb_dequeue(xmitq); - b->media->send_msg(net, skb, b, dst); - } + if (unlikely(!b)) + __skb_queue_purge(xmitq); + skb_queue_walk_safe(xmitq, skb, tmp) { + __skb_dequeue(xmitq); + b->media->send_msg(net, skb, b, dst); } rcu_read_unlock(); } @@ -490,14 +509,14 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, rcu_read_lock(); b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); - if (likely(b)) { - skb_queue_walk_safe(xmitq, skb, tmp) { - hdr = buf_msg(skb); - msg_set_non_seq(hdr, 1); - msg_set_mc_netid(hdr, net_id); - __skb_dequeue(xmitq); - b->media->send_msg(net, skb, b, &b->bcast_addr); - } + if (unlikely(!b)) + __skb_queue_purge(xmitq); + skb_queue_walk_safe(xmitq, skb, tmp) { + hdr = buf_msg(skb); + msg_set_non_seq(hdr, 1); + msg_set_mc_netid(hdr, net_id); + __skb_dequeue(xmitq); + b->media->send_msg(net, skb, b, &b->bcast_addr); } rcu_read_unlock(); } @@ -513,24 +532,21 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id, * ignores packets sent using interface multicast, and traffic sent to other * nodes (which can happen if interface is running in promiscuous mode). */ -static int tipc_l2_rcv_msg(struct sk_buff *buf, struct net_device *dev, +static int tipc_l2_rcv_msg(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct tipc_bearer *b; rcu_read_lock(); b = rcu_dereference_rtnl(dev->tipc_ptr); - if (likely(b)) { - if (likely(buf->pkt_type <= PACKET_BROADCAST)) { - buf->next = NULL; - tipc_rcv(dev_net(dev), buf, b); - rcu_read_unlock(); - return NET_RX_SUCCESS; - } + if (likely(b && (skb->pkt_type <= PACKET_BROADCAST))) { + skb->next = NULL; + tipc_rcv(dev_net(dev), skb, b); + rcu_read_unlock(); + return NET_RX_SUCCESS; } rcu_read_unlock(); - - kfree_skb(buf); + kfree_skb(skb); return NET_RX_DROP; } @@ -548,9 +564,18 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); + struct tipc_net *tn = tipc_net(net); struct tipc_bearer *b; + int i; b = rtnl_dereference(dev->tipc_ptr); + if (!b) { + for (i = 0; i < MAX_BEARERS; b = NULL, i++) { + b = rtnl_dereference(tn->bearer_list[i]); + if (b && (b->media_ptr == dev)) + break; + } + } if (!b) return NOTIFY_DONE; @@ -560,13 +585,20 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, case NETDEV_CHANGE: if (netif_carrier_ok(dev)) break; + case NETDEV_UP: + rcu_assign_pointer(dev->tipc_ptr, b); + break; case NETDEV_GOING_DOWN: + RCU_INIT_POINTER(dev->tipc_ptr, NULL); + synchronize_net(); + tipc_reset_bearer(net, b); + break; case NETDEV_CHANGEMTU: tipc_reset_bearer(net, b); break; case NETDEV_CHANGEADDR: b->media->raw2addr(b, &b->addr, - (char *)dev->dev_addr); + (char *)dev->dev_addr); tipc_reset_bearer(net, b); break; case NETDEV_UNREGISTER: diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index e31820516..60e49c3be 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -42,8 +42,6 @@ #include <net/genetlink.h> #define MAX_MEDIA 3 -#define MAX_NODES 4096 -#define WSIZE 32 /* Identifiers associated with TIPC message header media address info * - address info field is 32 bytes long @@ -62,16 +60,6 @@ #define TIPC_MEDIA_TYPE_UDP 3 /** - * struct tipc_node_map - set of node identifiers - * @count: # of nodes in set - * @map: bitmap of node identifiers that are in the set - */ -struct tipc_node_map { - u32 count; - u32 map[MAX_NODES / WSIZE]; -}; - -/** * struct tipc_media_addr - destination address used by TIPC bearers * @value: address info (format defined by media) * @media_id: TIPC media type identifier @@ -142,7 +130,6 @@ struct tipc_media { * @identity: array index of this bearer within TIPC bearer array * @link_req: ptr to (optional) structure making periodic link setup requests * @net_plane: network plane ('A' through 'H') currently associated with bearer - * @nodes: indicates which nodes in cluster can be reached through bearer * * Note: media-specific code is responsible for initialization of the fields * indicated below when a bearer is enabled; TIPC's generic bearer code takes @@ -163,8 +150,6 @@ struct tipc_bearer { u32 identity; struct tipc_link_req *link_req; char net_plane; - int node_cnt; - struct tipc_node_map nodes; }; struct tipc_bearer_names { @@ -213,6 +198,7 @@ void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest); void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest); struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name); struct tipc_media *tipc_media_find(const char *name); +void tipc_bearer_reset_all(struct net *net); int tipc_bearer_setup(void); void tipc_bearer_cleanup(void); void tipc_bearer_stop(struct net *net); diff --git a/net/tipc/core.c b/net/tipc/core.c index e2bdb07a4..fe1b062c4 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -112,11 +112,9 @@ static int __init tipc_init(void) pr_info("Activated (version " TIPC_MOD_VER ")\n"); - sysctl_tipc_rmem[0] = TIPC_CONN_OVERLOAD_LIMIT >> 4 << - TIPC_LOW_IMPORTANCE; - sysctl_tipc_rmem[1] = TIPC_CONN_OVERLOAD_LIMIT >> 4 << - TIPC_CRITICAL_IMPORTANCE; - sysctl_tipc_rmem[2] = TIPC_CONN_OVERLOAD_LIMIT; + sysctl_tipc_rmem[0] = RCVBUF_MIN; + sysctl_tipc_rmem[1] = RCVBUF_DEF; + sysctl_tipc_rmem[2] = RCVBUF_MAX; err = tipc_netlink_start(); if (err) diff --git a/net/tipc/discover.c b/net/tipc/discover.c index f1e738e80..ad9d477cc 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -268,10 +268,9 @@ exit: * Returns 0 if successful, otherwise -errno. */ int tipc_disc_create(struct net *net, struct tipc_bearer *b, - struct tipc_media_addr *dest) + struct tipc_media_addr *dest, struct sk_buff **skb) { struct tipc_link_req *req; - struct sk_buff *skb; req = kmalloc(sizeof(*req), GFP_ATOMIC); if (!req) @@ -293,9 +292,7 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b, setup_timer(&req->timer, disc_timeout, (unsigned long)req); mod_timer(&req->timer, jiffies + req->timer_intv); b->link_req = req; - skb = skb_clone(req->buf, GFP_ATOMIC); - if (skb) - tipc_bearer_xmit_skb(net, req->bearer_id, skb, &req->dest); + *skb = skb_clone(req->buf, GFP_ATOMIC); return 0; } diff --git a/net/tipc/discover.h b/net/tipc/discover.h index c9b12770c..b80a33538 100644 --- a/net/tipc/discover.h +++ b/net/tipc/discover.h @@ -40,7 +40,7 @@ struct tipc_link_req; int tipc_disc_create(struct net *net, struct tipc_bearer *b_ptr, - struct tipc_media_addr *dest); + struct tipc_media_addr *dest, struct sk_buff **skb); void tipc_disc_delete(struct tipc_link_req *req); void tipc_disc_reset(struct net *net, struct tipc_bearer *b_ptr); void tipc_disc_add_dest(struct tipc_link_req *req); diff --git a/net/tipc/link.c b/net/tipc/link.c index 7d2bb3e70..7d89f8713 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -140,6 +140,7 @@ struct tipc_link { char if_name[TIPC_MAX_IF_NAME]; u32 priority; char net_plane; + u16 rst_cnt; /* Failover/synch */ u16 drop_point; @@ -348,6 +349,8 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l, u16 ack = snd_l->snd_nxt - 1; snd_l->ackers--; + rcv_l->bc_peer_is_up = true; + rcv_l->state = LINK_ESTABLISHED; tipc_link_bc_ack_rcv(rcv_l, ack, xmitq); tipc_link_reset(rcv_l); rcv_l->state = LINK_RESET; @@ -701,40 +704,35 @@ static void link_profile_stats(struct tipc_link *l) /* tipc_link_timeout - perform periodic task as instructed from node timeout */ -/* tipc_link_timeout - perform periodic task as instructed from node timeout - */ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) { + int mtyp = 0; int rc = 0; - int mtyp = STATE_MSG; - bool xmit = false; - bool prb = false; + bool state = false; + bool probe = false; + bool setup = false; u16 bc_snt = l->bc_sndlink->snd_nxt - 1; u16 bc_acked = l->bc_rcvlink->acked; - bool bc_up = link_is_up(l->bc_rcvlink); link_profile_stats(l); switch (l->state) { case LINK_ESTABLISHED: case LINK_SYNCHING: - if (!l->silent_intv_cnt) { - if (bc_up && (bc_acked != bc_snt)) - xmit = true; - } else if (l->silent_intv_cnt <= l->abort_limit) { - xmit = true; - prb = true; - } else { - rc |= tipc_link_fsm_evt(l, LINK_FAILURE_EVT); - } + if (l->silent_intv_cnt > l->abort_limit) + return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); + mtyp = STATE_MSG; + state = bc_acked != bc_snt; + probe = l->silent_intv_cnt; l->silent_intv_cnt++; break; case LINK_RESET: - xmit = true; + setup = l->rst_cnt++ <= 4; + setup |= !(l->rst_cnt % 16); mtyp = RESET_MSG; break; case LINK_ESTABLISHING: - xmit = true; + setup = true; mtyp = ACTIVATE_MSG; break; case LINK_PEER_RESET: @@ -745,8 +743,8 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) break; } - if (xmit) - tipc_link_build_proto_msg(l, mtyp, prb, 0, 0, 0, xmitq); + if (state || probe || setup) + tipc_link_build_proto_msg(l, mtyp, probe, 0, 0, 0, xmitq); return rc; } @@ -833,6 +831,7 @@ void tipc_link_reset(struct tipc_link *l) l->rcv_nxt = 1; l->acked = 0; l->silent_intv_cnt = 0; + l->rst_cnt = 0; l->stats.recv_info = 0; l->stale_count = 0; l->bc_peer_is_up = false; @@ -1110,12 +1109,12 @@ static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked) return released; } -/* tipc_link_build_ack_msg: prepare link acknowledge message for transmission +/* tipc_link_build_state_msg: prepare link state message for transmission * * Note that sending of broadcast ack is coordinated among nodes, to reduce * risk of ack storms towards the sender */ -int tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq) +int tipc_link_build_state_msg(struct tipc_link *l, struct sk_buff_head *xmitq) { if (!l) return 0; @@ -1140,11 +1139,17 @@ int tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq) void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq) { int mtyp = RESET_MSG; + struct sk_buff *skb; if (l->state == LINK_ESTABLISHING) mtyp = ACTIVATE_MSG; tipc_link_build_proto_msg(l, mtyp, 0, 0, 0, 0, xmitq); + + /* Inform peer that this endpoint is going down if applicable */ + skb = skb_peek_tail(xmitq); + if (skb && (l->state == LINK_RESET)) + msg_set_peer_stopping(buf_msg(skb), 1); } /* tipc_link_build_nack_msg: prepare link nack message for transmission @@ -1219,7 +1224,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, if (!tipc_data_input(l, skb, l->inputq)) rc |= tipc_link_input(l, skb, l->inputq); if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) - rc |= tipc_link_build_ack_msg(l, xmitq); + rc |= tipc_link_build_state_msg(l, xmitq); if (unlikely(rc & ~TIPC_LINK_SND_BC_ACK)) break; } while ((skb = __skb_dequeue(defq))); @@ -1411,7 +1416,9 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, l->priority = peers_prio; /* ACTIVATE_MSG serves as PEER_RESET if link is already down */ - if ((mtyp == RESET_MSG) || !link_is_up(l)) + if (msg_peer_stopping(hdr)) + rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); + else if ((mtyp == RESET_MSG) || !link_is_up(l)) rc = tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT); /* ACTIVATE_MSG takes up link if it was already locally reset */ @@ -1554,7 +1561,12 @@ void tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, if (!msg_peer_node_is_up(hdr)) return; - l->bc_peer_is_up = true; + /* Open when peer ackowledges our bcast init msg (pkt #1) */ + if (msg_ack(hdr)) + l->bc_peer_is_up = true; + + if (!l->bc_peer_is_up) + return; /* Ignore if peers_snd_nxt goes beyond receive window */ if (more(peers_snd_nxt, l->rcv_nxt + l->window)) diff --git a/net/tipc/link.h b/net/tipc/link.h index 6a94175ee..d7e9d42fc 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -123,7 +123,7 @@ int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq); int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *xmitq); -int tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq); +int tipc_link_build_state_msg(struct tipc_link *l, struct sk_buff_head *xmitq); void tipc_link_add_bc_peer(struct tipc_link *snd_l, struct tipc_link *uc_l, struct sk_buff_head *xmitq); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 8740930f0..17201aa84 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -41,6 +41,8 @@ #include "name_table.h" #define MAX_FORWARD_SIZE 1024 +#define BUF_HEADROOM (LL_MAX_HEADER + 48) +#define BUF_TAILROOM 16 static unsigned int align(unsigned int i) { @@ -505,6 +507,10 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) msg_set_hdr_sz(hdr, BASIC_H_SIZE); } + if (skb_cloned(_skb) && + pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_KERNEL)) + goto exit; + /* Now reverse the concerned fields */ msg_set_errcode(hdr, err); msg_set_origport(hdr, msg_destport(&ohdr)); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 55778a0ae..7cf52fb39 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -94,17 +94,6 @@ struct plist; #define TIPC_MEDIA_INFO_OFFSET 5 -/** - * TIPC message buffer code - * - * TIPC message buffer headroom reserves space for the worst-case - * link-level device header (in case the message is sent off-node). - * - * Note: Headroom should be a multiple of 4 to ensure the TIPC header fields - * are word aligned for quicker access - */ -#define BUF_HEADROOM (LL_MAX_HEADER + 48) - struct tipc_skb_cb { void *handle; struct sk_buff *tail; @@ -715,6 +704,16 @@ static inline void msg_set_redundant_link(struct tipc_msg *m, u32 r) msg_set_bits(m, 5, 12, 0x1, r); } +static inline u32 msg_peer_stopping(struct tipc_msg *m) +{ + return msg_bits(m, 5, 13, 0x1); +} + +static inline void msg_set_peer_stopping(struct tipc_msg *m, u32 s) +{ + msg_set_bits(m, 5, 13, 0x1, s); +} + static inline char *msg_media_addr(struct tipc_msg *m) { return (char *)&m->hdr[TIPC_MEDIA_INFO_OFFSET]; @@ -733,16 +732,26 @@ static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n) msg_set_bits(m, 9, 16, 0xffff, n); } -static inline u32 msg_bcast_tag(struct tipc_msg *m) +static inline u32 msg_conn_ack(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } -static inline void msg_set_bcast_tag(struct tipc_msg *m, u32 n) +static inline void msg_set_conn_ack(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 16, 0xffff, n); } +static inline u32 msg_adv_win(struct tipc_msg *m) +{ + return msg_bits(m, 9, 0, 0xffff); +} + +static inline void msg_set_adv_win(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 9, 0, 0xffff, n); +} + static inline u32 msg_max_pkt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff) * 4; @@ -779,6 +788,11 @@ static inline bool msg_peer_node_is_up(struct tipc_msg *m) return msg_redundant_link(m); } +static inline bool msg_is_reset(struct tipc_msg *hdr) +{ + return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); +} + struct sk_buff *tipc_buf_acquire(u32 size); bool tipc_msg_validate(struct sk_buff *skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 4dfc5c14f..1fd464764 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -346,9 +346,15 @@ static int tipc_nl_compat_bearer_dump(struct tipc_nl_compat_msg *msg, struct nlattr **attrs) { struct nlattr *bearer[TIPC_NLA_BEARER_MAX + 1]; + int err; + + if (!attrs[TIPC_NLA_BEARER]) + return -EINVAL; - nla_parse_nested(bearer, TIPC_NLA_BEARER_MAX, attrs[TIPC_NLA_BEARER], - NULL); + err = nla_parse_nested(bearer, TIPC_NLA_BEARER_MAX, + attrs[TIPC_NLA_BEARER], NULL); + if (err) + return err; return tipc_add_tlv(msg->rep, TIPC_TLV_BEARER_NAME, nla_data(bearer[TIPC_NLA_BEARER_NAME]), @@ -460,14 +466,31 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg, struct nlattr *link[TIPC_NLA_LINK_MAX + 1]; struct nlattr *prop[TIPC_NLA_PROP_MAX + 1]; struct nlattr *stats[TIPC_NLA_STATS_MAX + 1]; + int err; - nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], NULL); + if (!attrs[TIPC_NLA_LINK]) + return -EINVAL; - nla_parse_nested(prop, TIPC_NLA_PROP_MAX, link[TIPC_NLA_LINK_PROP], - NULL); + err = nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], + NULL); + if (err) + return err; + + if (!link[TIPC_NLA_LINK_PROP]) + return -EINVAL; - nla_parse_nested(stats, TIPC_NLA_STATS_MAX, link[TIPC_NLA_LINK_STATS], - NULL); + err = nla_parse_nested(prop, TIPC_NLA_PROP_MAX, + link[TIPC_NLA_LINK_PROP], NULL); + if (err) + return err; + + if (!link[TIPC_NLA_LINK_STATS]) + return -EINVAL; + + err = nla_parse_nested(stats, TIPC_NLA_STATS_MAX, + link[TIPC_NLA_LINK_STATS], NULL); + if (err) + return err; name = (char *)TLV_DATA(msg->req); if (strcmp(name, nla_data(link[TIPC_NLA_LINK_NAME])) != 0) @@ -569,12 +592,20 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg, { struct nlattr *link[TIPC_NLA_LINK_MAX + 1]; struct tipc_link_info link_info; + int err; - nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], NULL); + if (!attrs[TIPC_NLA_LINK]) + return -EINVAL; + + err = nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], + NULL); + if (err) + return err; link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]); link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP])); - strcpy(link_info.str, nla_data(link[TIPC_NLA_LINK_NAME])); + nla_strlcpy(link_info.str, link[TIPC_NLA_LINK_NAME], + TIPC_MAX_LINK_NAME); return tipc_add_tlv(msg->rep, TIPC_TLV_LINK_INFO, &link_info, sizeof(link_info)); @@ -758,12 +789,23 @@ static int tipc_nl_compat_name_table_dump(struct tipc_nl_compat_msg *msg, u32 node, depth, type, lowbound, upbound; static const char * const scope_str[] = {"", " zone", " cluster", " node"}; + int err; - nla_parse_nested(nt, TIPC_NLA_NAME_TABLE_MAX, - attrs[TIPC_NLA_NAME_TABLE], NULL); + if (!attrs[TIPC_NLA_NAME_TABLE]) + return -EINVAL; - nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, nt[TIPC_NLA_NAME_TABLE_PUBL], - NULL); + err = nla_parse_nested(nt, TIPC_NLA_NAME_TABLE_MAX, + attrs[TIPC_NLA_NAME_TABLE], NULL); + if (err) + return err; + + if (!nt[TIPC_NLA_NAME_TABLE_PUBL]) + return -EINVAL; + + err = nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, + nt[TIPC_NLA_NAME_TABLE_PUBL], NULL); + if (err) + return err; ntq = (struct tipc_name_table_query *)TLV_DATA(msg->req); @@ -815,8 +857,15 @@ static int __tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg, { u32 type, lower, upper; struct nlattr *publ[TIPC_NLA_PUBL_MAX + 1]; + int err; - nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, attrs[TIPC_NLA_PUBL], NULL); + if (!attrs[TIPC_NLA_PUBL]) + return -EINVAL; + + err = nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, attrs[TIPC_NLA_PUBL], + NULL); + if (err) + return err; type = nla_get_u32(publ[TIPC_NLA_PUBL_TYPE]); lower = nla_get_u32(publ[TIPC_NLA_PUBL_LOWER]); @@ -876,7 +925,13 @@ static int tipc_nl_compat_sk_dump(struct tipc_nl_compat_msg *msg, u32 sock_ref; struct nlattr *sock[TIPC_NLA_SOCK_MAX + 1]; - nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, attrs[TIPC_NLA_SOCK], NULL); + if (!attrs[TIPC_NLA_SOCK]) + return -EINVAL; + + err = nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, attrs[TIPC_NLA_SOCK], + NULL); + if (err) + return err; sock_ref = nla_get_u32(sock[TIPC_NLA_SOCK_REF]); tipc_tlv_sprintf(msg->rep, "%u:", sock_ref); @@ -917,9 +972,15 @@ static int tipc_nl_compat_media_dump(struct tipc_nl_compat_msg *msg, struct nlattr **attrs) { struct nlattr *media[TIPC_NLA_MEDIA_MAX + 1]; + int err; + + if (!attrs[TIPC_NLA_MEDIA]) + return -EINVAL; - nla_parse_nested(media, TIPC_NLA_MEDIA_MAX, attrs[TIPC_NLA_MEDIA], - NULL); + err = nla_parse_nested(media, TIPC_NLA_MEDIA_MAX, attrs[TIPC_NLA_MEDIA], + NULL); + if (err) + return err; return tipc_add_tlv(msg->rep, TIPC_TLV_MEDIA_NAME, nla_data(media[TIPC_NLA_MEDIA_NAME]), @@ -931,8 +992,15 @@ static int tipc_nl_compat_node_dump(struct tipc_nl_compat_msg *msg, { struct tipc_node_info node_info; struct nlattr *node[TIPC_NLA_NODE_MAX + 1]; + int err; - nla_parse_nested(node, TIPC_NLA_NODE_MAX, attrs[TIPC_NLA_NODE], NULL); + if (!attrs[TIPC_NLA_NODE]) + return -EINVAL; + + err = nla_parse_nested(node, TIPC_NLA_NODE_MAX, attrs[TIPC_NLA_NODE], + NULL); + if (err) + return err; node_info.addr = htonl(nla_get_u32(node[TIPC_NLA_NODE_ADDR])); node_info.up = htonl(nla_get_flag(node[TIPC_NLA_NODE_UP])); @@ -971,8 +1039,16 @@ static int tipc_nl_compat_net_dump(struct tipc_nl_compat_msg *msg, { __be32 id; struct nlattr *net[TIPC_NLA_NET_MAX + 1]; + int err; + + if (!attrs[TIPC_NLA_NET]) + return -EINVAL; + + err = nla_parse_nested(net, TIPC_NLA_NET_MAX, attrs[TIPC_NLA_NET], + NULL); + if (err) + return err; - nla_parse_nested(net, TIPC_NLA_NET_MAX, attrs[TIPC_NLA_NET], NULL); id = htonl(nla_get_u32(net[TIPC_NLA_NET_ID])); return tipc_add_tlv(msg->rep, TIPC_TLV_UNSIGNED, &id, sizeof(id)); diff --git a/net/tipc/node.c b/net/tipc/node.c index 9aaa1bc56..23d476184 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1,7 +1,7 @@ /* * net/tipc/node.c: TIPC node management routines * - * Copyright (c) 2000-2006, 2012-2015, Ericsson AB + * Copyright (c) 2000-2006, 2012-2016, Ericsson AB * Copyright (c) 2005-2006, 2010-2014, Wind River Systems * All rights reserved. * @@ -191,6 +191,20 @@ int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) tipc_node_put(n); return mtu; } + +u16 tipc_node_get_capabilities(struct net *net, u32 addr) +{ + struct tipc_node *n; + u16 caps; + + n = tipc_node_find(net, addr); + if (unlikely(!n)) + return TIPC_NODE_CAPABILITIES; + caps = n->capabilities; + tipc_node_put(n); + return caps; +} + /* * A trivial power-of-two bitmask technique is used for speed, since this * operation is done for every incoming TIPC packet. The number of hash table @@ -304,8 +318,11 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) spin_lock_bh(&tn->node_list_lock); n = tipc_node_find(net, addr); - if (n) + if (n) { + /* Same node may come back with new capabilities */ + n->capabilities = capabilities; goto exit; + } n = kzalloc(sizeof(*n), GFP_ATOMIC); if (!n) { pr_warn("Node creation failed, no memory\n"); @@ -525,7 +542,7 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, struct tipc_link *ol = node_active_link(n, 0); struct tipc_link *nl = n->links[bearer_id].link; - if (!nl) + if (!nl || tipc_link_is_up(nl)) return; tipc_link_fsm_evt(nl, LINK_ESTABLISH_EVT); @@ -545,12 +562,16 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, pr_debug("Established link <%s> on network plane %c\n", tipc_link_name(nl), tipc_link_plane(nl)); + /* Ensure that a STATE message goes first */ + tipc_link_build_state_msg(nl, xmitq); + /* First link? => give it both slots */ if (!ol) { *slot0 = bearer_id; *slot1 = bearer_id; tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT); n->action_flags |= TIPC_NOTIFY_NODE_UP; + tipc_link_set_active(nl, true); tipc_bcast_add_peer(n->net, nl, xmitq); return; } @@ -581,8 +602,12 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, static void tipc_node_link_up(struct tipc_node *n, int bearer_id, struct sk_buff_head *xmitq) { + struct tipc_media_addr *maddr; + tipc_node_write_lock(n); __tipc_node_link_up(n, bearer_id, xmitq); + maddr = &n->links[bearer_id].maddr; + tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr); tipc_node_write_unlock(n); } @@ -1272,14 +1297,10 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id rc = tipc_bcast_rcv(net, be->link, skb); - /* Broadcast link reset may happen at reassembly failure */ - if (rc & TIPC_LINK_DOWN_EVT) - tipc_node_reset_links(n); - /* Broadcast ACKs are sent on a unicast link */ if (rc & TIPC_LINK_SND_BC_ACK) { tipc_node_read_lock(n); - tipc_link_build_ack_msg(le->link, &xmitq); + tipc_link_build_state_msg(le->link, &xmitq); tipc_node_read_unlock(n); } @@ -1295,6 +1316,17 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id spin_unlock_bh(&be->inputq2.lock); tipc_sk_mcast_rcv(net, &be->arrvq, &be->inputq2); } + + if (rc & TIPC_LINK_DOWN_EVT) { + /* Reception reassembly failure => reset all links to peer */ + if (!tipc_link_is_up(be->link)) + tipc_node_reset_links(n); + + /* Retransmission failure => reset all links to all peers */ + if (!tipc_link_is_up(tipc_bc_sndlink(net))) + tipc_bearer_reset_all(net); + } + tipc_node_put(n); } diff --git a/net/tipc/node.h b/net/tipc/node.h index f39d9d06e..8264b3d97 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -45,10 +45,11 @@ /* Optional capabilities supported by this code version */ enum { - TIPC_BCAST_SYNCH = (1 << 1) + TIPC_BCAST_SYNCH = (1 << 1), + TIPC_BLOCK_FLOWCTL = (2 << 1) }; -#define TIPC_NODE_CAPABILITIES TIPC_BCAST_SYNCH +#define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | TIPC_BLOCK_FLOWCTL) #define INVALID_BEARER_ID -1 void tipc_node_stop(struct net *net); @@ -70,6 +71,7 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel); +u16 tipc_node_get_capabilities(struct net *net, u32 addr); int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info); diff --git a/net/tipc/server.c b/net/tipc/server.c index 2446bfbaa..272d20a79 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -86,6 +86,7 @@ struct outqueue_entry { static void tipc_recv_work(struct work_struct *work); static void tipc_send_work(struct work_struct *work); static void tipc_clean_outqueues(struct tipc_conn *con); +static void tipc_sock_release(struct tipc_conn *con); static void tipc_conn_kref_release(struct kref *kref) { @@ -102,6 +103,7 @@ static void tipc_conn_kref_release(struct kref *kref) } saddr->scope = -TIPC_NODE_SCOPE; kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr)); + tipc_sock_release(con); sock_release(sock); con->sock = NULL; } @@ -136,28 +138,28 @@ static void sock_data_ready(struct sock *sk) { struct tipc_conn *con; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); con = sock2con(sk); if (con && test_bit(CF_CONNECTED, &con->flags)) { conn_get(con); if (!queue_work(con->server->rcv_wq, &con->rwork)) conn_put(con); } - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } static void sock_write_space(struct sock *sk) { struct tipc_conn *con; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); con = sock2con(sk); if (con && test_bit(CF_CONNECTED, &con->flags)) { conn_get(con); if (!queue_work(con->server->send_wq, &con->swork)) conn_put(con); } - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } static void tipc_register_callbacks(struct socket *sock, struct tipc_conn *con) @@ -184,26 +186,31 @@ static void tipc_unregister_callbacks(struct tipc_conn *con) write_unlock_bh(&sk->sk_callback_lock); } +static void tipc_sock_release(struct tipc_conn *con) +{ + struct tipc_server *s = con->server; + + if (con->conid) + s->tipc_conn_release(con->conid, con->usr_data); + + tipc_unregister_callbacks(con); +} + static void tipc_close_conn(struct tipc_conn *con) { struct tipc_server *s = con->server; if (test_and_clear_bit(CF_CONNECTED, &con->flags)) { - if (con->conid) - s->tipc_conn_shutdown(con->conid, con->usr_data); spin_lock_bh(&s->idr_lock); idr_remove(&s->conn_idr, con->conid); s->idr_in_use--; spin_unlock_bh(&s->idr_lock); - tipc_unregister_callbacks(con); - /* We shouldn't flush pending works as we may be in the * thread. In fact the races with pending rx/tx work structs * are harmless for us here as we have already deleted this - * connection from server connection list and set - * sk->sk_user_data to 0 before releasing connection object. + * connection from server connection list. */ kernel_sock_shutdown(con->sock, SHUT_RDWR); diff --git a/net/tipc/server.h b/net/tipc/server.h index 9015faedb..34f8055af 100644 --- a/net/tipc/server.h +++ b/net/tipc/server.h @@ -53,7 +53,7 @@ * @send_wq: send workqueue * @max_rcvbuf_size: maximum permitted receive message length * @tipc_conn_new: callback will be called when new connection is incoming - * @tipc_conn_shutdown: callback will be called when connection is shut down + * @tipc_conn_release: callback will be called before releasing the connection * @tipc_conn_recvmsg: callback will be called when message arrives * @saddr: TIPC server address * @name: server name @@ -70,7 +70,7 @@ struct tipc_server { struct workqueue_struct *send_wq; int max_rcvbuf_size; void *(*tipc_conn_new)(int conid); - void (*tipc_conn_shutdown)(int conid, void *usr_data); + void (*tipc_conn_release)(int conid, void *usr_data); void (*tipc_conn_recvmsg)(struct net *net, int conid, struct sockaddr_tipc *addr, void *usr_data, void *buf, size_t len); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 5f80d3fa9..c49b8df43 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -96,8 +96,11 @@ struct tipc_sock { uint conn_timeout; atomic_t dupl_rcvcnt; bool link_cong; - uint sent_unacked; - uint rcv_unacked; + u16 snt_unacked; + u16 snd_win; + u16 peer_caps; + u16 rcv_unacked; + u16 rcv_win; struct sockaddr_tipc remote; struct rhash_head node; struct rcu_head rcu; @@ -227,9 +230,29 @@ static struct tipc_sock *tipc_sk(const struct sock *sk) return container_of(sk, struct tipc_sock, sk); } -static int tsk_conn_cong(struct tipc_sock *tsk) +static bool tsk_conn_cong(struct tipc_sock *tsk) { - return tsk->sent_unacked >= TIPC_FLOWCTRL_WIN; + return tsk->snt_unacked >= tsk->snd_win; +} + +/* tsk_blocks(): translate a buffer size in bytes to number of + * advertisable blocks, taking into account the ratio truesize(len)/len + * We can trust that this ratio is always < 4 for len >= FLOWCTL_BLK_SZ + */ +static u16 tsk_adv_blocks(int len) +{ + return len / FLOWCTL_BLK_SZ / 4; +} + +/* tsk_inc(): increment counter for sent or received data + * - If block based flow control is not supported by peer we + * fall back to message based ditto, incrementing the counter + */ +static u16 tsk_inc(struct tipc_sock *tsk, int msglen) +{ + if (likely(tsk->peer_caps & TIPC_BLOCK_FLOWCTL)) + return ((msglen / FLOWCTL_BLK_SZ) + 1); + return 1; } /** @@ -366,7 +389,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, sock->state = state; sock_init_data(sock, sk); if (tipc_sk_insert(tsk)) { - pr_warn("Socket create failed; port numbrer exhausted\n"); + pr_warn("Socket create failed; port number exhausted\n"); return -EINVAL; } msg_set_origport(msg, tsk->portid); @@ -377,9 +400,12 @@ static int tipc_sk_create(struct net *net, struct socket *sock, sk->sk_write_space = tipc_write_space; sk->sk_destruct = tipc_sock_destruct; tsk->conn_timeout = CONN_TIMEOUT_DEFAULT; - tsk->sent_unacked = 0; atomic_set(&tsk->dupl_rcvcnt, 0); + /* Start out with safe limits until we receive an advertised window */ + tsk->snd_win = tsk_adv_blocks(RCVBUF_MIN); + tsk->rcv_win = tsk->snd_win; + if (sock->state == SS_READY) { tsk_set_unreturnable(tsk, true); if (sock->type == SOCK_DGRAM) @@ -770,12 +796,14 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, * @tsk: receiving socket * @skb: pointer to message buffer. */ -static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb) +static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, + struct sk_buff_head *xmitq) { struct sock *sk = &tsk->sk; + u32 onode = tsk_own_node(tsk); struct tipc_msg *hdr = buf_msg(skb); int mtyp = msg_type(hdr); - int conn_cong; + bool conn_cong; /* Ignore if connection cannot be validated: */ if (!tsk_peer_msg(tsk, hdr)) @@ -785,11 +813,14 @@ static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb) if (mtyp == CONN_PROBE) { msg_set_type(hdr, CONN_PROBE_REPLY); - tipc_sk_respond(sk, skb, TIPC_OK); + if (tipc_msg_reverse(onode, &skb, TIPC_OK)) + __skb_queue_tail(xmitq, skb); return; } else if (mtyp == CONN_ACK) { conn_cong = tsk_conn_cong(tsk); - tsk->sent_unacked -= msg_msgcnt(hdr); + tsk->snt_unacked -= msg_conn_ack(hdr); + if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) + tsk->snd_win = msg_adv_win(hdr); if (conn_cong) sk->sk_write_space(sk); } else if (mtyp != CONN_PROBE_REPLY) { @@ -1020,12 +1051,14 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz) u32 dnode; uint mtu, send, sent = 0; struct iov_iter save; + int hlen = MIN_H_SIZE; /* Handle implied connection establishment */ if (unlikely(dest)) { rc = __tipc_sendmsg(sock, m, dsz); + hlen = msg_hdr_sz(mhdr); if (dsz && (dsz == rc)) - tsk->sent_unacked = 1; + tsk->snt_unacked = tsk_inc(tsk, dsz + hlen); return rc; } if (dsz > (uint)INT_MAX) @@ -1054,7 +1087,7 @@ next: if (likely(!tsk_conn_cong(tsk))) { rc = tipc_node_xmit(net, &pktchain, dnode, portid); if (likely(!rc)) { - tsk->sent_unacked++; + tsk->snt_unacked += tsk_inc(tsk, send + hlen); sent += send; if (sent == dsz) return dsz; @@ -1118,6 +1151,13 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, sk_reset_timer(sk, &sk->sk_timer, jiffies + tsk->probing_intv); tipc_node_add_conn(net, peer_node, tsk->portid, peer_port); tsk->max_pkt = tipc_node_get_mtu(net, peer_node, tsk->portid); + tsk->peer_caps = tipc_node_get_capabilities(net, peer_node); + if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) + return; + + /* Fall back to message based flow control */ + tsk->rcv_win = FLOWCTL_MSG_WIN; + tsk->snd_win = FLOWCTL_MSG_WIN; } /** @@ -1214,7 +1254,7 @@ static int tipc_sk_anc_data_recv(struct msghdr *m, struct tipc_msg *msg, return 0; } -static void tipc_sk_send_ack(struct tipc_sock *tsk, uint ack) +static void tipc_sk_send_ack(struct tipc_sock *tsk) { struct net *net = sock_net(&tsk->sk); struct sk_buff *skb = NULL; @@ -1230,7 +1270,14 @@ static void tipc_sk_send_ack(struct tipc_sock *tsk, uint ack) if (!skb) return; msg = buf_msg(skb); - msg_set_msgcnt(msg, ack); + msg_set_conn_ack(msg, tsk->rcv_unacked); + tsk->rcv_unacked = 0; + + /* Adjust to and advertize the correct window limit */ + if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) { + tsk->rcv_win = tsk_adv_blocks(tsk->sk.sk_rcvbuf); + msg_set_adv_win(msg, tsk->rcv_win); + } tipc_node_xmit_skb(net, skb, dnode, msg_link_selector(msg)); } @@ -1288,7 +1335,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buf_len, long timeo; unsigned int sz; u32 err; - int res; + int res, hlen; /* Catch invalid receive requests */ if (unlikely(!buf_len)) @@ -1313,6 +1360,7 @@ restart: buf = skb_peek(&sk->sk_receive_queue); msg = buf_msg(buf); sz = msg_data_sz(msg); + hlen = msg_hdr_sz(msg); err = msg_errcode(msg); /* Discard an empty non-errored message & try again */ @@ -1335,7 +1383,7 @@ restart: sz = buf_len; m->msg_flags |= MSG_TRUNC; } - res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg), m, sz); + res = skb_copy_datagram_msg(buf, hlen, m, sz); if (res) goto exit; res = sz; @@ -1347,15 +1395,15 @@ restart: res = -ECONNRESET; } - /* Consume received message (optional) */ - if (likely(!(flags & MSG_PEEK))) { - if ((sock->state != SS_READY) && - (++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) { - tipc_sk_send_ack(tsk, tsk->rcv_unacked); - tsk->rcv_unacked = 0; - } - tsk_advance_rx_queue(sk); + if (unlikely(flags & MSG_PEEK)) + goto exit; + + if (likely(sock->state != SS_READY)) { + tsk->rcv_unacked += tsk_inc(tsk, hlen + sz); + if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4))) + tipc_sk_send_ack(tsk); } + tsk_advance_rx_queue(sk); exit: release_sock(sk); return res; @@ -1384,7 +1432,7 @@ static int tipc_recv_stream(struct socket *sock, struct msghdr *m, int sz_to_copy, target, needed; int sz_copied = 0; u32 err; - int res = 0; + int res = 0, hlen; /* Catch invalid receive attempts */ if (unlikely(!buf_len)) @@ -1410,6 +1458,7 @@ restart: buf = skb_peek(&sk->sk_receive_queue); msg = buf_msg(buf); sz = msg_data_sz(msg); + hlen = msg_hdr_sz(msg); err = msg_errcode(msg); /* Discard an empty non-errored message & try again */ @@ -1434,8 +1483,7 @@ restart: needed = (buf_len - sz_copied); sz_to_copy = (sz <= needed) ? sz : needed; - res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg) + offset, - m, sz_to_copy); + res = skb_copy_datagram_msg(buf, hlen + offset, m, sz_to_copy); if (res) goto exit; @@ -1457,20 +1505,18 @@ restart: res = -ECONNRESET; } - /* Consume received message (optional) */ - if (likely(!(flags & MSG_PEEK))) { - if (unlikely(++tsk->rcv_unacked >= TIPC_CONNACK_INTV)) { - tipc_sk_send_ack(tsk, tsk->rcv_unacked); - tsk->rcv_unacked = 0; - } - tsk_advance_rx_queue(sk); - } + if (unlikely(flags & MSG_PEEK)) + goto exit; + + tsk->rcv_unacked += tsk_inc(tsk, hlen + sz); + if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4))) + tipc_sk_send_ack(tsk); + tsk_advance_rx_queue(sk); /* Loop around if more data is required */ if ((sz_copied < buf_len) && /* didn't get all requested data */ (!skb_queue_empty(&sk->sk_receive_queue) || (sz_copied < target)) && /* and more is ready or required */ - (!(flags & MSG_PEEK)) && /* and aren't just peeking at data */ (!err)) /* and haven't reached a FIN */ goto restart; @@ -1602,30 +1648,33 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) /** * rcvbuf_limit - get proper overload limit of socket receive queue * @sk: socket - * @buf: message + * @skb: message * - * For all connection oriented messages, irrespective of importance, - * the default overload value (i.e. 67MB) is set as limit. + * For connection oriented messages, irrespective of importance, + * default queue limit is 2 MB. * - * For all connectionless messages, by default new queue limits are - * as belows: + * For connectionless messages, queue limits are based on message + * importance as follows: * - * TIPC_LOW_IMPORTANCE (4 MB) - * TIPC_MEDIUM_IMPORTANCE (8 MB) - * TIPC_HIGH_IMPORTANCE (16 MB) - * TIPC_CRITICAL_IMPORTANCE (32 MB) + * TIPC_LOW_IMPORTANCE (2 MB) + * TIPC_MEDIUM_IMPORTANCE (4 MB) + * TIPC_HIGH_IMPORTANCE (8 MB) + * TIPC_CRITICAL_IMPORTANCE (16 MB) * * Returns overload limit according to corresponding message importance */ -static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf) +static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) { - struct tipc_msg *msg = buf_msg(buf); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_msg *hdr = buf_msg(skb); - if (msg_connected(msg)) - return sysctl_tipc_rmem[2]; + if (unlikely(!msg_connected(hdr))) + return sk->sk_rcvbuf << msg_importance(hdr); - return sk->sk_rcvbuf >> TIPC_CRITICAL_IMPORTANCE << - msg_importance(msg); + if (likely(tsk->peer_caps & TIPC_BLOCK_FLOWCTL)) + return sk->sk_rcvbuf; + + return FLOWCTL_MSG_LIM; } /** @@ -1640,7 +1689,8 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf) * * Returns true if message was added to socket receive queue, otherwise false */ -static bool filter_rcv(struct sock *sk, struct sk_buff *skb) +static bool filter_rcv(struct sock *sk, struct sk_buff *skb, + struct sk_buff_head *xmitq) { struct socket *sock = sk->sk_socket; struct tipc_sock *tsk = tipc_sk(sk); @@ -1650,7 +1700,7 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb) int usr = msg_user(hdr); if (unlikely(msg_user(hdr) == CONN_MANAGER)) { - tipc_sk_proto_rcv(tsk, skb); + tipc_sk_proto_rcv(tsk, skb, xmitq); return false; } @@ -1693,7 +1743,8 @@ static bool filter_rcv(struct sock *sk, struct sk_buff *skb) return true; reject: - tipc_sk_respond(sk, skb, err); + if (tipc_msg_reverse(tsk_own_node(tsk), &skb, err)) + __skb_queue_tail(xmitq, skb); return false; } @@ -1709,9 +1760,24 @@ reject: static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) { unsigned int truesize = skb->truesize; + struct sk_buff_head xmitq; + u32 dnode, selector; + + __skb_queue_head_init(&xmitq); - if (likely(filter_rcv(sk, skb))) + if (likely(filter_rcv(sk, skb, &xmitq))) { atomic_add(truesize, &tipc_sk(sk)->dupl_rcvcnt); + return 0; + } + + if (skb_queue_empty(&xmitq)) + return 0; + + /* Send response/rejected message */ + skb = __skb_dequeue(&xmitq); + dnode = msg_destnode(buf_msg(skb)); + selector = msg_origport(buf_msg(skb)); + tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector); return 0; } @@ -1725,12 +1791,13 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) * Caller must hold socket lock */ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, - u32 dport) + u32 dport, struct sk_buff_head *xmitq) { + unsigned long time_limit = jiffies + 2; + struct sk_buff *skb; unsigned int lim; atomic_t *dcnt; - struct sk_buff *skb; - unsigned long time_limit = jiffies + 2; + u32 onode; while (skb_queue_len(inputq)) { if (unlikely(time_after_eq(jiffies, time_limit))) @@ -1742,20 +1809,22 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, /* Add message directly to receive queue if possible */ if (!sock_owned_by_user(sk)) { - filter_rcv(sk, skb); + filter_rcv(sk, skb, xmitq); continue; } /* Try backlog, compensating for double-counted bytes */ dcnt = &tipc_sk(sk)->dupl_rcvcnt; - if (sk->sk_backlog.len) + if (!sk->sk_backlog.len) atomic_set(dcnt, 0); lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt); if (likely(!sk_add_backlog(sk, skb, lim))) continue; /* Overload => reject message back to sender */ - tipc_sk_respond(sk, skb, TIPC_ERR_OVERLOAD); + onode = tipc_own_addr(sock_net(sk)); + if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) + __skb_queue_tail(xmitq, skb); break; } } @@ -1768,12 +1837,14 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, */ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) { + struct sk_buff_head xmitq; u32 dnode, dport = 0; int err; struct tipc_sock *tsk; struct sock *sk; struct sk_buff *skb; + __skb_queue_head_init(&xmitq); while (skb_queue_len(inputq)) { dport = tipc_skb_peek_port(inputq, dport); tsk = tipc_sk_lookup(net, dport); @@ -1781,9 +1852,14 @@ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) if (likely(tsk)) { sk = &tsk->sk; if (likely(spin_trylock_bh(&sk->sk_lock.slock))) { - tipc_sk_enqueue(inputq, sk, dport); + tipc_sk_enqueue(inputq, sk, dport, &xmitq); spin_unlock_bh(&sk->sk_lock.slock); } + /* Send pending response/rejected messages, if any */ + while ((skb = __skb_dequeue(&xmitq))) { + dnode = msg_destnode(buf_msg(skb)); + tipc_node_xmit_skb(net, skb, dnode, dport); + } sock_put(sk); continue; } diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 4241f2206..06fb5944c 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -1,6 +1,6 @@ /* net/tipc/socket.h: Include file for TIPC socket code * - * Copyright (c) 2014-2015, Ericsson AB + * Copyright (c) 2014-2016, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,10 +38,17 @@ #include <net/sock.h> #include <net/genetlink.h> -#define TIPC_CONNACK_INTV 256 -#define TIPC_FLOWCTRL_WIN (TIPC_CONNACK_INTV * 2) -#define TIPC_CONN_OVERLOAD_LIMIT ((TIPC_FLOWCTRL_WIN * 2 + 1) * \ - SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE)) +/* Compatibility values for deprecated message based flow control */ +#define FLOWCTL_MSG_WIN 512 +#define FLOWCTL_MSG_LIM ((FLOWCTL_MSG_WIN * 2 + 1) * SKB_TRUESIZE(MAX_MSG_SIZE)) + +#define FLOWCTL_BLK_SZ 1024 + +/* Socket receive buffer sizes */ +#define RCVBUF_MIN (FLOWCTL_BLK_SZ * 512) +#define RCVBUF_DEF (FLOWCTL_BLK_SZ * 1024 * 2) +#define RCVBUF_MAX (FLOWCTL_BLK_SZ * 1024 * 16) + int tipc_socket_init(void); void tipc_socket_stop(void); void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq); diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index e6cb386fb..0dd02244e 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -302,7 +302,7 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, } /* Handle one termination request for the subscriber */ -static void tipc_subscrb_shutdown_cb(int conid, void *usr_data) +static void tipc_subscrb_release_cb(int conid, void *usr_data) { tipc_subscrb_delete((struct tipc_subscriber *)usr_data); } @@ -326,8 +326,7 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid, return tipc_subscrp_cancel(s, subscriber); } - if (s) - tipc_subscrp_subscribe(net, s, subscriber, swap); + tipc_subscrp_subscribe(net, s, subscriber, swap); } /* Handle one request to establish a new subscriber */ @@ -365,7 +364,7 @@ int tipc_topsrv_start(struct net *net) topsrv->max_rcvbuf_size = sizeof(struct tipc_subscr); topsrv->tipc_conn_recvmsg = tipc_subscrb_rcv_cb; topsrv->tipc_conn_new = tipc_subscrb_connect_cb; - topsrv->tipc_conn_shutdown = tipc_subscrb_shutdown_cb; + topsrv->tipc_conn_release = tipc_subscrb_release_cb; strncpy(topsrv->name, name, strlen(name) + 1); tn->topsrv = topsrv; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7748199b3..735362c26 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -953,7 +953,7 @@ fail: return NULL; } -static int unix_mknod(struct dentry *dentry, struct path *path, umode_t mode, +static int unix_mknod(struct dentry *dentry, const struct path *path, umode_t mode, struct path *res) { int err; diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index b5f1221f4..b96ac918e 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -61,6 +61,14 @@ * function will also cleanup rejected sockets, those that reach the connected * state but leave it before they have been accepted. * + * - Lock ordering for pending or accept queue sockets is: + * + * lock_sock(listener); + * lock_sock_nested(pending, SINGLE_DEPTH_NESTING); + * + * Using explicit nested locking keeps lockdep happy since normally only one + * lock of a given class may be taken at a time. + * * - Sockets created by user action will be cleaned up when the user process * calls close(2), causing our release implementation to be called. Our release * implementation will perform some cleanup then drop the last reference so our @@ -443,7 +451,7 @@ void vsock_pending_work(struct work_struct *work) cleanup = true; lock_sock(listener); - lock_sock(sk); + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); if (vsock_is_pending(sk)) { vsock_remove_pending(listener, sk); @@ -1292,7 +1300,7 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags) if (connected) { listener->sk_ack_backlog--; - lock_sock(connected); + lock_sock_nested(connected, SINGLE_DEPTH_NESTING); vconnected = vsock_sk(connected); /* If the listener socket has received an error, then we should diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 56214736f..4120b7a53 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -2051,7 +2051,7 @@ static u32 vmci_transport_get_local_cid(void) return vmci_get_context_id(); } -static struct vsock_transport vmci_transport = { +static const struct vsock_transport vmci_transport = { .init = vmci_transport_socket_init, .destruct = vmci_transport_destruct, .release = vmci_transport_release, diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 59cabc9bc..da49c0b1f 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -739,7 +739,7 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy, * and thus fail the GO instantiation, consider only the interfaces of * the current registered device. */ - list_for_each_entry(wdev, &rdev->wdev_list, list) { + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { struct ieee80211_channel *other_chan = NULL; int r1, r2; @@ -768,7 +768,7 @@ static bool cfg80211_ir_permissive_chan(struct wiphy *wiphy, if (chan == other_chan) return true; - if (chan->band != IEEE80211_BAND_5GHZ) + if (chan->band != NL80211_BAND_5GHZ) continue; r1 = cfg80211_get_unii(chan->center_freq); diff --git a/net/wireless/core.c b/net/wireless/core.c index c878045d1..ecca3896b 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -3,6 +3,7 @@ * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright 2015 Intel Deutschland GmbH */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -157,7 +158,7 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, if (!(rdev->wiphy.flags & WIPHY_FLAG_NETNS_OK)) return -EOPNOTSUPP; - list_for_each_entry(wdev, &rdev->wdev_list, list) { + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; wdev->netdev->features &= ~NETIF_F_NETNS_LOCAL; @@ -171,7 +172,8 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, /* failed -- clean up to old netns */ net = wiphy_net(&rdev->wiphy); - list_for_each_entry_continue_reverse(wdev, &rdev->wdev_list, + list_for_each_entry_continue_reverse(wdev, + &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; @@ -230,7 +232,7 @@ void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) ASSERT_RTNL(); - list_for_each_entry(wdev, &rdev->wdev_list, list) { + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (wdev->netdev) { dev_close(wdev->netdev); continue; @@ -298,7 +300,8 @@ void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev) kfree(item); spin_unlock_irq(&rdev->destroy_list_lock); - list_for_each_entry_safe(wdev, tmp, &rdev->wdev_list, list) { + list_for_each_entry_safe(wdev, tmp, + &rdev->wiphy.wdev_list, list) { if (nlportid == wdev->owner_nlportid) rdev_del_virtual_intf(rdev, wdev); } @@ -408,7 +411,7 @@ use_default_name: dev_set_name(&rdev->wiphy.dev, PHY_NAME "%d", rdev->wiphy_idx); } - INIT_LIST_HEAD(&rdev->wdev_list); + INIT_LIST_HEAD(&rdev->wiphy.wdev_list); INIT_LIST_HEAD(&rdev->beacon_registrations); spin_lock_init(&rdev->beacon_registrations_lock); spin_lock_init(&rdev->bss_lock); @@ -555,7 +558,7 @@ int wiphy_register(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); int res; - enum ieee80211_band band; + enum nl80211_band band; struct ieee80211_supported_band *sband; bool have_band = false; int i; @@ -624,6 +627,13 @@ int wiphy_register(struct wiphy *wiphy) !rdev->ops->set_mac_acl))) return -EINVAL; + /* assure only valid behaviours are flagged by driver + * hence subtract 2 as bit 0 is invalid. + */ + if (WARN_ON(wiphy->bss_select_support && + (wiphy->bss_select_support & ~(BIT(__NL80211_BSS_SELECT_ATTR_AFTER_LAST) - 2)))) + return -EINVAL; + if (wiphy->addresses) memcpy(wiphy->perm_addr, wiphy->addresses[0].addr, ETH_ALEN); @@ -638,7 +648,7 @@ int wiphy_register(struct wiphy *wiphy) return res; /* sanity check supported bands/channels */ - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) continue; @@ -650,7 +660,7 @@ int wiphy_register(struct wiphy *wiphy) * on 60GHz band, there are no legacy rates, so * n_bitrates is 0 */ - if (WARN_ON(band != IEEE80211_BAND_60GHZ && + if (WARN_ON(band != NL80211_BAND_60GHZ && !sband->n_bitrates)) return -EINVAL; @@ -660,7 +670,7 @@ int wiphy_register(struct wiphy *wiphy) * global structure for that. */ if (cfg80211_disable_40mhz_24ghz && - band == IEEE80211_BAND_2GHZ && + band == NL80211_BAND_2GHZ && sband->ht_cap.ht_supported) { sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SUP_WIDTH_20_40; sband->ht_cap.cap &= ~IEEE80211_HT_CAP_SGI_40; @@ -790,7 +800,7 @@ void wiphy_unregister(struct wiphy *wiphy) nl80211_notify_wiphy(rdev, NL80211_CMD_DEL_WIPHY); rdev->wiphy.registered = false; - WARN_ON(!list_empty(&rdev->wdev_list)); + WARN_ON(!list_empty(&rdev->wiphy.wdev_list)); /* * First remove the hardware from everywhere, this makes @@ -1012,7 +1022,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, spin_lock_init(&wdev->mgmt_registrations_lock); wdev->identifier = ++rdev->wdev_id; - list_add_rcu(&wdev->list, &rdev->wdev_list); + list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list); rdev->devlist_generation++; /* can only change netns with wiphy */ dev->features |= NETIF_F_NETNS_LOCAL; diff --git a/net/wireless/core.h b/net/wireless/core.h index 022ccad06..025b7a5d5 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -50,10 +50,9 @@ struct cfg80211_registered_device { /* wiphy index, internal only */ int wiphy_idx; - /* associated wireless interfaces, protected by rtnl or RCU */ - struct list_head wdev_list; + /* protected by RTNL */ int devlist_generation, wdev_id; - int opencount; /* also protected by devlist_mtx */ + int opencount; wait_queue_head_t dev_wait; struct list_head beacon_registrations; @@ -214,6 +213,7 @@ struct cfg80211_event { const u8 *resp_ie; size_t req_ie_len; size_t resp_ie_len; + struct cfg80211_bss *bss; u16 status; } cr; struct { diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c index 454157717..5d453916a 100644 --- a/net/wireless/debugfs.c +++ b/net/wireless/debugfs.c @@ -69,7 +69,7 @@ static ssize_t ht40allow_map_read(struct file *file, struct wiphy *wiphy = file->private_data; char *buf; unsigned int offset = 0, buf_size = PAGE_SIZE, i, r; - enum ieee80211_band band; + enum nl80211_band band; struct ieee80211_supported_band *sband; buf = kzalloc(buf_size, GFP_KERNEL); @@ -78,7 +78,7 @@ static ssize_t ht40allow_map_read(struct file *file, rtnl_lock(); - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) continue; diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index 4c55fab9b..4a4dda53b 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -104,7 +104,7 @@ static int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, struct ieee80211_supported_band *sband = rdev->wiphy.bands[params->chandef.chan->band]; int j; - u32 flag = params->chandef.chan->band == IEEE80211_BAND_5GHZ ? + u32 flag = params->chandef.chan->band == NL80211_BAND_5GHZ ? IEEE80211_RATE_MANDATORY_A : IEEE80211_RATE_MANDATORY_B; @@ -236,7 +236,7 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { struct cfg80211_cached_keys *ck = NULL; - enum ieee80211_band band; + enum nl80211_band band; int i, err; ASSERT_WDEV_LOCK(wdev); @@ -248,7 +248,7 @@ int cfg80211_ibss_wext_join(struct cfg80211_registered_device *rdev, if (!wdev->wext.ibss.chandef.chan) { struct ieee80211_channel *new_chan = NULL; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index 092300b30..fa2066b56 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -128,9 +128,9 @@ int __cfg80211_join_mesh(struct cfg80211_registered_device *rdev, if (!setup->chandef.chan) { /* if we don't have that either, use the first usable channel */ - enum ieee80211_band band; + enum nl80211_band band; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; int i; diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index ff328250b..c284d883c 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -726,7 +726,7 @@ void cfg80211_dfs_channels_update_work(struct work_struct *work) wiphy = &rdev->wiphy; rtnl_lock(); - for (bandid = 0; bandid < IEEE80211_NUM_BANDS; bandid++) { + for (bandid = 0; bandid < NUM_NL80211_BANDS; bandid++) { sband = wiphy->bands[bandid]; if (!sband) continue; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 056a73078..7d7228390 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -103,7 +103,7 @@ __cfg80211_wdev_from_attrs(struct net *netns, struct nlattr **attrs) if (have_wdev_id && rdev->wiphy_idx != wiphy_idx) continue; - list_for_each_entry(wdev, &rdev->wdev_list, list) { + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (have_ifidx && wdev->netdev && wdev->netdev->ifindex == ifidx) { result = wdev; @@ -149,7 +149,7 @@ __cfg80211_rdev_from_attrs(struct net *netns, struct nlattr **attrs) tmp = cfg80211_rdev_by_wiphy_idx(wdev_id >> 32); if (tmp) { /* make sure wdev exists */ - list_for_each_entry(wdev, &tmp->wdev_list, list) { + list_for_each_entry(wdev, &tmp->wiphy.wdev_list, list) { if (wdev->identifier != (u32)wdev_id) continue; found = true; @@ -402,6 +402,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 }, [NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG }, [NL80211_ATTR_PBSS] = { .type = NLA_FLAG }, + [NL80211_ATTR_BSS_SELECT] = { .type = NLA_NESTED }, + [NL80211_ATTR_STA_SUPPORT_P2P_PS] = { .type = NLA_U8 }, }; /* policy for the key attributes */ @@ -486,6 +488,15 @@ nl80211_plan_policy[NL80211_SCHED_SCAN_PLAN_MAX + 1] = { [NL80211_SCHED_SCAN_PLAN_ITERATIONS] = { .type = NLA_U32 }, }; +static const struct nla_policy +nl80211_bss_select_policy[NL80211_BSS_SELECT_ATTR_MAX + 1] = { + [NL80211_BSS_SELECT_ATTR_RSSI] = { .type = NLA_FLAG }, + [NL80211_BSS_SELECT_ATTR_BAND_PREF] = { .type = NLA_U32 }, + [NL80211_BSS_SELECT_ATTR_RSSI_ADJUST] = { + .len = sizeof(struct nl80211_bss_select_rssi_adjust) + }, +}; + static int nl80211_prepare_wdev_dump(struct sk_buff *skb, struct netlink_callback *cb, struct cfg80211_registered_device **rdev, @@ -524,7 +535,7 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb, *rdev = wiphy_to_rdev(wiphy); *wdev = NULL; - list_for_each_entry(tmp, &(*rdev)->wdev_list, list) { + list_for_each_entry(tmp, &(*rdev)->wiphy.wdev_list, list) { if (tmp->identifier == cb->args[1]) { *wdev = tmp; break; @@ -1266,7 +1277,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, struct nlattr *nl_bands, *nl_band; struct nlattr *nl_freqs, *nl_freq; struct nlattr *nl_cmds; - enum ieee80211_band band; + enum nl80211_band band; struct ieee80211_channel *chan; int i; const struct ieee80211_txrx_stypes *mgmt_stypes = @@ -1399,7 +1410,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, goto nla_put_failure; for (band = state->band_start; - band < IEEE80211_NUM_BANDS; band++) { + band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; sband = rdev->wiphy.bands[band]; @@ -1461,7 +1472,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, } nla_nest_end(msg, nl_bands); - if (band < IEEE80211_NUM_BANDS) + if (band < NUM_NL80211_BANDS) state->band_start = band + 1; else state->band_start = 0; @@ -1731,6 +1742,25 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, rdev->wiphy.ext_features)) goto nla_put_failure; + if (rdev->wiphy.bss_select_support) { + struct nlattr *nested; + u32 bss_select_support = rdev->wiphy.bss_select_support; + + nested = nla_nest_start(msg, NL80211_ATTR_BSS_SELECT); + if (!nested) + goto nla_put_failure; + + i = 0; + while (bss_select_support) { + if ((bss_select_support & 1) && + nla_put_flag(msg, i)) + goto nla_put_failure; + i++; + bss_select_support >>= 1; + } + nla_nest_end(msg, nested); + } + /* done */ state->split_start = 0; break; @@ -2399,7 +2429,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u32(msg, NL80211_ATTR_IFTYPE, wdev->iftype) || - nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, wdev_address(wdev)) || nla_put_u32(msg, NL80211_ATTR_GENERATION, rdev->devlist_generation ^ @@ -2459,7 +2490,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * } if_idx = 0; - list_for_each_entry(wdev, &rdev->wdev_list, list) { + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (if_idx < if_start) { if_idx++; continue; @@ -2731,7 +2762,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) spin_lock_init(&wdev->mgmt_registrations_lock); wdev->identifier = ++rdev->wdev_id; - list_add_rcu(&wdev->list, &rdev->wdev_list); + list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list); rdev->devlist_generation++; break; default: @@ -3267,7 +3298,7 @@ static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev; bool ret = false; - list_for_each_entry(wdev, &rdev->wdev_list, list) { + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (wdev->iftype != NL80211_IFTYPE_AP && wdev->iftype != NL80211_IFTYPE_P2P_GO) continue; @@ -3456,16 +3487,16 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) params.smps_mode = NL80211_SMPS_OFF; } + params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); + if (params.pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) + return -EOPNOTSUPP; + if (info->attrs[NL80211_ATTR_ACL_POLICY]) { params.acl = parse_acl_data(&rdev->wiphy, info); if (IS_ERR(params.acl)) return PTR_ERR(params.acl); } - params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); - if (params.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ]) - return -EOPNOTSUPP; - wdev_lock(wdev); err = rdev_start_ap(rdev, dev, ¶ms); if (!err) { @@ -3724,11 +3755,18 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, goto nla_put_failure; #define PUT_SINFO(attr, memb, type) do { \ - if (sinfo->filled & BIT(NL80211_STA_INFO_ ## attr) && \ + BUILD_BUG_ON(sizeof(type) == sizeof(u64)); \ + if (sinfo->filled & (1ULL << NL80211_STA_INFO_ ## attr) && \ nla_put_ ## type(msg, NL80211_STA_INFO_ ## attr, \ sinfo->memb)) \ goto nla_put_failure; \ } while (0) +#define PUT_SINFO_U64(attr, memb) do { \ + if (sinfo->filled & (1ULL << NL80211_STA_INFO_ ## attr) && \ + nla_put_u64_64bit(msg, NL80211_STA_INFO_ ## attr, \ + sinfo->memb, NL80211_STA_INFO_PAD)) \ + goto nla_put_failure; \ + } while (0) PUT_SINFO(CONNECTED_TIME, connected_time, u32); PUT_SINFO(INACTIVE_TIME, inactive_time, u32); @@ -3745,11 +3783,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, (u32)sinfo->tx_bytes)) goto nla_put_failure; - PUT_SINFO(RX_BYTES64, rx_bytes, u64); - PUT_SINFO(TX_BYTES64, tx_bytes, u64); + PUT_SINFO_U64(RX_BYTES64, rx_bytes); + PUT_SINFO_U64(TX_BYTES64, tx_bytes); PUT_SINFO(LLID, llid, u16); PUT_SINFO(PLID, plid, u16); PUT_SINFO(PLINK_STATE, plink_state, u8); + PUT_SINFO_U64(RX_DURATION, rx_duration); switch (rdev->wiphy.signal_type) { case CFG80211_SIGNAL_TYPE_MBM: @@ -3817,12 +3856,13 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, &sinfo->sta_flags)) goto nla_put_failure; - PUT_SINFO(T_OFFSET, t_offset, u64); - PUT_SINFO(RX_DROP_MISC, rx_dropped_misc, u64); - PUT_SINFO(BEACON_RX, rx_beacon, u64); + PUT_SINFO_U64(T_OFFSET, t_offset); + PUT_SINFO_U64(RX_DROP_MISC, rx_dropped_misc); + PUT_SINFO_U64(BEACON_RX, rx_beacon); PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8); #undef PUT_SINFO +#undef PUT_SINFO_U64 if (sinfo->filled & BIT(NL80211_STA_INFO_TID_STATS)) { struct nlattr *tidsattr; @@ -3845,19 +3885,19 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, if (!tidattr) goto nla_put_failure; -#define PUT_TIDVAL(attr, memb, type) do { \ +#define PUT_TIDVAL_U64(attr, memb) do { \ if (tidstats->filled & BIT(NL80211_TID_STATS_ ## attr) && \ - nla_put_ ## type(msg, NL80211_TID_STATS_ ## attr, \ - tidstats->memb)) \ + nla_put_u64_64bit(msg, NL80211_TID_STATS_ ## attr, \ + tidstats->memb, NL80211_TID_STATS_PAD)) \ goto nla_put_failure; \ } while (0) - PUT_TIDVAL(RX_MSDU, rx_msdu, u64); - PUT_TIDVAL(TX_MSDU, tx_msdu, u64); - PUT_TIDVAL(TX_MSDU_RETRIES, tx_msdu_retries, u64); - PUT_TIDVAL(TX_MSDU_FAILED, tx_msdu_failed, u64); + PUT_TIDVAL_U64(RX_MSDU, rx_msdu); + PUT_TIDVAL_U64(TX_MSDU, tx_msdu); + PUT_TIDVAL_U64(TX_MSDU_RETRIES, tx_msdu_retries); + PUT_TIDVAL_U64(TX_MSDU_FAILED, tx_msdu_failed); -#undef PUT_TIDVAL +#undef PUT_TIDVAL_U64 nla_nest_end(msg, tidattr); } @@ -3977,6 +4017,10 @@ int cfg80211_check_station_change(struct wiphy *wiphy, statype != CFG80211_STA_AP_CLIENT_UNASSOC) return -EINVAL; + if (params->support_p2p_ps != -1 && + statype != CFG80211_STA_AP_CLIENT_UNASSOC) + return -EINVAL; + if (params->aid && !(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) && statype != CFG80211_STA_AP_CLIENT_UNASSOC) @@ -4270,6 +4314,18 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) else params.listen_interval = -1; + if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) { + u8 tmp; + + tmp = nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]); + if (tmp >= NUM_NL80211_P2P_PS_STATUS) + return -EINVAL; + + params.support_p2p_ps = tmp; + } else { + params.support_p2p_ps = -1; + } + if (!info->attrs[NL80211_ATTR_MAC]) return -EINVAL; @@ -4393,6 +4449,23 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.listen_interval = nla_get_u16(info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL]); + if (info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]) { + u8 tmp; + + tmp = nla_get_u8(info->attrs[NL80211_ATTR_STA_SUPPORT_P2P_PS]); + if (tmp >= NUM_NL80211_P2P_PS_STATUS) + return -EINVAL; + + params.support_p2p_ps = tmp; + } else { + /* + * if not specified, assume it's supported for P2P GO interface, + * and is NOT supported for AP interface + */ + params.support_p2p_ps = + dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_GO; + } + if (info->attrs[NL80211_ATTR_PEER_AID]) params.aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]); else @@ -5758,6 +5831,73 @@ static int validate_scan_freqs(struct nlattr *freqs) return n_channels; } +static bool is_band_valid(struct wiphy *wiphy, enum nl80211_band b) +{ + return b < NUM_NL80211_BANDS && wiphy->bands[b]; +} + +static int parse_bss_select(struct nlattr *nla, struct wiphy *wiphy, + struct cfg80211_bss_selection *bss_select) +{ + struct nlattr *attr[NL80211_BSS_SELECT_ATTR_MAX + 1]; + struct nlattr *nest; + int err; + bool found = false; + int i; + + /* only process one nested attribute */ + nest = nla_data(nla); + if (!nla_ok(nest, nla_len(nest))) + return -EINVAL; + + err = nla_parse(attr, NL80211_BSS_SELECT_ATTR_MAX, nla_data(nest), + nla_len(nest), nl80211_bss_select_policy); + if (err) + return err; + + /* only one attribute may be given */ + for (i = 0; i <= NL80211_BSS_SELECT_ATTR_MAX; i++) { + if (attr[i]) { + if (found) + return -EINVAL; + found = true; + } + } + + bss_select->behaviour = __NL80211_BSS_SELECT_ATTR_INVALID; + + if (attr[NL80211_BSS_SELECT_ATTR_RSSI]) + bss_select->behaviour = NL80211_BSS_SELECT_ATTR_RSSI; + + if (attr[NL80211_BSS_SELECT_ATTR_BAND_PREF]) { + bss_select->behaviour = NL80211_BSS_SELECT_ATTR_BAND_PREF; + bss_select->param.band_pref = + nla_get_u32(attr[NL80211_BSS_SELECT_ATTR_BAND_PREF]); + if (!is_band_valid(wiphy, bss_select->param.band_pref)) + return -EINVAL; + } + + if (attr[NL80211_BSS_SELECT_ATTR_RSSI_ADJUST]) { + struct nl80211_bss_select_rssi_adjust *adj_param; + + adj_param = nla_data(attr[NL80211_BSS_SELECT_ATTR_RSSI_ADJUST]); + bss_select->behaviour = NL80211_BSS_SELECT_ATTR_RSSI_ADJUST; + bss_select->param.adjust.band = adj_param->band; + bss_select->param.adjust.delta = adj_param->delta; + if (!is_band_valid(wiphy, bss_select->param.adjust.band)) + return -EINVAL; + } + + /* user-space did not provide behaviour attribute */ + if (bss_select->behaviour == __NL80211_BSS_SELECT_ATTR_INVALID) + return -EINVAL; + + if (!(wiphy->bss_select_support & BIT(bss_select->behaviour))) + return -EINVAL; + + return 0; +} + static int nl80211_parse_random_mac(struct nlattr **attrs, u8 *mac_addr, u8 *mac_addr_mask) { @@ -5888,10 +6028,10 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) i++; } } else { - enum ieee80211_band band; + enum nl80211_band band; /* all channels */ - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { int j; if (!wiphy->bands[band]) continue; @@ -5936,7 +6076,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) request->ie_len); } - for (i = 0; i < IEEE80211_NUM_BANDS; i++) + for (i = 0; i < NUM_NL80211_BANDS; i++) if (wiphy->bands[i]) request->rates[i] = (1 << wiphy->bands[i]->n_bitrates) - 1; @@ -5945,9 +6085,9 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SUPP_RATES], tmp) { - enum ieee80211_band band = nla_type(attr); + enum nl80211_band band = nla_type(attr); - if (band < 0 || band >= IEEE80211_NUM_BANDS) { + if (band < 0 || band >= NUM_NL80211_BANDS) { err = -EINVAL; goto out_free; } @@ -5996,6 +6136,12 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) request->no_cck = nla_get_flag(info->attrs[NL80211_ATTR_TX_NO_CCK_RATE]); + if (info->attrs[NL80211_ATTR_MAC]) + memcpy(request->bssid, nla_data(info->attrs[NL80211_ATTR_MAC]), + ETH_ALEN); + else + eth_broadcast_addr(request->bssid); + request->wdev = wdev; request->wiphy = &rdev->wiphy; request->scan_start = jiffies; @@ -6129,7 +6275,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_sched_scan_request *request; struct nlattr *attr; int err, tmp, n_ssids = 0, n_match_sets = 0, n_channels, i, n_plans = 0; - enum ieee80211_band band; + enum nl80211_band band; size_t ie_len; struct nlattr *tb[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1]; s32 default_match_rssi = NL80211_SCAN_RSSI_THOLD_OFF; @@ -6294,7 +6440,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, } } else { /* all channels */ - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { int j; if (!wiphy->bands[band]) continue; @@ -6738,7 +6884,8 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, if (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex)) goto nla_put_failure; - if (nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev))) + if (nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD)) goto nla_put_failure; bss = nla_nest_start(msg, NL80211_ATTR_BSS); @@ -6759,7 +6906,8 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, */ ies = rcu_dereference(res->ies); if (ies) { - if (nla_put_u64(msg, NL80211_BSS_TSF, ies->tsf)) + if (nla_put_u64_64bit(msg, NL80211_BSS_TSF, ies->tsf, + NL80211_BSS_PAD)) goto fail_unlock_rcu; if (ies->len && nla_put(msg, NL80211_BSS_INFORMATION_ELEMENTS, ies->len, ies->data)) @@ -6769,7 +6917,8 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, /* and this pointer is always (unless driver didn't know) beacon data */ ies = rcu_dereference(res->beacon_ies); if (ies && ies->from_beacon) { - if (nla_put_u64(msg, NL80211_BSS_BEACON_TSF, ies->tsf)) + if (nla_put_u64_64bit(msg, NL80211_BSS_BEACON_TSF, ies->tsf, + NL80211_BSS_PAD)) goto fail_unlock_rcu; if (ies->len && nla_put(msg, NL80211_BSS_BEACON_IES, ies->len, ies->data)) @@ -6788,8 +6937,8 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, goto nla_put_failure; if (intbss->ts_boottime && - nla_put_u64(msg, NL80211_BSS_LAST_SEEN_BOOTTIME, - intbss->ts_boottime)) + nla_put_u64_64bit(msg, NL80211_BSS_LAST_SEEN_BOOTTIME, + intbss->ts_boottime, NL80211_BSS_PAD)) goto nla_put_failure; switch (rdev->wiphy.signal_type) { @@ -6909,28 +7058,28 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 portid, u32 seq, nla_put_flag(msg, NL80211_SURVEY_INFO_IN_USE)) goto nla_put_failure; if ((survey->filled & SURVEY_INFO_TIME) && - nla_put_u64(msg, NL80211_SURVEY_INFO_TIME, - survey->time)) + nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME, + survey->time, NL80211_SURVEY_INFO_PAD)) goto nla_put_failure; if ((survey->filled & SURVEY_INFO_TIME_BUSY) && - nla_put_u64(msg, NL80211_SURVEY_INFO_TIME_BUSY, - survey->time_busy)) + nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_BUSY, + survey->time_busy, NL80211_SURVEY_INFO_PAD)) goto nla_put_failure; if ((survey->filled & SURVEY_INFO_TIME_EXT_BUSY) && - nla_put_u64(msg, NL80211_SURVEY_INFO_TIME_EXT_BUSY, - survey->time_ext_busy)) + nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_EXT_BUSY, + survey->time_ext_busy, NL80211_SURVEY_INFO_PAD)) goto nla_put_failure; if ((survey->filled & SURVEY_INFO_TIME_RX) && - nla_put_u64(msg, NL80211_SURVEY_INFO_TIME_RX, - survey->time_rx)) + nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_RX, + survey->time_rx, NL80211_SURVEY_INFO_PAD)) goto nla_put_failure; if ((survey->filled & SURVEY_INFO_TIME_TX) && - nla_put_u64(msg, NL80211_SURVEY_INFO_TIME_TX, - survey->time_tx)) + nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_TX, + survey->time_tx, NL80211_SURVEY_INFO_PAD)) goto nla_put_failure; if ((survey->filled & SURVEY_INFO_TIME_SCAN) && - nla_put_u64(msg, NL80211_SURVEY_INFO_TIME_SCAN, - survey->time_scan)) + nla_put_u64_64bit(msg, NL80211_SURVEY_INFO_TIME_SCAN, + survey->time_scan, NL80211_SURVEY_INFO_PAD)) goto nla_put_failure; nla_nest_end(msg, infoattr); @@ -7402,14 +7551,14 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info) static bool nl80211_parse_mcast_rate(struct cfg80211_registered_device *rdev, - int mcast_rate[IEEE80211_NUM_BANDS], + int mcast_rate[NUM_NL80211_BANDS], int rateval) { struct wiphy *wiphy = &rdev->wiphy; bool found = false; int band, i; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { struct ieee80211_supported_band *sband; sband = wiphy->bands[band]; @@ -7589,7 +7738,7 @@ static int nl80211_set_mcast_rate(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; - int mcast_rate[IEEE80211_NUM_BANDS]; + int mcast_rate[NUM_NL80211_BANDS]; u32 nla_rate; int err; @@ -7650,8 +7799,8 @@ __cfg80211_alloc_vendor_skb(struct cfg80211_registered_device *rdev, } if (wdev) { - if (nla_put_u64(skb, NL80211_ATTR_WDEV, - wdev_id(wdev))) + if (nla_put_u64_64bit(skb, NL80211_ATTR_WDEV, + wdev_id(wdev), NL80211_ATTR_PAD)) goto nla_put_failure; if (wdev->netdev && nla_put_u32(skb, NL80211_ATTR_IFINDEX, @@ -7922,6 +8071,10 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) connect.mfp = NL80211_MFP_NO; } + if (info->attrs[NL80211_ATTR_PREV_BSSID]) + connect.prev_bssid = + nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) { connect.channel = nl80211_get_valid_chan( wiphy, info->attrs[NL80211_ATTR_WIPHY_FREQ]); @@ -7990,13 +8143,29 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) } connect.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); - if (connect.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ]) { + if (connect.pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) { kzfree(connkeys); return -EOPNOTSUPP; } + if (info->attrs[NL80211_ATTR_BSS_SELECT]) { + /* bss selection makes no sense if bssid is set */ + if (connect.bssid) { + kzfree(connkeys); + return -EINVAL; + } + + err = parse_bss_select(info->attrs[NL80211_ATTR_BSS_SELECT], + wiphy, &connect.bss_select); + if (err) { + kzfree(connkeys); + return err; + } + } + wdev_lock(dev->ieee80211_ptr); - err = cfg80211_connect(rdev, dev, &connect, connkeys, NULL); + err = cfg80211_connect(rdev, dev, &connect, connkeys, + connect.prev_bssid); wdev_unlock(dev->ieee80211_ptr); if (err) kzfree(connkeys); @@ -8224,7 +8393,8 @@ static int nl80211_remain_on_channel(struct sk_buff *skb, if (err) goto free_msg; - if (nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie)) + if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie, + NL80211_ATTR_PAD)) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -8394,7 +8564,7 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, memset(&mask, 0, sizeof(mask)); /* Default to all rates enabled */ - for (i = 0; i < IEEE80211_NUM_BANDS; i++) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { sband = rdev->wiphy.bands[i]; if (!sband) @@ -8418,14 +8588,14 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, /* * The nested attribute uses enum nl80211_band as the index. This maps - * directly to the enum ieee80211_band values used in cfg80211. + * directly to the enum nl80211_band values used in cfg80211. */ BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8); nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) { - enum ieee80211_band band = nla_type(tx_rates); + enum nl80211_band band = nla_type(tx_rates); int err; - if (band < 0 || band >= IEEE80211_NUM_BANDS) + if (band < 0 || band >= NUM_NL80211_BANDS) return -EINVAL; sband = rdev->wiphy.bands[band]; if (sband == NULL) @@ -8636,7 +8806,8 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) goto free_msg; if (msg) { - if (nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie)) + if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie, + NL80211_ATTR_PAD)) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -9922,7 +10093,8 @@ static int nl80211_probe_client(struct sk_buff *skb, if (err) goto free_msg; - if (nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie)) + if (nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie, + NL80211_ATTR_PAD)) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -10220,7 +10392,7 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, *wdev = NULL; if (cb->args[1]) { - list_for_each_entry(tmp, &(*rdev)->wdev_list, list) { + list_for_each_entry(tmp, &wiphy->wdev_list, list) { if (tmp->identifier == cb->args[1] - 1) { *wdev = tmp; break; @@ -10347,8 +10519,9 @@ static int nl80211_vendor_cmd_dump(struct sk_buff *skb, break; if (nla_put_u32(skb, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || - (wdev && nla_put_u64(skb, NL80211_ATTR_WDEV, - wdev_id(wdev)))) { + (wdev && nla_put_u64_64bit(skb, NL80211_ATTR_WDEV, + wdev_id(wdev), + NL80211_ATTR_PAD))) { genlmsg_cancel(skb, hdr); break; } @@ -10590,7 +10763,7 @@ static int nl80211_tdls_channel_switch(struct sk_buff *skb, * section 10.22.6.2.1. Disallow 5/10Mhz channels as well for now, the * specification is not defined for them. */ - if (chandef.chan->band == IEEE80211_BAND_2GHZ && + if (chandef.chan->band == NL80211_BAND_2GHZ && chandef.width != NL80211_CHAN_WIDTH_20_NOHT && chandef.width != NL80211_CHAN_WIDTH_20) return -EINVAL; @@ -11555,7 +11728,8 @@ static int nl80211_send_scan_msg(struct sk_buff *msg, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex)) || - nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev))) + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD)) goto nla_put_failure; /* ignore errors and send incomplete event anyway */ @@ -12222,11 +12396,13 @@ static void nl80211_send_remain_on_chan_event( if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex)) || - nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, chan->center_freq) || nla_put_u32(msg, NL80211_ATTR_WIPHY_CHANNEL_TYPE, NL80211_CHAN_NO_HT) || - nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie)) + nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie, + NL80211_ATTR_PAD)) goto nla_put_failure; if (cmd == NL80211_CMD_REMAIN_ON_CHANNEL && @@ -12460,7 +12636,8 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || (netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex)) || - nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || @@ -12503,9 +12680,11 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || (netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex)) || - nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || nla_put(msg, NL80211_ATTR_FRAME, len, buf) || - nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie) || + nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie, + NL80211_ATTR_PAD) || (ack && nla_put_flag(msg, NL80211_ATTR_ACK))) goto nla_put_failure; @@ -12885,7 +13064,8 @@ nl80211_radar_notify(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev = netdev->ieee80211_ptr; if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || - nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev))) + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD)) goto nla_put_failure; } @@ -12930,7 +13110,8 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) || - nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie) || + nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, cookie, + NL80211_ATTR_PAD) || (acked && nla_put_flag(msg, NL80211_ATTR_ACK))) goto nla_put_failure; @@ -13075,7 +13256,8 @@ void cfg80211_report_wowlan_wakeup(struct wireless_dev *wdev, goto free_msg; if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || - nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev))) + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD)) goto free_msg; if (wdev->netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX, @@ -13231,7 +13413,7 @@ static int nl80211_netlink_notify(struct notifier_block * nb, sched_scan_req->owner_nlportid == notify->portid) schedule_scan_stop = true; - list_for_each_entry_rcu(wdev, &rdev->wdev_list, list) { + list_for_each_entry_rcu(wdev, &rdev->wiphy.wdev_list, list) { cfg80211_mlme_unregister_socket(wdev, notify->portid); if (wdev->owner_nlportid == notify->portid) @@ -13350,7 +13532,8 @@ void cfg80211_crit_proto_stopped(struct wireless_dev *wdev, gfp_t gfp) goto nla_put_failure; if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || - nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev))) + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD)) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -13383,7 +13566,8 @@ void nl80211_send_ap_stopped(struct wireless_dev *wdev) if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u32(msg, NL80211_ATTR_IFINDEX, wdev->netdev->ifindex) || - nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev))) + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD)) goto out; genlmsg_end(msg, hdr); diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 8ae0c04f9..85ff30bee 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -1048,7 +1048,7 @@ rdev_start_radar_detection(struct cfg80211_registered_device *rdev, static inline int rdev_set_mcast_rate(struct cfg80211_registered_device *rdev, struct net_device *dev, - int mcast_rate[IEEE80211_NUM_BANDS]) + int mcast_rate[NUM_NL80211_BANDS]) { int ret = -ENOTSUPP; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index c5fb317ee..5dbac3749 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1546,12 +1546,12 @@ static void reg_process_ht_flags_band(struct wiphy *wiphy, static void reg_process_ht_flags(struct wiphy *wiphy) { - enum ieee80211_band band; + enum nl80211_band band; if (!wiphy) return; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) + for (band = 0; band < NUM_NL80211_BANDS; band++) reg_process_ht_flags_band(wiphy, wiphy->bands[band]); } @@ -1639,7 +1639,7 @@ static void reg_leave_invalid_chans(struct wiphy *wiphy) ASSERT_RTNL(); - list_for_each_entry(wdev, &rdev->wdev_list, list) + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) if (!reg_wdev_chan_valid(wiphy, wdev)) cfg80211_leave(rdev, wdev); } @@ -1673,7 +1673,7 @@ static void reg_check_channels(void) static void wiphy_update_regulatory(struct wiphy *wiphy, enum nl80211_reg_initiator initiator) { - enum ieee80211_band band; + enum nl80211_band band; struct regulatory_request *lr = get_last_request(); if (ignore_reg_update(wiphy, initiator)) { @@ -1690,7 +1690,7 @@ static void wiphy_update_regulatory(struct wiphy *wiphy, lr->dfs_region = get_cfg80211_regdom()->dfs_region; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) + for (band = 0; band < NUM_NL80211_BANDS; band++) handle_band(wiphy, initiator, wiphy->bands[band]); reg_process_beacons(wiphy); @@ -1786,14 +1786,14 @@ static void handle_band_custom(struct wiphy *wiphy, void wiphy_apply_custom_regulatory(struct wiphy *wiphy, const struct ieee80211_regdomain *regd) { - enum ieee80211_band band; + enum nl80211_band band; unsigned int bands_set = 0; WARN(!(wiphy->regulatory_flags & REGULATORY_CUSTOM_REG), "wiphy should have REGULATORY_CUSTOM_REG\n"); wiphy->regulatory_flags |= REGULATORY_CUSTOM_REG; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!wiphy->bands[band]) continue; handle_band_custom(wiphy, wiphy->bands[band], regd); @@ -2228,7 +2228,7 @@ static void reg_process_self_managed_hints(void) struct wiphy *wiphy; const struct ieee80211_regdomain *tmp; const struct ieee80211_regdomain *regd; - enum ieee80211_band band; + enum nl80211_band band; struct regulatory_request request = {}; list_for_each_entry(rdev, &cfg80211_rdev_list, list) { @@ -2246,7 +2246,7 @@ static void reg_process_self_managed_hints(void) rcu_assign_pointer(wiphy->regd, regd); rcu_free_regdom(tmp); - for (band = 0; band < IEEE80211_NUM_BANDS; band++) + for (band = 0; band < NUM_NL80211_BANDS; band++) handle_band_custom(wiphy, wiphy->bands[band], regd); reg_process_ht_flags(wiphy); @@ -2404,7 +2404,7 @@ int regulatory_hint(struct wiphy *wiphy, const char *alpha2) } EXPORT_SYMBOL(regulatory_hint); -void regulatory_hint_country_ie(struct wiphy *wiphy, enum ieee80211_band band, +void regulatory_hint_country_ie(struct wiphy *wiphy, enum nl80211_band band, const u8 *country_ie, u8 country_ie_len) { char alpha2[2]; @@ -2504,11 +2504,11 @@ static void restore_alpha2(char *alpha2, bool reset_user) static void restore_custom_reg_settings(struct wiphy *wiphy) { struct ieee80211_supported_band *sband; - enum ieee80211_band band; + enum nl80211_band band; struct ieee80211_channel *chan; int i; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) continue; @@ -2623,9 +2623,9 @@ void regulatory_hint_disconnect(void) static bool freq_is_chan_12_13_14(u16 freq) { - if (freq == ieee80211_channel_to_frequency(12, IEEE80211_BAND_2GHZ) || - freq == ieee80211_channel_to_frequency(13, IEEE80211_BAND_2GHZ) || - freq == ieee80211_channel_to_frequency(14, IEEE80211_BAND_2GHZ)) + if (freq == ieee80211_channel_to_frequency(12, NL80211_BAND_2GHZ) || + freq == ieee80211_channel_to_frequency(13, NL80211_BAND_2GHZ) || + freq == ieee80211_channel_to_frequency(14, NL80211_BAND_2GHZ)) return true; return false; } @@ -2650,7 +2650,7 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy, if (beacon_chan->beacon_found || beacon_chan->flags & IEEE80211_CHAN_RADAR || - (beacon_chan->band == IEEE80211_BAND_2GHZ && + (beacon_chan->band == NL80211_BAND_2GHZ && !freq_is_chan_12_13_14(beacon_chan->center_freq))) return 0; diff --git a/net/wireless/reg.h b/net/wireless/reg.h index 9f495d76e..f6ced316b 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -104,7 +104,7 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy, * information for a band the BSS is not present in it will be ignored. */ void regulatory_hint_country_ie(struct wiphy *wiphy, - enum ieee80211_band band, + enum nl80211_band band, const u8 *country_ie, u8 country_ie_len); diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 14d5369eb..ef2955c89 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -364,13 +364,16 @@ const u8 *cfg80211_find_ie(u8 eid, const u8 *ies, int len) } EXPORT_SYMBOL(cfg80211_find_ie); -const u8 *cfg80211_find_vendor_ie(unsigned int oui, u8 oui_type, +const u8 *cfg80211_find_vendor_ie(unsigned int oui, int oui_type, const u8 *ies, int len) { struct ieee80211_vendor_ie *ie; const u8 *pos = ies, *end = ies + len; int ie_oui; + if (WARN_ON(oui_type > 0xff)) + return NULL; + while (pos < end) { pos = cfg80211_find_ie(WLAN_EID_VENDOR_SPECIFIC, pos, end - pos); @@ -386,7 +389,8 @@ const u8 *cfg80211_find_vendor_ie(unsigned int oui, u8 oui_type, goto cont; ie_oui = ie->oui[0] << 16 | ie->oui[1] << 8 | ie->oui[2]; - if (ie_oui == oui && ie->oui_type == oui_type) + if (ie_oui == oui && + (oui_type < 0 || ie->oui_type == oui_type)) return pos; cont: pos += 2 + ie->len; @@ -531,7 +535,7 @@ static int cmp_bss(struct cfg80211_bss *a, } static bool cfg80211_bss_type_match(u16 capability, - enum ieee80211_band band, + enum nl80211_band band, enum ieee80211_bss_type bss_type) { bool ret = true; @@ -540,7 +544,7 @@ static bool cfg80211_bss_type_match(u16 capability, if (bss_type == IEEE80211_BSS_TYPE_ANY) return ret; - if (band == IEEE80211_BAND_60GHZ) { + if (band == NL80211_BAND_60GHZ) { mask = WLAN_CAPABILITY_DMG_TYPE_MASK; switch (bss_type) { case IEEE80211_BSS_TYPE_ESS: @@ -1006,7 +1010,7 @@ cfg80211_inform_bss_data(struct wiphy *wiphy, if (!res) return NULL; - if (channel->band == IEEE80211_BAND_60GHZ) { + if (channel->band == NL80211_BAND_60GHZ) { bss_type = res->pub.capability & WLAN_CAPABILITY_DMG_TYPE_MASK; if (bss_type == WLAN_CAPABILITY_DMG_TYPE_AP || bss_type == WLAN_CAPABILITY_DMG_TYPE_PBSS) @@ -1089,7 +1093,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, if (!res) return NULL; - if (channel->band == IEEE80211_BAND_60GHZ) { + if (channel->band == NL80211_BAND_60GHZ) { bss_type = res->pub.capability & WLAN_CAPABILITY_DMG_TYPE_MASK; if (bss_type == WLAN_CAPABILITY_DMG_TYPE_AP || bss_type == WLAN_CAPABILITY_DMG_TYPE_PBSS) @@ -1185,7 +1189,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, struct iw_scan_req *wreq = NULL; struct cfg80211_scan_request *creq = NULL; int i, err, n_channels = 0; - enum ieee80211_band band; + enum nl80211_band band; if (!netif_running(dev)) return -ENETDOWN; @@ -1229,7 +1233,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, /* translate "Scan on frequencies" request */ i = 0; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { int j; if (!wiphy->bands[band]) @@ -1289,10 +1293,12 @@ int cfg80211_wext_siwscan(struct net_device *dev, creq->n_ssids = 0; } - for (i = 0; i < IEEE80211_NUM_BANDS; i++) + for (i = 0; i < NUM_NL80211_BANDS; i++) if (wiphy->bands[i]) creq->rates[i] = (1 << wiphy->bands[i]->n_bitrates) - 1; + eth_broadcast_addr(creq->bssid); + rdev->scan_req = creq; err = rdev_scan(rdev, creq); if (err) { diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 544558171..584fdc347 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -81,7 +81,7 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) return -ENOMEM; if (wdev->conn->params.channel) { - enum ieee80211_band band = wdev->conn->params.channel->band; + enum nl80211_band band = wdev->conn->params.channel->band; struct ieee80211_supported_band *sband = wdev->wiphy->bands[band]; @@ -93,11 +93,11 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) request->rates[band] = (1 << sband->n_bitrates) - 1; } else { int i = 0, j; - enum ieee80211_band band; + enum nl80211_band band; struct ieee80211_supported_band *bands; struct ieee80211_channel *channel; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { bands = wdev->wiphy->bands[band]; if (!bands) continue; @@ -119,6 +119,8 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) wdev->conn->params.ssid_len); request->ssids[0].ssid_len = wdev->conn->params.ssid_len; + eth_broadcast_addr(request->bssid); + request->wdev = wdev; request->wiphy = &rdev->wiphy; request->scan_start = jiffies; @@ -221,7 +223,7 @@ void cfg80211_conn_work(struct work_struct *work) rtnl_lock(); - list_for_each_entry(wdev, &rdev->wdev_list, list) { + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; @@ -490,8 +492,18 @@ static int cfg80211_sme_connect(struct wireless_dev *wdev, if (!rdev->ops->auth || !rdev->ops->assoc) return -EOPNOTSUPP; - if (wdev->current_bss) - return -EALREADY; + if (wdev->current_bss) { + if (!prev_bssid) + return -EALREADY; + if (prev_bssid && + !ether_addr_equal(prev_bssid, wdev->current_bss->pub.bssid)) + return -ENOTCONN; + cfg80211_unhold_bss(wdev->current_bss); + cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); + wdev->current_bss = NULL; + + cfg80211_sme_free(wdev); + } if (WARN_ON(wdev->conn)) return -EINPROGRESS; @@ -605,7 +617,7 @@ static bool cfg80211_is_all_idle(void) * count as new regulatory hints. */ list_for_each_entry(rdev, &cfg80211_rdev_list, list) { - list_for_each_entry(wdev, &rdev->wdev_list, list) { + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { wdev_lock(wdev); if (wdev->conn || wdev->current_bss) is_all_idle = false; @@ -741,19 +753,32 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, kfree(country_ie); } -void cfg80211_connect_result(struct net_device *dev, const u8 *bssid, - const u8 *req_ie, size_t req_ie_len, - const u8 *resp_ie, size_t resp_ie_len, - u16 status, gfp_t gfp) +/* Consumes bss object one way or another */ +void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid, + struct cfg80211_bss *bss, const u8 *req_ie, + size_t req_ie_len, const u8 *resp_ie, + size_t resp_ie_len, u16 status, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; + if (bss) { + /* Make sure the bss entry provided by the driver is valid. */ + struct cfg80211_internal_bss *ibss = bss_from_pub(bss); + + if (WARN_ON(list_empty(&ibss->list))) { + cfg80211_put_bss(wdev->wiphy, bss); + return; + } + } + ev = kzalloc(sizeof(*ev) + req_ie_len + resp_ie_len, gfp); - if (!ev) + if (!ev) { + cfg80211_put_bss(wdev->wiphy, bss); return; + } ev->type = EVENT_CONNECT_RESULT; if (bssid) @@ -768,6 +793,9 @@ void cfg80211_connect_result(struct net_device *dev, const u8 *bssid, ev->cr.resp_ie_len = resp_ie_len; memcpy((void *)ev->cr.resp_ie, resp_ie, resp_ie_len); } + if (bss) + cfg80211_hold_bss(bss_from_pub(bss)); + ev->cr.bss = bss; ev->cr.status = status; spin_lock_irqsave(&wdev->event_lock, flags); @@ -775,7 +803,7 @@ void cfg80211_connect_result(struct net_device *dev, const u8 *bssid, spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } -EXPORT_SYMBOL(cfg80211_connect_result); +EXPORT_SYMBOL(cfg80211_connect_bss); /* Consumes bss object one way or another */ void __cfg80211_roamed(struct wireless_dev *wdev, diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 9cee02206..e46469bc1 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -91,7 +91,7 @@ static void cfg80211_leave_all(struct cfg80211_registered_device *rdev) { struct wireless_dev *wdev; - list_for_each_entry(wdev, &rdev->wdev_list, list) + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_leave(rdev, wdev); } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 09b242b09..3c1091ae6 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -110,7 +110,7 @@ conf->dot11MeshHWMPconfirmationInterval; \ } while (0) -#define CHAN_ENTRY __field(enum ieee80211_band, band) \ +#define CHAN_ENTRY __field(enum nl80211_band, band) \ __field(u16, center_freq) #define CHAN_ASSIGN(chan) \ do { \ @@ -125,7 +125,7 @@ #define CHAN_PR_FMT "band: %d, freq: %u" #define CHAN_PR_ARG __entry->band, __entry->center_freq -#define CHAN_DEF_ENTRY __field(enum ieee80211_band, band) \ +#define CHAN_DEF_ENTRY __field(enum nl80211_band, band) \ __field(u32, control_freq) \ __field(u32, width) \ __field(u32, center_freq1) \ @@ -1259,6 +1259,7 @@ TRACE_EVENT(rdev_connect, __field(bool, privacy) __field(u32, wpa_versions) __field(u32, flags) + MAC_ENTRY(prev_bssid) ), TP_fast_assign( WIPHY_ASSIGN; @@ -1270,13 +1271,14 @@ TRACE_EVENT(rdev_connect, __entry->privacy = sme->privacy; __entry->wpa_versions = sme->crypto.wpa_versions; __entry->flags = sme->flags; + MAC_ASSIGN(prev_bssid, sme->prev_bssid); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: " MAC_PR_FMT ", ssid: %s, auth type: %d, privacy: %s, wpa versions: %u, " - "flags: %u", + "flags: %u, previous bssid: " MAC_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(bssid), __entry->ssid, __entry->auth_type, BOOL_TO_STR(__entry->privacy), - __entry->wpa_versions, __entry->flags) + __entry->wpa_versions, __entry->flags, MAC_PR_ARG(prev_bssid)) ); TRACE_EVENT(rdev_set_cqm_rssi_config, @@ -2645,7 +2647,7 @@ TRACE_EVENT(cfg80211_scan_done, TP_STRUCT__entry( __field(u32, n_channels) __dynamic_array(u8, ie, request ? request->ie_len : 0) - __array(u32, rates, IEEE80211_NUM_BANDS) + __array(u32, rates, NUM_NL80211_BANDS) __field(u32, wdev_id) MAC_ENTRY(wiphy_mac) __field(bool, no_cck) @@ -2656,7 +2658,7 @@ TRACE_EVENT(cfg80211_scan_done, memcpy(__get_dynamic_array(ie), request->ie, request->ie_len); memcpy(__entry->rates, request->rates, - IEEE80211_NUM_BANDS); + NUM_NL80211_BANDS); __entry->wdev_id = request->wdev ? request->wdev->identifier : 0; if (request->wiphy) @@ -2881,25 +2883,25 @@ TRACE_EVENT(rdev_start_radar_detection, TRACE_EVENT(rdev_set_mcast_rate, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - int mcast_rate[IEEE80211_NUM_BANDS]), + int mcast_rate[NUM_NL80211_BANDS]), TP_ARGS(wiphy, netdev, mcast_rate), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY - __array(int, mcast_rate, IEEE80211_NUM_BANDS) + __array(int, mcast_rate, NUM_NL80211_BANDS) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; memcpy(__entry->mcast_rate, mcast_rate, - sizeof(int) * IEEE80211_NUM_BANDS); + sizeof(int) * NUM_NL80211_BANDS); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " "mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 60GHz=0x%x]", WIPHY_PR_ARG, NETDEV_PR_ARG, - __entry->mcast_rate[IEEE80211_BAND_2GHZ], - __entry->mcast_rate[IEEE80211_BAND_5GHZ], - __entry->mcast_rate[IEEE80211_BAND_60GHZ]) + __entry->mcast_rate[NL80211_BAND_2GHZ], + __entry->mcast_rate[NL80211_BAND_5GHZ], + __entry->mcast_rate[NL80211_BAND_60GHZ]) ); TRACE_EVENT(rdev_set_coalesce, diff --git a/net/wireless/util.c b/net/wireless/util.c index 47b917841..b7d1592bd 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -47,7 +47,7 @@ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband, if (WARN_ON(!sband)) return 1; - if (sband->band == IEEE80211_BAND_2GHZ) { + if (sband->band == NL80211_BAND_2GHZ) { if (scan_width == NL80211_BSS_CHAN_WIDTH_5 || scan_width == NL80211_BSS_CHAN_WIDTH_10) mandatory_flag = IEEE80211_RATE_MANDATORY_G; @@ -65,26 +65,26 @@ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband, } EXPORT_SYMBOL(ieee80211_mandatory_rates); -int ieee80211_channel_to_frequency(int chan, enum ieee80211_band band) +int ieee80211_channel_to_frequency(int chan, enum nl80211_band band) { /* see 802.11 17.3.8.3.2 and Annex J * there are overlapping channel numbers in 5GHz and 2GHz bands */ if (chan <= 0) return 0; /* not supported */ switch (band) { - case IEEE80211_BAND_2GHZ: + case NL80211_BAND_2GHZ: if (chan == 14) return 2484; else if (chan < 14) return 2407 + chan * 5; break; - case IEEE80211_BAND_5GHZ: + case NL80211_BAND_5GHZ: if (chan >= 182 && chan <= 196) return 4000 + chan * 5; else return 5000 + chan * 5; break; - case IEEE80211_BAND_60GHZ: + case NL80211_BAND_60GHZ: if (chan < 5) return 56160 + chan * 2160; break; @@ -116,11 +116,11 @@ EXPORT_SYMBOL(ieee80211_frequency_to_channel); struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy, int freq) { - enum ieee80211_band band; + enum nl80211_band band; struct ieee80211_supported_band *sband; int i; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wiphy->bands[band]; if (!sband) @@ -137,12 +137,12 @@ struct ieee80211_channel *__ieee80211_get_channel(struct wiphy *wiphy, EXPORT_SYMBOL(__ieee80211_get_channel); static void set_mandatory_flags_band(struct ieee80211_supported_band *sband, - enum ieee80211_band band) + enum nl80211_band band) { int i, want; switch (band) { - case IEEE80211_BAND_5GHZ: + case NL80211_BAND_5GHZ: want = 3; for (i = 0; i < sband->n_bitrates; i++) { if (sband->bitrates[i].bitrate == 60 || @@ -155,7 +155,7 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband, } WARN_ON(want); break; - case IEEE80211_BAND_2GHZ: + case NL80211_BAND_2GHZ: want = 7; for (i = 0; i < sband->n_bitrates; i++) { if (sband->bitrates[i].bitrate == 10) { @@ -185,12 +185,12 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband, } WARN_ON(want != 0 && want != 3 && want != 6); break; - case IEEE80211_BAND_60GHZ: + case NL80211_BAND_60GHZ: /* check for mandatory HT MCS 1..4 */ WARN_ON(!sband->ht_cap.ht_supported); WARN_ON((sband->ht_cap.mcs.rx_mask[0] & 0x1e) != 0x1e); break; - case IEEE80211_NUM_BANDS: + case NUM_NL80211_BANDS: WARN_ON(1); break; } @@ -198,9 +198,9 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband, void ieee80211_set_bitrate_flags(struct wiphy *wiphy) { - enum ieee80211_band band; + enum nl80211_band band; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) + for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) set_mandatory_flags_band(wiphy->bands[band], band); } @@ -651,7 +651,7 @@ __frame_add_frag(struct sk_buff *skb, struct page *page, struct skb_shared_info *sh = skb_shinfo(skb); int page_offset; - atomic_inc(&page->_count); + page_ref_inc(page); page_offset = ptr - page_address(page); skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size); } @@ -721,6 +721,8 @@ __ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen, * alignment since sizeof(struct ethhdr) is 14. */ frame = dev_alloc_skb(hlen + sizeof(struct ethhdr) + 2 + cur_len); + if (!frame) + return NULL; skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2); skb_copy_bits(skb, offset, skb_put(frame, cur_len), cur_len); @@ -950,7 +952,7 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev) ev->cr.resp_ie, ev->cr.resp_ie_len, ev->cr.status, ev->cr.status == WLAN_STATUS_SUCCESS, - NULL); + ev->cr.bss); break; case EVENT_ROAMED: __cfg80211_roamed(wdev, ev->rm.bss, ev->rm.req_ie, @@ -986,7 +988,7 @@ void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev) ASSERT_RTNL(); - list_for_each_entry(wdev, &rdev->wdev_list, list) + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_process_wdev_events(wdev); } @@ -1399,22 +1401,22 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen, EXPORT_SYMBOL(ieee80211_ie_split_ric); bool ieee80211_operating_class_to_band(u8 operating_class, - enum ieee80211_band *band) + enum nl80211_band *band) { switch (operating_class) { case 112: case 115 ... 127: case 128 ... 130: - *band = IEEE80211_BAND_5GHZ; + *band = NL80211_BAND_5GHZ; return true; case 81: case 82: case 83: case 84: - *band = IEEE80211_BAND_2GHZ; + *band = NL80211_BAND_2GHZ; return true; case 180: - *band = IEEE80211_BAND_60GHZ; + *band = NL80211_BAND_60GHZ; return true; } @@ -1560,7 +1562,7 @@ int cfg80211_validate_beacon_int(struct cfg80211_registered_device *rdev, if (!beacon_int) return -EINVAL; - list_for_each_entry(wdev, &rdev->wdev_list, list) { + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->beacon_interval) continue; if (wdev->beacon_interval != beacon_int) { @@ -1726,10 +1728,10 @@ int ieee80211_get_ratemask(struct ieee80211_supported_band *sband, unsigned int ieee80211_get_num_supported_channels(struct wiphy *wiphy) { - enum ieee80211_band band; + enum nl80211_band band; unsigned int n_channels = 0; - for (band = 0; band < IEEE80211_NUM_BANDS; band++) + for (band = 0; band < NUM_NL80211_BANDS; band++) if (wiphy->bands[band]) n_channels += wiphy->bands[band]->n_channels; diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index fd682832a..9f27221c8 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -25,42 +25,7 @@ int cfg80211_wext_giwname(struct net_device *dev, struct iw_request_info *info, char *name, char *extra) { - struct wireless_dev *wdev = dev->ieee80211_ptr; - struct ieee80211_supported_band *sband; - bool is_ht = false, is_a = false, is_b = false, is_g = false; - - if (!wdev) - return -EOPNOTSUPP; - - sband = wdev->wiphy->bands[IEEE80211_BAND_5GHZ]; - if (sband) { - is_a = true; - is_ht |= sband->ht_cap.ht_supported; - } - - sband = wdev->wiphy->bands[IEEE80211_BAND_2GHZ]; - if (sband) { - int i; - /* Check for mandatory rates */ - for (i = 0; i < sband->n_bitrates; i++) { - if (sband->bitrates[i].bitrate == 10) - is_b = true; - if (sband->bitrates[i].bitrate == 60) - is_g = true; - } - is_ht |= sband->ht_cap.ht_supported; - } - strcpy(name, "IEEE 802.11"); - if (is_a) - strcat(name, "a"); - if (is_b) - strcat(name, "b"); - if (is_g) - strcat(name, "g"); - if (is_ht) - strcat(name, "n"); - return 0; } EXPORT_WEXT_HANDLER(cfg80211_wext_giwname); @@ -143,7 +108,7 @@ int cfg80211_wext_giwrange(struct net_device *dev, { struct wireless_dev *wdev = dev->ieee80211_ptr; struct iw_range *range = (struct iw_range *) extra; - enum ieee80211_band band; + enum nl80211_band band; int i, c = 0; if (!wdev) @@ -215,7 +180,7 @@ int cfg80211_wext_giwrange(struct net_device *dev, } } - for (band = 0; band < IEEE80211_NUM_BANDS; band ++) { + for (band = 0; band < NUM_NL80211_BANDS; band ++) { struct ieee80211_supported_band *sband; sband = wdev->wiphy->bands[band]; @@ -265,11 +230,11 @@ int cfg80211_wext_freq(struct iw_freq *freq) * -EINVAL for impossible things. */ if (freq->e == 0) { - enum ieee80211_band band = IEEE80211_BAND_2GHZ; + enum nl80211_band band = NL80211_BAND_2GHZ; if (freq->m < 0) return 0; if (freq->m > 14) - band = IEEE80211_BAND_5GHZ; + band = NL80211_BAND_5GHZ; return ieee80211_channel_to_frequency(freq->m, band); } else { int i, div = 1000000; @@ -1245,7 +1210,7 @@ static int cfg80211_wext_siwrate(struct net_device *dev, maxrate = rate->value / 100000; } - for (band = 0; band < IEEE80211_NUM_BANDS; band++) { + for (band = 0; band < NUM_NL80211_BANDS; band++) { sband = wdev->wiphy->bands[band]; if (sband == NULL) continue; diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index c753211cb..dbb2738e3 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -399,7 +399,10 @@ static int __init wireless_nlevent_init(void) if (err) return err; - return register_netdevice_notifier(&wext_netdev_notifier); + err = register_netdevice_notifier(&wext_netdev_notifier); + if (err) + unregister_pernet_subsys(&wext_pernet_ops); + return err; } subsys_initcall(wireless_nlevent_init); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 2cc7af858..d516845e1 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -809,7 +809,8 @@ static int copy_to_user_state_extra(struct xfrm_state *x, goto out; } if (x->lastused) { - ret = nla_put_u64(skb, XFRMA_LASTUSED, x->lastused); + ret = nla_put_u64_64bit(skb, XFRMA_LASTUSED, x->lastused, + XFRMA_PAD); if (ret) goto out; } @@ -1813,7 +1814,7 @@ static inline size_t xfrm_aevent_msgsize(struct xfrm_state *x) return NLMSG_ALIGN(sizeof(struct xfrm_aevent_id)) + nla_total_size(replay_size) - + nla_total_size(sizeof(struct xfrm_lifetime_cur)) + + nla_total_size_64bit(sizeof(struct xfrm_lifetime_cur)) + nla_total_size(sizeof(struct xfrm_mark)) + nla_total_size(4) /* XFRM_AE_RTHR */ + nla_total_size(4); /* XFRM_AE_ETHR */ @@ -1848,7 +1849,8 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct } if (err) goto out_cancel; - err = nla_put(skb, XFRMA_LTIME_VAL, sizeof(x->curlft), &x->curlft); + err = nla_put_64bit(skb, XFRMA_LTIME_VAL, sizeof(x->curlft), &x->curlft, + XFRMA_PAD); if (err) goto out_cancel; @@ -2617,7 +2619,7 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x) l += nla_total_size(sizeof(x->props.extra_flags)); /* Must count x->lastused as it may become non-zero behind our back. */ - l += nla_total_size(sizeof(u64)); + l += nla_total_size_64bit(sizeof(u64)); return l; } |